Ejemplo n.º 1
0
    def load_objects(self, is_bedgraph, verbose=False):
        """Load files and initialize object"""
        for i, t in enumerate(self.types):
            if verbose:
                print("Loading file ",
                      self.files[self.names[i]],
                      file=sys.stderr)

            if t not in ["regions", "genes"] and verbose:
                print("Cannot load objects", file=sys.stderr)

            if t == "regions":
                regions = GenomicRegionSet(self.names[i])
                if is_bedgraph:
                    regions.read_bedgraph(
                        os.path.abspath(self.files[self.names[i]]))

                else:
                    regions.read_bed(
                        os.path.abspath(self.files[self.names[i]])
                    )  # Here change the relative path into absolute path
                self.objectsDict[self.names[i]] = regions

            elif t == "genes":
                genes = GeneSet(self.names[i])
                genes.read(
                    os.path.abspath(self.files[self.names[i]])
                )  # Here change the relative path into absolute path
                self.objectsDict[self.names[i]] = genes
Ejemplo n.º 2
0
    def load_objects(self, is_bedgraph, verbose=False, test=False):
        """Load files and initialize object.

        *Keyword arguments:*

            - is_bedgraph -- Whether regions are in bedgraph format (default = False).
            - verbose -- Verbose output (default = False).
            - test -- Fetch only 10 regions form each BED files for test.
        """
        for i, t in enumerate(self.types):
            if verbose:
                print("Loading file ",
                      self.files[self.names[i]],
                      file=sys.stderr)

            if t not in ["regions", "genes"] and verbose:
                print("Cannot load objects", file=sys.stderr)

            if t == "regions":
                regions = GenomicRegionSet(self.names[i])
                if is_bedgraph:
                    regions.read_bedgraph(
                        os.path.abspath(self.files[self.names[i]]))
                else:
                    regions.read_bed(os.path.abspath(
                        self.files[self.names[i]]))
                    if test: regions.sequences = regions.sequences[0:11]
                self.objectsDict[self.names[i]] = regions

            elif t == "genes":
                genes = GeneSet(self.names[i])
                genes.read(
                    os.path.abspath(self.files[self.names[i]])
                )  # Here change the relative path into absolute path
                self.objectsDict[self.names[i]] = genes
Ejemplo n.º 3
0
    def load_objects(self, is_bedgraph, verbose=False, test=False):
        """Load files and initialize object.

        *Keyword arguments:*

            - is_bedgraph -- Whether regions are in bedgraph format (default = False).
            - verbose -- Verbose output (default = False).
            - test -- Fetch only 10 regions form each BED files for test.
        """
        for i, t in enumerate(self.types):
            if verbose: print("Loading file ", self.files[self.names[i]], file = sys.stderr)
            
            if t not in ["regions", "genes"] and verbose:
                print("Cannot load objects", file=sys.stderr)
            
            if t == "regions":
                regions = GenomicRegionSet(self.names[i])
                if is_bedgraph:
                    regions.read_bedgraph(os.path.abspath(self.files[self.names[i]]))
                    
                else:
                    if test:
                        g = GenomicRegionSet(self.names[i])
                        g.read_bed(os.path.abspath(self.files[self.names[i]]))
                        regions.sequences = g.sequences[0:11]
                    else:
                        regions.read_bed(os.path.abspath(self.files[self.names[i]]))  # Here change the relative path into absolute path
                self.objectsDict[self.names[i]] = regions
            
            elif t == "genes":
                genes = GeneSet(self.names[i])
                genes.read(os.path.abspath(self.files[self.names[i]]))  # Here change the relative path into absolute path
                self.objectsDict[self.names[i]] = genes
Ejemplo n.º 4
0
    def match_ms_tags(self, field, test=False):
        """Add more entries to match the missing tags of the given field. For example, there are tags for cell like 'cell_A' and 'cell_B' for reads, but no these tag for regions. Then the regions are repeated for each tags from reads to match all reads.

        *Keyword arguments:*

            - field -- Field to add extra entries.
        """

        # check regions or reads have empty tag
        altypes = self.fieldsDict[field].keys()
        if "ALL" in altypes:
            altypes.remove("ALL")
            for name in self.fieldsDict[field]["ALL"]:
                i = self.names.index(name)
                for t in altypes:
                    # print("\t"+t)
                    n = name + "_" + t
                    # print("\t\t"+n)
                    self.names.append(n)
                    self.types.append(self.types[i])
                    self.files[n] = self.files[name]
                    # types = self.get_types(name,skip_all=True)
                    # print("************")
                    # print(types)

                    for f in self.fields[3:]:
                        if f == field:
                            try:
                                self.fieldsDict[f][t].append(n)
                            except:
                                self.fieldsDict[f][t] = [n]
                        else:
                            try:
                                self.fieldsDict[f][self.get_type(
                                    name=name, field=f)].append(n)
                            except:
                                self.fieldsDict[f][self.get_type(
                                    name=name, field=f)] = [n]
                    # for f in self.fieldsDict.keys():
                    #     for ty in types:
                    #         try: self.fieldsDict[f][ty].append(n)
                    #         except: pass
                    if self.types[i] == "regions":
                        g = GenomicRegionSet(n)
                        g.read_bed(self.files[name])
                        if test: g.sequences = g.sequences[0:11]
                        self.objectsDict[n] = g
                    self.trash.append(name)
Ejemplo n.º 5
0
    def match_ms_tags(self,field):
        """Add more entries to match the missing tags of the given field. For example, there are tags for cell like 'cell_A' and 'cell_B' for reads, but no these tag for regions. Then the regions are repeated for each tags from reads to match all reads.

        *Keyword arguments:*

            - field -- Field to add extra entries.
        """
        
        # print(field)
        # print(self.fieldsDict)
        # check regions or reads have empty tag
        altypes = self.fieldsDict[field].keys()
        if "ALL" in altypes:
            altypes.remove("ALL")
            for name in self.fieldsDict[field]["ALL"]:
                # print(name)
                i = self.names.index(name)
                for t in altypes:
                    # print("\t"+t)
                    n = name+"_"+t
                    # print("\t\t"+n)
                    self.names.append(n)
                    self.types.append(self.types[i])
                    self.files[n] = self.files[name]
                    # types = self.get_types(name,skip_all=True)
                    # print("************")
                    # print(types)

                    for f in self.fields[3:]:
                        if f == field: 
                            try: self.fieldsDict[f][t].append(n)
                            except: self.fieldsDict[f][t] = [n]
                        else:
                            try: self.fieldsDict[f][self.get_type(name=name,field=f)].append(n)
                            except: self.fieldsDict[f][self.get_type(name=name,field=f)] = [n]
                    # for f in self.fieldsDict.keys():
                    #     for ty in types:
                    #         try: self.fieldsDict[f][ty].append(n)
                    #         except: pass
                    if self.types[i] == "regions":
                        g = GenomicRegionSet(n)
                        g.read_bed(self.files[name])
                        self.objectsDict[n] = g
                    self.trash.append(name)
Ejemplo n.º 6
0
##################################################################################
parser = argparse.ArgumentParser(description='Replace TCONs in BED file by assoicated gene names', 
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files")
parser.add_argument('-output', type=str, help="Define the output directory")
parser.add_argument('-organism', type=str, help="Define the organism")
args = parser.parse_args()




genome = GenomeData(args.organism)

if os.path.isfile(args.bed):
    regionset = GenomicRegionSet("bed")
    regionset.read_bed(args.bed)
    gr = regionset.gene_association(organism=args.organism, promoterLength=1000, 
                                    threshDist=500000, show_dis=True)
    regionset.replace_region_name(gr,combine=True)
    
    regionset.write_bed(args.output)

elif os.path.isdir(args.bed):
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    for root, dirnames, filenames in os.walk(args.bed):
            
        for filename in filenames:
            if ".bed" in filename:
                print(filename)
                fnn = os.path.basename(filename)
Ejemplo n.º 7
0
from rgt.ExperimentalMatrix import *
#from fisher import pvalue
import scipy.stats

outdir = ""

back = False
designFile = sys.argv[1]
genomeName = sys.argv[2]
geneFile = sys.argv[3]
randomize = int(sys.argv[4])
backGroundPeaks = False
if len(sys.argv) > 5:
    backGroundPeaksName = sys.argv[6]
    backBed = GenomicRegionSet("BACK")
    backBed.read_bed(backGroundPeaksName)
    backGroundPeaks = True

distance = 50000
if len(sys.argv) > 6:
    distance = len(sys.argv[6])

if len(sys.argv) > 7:
    outdir = sys.argv[7]

#genomeFile=anotationPath+"chrom.sizes"
#geneFile=anotationPath+"association_file.bed"

exps = ExperimentalMatrix()
exps.read(designFile)
Ejemplo n.º 8
0
back=False
designFile = sys.argv[1]
anotationPath = sys.argv[2]
genomeFile=anotationPath+"chrom.sizes"
geneFile=anotationPath+"association_file.bed"

exps=ExperimentalMatrix()
exps.read(designFile)

beds=[]
geneLists=[]

#this should be improved
bedGenes = GenomicRegionSet(geneFile)
bedGenes.read_bed(geneFile)
allgenes=[]
for r in bedGenes:
 allgenes.append(r.name)
allgenes=list(set(allgenes))

genesets=exps.get_genesets()

if len(sys.argv) > 3:
    back=True
    backGroundPeaks = sys.argv[3]
    backBed=GenomicRegionSet("BACK")
    backBed.read_bed(backGroundPeaks)


backBed=GenomicRegionSet("BACK")    
Ejemplo n.º 9
0
if __name__ == "__main__":

  import sys

  from rgt.GenomicRegionSet import *

  bam_file=sys.argv[1]
  fasta_file=sys.argv[2]
  bed_file=sys.argv[3]
  kmer=int(sys.argv[4])
  shift=int(sys.argv[5])
  out=sys.argv[6]


  regions=GenomicRegionSet("regions")
  regions.read_bed(bed_file)


  table=BiasTable(regions=regions,dnase_file_name=bam_file,genome_file_name=fasta_file,k_nb=kmer,shift=shift)
  table.write_tables(out)










Ejemplo n.º 10
0
def load_exon_sequence(bed, directory, genome_path):
    """Load the exon sequence from the the transcripts. 
    Input BED format should contain:
        blockCount - The number of blocks (exons) in the BED line.
        blockSizes - A comma-separated list of the block sizes.
        blockStarts - A comma-separated list of block starts. 
        see details: http://genome.ucsc.edu/FAQ/FAQformat#format1

    Output:
        Each FASTA file represants a transcript and contains all the exons within the file.

    """
    regionset = GenomicRegionSet("bed")
    regionset.read_bed(bed)
    regionset.sort()

    genome = pysam.Fastafile(genome_path)

    try:
        if len(regionset.sequences[0].data.split("\t")) == 7:
            blockinfor = True
            no_exon = False
    except:
        blockinfor = False
        regionset.sequences.sort(key=lambda g: g.name)
        no_exon = True

    if blockinfor:

        for gr in regionset:
            if not gr.name:
                print(
                    "Error: For fetching exon sequences, please define the transcript name."
                )
                sys.exit()
            else:
                if not os.path.exists(directory):
                    os.makedirs(directory)
                f = open(os.path.join(directory, gr.name + ".fa"), "w")
                data = gr.data.split("\t")
                #print(len(data))
                if len(data) == 7:
                    #print(data)
                    n = int(data[4])

                    blocks = [int(b) for b in filter(None, data[5].split(","))]
                    starts = [int(s) for s in filter(None, data[6].split(","))]
                    printstr = []

                    for i in range(n):
                        start = gr.initial + starts[i]
                        end = start + blocks[i]
                        if no_exon and i == 0:
                            ex = ""
                        elif gr.orientation == "-":
                            ex = "exon:" + str(n - i)
                        else:
                            ex = "exon:" + str(i + 1)

                        if gr.orientation == "-":
                            seq = Seq(
                                genome.fetch(gr.chrom, start - 1, end - 1),
                                IUPAC.unambiguous_dna)
                            seq = seq.reverse_complement()
                            p = [
                                ">" + " ".join([
                                    gr.name, ex, "_".join([
                                        "REGION", gr.chrom,
                                        str(start),
                                        str(end), gr.orientation
                                    ])
                                ]), seq
                            ]

                            printstr.append(p)

                        else:
                            p = [
                                ">" + " ".join([
                                    gr.name, ex, "_".join([
                                        "REGION", gr.chrom,
                                        str(start),
                                        str(end), gr.orientation
                                    ])
                                ]),
                                genome.fetch(gr.chrom, start - 1, end - 1)
                            ]
                            printstr.append(p)

                    if gr.orientation == "-": printstr = printstr[::-1]
                    for i in range(n):
                        print(printstr[i][0], file=f)
                        print(printstr[i][1], file=f)

                else:
                    print(
                        "Warning: The given regions have no block information, please try write_bed_blocks"
                    )
                f.close()
    else:
        pre_id = ""
        for gr in regionset:
            if not gr.name:
                gr.name = gr.toString()

            if pre_id == "":
                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)
            elif gr.name == pre_id:
                z.add(gr)
            else:
                f = open(os.path.join(directory, pre_id + ".fa"), "w")
                for i, g in enumerate(z):
                    try:
                        regiontag = "_".join([
                            "REGION", g.chrom,
                            str(g.initial),
                            str(g.final), gr.orientation
                        ])
                    except:
                        regiontag = "_".join(
                            ["REGION", g.chrom,
                             str(g.initial),
                             str(g.final)])

                    print(">" + " ".join([g.name, regiontag]), file=f)
                    print(genome.fetch(g.chrom, g.initial, g.final), file=f)
                f.close()

                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)

        # Last TX
        f = open(os.path.join(directory, pre_id + ".fa"), "w")
        for i, g in enumerate(z):
            try:
                regiontag = "_".join([
                    "REGION", g.chrom,
                    str(g.initial),
                    str(g.final), gr.orientation
                ])
            except:
                regiontag = "_".join(
                    ["REGION", g.chrom,
                     str(g.initial),
                     str(g.final)])
            print(">" + " ".join([g.name, regiontag]), file=f)
            print(genome.fetch(g.chrom, g.initial, g.final), file=f)
        f.close()
Ejemplo n.º 11
0
def load_exon_sequence(bed, directory, genome_path):
    """Load the exon sequence from the the transcripts. 
    Input BED format should contain:
        blockCount - The number of blocks (exons) in the BED line.
        blockSizes - A comma-separated list of the block sizes.
        blockStarts - A comma-separated list of block starts. 
        see details: http://genome.ucsc.edu/FAQ/FAQformat#format1

    Output:
        Each FASTA file represants a transcript and contains all the exons within the file.

    """
    regionset = GenomicRegionSet("bed")
    regionset.read_bed(bed)
    regionset.sort()

    
    genome = pysam.Fastafile(genome_path)
    
    try:
        if len(regionset.sequences[0].data.split("\t")) == 7: 
            blockinfor = True
            no_exon = False
    except:
        blockinfor = False
        regionset.sequences.sort(key=lambda g: g.name)
        no_exon = True

    if blockinfor:
        
        for gr in regionset:
            if not gr.name:
                print("Error: For fetching exon sequences, please define the transcript name.")
                sys.exit()
            else:
                if not os.path.exists(directory):
                    os.makedirs(directory)
                f = open(os.path.join(directory, gr.name+".fa"), "w")
                data = gr.data.split("\t")
                #print(len(data))
                if len(data) == 7:
                    #print(data)
                    n = int(data[4])
                    
                    blocks = [ int(b) for b in filter(None, data[5].split(",")) ]
                    starts = [ int(s) for s in filter(None, data[6].split(",")) ]
                    printstr = []

                    for i in range(n):
                        start = gr.initial + starts[i]
                        end = start + blocks[i]
                        if no_exon and i == 0:
                            ex = ""
                        elif gr.orientation == "-":
                            ex = "exon:"+str(n-i)
                        else:
                            ex = "exon:"+str(i+1)

                        if gr.orientation == "-":
                            seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna)
                            seq = seq.reverse_complement()
                            p = [ ">"+ " ".join([ gr.name, 
                                                  ex, 
                                                  "_".join(["REGION",gr.chrom,
                                                            str(start),str(end), 
                                                            gr.orientation]) ]),
                                  seq ]
                            
                            printstr.append(p)
                            

                        else:
                            p = [ ">"+ " ".join([gr.name, ex, 
                                  "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]),
                                  genome.fetch(gr.chrom, start-1, end-1)
                                ]
                            printstr.append(p)
                            

                    if gr.orientation == "-": printstr = printstr[::-1]
                    for i in range(n):
                        print(printstr[i][0], file=f)
                        print(printstr[i][1], file=f)
                        

                else:
                    print("Warning: The given regions have no block information, please try write_bed_blocks")
                f.close()
    else:
        pre_id = ""
        for gr in regionset:
            if not gr.name: 
                gr.name = gr.toString()

            if pre_id == "": 
                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)
            elif gr.name == pre_id:
                z.add(gr)
            else:
                f = open(os.path.join(directory, pre_id+".fa"), "w")
                for i, g in enumerate(z):
                    try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
                    except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )

                    print( ">"+ " ".join([g.name,  
                                          regiontag ]), file=f)
                    print(genome.fetch(g.chrom, g.initial, g.final), file=f)
                f.close()

                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)

        # Last TX
        f = open(os.path.join(directory, pre_id+".fa"), "w")
        for i, g in enumerate(z):
            try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
            except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )
            print( ">"+ " ".join([g.name, 
                                  regiontag ]), file=f)
            print(genome.fetch(g.chrom, g.initial, g.final), file=f)
        f.close()