def load_objects(self, is_bedgraph, verbose=False): """Load files and initialize object""" for i, t in enumerate(self.types): if verbose: print("Loading file ", self.files[self.names[i]], file=sys.stderr) if t not in ["regions", "genes"] and verbose: print("Cannot load objects", file=sys.stderr) if t == "regions": regions = GenomicRegionSet(self.names[i]) if is_bedgraph: regions.read_bedgraph( os.path.abspath(self.files[self.names[i]])) else: regions.read_bed( os.path.abspath(self.files[self.names[i]]) ) # Here change the relative path into absolute path self.objectsDict[self.names[i]] = regions elif t == "genes": genes = GeneSet(self.names[i]) genes.read( os.path.abspath(self.files[self.names[i]]) ) # Here change the relative path into absolute path self.objectsDict[self.names[i]] = genes
def load_objects(self, is_bedgraph, verbose=False, test=False): """Load files and initialize object. *Keyword arguments:* - is_bedgraph -- Whether regions are in bedgraph format (default = False). - verbose -- Verbose output (default = False). - test -- Fetch only 10 regions form each BED files for test. """ for i, t in enumerate(self.types): if verbose: print("Loading file ", self.files[self.names[i]], file=sys.stderr) if t not in ["regions", "genes"] and verbose: print("Cannot load objects", file=sys.stderr) if t == "regions": regions = GenomicRegionSet(self.names[i]) if is_bedgraph: regions.read_bedgraph( os.path.abspath(self.files[self.names[i]])) else: regions.read_bed(os.path.abspath( self.files[self.names[i]])) if test: regions.sequences = regions.sequences[0:11] self.objectsDict[self.names[i]] = regions elif t == "genes": genes = GeneSet(self.names[i]) genes.read( os.path.abspath(self.files[self.names[i]]) ) # Here change the relative path into absolute path self.objectsDict[self.names[i]] = genes
def load_objects(self, is_bedgraph, verbose=False, test=False): """Load files and initialize object. *Keyword arguments:* - is_bedgraph -- Whether regions are in bedgraph format (default = False). - verbose -- Verbose output (default = False). - test -- Fetch only 10 regions form each BED files for test. """ for i, t in enumerate(self.types): if verbose: print("Loading file ", self.files[self.names[i]], file = sys.stderr) if t not in ["regions", "genes"] and verbose: print("Cannot load objects", file=sys.stderr) if t == "regions": regions = GenomicRegionSet(self.names[i]) if is_bedgraph: regions.read_bedgraph(os.path.abspath(self.files[self.names[i]])) else: if test: g = GenomicRegionSet(self.names[i]) g.read_bed(os.path.abspath(self.files[self.names[i]])) regions.sequences = g.sequences[0:11] else: regions.read_bed(os.path.abspath(self.files[self.names[i]])) # Here change the relative path into absolute path self.objectsDict[self.names[i]] = regions elif t == "genes": genes = GeneSet(self.names[i]) genes.read(os.path.abspath(self.files[self.names[i]])) # Here change the relative path into absolute path self.objectsDict[self.names[i]] = genes
def match_ms_tags(self, field, test=False): """Add more entries to match the missing tags of the given field. For example, there are tags for cell like 'cell_A' and 'cell_B' for reads, but no these tag for regions. Then the regions are repeated for each tags from reads to match all reads. *Keyword arguments:* - field -- Field to add extra entries. """ # check regions or reads have empty tag altypes = self.fieldsDict[field].keys() if "ALL" in altypes: altypes.remove("ALL") for name in self.fieldsDict[field]["ALL"]: i = self.names.index(name) for t in altypes: # print("\t"+t) n = name + "_" + t # print("\t\t"+n) self.names.append(n) self.types.append(self.types[i]) self.files[n] = self.files[name] # types = self.get_types(name,skip_all=True) # print("************") # print(types) for f in self.fields[3:]: if f == field: try: self.fieldsDict[f][t].append(n) except: self.fieldsDict[f][t] = [n] else: try: self.fieldsDict[f][self.get_type( name=name, field=f)].append(n) except: self.fieldsDict[f][self.get_type( name=name, field=f)] = [n] # for f in self.fieldsDict.keys(): # for ty in types: # try: self.fieldsDict[f][ty].append(n) # except: pass if self.types[i] == "regions": g = GenomicRegionSet(n) g.read_bed(self.files[name]) if test: g.sequences = g.sequences[0:11] self.objectsDict[n] = g self.trash.append(name)
def match_ms_tags(self,field): """Add more entries to match the missing tags of the given field. For example, there are tags for cell like 'cell_A' and 'cell_B' for reads, but no these tag for regions. Then the regions are repeated for each tags from reads to match all reads. *Keyword arguments:* - field -- Field to add extra entries. """ # print(field) # print(self.fieldsDict) # check regions or reads have empty tag altypes = self.fieldsDict[field].keys() if "ALL" in altypes: altypes.remove("ALL") for name in self.fieldsDict[field]["ALL"]: # print(name) i = self.names.index(name) for t in altypes: # print("\t"+t) n = name+"_"+t # print("\t\t"+n) self.names.append(n) self.types.append(self.types[i]) self.files[n] = self.files[name] # types = self.get_types(name,skip_all=True) # print("************") # print(types) for f in self.fields[3:]: if f == field: try: self.fieldsDict[f][t].append(n) except: self.fieldsDict[f][t] = [n] else: try: self.fieldsDict[f][self.get_type(name=name,field=f)].append(n) except: self.fieldsDict[f][self.get_type(name=name,field=f)] = [n] # for f in self.fieldsDict.keys(): # for ty in types: # try: self.fieldsDict[f][ty].append(n) # except: pass if self.types[i] == "regions": g = GenomicRegionSet(n) g.read_bed(self.files[name]) self.objectsDict[n] = g self.trash.append(name)
################################################################################## parser = argparse.ArgumentParser(description='Replace TCONs in BED file by assoicated gene names', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files") parser.add_argument('-output', type=str, help="Define the output directory") parser.add_argument('-organism', type=str, help="Define the organism") args = parser.parse_args() genome = GenomeData(args.organism) if os.path.isfile(args.bed): regionset = GenomicRegionSet("bed") regionset.read_bed(args.bed) gr = regionset.gene_association(organism=args.organism, promoterLength=1000, threshDist=500000, show_dis=True) regionset.replace_region_name(gr,combine=True) regionset.write_bed(args.output) elif os.path.isdir(args.bed): if not os.path.exists(args.output): os.makedirs(args.output) for root, dirnames, filenames in os.walk(args.bed): for filename in filenames: if ".bed" in filename: print(filename) fnn = os.path.basename(filename)
from rgt.ExperimentalMatrix import * #from fisher import pvalue import scipy.stats outdir = "" back = False designFile = sys.argv[1] genomeName = sys.argv[2] geneFile = sys.argv[3] randomize = int(sys.argv[4]) backGroundPeaks = False if len(sys.argv) > 5: backGroundPeaksName = sys.argv[6] backBed = GenomicRegionSet("BACK") backBed.read_bed(backGroundPeaksName) backGroundPeaks = True distance = 50000 if len(sys.argv) > 6: distance = len(sys.argv[6]) if len(sys.argv) > 7: outdir = sys.argv[7] #genomeFile=anotationPath+"chrom.sizes" #geneFile=anotationPath+"association_file.bed" exps = ExperimentalMatrix() exps.read(designFile)
back=False designFile = sys.argv[1] anotationPath = sys.argv[2] genomeFile=anotationPath+"chrom.sizes" geneFile=anotationPath+"association_file.bed" exps=ExperimentalMatrix() exps.read(designFile) beds=[] geneLists=[] #this should be improved bedGenes = GenomicRegionSet(geneFile) bedGenes.read_bed(geneFile) allgenes=[] for r in bedGenes: allgenes.append(r.name) allgenes=list(set(allgenes)) genesets=exps.get_genesets() if len(sys.argv) > 3: back=True backGroundPeaks = sys.argv[3] backBed=GenomicRegionSet("BACK") backBed.read_bed(backGroundPeaks) backBed=GenomicRegionSet("BACK")
if __name__ == "__main__": import sys from rgt.GenomicRegionSet import * bam_file=sys.argv[1] fasta_file=sys.argv[2] bed_file=sys.argv[3] kmer=int(sys.argv[4]) shift=int(sys.argv[5]) out=sys.argv[6] regions=GenomicRegionSet("regions") regions.read_bed(bed_file) table=BiasTable(regions=regions,dnase_file_name=bam_file,genome_file_name=fasta_file,k_nb=kmer,shift=shift) table.write_tables(out)
def load_exon_sequence(bed, directory, genome_path): """Load the exon sequence from the the transcripts. Input BED format should contain: blockCount - The number of blocks (exons) in the BED line. blockSizes - A comma-separated list of the block sizes. blockStarts - A comma-separated list of block starts. see details: http://genome.ucsc.edu/FAQ/FAQformat#format1 Output: Each FASTA file represants a transcript and contains all the exons within the file. """ regionset = GenomicRegionSet("bed") regionset.read_bed(bed) regionset.sort() genome = pysam.Fastafile(genome_path) try: if len(regionset.sequences[0].data.split("\t")) == 7: blockinfor = True no_exon = False except: blockinfor = False regionset.sequences.sort(key=lambda g: g.name) no_exon = True if blockinfor: for gr in regionset: if not gr.name: print( "Error: For fetching exon sequences, please define the transcript name." ) sys.exit() else: if not os.path.exists(directory): os.makedirs(directory) f = open(os.path.join(directory, gr.name + ".fa"), "w") data = gr.data.split("\t") #print(len(data)) if len(data) == 7: #print(data) n = int(data[4]) blocks = [int(b) for b in filter(None, data[5].split(","))] starts = [int(s) for s in filter(None, data[6].split(","))] printstr = [] for i in range(n): start = gr.initial + starts[i] end = start + blocks[i] if no_exon and i == 0: ex = "" elif gr.orientation == "-": ex = "exon:" + str(n - i) else: ex = "exon:" + str(i + 1) if gr.orientation == "-": seq = Seq( genome.fetch(gr.chrom, start - 1, end - 1), IUPAC.unambiguous_dna) seq = seq.reverse_complement() p = [ ">" + " ".join([ gr.name, ex, "_".join([ "REGION", gr.chrom, str(start), str(end), gr.orientation ]) ]), seq ] printstr.append(p) else: p = [ ">" + " ".join([ gr.name, ex, "_".join([ "REGION", gr.chrom, str(start), str(end), gr.orientation ]) ]), genome.fetch(gr.chrom, start - 1, end - 1) ] printstr.append(p) if gr.orientation == "-": printstr = printstr[::-1] for i in range(n): print(printstr[i][0], file=f) print(printstr[i][1], file=f) else: print( "Warning: The given regions have no block information, please try write_bed_blocks" ) f.close() else: pre_id = "" for gr in regionset: if not gr.name: gr.name = gr.toString() if pre_id == "": pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) elif gr.name == pre_id: z.add(gr) else: f = open(os.path.join(directory, pre_id + ".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join([ "REGION", g.chrom, str(g.initial), str(g.final), gr.orientation ]) except: regiontag = "_".join( ["REGION", g.chrom, str(g.initial), str(g.final)]) print(">" + " ".join([g.name, regiontag]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close() pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) # Last TX f = open(os.path.join(directory, pre_id + ".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join([ "REGION", g.chrom, str(g.initial), str(g.final), gr.orientation ]) except: regiontag = "_".join( ["REGION", g.chrom, str(g.initial), str(g.final)]) print(">" + " ".join([g.name, regiontag]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close()
def load_exon_sequence(bed, directory, genome_path): """Load the exon sequence from the the transcripts. Input BED format should contain: blockCount - The number of blocks (exons) in the BED line. blockSizes - A comma-separated list of the block sizes. blockStarts - A comma-separated list of block starts. see details: http://genome.ucsc.edu/FAQ/FAQformat#format1 Output: Each FASTA file represants a transcript and contains all the exons within the file. """ regionset = GenomicRegionSet("bed") regionset.read_bed(bed) regionset.sort() genome = pysam.Fastafile(genome_path) try: if len(regionset.sequences[0].data.split("\t")) == 7: blockinfor = True no_exon = False except: blockinfor = False regionset.sequences.sort(key=lambda g: g.name) no_exon = True if blockinfor: for gr in regionset: if not gr.name: print("Error: For fetching exon sequences, please define the transcript name.") sys.exit() else: if not os.path.exists(directory): os.makedirs(directory) f = open(os.path.join(directory, gr.name+".fa"), "w") data = gr.data.split("\t") #print(len(data)) if len(data) == 7: #print(data) n = int(data[4]) blocks = [ int(b) for b in filter(None, data[5].split(",")) ] starts = [ int(s) for s in filter(None, data[6].split(",")) ] printstr = [] for i in range(n): start = gr.initial + starts[i] end = start + blocks[i] if no_exon and i == 0: ex = "" elif gr.orientation == "-": ex = "exon:"+str(n-i) else: ex = "exon:"+str(i+1) if gr.orientation == "-": seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna) seq = seq.reverse_complement() p = [ ">"+ " ".join([ gr.name, ex, "_".join(["REGION",gr.chrom, str(start),str(end), gr.orientation]) ]), seq ] printstr.append(p) else: p = [ ">"+ " ".join([gr.name, ex, "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]), genome.fetch(gr.chrom, start-1, end-1) ] printstr.append(p) if gr.orientation == "-": printstr = printstr[::-1] for i in range(n): print(printstr[i][0], file=f) print(printstr[i][1], file=f) else: print("Warning: The given regions have no block information, please try write_bed_blocks") f.close() else: pre_id = "" for gr in regionset: if not gr.name: gr.name = gr.toString() if pre_id == "": pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) elif gr.name == pre_id: z.add(gr) else: f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close() pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) # Last TX f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close()