def load_exon_sequence(bed, directory, genome_path): """Load the exon sequence from the the transcripts. Input BED format should contain: blockCount - The number of blocks (exons) in the BED line. blockSizes - A comma-separated list of the block sizes. blockStarts - A comma-separated list of block starts. see details: http://genome.ucsc.edu/FAQ/FAQformat#format1 Output: Each FASTA file represants a transcript and contains all the exons within the file. """ regionset = GenomicRegionSet("bed") regionset.read(bed) regionset.sort() genome = pysam.Fastafile(genome_path) try: if len(regionset.sequences[0].data.split("\t")) == 7: blockinfor = True no_exon = False except: blockinfor = False regionset.sequences.sort(key=lambda g: g.name) no_exon = True if blockinfor: for gr in regionset: if not gr.name: print("Error: For fetching exon sequences, please define the transcript name.") sys.exit() else: if not os.path.exists(directory): os.makedirs(directory) f = open(os.path.join(directory, gr.name+".fa"), "w") data = gr.data.split("\t") #print(len(data)) if len(data) == 7: #print(data) n = int(data[4]) blocks = [ int(b) for b in filter(None, data[5].split(",")) ] starts = [ int(s) for s in filter(None, data[6].split(",")) ] printstr = [] for i in range(n): start = gr.initial + starts[i] end = start + blocks[i] if no_exon and i == 0: ex = "" elif gr.orientation == "-": ex = "exon:"+str(n-i) else: ex = "exon:"+str(i+1) if gr.orientation == "-": seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna) seq = seq.reverse_complement() p = [ ">"+ " ".join([ gr.name, ex, "_".join(["REGION",gr.chrom, str(start),str(end), gr.orientation]) ]), seq ] printstr.append(p) else: p = [ ">"+ " ".join([gr.name, ex, "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]), genome.fetch(gr.chrom, start-1, end-1) ] printstr.append(p) if gr.orientation == "-": printstr = printstr[::-1] for i in range(n): print(printstr[i][0], file=f) print(printstr[i][1], file=f) else: print("Warning: The given regions have no block information, please try write_bed_blocks") f.close() else: pre_id = "" for gr in regionset: if not gr.name: gr.name = gr.toString() if pre_id == "": pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) elif gr.name == pre_id: z.add(gr) else: f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close() pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) # Last TX f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close()
################################################################################## parser = argparse.ArgumentParser( description='Replace TCONs in BED file by assoicated gene names', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files") parser.add_argument('-output', type=str, help="Define the output directory") parser.add_argument('-organism', type=str, help="Define the organism") args = parser.parse_args() genome = GenomeData(args.organism) if os.path.isfile(args.bed): regionset = GenomicRegionSet("bed") regionset.read(args.bed) gr = regionset.gene_association(organism=args.organism, promoter_length=1000, thresh_dist=500000, show_dis=True) regionset.replace_region_name(gr, combine=True) regionset.write(args.output) elif os.path.isdir(args.bed): if not os.path.exists(args.output): os.makedirs(args.output) for root, dirnames, filenames in os.walk(args.bed): for filename in filenames: if ".bed" in filename:
from rgt.ExperimentalMatrix import * #from fisher import pvalue import scipy.stats outdir = "" back = False designFile = sys.argv[1] genomeName = sys.argv[2] geneFile = sys.argv[3] randomize = int(sys.argv[4]) backGroundPeaks = False if len(sys.argv) > 5: backGroundPeaksName = sys.argv[6] backBed = GenomicRegionSet("BACK") backBed.read(backGroundPeaksName) backGroundPeaks = True distance = 50000 if len(sys.argv) > 6: distance = len(sys.argv[6]) if len(sys.argv) > 7: outdir = sys.argv[7] #genomeFile=anotationPath+"chrom.sizes" #geneFile=anotationPath+"association_file.bed" exps = ExperimentalMatrix() exps.read(designFile)
#from fisher import pvalue import scipy.stats outdir="" back=False designFile = sys.argv[1] genomeName = sys.argv[2] geneFile = sys.argv[3] randomize = int(sys.argv[4]) backGroundPeaks=False if len(sys.argv) > 5: backGroundPeaksName = sys.argv[6] backBed=GenomicRegionSet("BACK") backBed.read(backGroundPeaksName) backGroundPeaks=True distance=50000 if len(sys.argv) > 6: distance=len(sys.argv[6]) if len(sys.argv) > 7: outdir=sys.argv[7] #genomeFile=anotationPath+"chrom.sizes" #geneFile=anotationPath+"association_file.bed" exps=ExperimentalMatrix() exps.read(designFile)
################################################################################## parser = argparse.ArgumentParser(description='Replace TCONs in BED file by assoicated gene names', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-bed', type=str, help="BED file or a directory containing BED files") parser.add_argument('-output', type=str, help="Define the output directory") parser.add_argument('-organism', type=str, help="Define the organism") args = parser.parse_args() genome = GenomeData(args.organism) if os.path.isfile(args.bed): regionset = GenomicRegionSet("bed") regionset.read(args.bed) gr = regionset.gene_association(organism=args.organism, promoter_length=1000, thresh_dist=500000, show_dis=True) regionset.replace_region_name(gr,combine=True) regionset.write(args.output) elif os.path.isdir(args.bed): if not os.path.exists(args.output): os.makedirs(args.output) for root, dirnames, filenames in os.walk(args.bed): for filename in filenames: if ".bed" in filename: print(filename) fnn = os.path.basename(filename)