def TransUnitIterator(handle, **kwargs): if type(handle) == type("s"): try: handle = open(handle, "r") except: raise ValueError("Can't open file %s" % handle) TU = TransUnit() for line in handle: line = line.strip() if len(line) == 0: continue if line[0] == "#": continue if line == "//" or line == "// ": yield TU TU = TransUnit() #Reset continue x = line.split("\t") if x[1] == "OverlapGene:": gene = Bed(x[2:]) TU.append_overlap_gene(gene) elif x[1] == "OverlapFeat:": feat = Bed(x[2:]) TU.append_overlap_feat(feat) elif x[1] == "NearbyPromoter:": TU.append_promoter(x[2]) elif x[1] == "Promoter Info:": TU.promoterInfo = x[2] else: TU.processHeader(x)
def Main(): args = ParseArg() if len(args.data) != len(args.name): print >> sys.stderr, "ERROR: Number of data is not the same as number of names!" sys.exit(0) # store data information data = {} total_reads = {} for i in range(len(args.data)): temp_name = args.name[i] print >> sys.stderr, "\n Reading data file:" + temp_name + "..." total_reads[temp_name] = 0 if args.format[i] == "bam": total_reads[temp_name] = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(args.data[i]) ]) else: Format = "bed" for b in TableIO.parse(args.data[i], Format): total_reads[temp_name] += 1 if total_reads[temp_name] % 50000 == 0: print >> sys.stderr, " reading %d reads..\r" % ( total_reads[temp_name]), data[temp_name] = DBI.init(args.data[i], args.format[i]) output = open(args.output, 'w') Input = open(args.input, 'r') lines = Input.read().split("\n") # header header = ["chr", "start", "end", "type", "name", "subtype", "count" ] + data.keys() print >> output, "\t".join(g + "_%d" % (f) for f in [1, 2] for g in header) + "\tinteraction\tp-value" num = 0 print >> sys.stderr, "Start process interactions:" for l in lines: if l.strip() == '': continue l = l.strip().split('\t') num = num + 1 if l[0] == "chrM" or l[7] == "chrM": continue C1 = Bed([l[0], int(l[1]), int(l[2])]) C2 = Bed([l[7], int(l[8]), int(l[9])]) rpkm1 = "\t".join( str(f) for f in [RPKM(C1, data[n], total_reads[n], n) for n in data.keys()]) rpkm2 = "\t".join( str(f) for f in [RPKM(C2, data[n], total_reads[n], n) for n in data.keys()]) print >> output, "\t".join( str(f) for f in l[:7] + [rpkm1] + l[7:14] + [rpkm2, l[14], l[15]]) if num % 1000 == 0: print >> sys.stderr, " Output interaction: %d\r" % (num),
def Main(): global args args = ParseArg() # Index peak files global sp1, sp2 sp1, sp2 = ReadHistones(args.bg) if len(sp1) != len(sp2) or len(sp1) != len(args.histone): print >> sys.stderr, "The number of histone marks must be identical for the two species. Provided histone mark names should match the peak files." sys.exit(204) fout = open(args.output, "w") # t0 = time() input_list = [] with open(args.q_region[0], "r") as fbed1, open(args.q_region[1], "r") as fbed2: while True: line1 = fbed1.readline().strip().split() line2 = fbed2.readline().strip().split() if len(line1) == 0: break if len(line2) == 0: break bed1 = Bed(line1) bed2 = Bed(line2) if bed1.chr != "chrX" and bed1.chr != "chrY" and ( not bed1.chr.lstrip("chr").isdigit()): continue if bed2.chr != "chrX" and bed2.chr != "chrY" and ( not bed2.chr.lstrip("chr").isdigit()): continue input_list.append((bed1, bed2)) p = Pool(args.p_num) try: out_queue = p.map(Generate_output_str, input_list) p.close() p.join() except Exception as e: p.terminate() print >> sys.stderr, e.args sys.exit(e.args[0]) # t1 = time() # print >>sys.stderr, "Time: " + str((t1 - t0) / 60) with open(args.output, "w") as fout: for item in out_queue: if item: print >> fout, item
def Extend_liftPeaks(ori_dict, lift_dict, lift_back_peak): ''' Extend the remapped peak region if it is shorter than 0.9 * original length when remapped back. Names contain $. ori_dict, lift_dict: dictionaries. Names in ori_dict have no $, whereas in lift_dict have $. lift_back_peak: a file with peak regions in species1 remapped back to species1. ''' N = 0.9 sp1_dict = OrderedDict() sp2_dict = OrderedDict() ori_to_lift = {} with open(lift_back_peak, "r") as fin: for line in fin: line = line.strip().split() peak_name = line[3].split("$")[0] back_bed = Bed(line) ori_bed = ori_dict[peak_name] lift_bed = lift_dict[back_bed.id] if Overlap(ori_bed, back_bed, ifStrand=True): # if no overlap, no need to proceed. ori_len = (ori_bed.stop - ori_bed.start) back_len = (back_bed.stop - back_bed.start) if float(back_len / ori_len) < N: left_ext_len = max(0, back_bed.start - ori_bed.start) right_ext_len = max(0, ori_bed.stop - back_bed.stop) if lift_bed.strand == ori_bed.strand: lift_new_start = max(1, lift_bed.start - left_ext_len) lift_new_stop = lift_bed.stop + right_ext_len + 1 else: lift_new_start = max(1, lift_bed.start - right_ext_len) lift_new_stop = lift_bed.stop + left_ext_len + 1 sp1_dict[back_bed.id] = ori_bed sp2_dict[back_bed.id] = Bed([ lift_bed.chr, lift_new_start, lift_new_stop, lift_bed.id, lift_bed.score, lift_bed.strand ]) else: sp1_dict[back_bed.id] = ori_bed sp2_dict[back_bed.id] = lift_bed # add peak name and region name into ori_to_lift if peak_name not in ori_to_lift: ori_to_lift[peak_name] = [] ori_to_lift[peak_name].append(back_bed.id) return sp1_dict, sp2_dict
def Annotate(self, dbi1, dbi2, dbi3): # do RNA annotation if not annotated """ Update annotation. :param ref_allRNA: the `DBI.init <http://bam2xwiki.appspot.com/DBI>`_ object (from BAM2X) for bed6 file of all kinds of RNA :param ref_detail: the `DBI.init <http://bam2xwiki.appspot.com/DBI>`_ object for bed12 file of lincRNA and mRNA with intron, exon, UTR :param ref_detail: the `DBI.init <http://bam2xwiki.appspot.com/DBI>`_ object for bed6 file of mouse repeat Example: >>> str="chr13\t40975747\t40975770\t+" >>> a=annotated_bed(str) >>> a.Cluster(3) >>> ref_allRNA=DBI.init("../../Data/all_RNAs-rRNA_repeat.txt.gz","bed") >>> ref_detail=DBI.init("../../Data/Ensembl_mm9.genebed.gz","bed") >>> ref_repeat=DBI.init("../../Data/mouse.repeat.txt.gz","bed") >>> a.Annotate(ref_allRNA,ref_detail,ref_repeat) >>> print a "chr13\t40975747\t40975770\tprotein_coding\tgcnt2\tintron\t3" """ if not self.annotated: if "chr" in self.chr: bed = Bed([self.chr, self.start, self.end]) [self.type, self.name, self.subtype, self.proper] = annotation(bed, dbi1, dbi2, dbi3) self.annotated = True
def read(self,handle): ''' Read list or iterator into BinIndex Data Structure Usage: Example: f=TableIO.parse("filename","genebed") data=binindex(f) Example: f=TableIO.parse("filename","vcf") data=binindex(f) equals to f=TableIO.parse("filename","vcf") data=binindex() data.read(f) Example: data.read(bedlist) ''' for i in handle: a=i if type(i)==type([]) or type(i)==type((1,2,3)): a=Bed(i) self.append(a)
def find_nearest(bed, dbi, extends=50000, **dict): start = bed.start - extends stop = bed.stop + extends chr = bed.chr if start < 0: start = 0 new_bed = Bed([chr, start, stop]) results = dbi.query(new_bed, **dict) d = 2 * extends flag = 0 for result in results: if distance(bed, result) < d: d = distance(bed, result) nearest = result if result.strand == "." or bed.strand == ".": strand = "." elif result.strand == bed.strand: strand = "+" else: strand = "-" flag = 1 if flag == 0: return (None, None, None) else: return (d, nearest, strand)
def BamToBedIterator(filename, **kwargs): ''' iterator for reading a bam file, yield Bed Object instead of pysam.AlignedRead Object Usage: from xplib.TableIO.BamIO import BamToBedIterator for read in BamToBedIterator(filename): print read read is a alignment in pysam.AlignedRead format. Wrapper In TableIO.parse(filename,"bam2bed") Usage: for i in TableIO.parse(filename,"bam2bed"): print i A simple bam2bed.py which will read bam file and print aligned read in bed format: import sys from xplib import TableIO filename=sys.args[1] for i in TableIO.parse(filename,"bam2bed"): print i ''' f = pysam.Samfile(filename, "rb") for i in f: if i.tid < 0: continue strand = "+" if i.is_reverse: strand = "-" score = i.mapq bed = Bed([f.references[i.tid], i.pos, i.aend, i.qname, score, strand]) yield bed
def Main(): global args args=ParseArg() print >>sys.stderr, "Indexing..." global sp1, sp2 sp1, sp2=ReadHistones(args.bg) print >>sys.stderr, "Done indexing" if len(sp1)!=len(sp2) or len(sp1)!=len(args.histone): print >>sys.stderr, "Check the number of histone modifications!" exit(0) fout=open(args.output,"w") input_list=[] with open(args.q_region[0],"r") as fbed1, open(args.q_region[1],"r") as fbed2: while True: line1=fbed1.readline().strip().split("\t") line2=fbed2.readline().strip().split("\t") if line1[0]=="": break if line2[0]=="": break bed1=Bed(line1) bed2=Bed(line2) if bed1.chr!="chrX" and bed1.chr!="chrY": try: int(bed1.chr.lstrip("chr")) except: continue if bed2.chr!="chrX" and bed2.chr!="chrY": try: int(bed2.chr.lstrip("chr")) except: continue input_list.append((bed1, bed2)) p=Pool(args.p_num) out_queue=p.map(Generate_output_str,input_list) with open(args.output,"w") as fout: for item in out_queue: if item: print >>fout, item
def test(): if len(sys.argv) == 1: print >> sys.stderr, "Usage: Utils.py file.bed" exit() a = TableIO.parse(sys.argv[1], 'bed') data = readIntoBinIndex(a) bed = Bed(["chr1", 100000, 200000, ".", ".", "."]) g = getOverlapFeatures(bed, data) print "Overlap with", bed for i in g: print i
def Construct_dict(fname): peak_dict = OrderedDict() with open(fname, "r") as fin: for line in fin: line = line.strip().split() bed = Bed(line) if bed.id in peak_dict: print >> sys.stderr, "Duplicated names: " + bed.id sys.exit() peak_dict[bed.id] = bed return peak_dict
def parse_string_to_bed(string): x = string.split(":") if len(x) != 2: print >> sys.stderr, "String Format should be\n chromsome:start-end" exit(1) chr = x[0] y = x[1].split("-") if len(y) != 2: print >> sys.stderr, "String Format should be\n chromsome:start-end" exit(1) start = int(y[0]) - 1 end = int(y[1]) return Bed([chr, start, end])
def BedIterator(handle): if type(handle) == type("s"): try: handle = open(handle, "r") except: raise ValueError("Can't open file %s" % handle) for line in handle: line = line.strip() if line[0] == "#": continue x = line.split("\t") b = Bed(x) yield b return
def Main(): args = ParseArg() anno = DBI.init(args.annotation, "bed") ext_dis = args.ext_dis target_num = args.target_num with open(args.input, "r") as fin, open(args.output, "w") as fout: for line in fin: bed_region = Bed(line.strip().split()) mid_point = (bed_region.start + bed_region.stop) / 2 ori_start = bed_region.start ori_stop = bed_region.stop bed_region.start = mid_point - ext_dis bed_region.stop = mid_point + ext_dis gene_list = findNearbyGene(bed_region, anno, ori_start, ori_stop, target_num) for gene in gene_list: print >> fout, "\t".join( [bed_region.id, gene[1], str(gene[0])])
def test(): if len(sys.argv) == 1: print >> sys.stderr, "Usage: Utils.py file.bed" exit() a = TableIO.parse(sys.argv[1], 'genebed') data = readIntoBinIndex(a) bed = Bed(["chr12", 54380000, 54392000, "HOXC", 0, "+"]) g = getOverlapFeatures(bed, data) Overlap_dict = Classify_Overlap(bed, g) overlap_string = '' for k, v in Overlap_dict.iteritems(): if v: overlap_string += "".join( [str(k + '_' + each) + ';' for each in v]) print bed, overlap_string
def iterOverlapFeature(bed, data): ''' iterator the bed overlap features in data Usage: from xplib.Annotation.Utils import * for i in iterOverlapFeature(bed,data): print i ''' if type(bed) == type((1, 2, 3)) or type(bed) == ([1, 2, 3]): bed = Bed(bed[0:3]) # guess (chrome,start,stop) if not data.has_key(bed.chr): raise StopIteration D = data[bed.chr] for bin in iterRangeOverlapBins(bed.start, bed.stop): for f in D[bin]: if f.start < bed.stop and f.stop > bed.start: yield f raise StopIteration
def Main(): args = ParseArg() #store bed files with indexing and count information: bed = {} print >> sys.stderr, "Starting index bed files:" for i in range(len(args.beds)): temp_name = args.name[i] print >> sys.stderr, " #Indexing for bed file of", temp_name, "\r", bed[temp_name] = DBI.init(args.beds[i], 'bed') half_len = int(args.len) print >> sys.stderr print >> sys.stderr, "Reading nucleosome peak xls file from Danpos." nucleosomes = TableIO.parse(args.nucleosome, 'metabed', header=True) print >> sys.stderr, "Start Counting..." count_matrix = [] out = open(args.output, "w") line_head = open(args.nucleosome, 'r').readline().strip() line_head = line_head + "\t" + "\t".join(str(f) for f in args.name) print >> out, line_head for i in nucleosomes: chrom = i.chr if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM': continue center = int(i.start + i.end) / 2 count = np.zeros(len(args.beds), dtype="float") line = str(i) for k, name in enumerate(bed.keys()): for j in bed[name].query( Bed([ chrom, center - ma - (half_len - 75), center + ma + (half_len - 75) ])): j_center = find_center(j, half_len) weight = max(min(1, (ma - abs(j_center - center)) / 25.0), 0) count[k] += weight line = line + "\t" + "\t".join(str(f) for f in count) print >> out, line count_matrix.append(count)
def compare(isoforms): global reads_set,isoforms_set,selected_reads_set,selected_isoforms_set isoforms_set=[] selected_isoforms_set=[] chr=isoforms[0].chr min_start=isoforms[0].start max_stop=isoforms[0].stop for i in isoforms: if i.start < min_start: min_start=i.start if i.stop > max_stop: max_stop=i.stop isoforms_set.append(i) # print >>sys.stderr,"debug",i transcript_region=Bed([chr,min_start,max_stop]); print >>out,"REGION\t",chr,"\t",min_start,"\t",max_stop print >>out,"ISOFORM_INPUT_NUMBER\t",len(isoforms_set) ''' reading all the reads in this transcript region ''' reads_set=[] selected_reads_set=[] reads_num=0 for i in dbi.query(transcript_region,method="fetch12"): reads_set.append(i) reads_num+=1 # print >>sys.stderr,"debug",i ''' compare two sets ''' if reads_num==0: print >>out,"SUMMARY\t0 / 0 (0.0%) reads were generated from", len(selected_isoforms_set), "isoforms" else: while( float(len(selected_reads_set))/reads_num < 0.99): no_more_isoform=find_max_compatible() if no_more_isoform: break print >>out,"SUMMARY\t",len(selected_reads_set) ,"/", reads_num , ratio=float(len(selected_reads_set))/reads_num, print >>out,"(%.4f)"%ratio, print >>out," reads were generated from ",len(selected_isoforms_set)," isoforms" print >>out,"//"
def readIntoBinIndex(handle): ''' Read list or iterator into BinIndex Data Structure Usage: Example: f=TableIO.parse("filename","genebed") data=readIntoBinIndex(f) Example: data=readIntoBinIndex(bedlist) ''' data = {} for i in handle: a = i if type(i) == type([]) or type(i) == type((1, 2, 3)): a = Bed(i) if not data.has_key(a.chr): data[a.chr] = [[] for row in range(4096 + 512 + 64 + 8 + 1)] bin = binFromRangeStandard(a.start, a.stop) data[a.chr][bin].append(i) return data
def test(): if len(sys.argv)==1: print >>sys.stderr,"Usage: binindex.py file.bed" exit() a=TableIO.parse(sys.argv[1],'bed') data=binindex(a) data2=binindex() bed=Bed(["chr1",100000,2000000,".",".","."]) for i in data.query(bed): print "before remove:",len(data) data.remove(i) print "after remove:",len(data) data2.append(i) print data2 print "data finalize:" data.merge(data2) print data print data+data2 print data print data.uniq() print data
def query(self,bed,**kwargs): ''' iterator the bed overlap features in data Usage: data=bedindex(file.bed) for i in data.query(bed): print i **kwargs for further extension for annotation with no pre define format ''' if type(bed)==type((1,2,3)) or type(bed)==([1,2,3]): bed=Bed(bed[0:3]) # guess (chrome,start,stop) if not self.data.has_key(bed.chr): raise StopIteration D=self.data[bed.chr] for bin in binindex.iter_range_overlap_bins(bed.start,bed.stop): for f in D[bin]: if f.start < bed.stop and f.stop > bed.start: yield f raise StopIteration
def readIntoBinIndex(handle): ''' Read list or iterator into BinIndex Data Structure Usage: Example: f=TableIO.parse("filename","genebed") data=readIntoBinIndex(f) Example: f=TableIO.parse("filename","vcf") data=readIntoBinIndex(f) Example: data=readIntoBinIndex(bedlist) ''' data = {} for i in handle: a = i if type(i) == type([]) or type(i) == type((1, 2, 3)): a = Bed(i) appendIntoBinIndex(data, a) return data
def GetAnnotationName(pAnno, hasAnno, dbi, hasRepeat, dbirepeat): if pAnno.source == "genome": if hasAnno: bed = Bed([pAnno.chr, pAnno.start, pAnno.end, '.', 0.0, '+']) for hit in dbi.query(bed): name_component = hit.id.split(".", 2)[2] id_component = hit.id.split(".", 2)[1] if pAnno.name == name_component or pAnno.name.split( "_", 2)[0] == id_component: pAnno.id = id_component return ":".join( str(f) for f in [pAnno.type, pAnno.id, pAnno.proper]) if hasRepeat: for hit in dbirepeat.query(bed): tempname = hit.id.split("&") if pAnno.name == tempname[0]: pAnno.id = "".join( str(f) for f in [pAnno.chr, pAnno.name, hit.start]) return ":".join( str(f) for f in [pAnno.type, pAnno.id, pAnno.proper]) if pAnno.type == 'tRNA': return ":".join( str(f) for f in [pAnno.type, pAnno.name, pAnno.proper]) if hasAnno or hasRepeat: raise Exception('pAnno not found! ' + pAnno.chr + ':' + pAnno.name) pAnno.id = "".join( str(f) for f in [pAnno.type, pAnno.chr, pAnno.name, pAnno.start]) elif ".fa" in pAnno.source: pAnno.id = pAnno.chr.split("_", 1)[0] try: pAnno.chr = pAnno.chr.split("_", 1)[1] except: pass else: pAnno.id = pAnno.name return pAnno.type + ":" + pAnno.id + ":" + pAnno.proper
def genome_annotation(outputbam, annotationfile, detail, annotationRepeat, mapq_thred, strandenforced=False, posstrand=True, requireUnique=False, results_dict=dict()): # annotationfile is annotation file # detail is db_detail file if annotationfile: dbi1 = DBI.init(annotationfile, "bed") dbi2 = DBI.init(detail, "bed") dbi3 = DBI.init(annotationRepeat, "bed") newdict = dict() # funmap = open(unmapfilename, 'w') for record in outputbam: # print >> sys.stderr, record.qname if "N" not in record.cigarstring: anno_start = record.pos anno_end = record.aend bed_start = record.pos bed_end = record.aend else: bed_list, anno_start, anno_end = Exon_junction(record) bed_start = ",".join([str(f[0]) for f in bed_list]) bed_end = ",".join([str(f[1]) for f in bed_list]) # print anno_start,anno_end,bed_start,bed_end IsMapped = False if Included(record, requireUnique, mapq_thred): strandactual = ("+" if posstrand else "-") strand = "+" if record.is_reverse: strandactual = ("-" if posstrand else "+") strand = "-" if annotationfile: bed = Bed([ outputbam.getrname(record.tid), anno_start, anno_end, '.', 0.0, strandactual ]) [typ, name, subtype, strandcol] = annotation(bed, dbi1, dbi2, dbi3) if (not strandenforced) or strandcol == 'ProperStrand': curr_anno_arr = (str(f) for f in [ outputbam.getrname( record.tid), bed_start, bed_end, strand, record.seq, 'genome', typ, name, subtype, strandcol ]) if not record.qname in newdict: newdict[record.qname] = '\t'.join(curr_anno_arr) if not Included(record, True, mapq_thred): # not unique newdict[record.qname] = [newdict[record.qname]] else: if type(newdict[record.qname]) is str: newdict[record.qname] = [newdict[record.qname]] newdict[record.qname].append('\t'.join(curr_anno_arr)) IsMapped = True else: strandcol = '.' curr_anno_arr = (str(f) for f in [ outputbam.getrname(record.tid), record.aend - record.alen + 1, record.aend, strand, record.seq, 'genome', strandcol ]) if not record.qname in newdict: newdict[record.qname] = '\t'.join(curr_anno_arr) if not Included(record, True, mapq_thred): # not unique newdict[record.qname] = [newdict[record.qname]] else: if type(newdict[record.qname]) is str: newdict[record.qname] = [newdict[record.qname]] newdict[record.qname].append('\t'.join(curr_anno_arr)) IsMapped = True newanno = dict(results_dict.items() + newdict.items()) return newanno
def compare_reads(isoforms): # global dbi,out isoforms_set=[] chr=isoforms[0].chr min_start=isoforms[0].start max_stop=isoforms[0].stop for i in isoforms: if i.start < min_start: min_start=i.start if i.stop > max_stop: max_stop=i.stop isoforms_set.append(i) # print >>sys.stderr,"debug",i transcript_region=Bed([chr,min_start,max_stop]); print >>out,"REGION\t",chr,"\t",min_start,"\t",max_stop print >>out,"ISOFORM_INPUT_NUMBER\t",len(isoforms_set) ''' reading all the reads in this transcript region ''' reads_set=[] reads_num=0 for i in dbi.query(transcript_region,method="fetch12"): reads_set.append(i) reads_num+=1 ''' compare two sets ''' l=len(isoforms_set) bincodes={} total=reads_num if total==0: total=0.001 for i in reads_set: bincode=0 for j in isoforms_set: k=Tools.compatible_with_transcript(i,j) if k: bincode = (bincode<<1)+1 else: bincode = bincode<<1 if bincodes.has_key(bincode): bincodes[bincode]+=1 else: bincodes[bincode]=1 init=[ 1.0/l for i in range(l) ] proportion=init ''' EM Initialize ''' ''' E step ''' totals=[0.0 for i in range(l)] new_proportion=[0.0 for i in range(l)] iterate_time=0; while(1): totals=[0.0 for i in range(l)] for code in bincodes.keys(): row_total=0.0 for j in range(l): if get_bit_n(j,l,code): row_total+=proportion[j] for j in range(l): if get_bit_n(j,l,code): totals[j]+=bincodes[code] * proportion[j] / row_total for i in range(l): new_proportion[i]=totals[i]/total ''' M step ''' #print >>sys.stderr,"proportion",proportion #debug #print >>sys.stderr,"new_proportion",new_proportion #debug #print >>sys.stderr,"total",total #debug #print >>sys.stderr,"totals",totals #debug dis=distance(proportion,new_proportion) proportion=new_proportion iterate_time+=1 if(dis<1e-05): break; if(args.BYY and iterate_time > 10): break; ''' BYY Hard Cut Algorithm ''' while(args.BYY): totals=[0.0 for i in range(l)] for code in bincodes.keys(): maxj=-1 for j in range(l): if get_bit_n(j,l,code): # totals[j]+=bincodes[code] * proportion[j] / row_total if maxj==-1: maxj=j elif proportion[j] > proportion[maxj]: maxj=j if maxj!=-1: totals[maxj]+=bincodes[code] #new_proportion=[0.0 for i in range(l)] for i in range(l): new_proportion[i]=totals[i]/total ''' M step ''' dis=distance(proportion,new_proportion) if(dis<1e-05): break; proportion=new_proportion # print >>out,proportion ''' print isoforms ''' for i,x in enumerate(isoforms_set): if proportion[i] > args.threshold: if x.score==0.0: x.score=proportion[i] print >>out,"HT\t",x,"\t",proportion[i] else: print >>out,"HT\t",x,"\t",proportion[i] else: if not args.hits_only: if x.score==0.0: x.score=proportion[i] print >>out,"NT\t",x,"\t",proportion[i] else: print >>out,"NT\t",x,"\t",proportion[i] print >>out,"//"
def randRegion(length, RNAs): ''' get random sequences coordinate given desire seq length and a database of all RNAs''' Type = selectType(length) RNA = random.choice(RNAs[Type]) start = random.randrange(RNA.start, max(RNA.stop - length, RNA.start) + 1) return Bed([RNA.chr, start, start + length], strand=RNA.strand), Type
def genome_annotation(outputbam, annotationfile, detail, readfilename, unmapfilename, strandenforced = False, posstrand = True, requireUnique = False, results_dict = dict()): # annotationfile is annotation file # detail is db_detail file if annotationfile: dbi1=DBI.init(annotationfile,"bed") dbi2=DBI.init(detail,"bed") dbi3=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed") newdict = dict() funmap = open(unmapfilename, 'w') for record in outputbam: # print >> sys.stderr, record.qname IsMapped = False if Included(record, requireUnique): strandactual = ("+" if posstrand else "-") strand = "+" if record.is_reverse: strandactual = ("-" if posstrand else "+") strand = "-" if annotationfile: bed=Bed([outputbam.getrname(record.tid), record.pos, record.aend,'.',0.0,strandactual]) [typ, name, subtype, strandcol] = annotation(bed,dbi1,dbi2,dbi3) if (not strandenforced) or strandcol == 'ProperStrand': curr_anno_arr = (str(f) for f in [outputbam.getrname(record.tid), record.pos, record.aend, strand, record.seq, 'genome', typ, name, subtype, strandcol]) if not record.qname in newdict: newdict[record.qname] = '\t'.join(curr_anno_arr) if not Included(record, True): # not unique newdict[record.qname] = [newdict[record.qname]] else: if type(newdict[record.qname]) is str: newdict[record.qname] = [newdict[record.qname]] newdict[record.qname].append('\t'.join(curr_anno_arr)) IsMapped = True else: strandcol = '.' curr_anno_arr = (str(f) for f in [outputbam.getrname(record.tid), record.aend - record.alen + 1, record.aend, strand, record.seq, 'genome', strandcol]) if not record.qname in newdict: newdict[record.qname] = '\t'.join(curr_anno_arr) if not Included(record, True): # not unique newdict[record.qname] = [newdict[record.qname]] else: if type(newdict[record.qname]) is str: newdict[record.qname] = [newdict[record.qname]] newdict[record.qname].append('\t'.join(curr_anno_arr)) IsMapped = True if not IsMapped: # output all pairs that cannot be mapped on both sides as unmaped pairs into two fasta file seq = record.seq if record.is_reverse: seq = revcomp(record.seq, rev_table) unmap_rec = SeqRecord(Seq(seq, IUPAC.unambiguous_dna), id = record.qname, description='') SeqIO.write(unmap_rec, funmap, "fasta") funmap.close() newanno = dict(results_dict.items() + newdict.items()) return newanno
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) hSites={}; donorSites={}; acceptorSites={} if args.genome is not None: genome=DBI.init(args.genome,"genome") else: genome=None j=0 for j,i in enumerate(TableIO.parse(fin,"bam2bed12",references=fin.references,strand=args.strand)): #print >>out,i if j%1000==0: print >>sys.stderr,"processing ",j,"reads \r", for intron in i.Introns(): if len(intron)< args.intron_min_length: continue donor=intron.head(); #print >>sys.stderr,intron #print >>sys.stderr,donor donorID=bedToID(donor) if(donorSites.has_key(donorID)): donorSites[donorID]+=1 else: donorSites[donorID]=1 acceptor=intron.tail(); acceptorID=bedToID(acceptor) if(acceptorSites.has_key(acceptorID)): acceptorSites[acceptorID]+=1 else: acceptorSites[acceptorID]=1 ''' if genome is not None: s=genome.query(intron.head()).upper()+".."+genome.query(intron.tail()).upper() if hSites.has_key(s): hSites[s]+=1 else: hSites[s]=1 ''' donors=[] for key in donorSites.keys(): a=key.split("\t") donors.append(Bed([a[0],a[1],a[2],"noname_donor",donorSites[key],a[3]])); donors.sort() for i,x in enumerate(donors): x.id="donor_"+str(i) print >>out,x,"\t",genome.query(x).upper() acceptors=[] for key in acceptorSites.keys(): a=key.split("\t") acceptors.append(Bed([a[0],a[1],a[2],"noname_acceptor",acceptorSites[key],a[3]])); acceptors.sort() for i,x in enumerate(acceptors): x.id="acceptor_"+str(i) print >>out,x,"\t",genome.query(x).upper()
def Main(): args=ParseArg() #store bed files with indexing and count information: bam={} print >>sys.stderr,"Starting index bed files:" for i in range(len(args.bams)): temp_name=args.name[i] print >>sys.stderr," #Indexing for bed file of",temp_name,"\r", bam[temp_name]=DBI.init(args.bams[i],args.fmt) print >>sys.stderr print >>sys.stderr,"Reading nucleosome peak xls file from Danpos." nucleosomes=TableIO.parse(args.nucleosome,'metabed',header=True) print >>sys.stderr,"Initial output files..." out=open(args.output,"w") # -- for verbose --- if args.verbose: out_mark=[] for n in args.name: out_mark.append(open(n+'_shift_nucleosomes.bed','w')) # ------------------ line_head=open(args.nucleosome,'r').readline().strip() line_head=line_head+"\t"+"\t".join(str(f) for f in args.name)+'\t'+"\t".join(str(f)+'_off' for f in args.name) print >>out,line_head print >>sys.stderr,"Start Counting..." num=0 t0 = time() for i in nucleosomes: chrom=i.chr if i.smt_pval>0.01 or i.fuzziness_pval>0.01: continue # only choose nucleosomes with high value and low fuzziness if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM': continue num=num+1 center=int(i.start+i.end)/2 count=np.zeros(len(args.bams),dtype="float") offset=np.zeros(len(args.bams),dtype='int') line=str(i) for k,name in enumerate(bam.keys()): if args.fmt=='bam': query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS]),method='fetch') else: query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS])) read_centers=[] for j in query: read_centers.append(find_center(j,args.fmt)) [o,c]=getCount(read_centers,center) count[k]=c offset[k]=o # -- for verbose --- if args.verbose: print >>out_mark[k],chrom+'\t%d\t%d'%(i.start+o,i.end+o) # ------------------ line = line + "\t" + "\t".join(str(f) for f in count) + '\t' + "\t".join(str(f) for f in offset) if num%20000==0: t1 = time() print >>sys.stderr,"processing %dth nucleosome..., time: %.2fs."%(num,t1-t0),'\r', t0 = time() print >>out,line print out.close() # -- for verbose --- if args.verbose: for k in out_mark: k.close()