def scan_fp(plusdnase, minusdnase, bed, out, upstream, downstream): p = BwIO(plusdnase) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle1 = BigWigFile(open(plusdnase, 'rb')) bwHandle2 = BigWigFile(open(minusdnase, 'rb')) inf = open(bed) outf = open(out, 'w') for line in inf: ll = line.split() if not chrom_len.has_key(ll[0]): continue if int(ll[1]) < upstream: continue signal1 = bwHandle1.summarize( ll[0], int(ll[1]) - upstream, int(ll[2]) + downstream, (int(ll[2]) + downstream - int(ll[1]) + upstream)) signal2 = bwHandle2.summarize( ll[0], int(ll[1]) - upstream, int(ll[2]) + downstream, (int(ll[2]) + downstream - int(ll[1]) + upstream)) #ll.append(str(float(signal.sum_data))) newll = ll[:6] + map(str, list(signal1.sum_data)) + map( str, list(signal2.sum_data)) outf.write("\t".join(newll) + "\n") inf.close() outf.close()
def getsignal(inputfile,outputfile,pcut,DHT,Veh,pspan): # p=BwIO(pcut) # chrom_len = {} # for i in p.chromosomeTree['nodes']: # chrom_len[i['key']] = i['chromSize'] pcutbw = BigWigFile(open(pcut, 'rb')) dht = BigWigFile(open(DHT, 'rb')) veh = BigWigFile(open(Veh, 'rb')) inf = open(inputfile) testll = inf.readline().split() ml = int(testll[2]) - int(testll[1]) inf.seek(0) outf = open(outputfile,'w') for line in inf: ll = line.split() # if not chrom_len.has_key(ll[0]): # continue cut = list(pcutbw.summarize(ll[0],int(ll[1]) + ml/2 -pspan ,int(ll[1]) + ml/2 +pspan ,2*pspan).sum_data) TC = sum(cut) C = sum(cut[(pspan-ml/2) : (pspan-ml/2+ml)]) L = sum(cut[(pspan-ml/2-ml):(pspan-ml/2)]) R = sum(cut[(pspan-ml/2+ml):(pspan-ml/2+2*ml)]) FOS = -1*( (C+1)/(R+1) + (C+1)/(L+1) ) dhtnum = sum(list(dht.summarize(ll[0],int(ll[1]) + ml/2 -pspan ,int(ll[1]) + ml/2 +pspan ,2).sum_data)) + 1 vehnum = sum(list(veh.summarize(ll[0],int(ll[1]) + ml/2 -pspan ,int(ll[1]) + ml/2 +pspan ,2).sum_data)) + 1 newll = ll + [TC,FOS,dhtnum,vehnum] outf.write("\t".join(map(str,newll))+"\n") outf.close()
def get_regionLevel_simplex_parameters(inputbed, outputbed, plusbw, minusbw, biasmat, ext, genome2bit): simplex_code = encoding() biasdict, flank = readBG(biasmat) B, B0, B1, B2 = paramest(biasdict) permuteSeq = {} inf = open("permuteSeq8mer.txt") for line in inf: ll = line.split() permuteSeq[ll[0]] = ll[1] inf.close() # outitem = seq2biasParm("ACTCGCAA",B,simplex_code) #print B genome = twobitreader.TwoBitFile(genome2bit) # seq = genome[chrm][(int(ll[1])-flank):(int(ll[1])+flank)].upper() plusBWH = BigWigFile(open(plusbw, 'rb')) minusBWH = BigWigFile(open(minusbw, 'rb')) inf = open(inputbed) outf = open(outputbed, 'w') for line in inf: ll = line.split() chrm = ll[0] center = (int(ll[1]) + int(ll[2])) / 2 start = max(0, center - ext) end = center + ext plusSig = plusBWH.summarize(ll[0], start, end, end - start).sum_data minusSig = minusBWH.summarize(ll[0], start, end, end - start).sum_data if type(plusSig) == None or type(minusSig) == None: continue plusSequence = genome[chrm][(start - flank):(end + flank)].upper() minusSequence = genome[chrm][(start - flank + 1):(end + flank + 1)].upper() plus_data = numpy.array([0.0] * len(B)) minus_data = numpy.array([0.0] * len(B)) for i in range(len(plusSig)): #position = start + i pcuts = plusSig[i] if pcuts > 0: pseq = plusSequence[i:(i + 2 * flank)].upper() if not "N" in pseq: p_out = seq2biasParm(permuteSeq[pseq], B, simplex_code) plus_data += pcuts * p_out for i in range(len(minusSig)): #position = start + i mcuts = minusSig[i] if mcuts > 0: tmpseq = minusSequence[i:(i + 2 * flank)] if not "N" in tmpseq: mseq = revcomp(tmpseq).upper() m_out = seq2biasParm(permuteSeq[mseq], B, simplex_code) minus_data += mcuts * m_out newll = ll + list(plus_data) + list(minus_data) outf.write("\t".join(map(str, newll)) + "\n") inf.close() outf.close()
def get_signal(inputfile, output, vp, vm, dp, dm): p = BwIO(vp) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] vpBw = BigWigFile(open(vp, 'rb')) vmBw = BigWigFile(open(vm, 'rb')) dpBw = BigWigFile(open(dp, 'rb')) dmBw = BigWigFile(open(dm, 'rb')) inf = open(inputfile) outf = open(output, 'w') colnames = [ "chrom", "start", "end", "seq", "motifscore", "strand", "LncapARsignal", "LncapDNaseCutsite", "LncapDNaseFrag", "K562DNaseFrag", "LncapFP", "K562FP", "overARpeak", "VehPlus", "VehMinus", "DHTPlus", "DHTMinus" ] outf.write("\t".join(colnames) + "\n") for line in inf: if line.startswith("chrom"): continue ll = line.split() if not chrom_len.has_key(ll[0]): continue signal = vpBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) signal = vmBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) signal = dpBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) signal = dmBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) outf.write("\t".join(ll) + "\n") inf.close() outf.close()
def summary(bwfile,bedfile,topnumber,out): total_result = [] p=BwIO(bwfile) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle=BigWigFile(open(bwfile, 'rb')) inf = open(bedfile) t = time.time() for line in inf: ll = line.split() ll[3]="-" if chrom_len.has_key(ll[0]): summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),1) if summary.valid_count == 0: mean_value = 0 else: mean_value = (summary.sum_data/summary.valid_count)[0] total_result.append(ll+[mean_value]) inf.close() total_result.sort(reverse=True,key=lambda x:x[-1]) outf = open(out,'w') print "scaning 1st ",time.time()-t t=time.time() for i in range(topnumber): ll = total_result[i] summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1]))) additional_value = ",".join(map(str,list(summary.sum_data))) result = map(str,(ll+[additional_value])) outf.write("\t".join(result)+"\n") outf.close() print "scaning 2nd ",time.time()-t
def count_cut_nmers(fp, w_plus, w_minus, lflank, rflank, single_nmer_cutoff, sequence): """ count the number of cuts associated with each nmer in sequence covered by X. offset is the position of the cut to be associated with each nmer. if offset = 0 the first base of the tag is lined up with the nmer start """ w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) genome = twobitreader.TwoBitFile(sequence) # keep count of the number of occurrences of each n-mer seq_nmer_dict = {} cut_nmer_dict = {} for line in fp.readlines(): ll = line.split() chrm = ll[0] start = int(ll[1]) end = int(ll[2]) seq = genome[chrm][(start - lflank):(end + rflank)].upper() cp = list(w_plus_H.summarize(ll[0], start, end, end - start).sum_data) cn = list(w_minus_H.summarize(ll[0], start, end, end - start).sum_data) #each = (len(ll)-5)/2 #cp = (map(float,ll[5:(5+each)])) #cn = (map(float,ll[(5+each):(5+each*2)])) for k in range(len(cp)): p_cut = cp[k] n_cut = cn[k] p_seq = seq[k:(k + lflank + rflank)] n_seq = seq[(k + 1):(k + lflank + rflank + 1)] # rev_n_seq = rev(n_seq) if 'N' not in p_seq and p_cut <= single_nmer_cutoff: try: cut_nmer_dict[p_seq] += p_cut except: cut_nmer_dict[p_seq] = p_cut try: seq_nmer_dict[p_seq] += 1 except: seq_nmer_dict[p_seq] = 1 if 'N' not in n_seq and n_cut <= single_nmer_cutoff: rev_n_seq = rev(n_seq) try: cut_nmer_dict[rev_n_seq] += n_cut except: cut_nmer_dict[rev_n_seq] = n_cut try: seq_nmer_dict[rev_n_seq] += 1 except: seq_nmer_dict[rev_n_seq] = 1 return seq_nmer_dict, cut_nmer_dict
def sitepro_scan(peak, outname, w_plus, w_minus, Cspan): inf = open(peak) w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) outf_propPlus = open(outname + "_propcutPlus.bdg", 'w') outf_propMinus = open(outname + "_propcutMinus.bdg", 'w') for line in inf: ll = line.split() chrm = ll[0] start = int(ll[1]) end = int(ll[2]) if start - Cspan < 0: print ll continue plus_obj = w_plus_H.summarize(chrm, start - Cspan, end + Cspan, (end - start + 2 * Cspan)) minus_obj = w_minus_H.summarize(chrm, start - Cspan, end + Cspan, (end - start + 2 * Cspan)) if not plus_obj: plus_vector = numpy.array([0] * (end - start + 2 * Cspan)) + 1 else: plus_vector = plus_obj.sum_data + 1 if not minus_obj: minus_vector = numpy.array([0] * (end - start + 2 * Cspan)) + 1 else: minus_vector = minus_obj.sum_data + 1 roundN = 4 #### assign bias to bp and proportion for outpos in range(Cspan, (end - start + Cspan)): this_plus_cuts_prop = round( plus_vector[outpos] / sum(plus_vector[(outpos - Cspan):(outpos + Cspan)]), roundN) this_minus_cuts_prop = round( minus_vector[outpos] / sum(minus_vector[(outpos - Cspan):(outpos + Cspan)]), roundN) out_chrm = chrm out_start = start + outpos - Cspan out_end = out_start + 1 outf_propPlus.write("\t".join( map(str, [out_chrm, out_start, out_end, this_plus_cuts_prop])) + "\n") outf_propMinus.write("\t".join( map(str, [out_chrm, out_start, out_end, this_minus_cuts_prop])) + "\n") outf_propPlus.close() outf_propMinus.close() inf.close()
def sitepro_scan(peak, outp, outn, w_plus, w_minus, bgmatrix, span, gen, lflank, rflank): nmer = lflank + rflank genome = twobitreader.TwoBitFile(gen) pBG, nBG = readBG(bgmatrix) inf = open(peak) w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) outfp = open(outp, 'w') outfn = open(outn, 'w') for line in inf: ### chr start end name motifscore strand FP DNase chip ll = line.split() ##### 3 below is flanking length chrm = ll[0] start = int(ll[1]) end = int(ll[2]) ## remove overflow if start - span - lflank <= 0: continue ## get cleavage p_sum = list( w_plus_H.summarize(chrm, start - span, end + span, end - start + 2 * span).sum_data) n_sum = list( w_minus_H.summarize(chrm, start - span, end + span, end - start + 2 * span).sum_data) ## get seqbias seq = genome[chrm][(start - span - lflank):(end + span + rflank)] if 'N' in seq.upper(): continue pseq = seq[:-1] nseq = seq[1:] p = [] n = [] ### bias for k in range(len(pseq) + 1 - nmer): p.append(pBG[pseq[k:(k + nmer)].upper()]) n.append(nBG[nseq[k:(k + nmer)].upper()]) for bp in range(len(p_sum) - 2 * span): ptotal = sum(p_sum[bp:(bp + 2 * span)]) ### total ntotal = sum(n_sum[bp:(bp + 2 * span)]) pc = int(p_sum[bp + span]) #### observation cut nc = int(n_sum[bp + span]) pbias = p[bp + span] nbias = n[bp + span] pbgtotal = sum(p[bp:(bp + span * 2)]) nbgtotal = sum(n[bp:(bp + span * 2)]) paraw = (pbias / pbgtotal) * ptotal naraw = (nbias / nbgtotal) * ntotal outfp.write("\t".join(map(str, [pc, ptotal, pbias, paraw])) + "\n") outfn.write("\t".join(map(str, [nc, ntotal, nbias, paraw])) + "\n") outfp.close() outfn.close() inf.close()
def get_signal(inputfile, output, Pbw, Nbw, score_range): persudo = 0.2 p = BwIO(Pbw) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] PH = BigWigFile(open(Pbw, 'rb')) NH = BigWigFile(open(Nbw, 'rb')) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if not chrom_len.has_key(ll[0]): continue motif_len = int(ll[2]) - int(ll[1]) Psignal = list( PH.summarize(ll[0], max(int(ll[1]) - 100, 0), int(ll[1]) + 100, 200).sum_data) Nsignal = list( NH.summarize(ll[0], max(int(ll[1]) - 100, 0), int(ll[1]) + 100, 200).sum_data) DNase = sum(Psignal) + sum(Nsignal) if ll[5] == '+': S_up_same = sum(Psignal[(100 - score_range):100]) S_up_diff = sum(Nsignal[(100 - score_range):100]) S_down_same = sum(Psignal[(100 + motif_len):100 + motif_len + score_range]) S_down_diff = sum(Nsignal[(100 + motif_len):100 + motif_len + score_range]) elif ll[5] == '-': S_up_same = sum(Nsignal[(100 + motif_len):100 + motif_len + score_range]) S_up_diff = sum(Psignal[(100 + motif_len):100 + motif_len + score_range]) S_down_same = sum(Nsignal[(100 - score_range):100]) S_down_diff = sum(Psignal[(100 - score_range):100]) else: print line sys.exit(1) # if S_up_same == 0 or S_up_diff ==0 or S_down_same == 0 or S_down_diff == 0: # continue FPscore1 = math.log((S_up_same + persudo) * (S_down_diff + persudo) / ((S_up_diff + persudo) * (S_down_same + persudo)), 2) FPscore2 = math.sqrt(S_up_same) + math.sqrt(S_down_diff) - math.sqrt( S_up_diff) - math.sqrt(S_down_same) ll.extend([DNase, FPscore1, FPscore2]) outf.write("\t".join(map(str, ll)) + "\n") inf.close() outf.close()
def summary(bwfile1, bwfile2, bwfile_add, bedfile, topnumber, out): total_result = [] p = BwIO(bwfile1) q = BwIO(bwfile2) chrom_len1 = {} chrom_len2 = {} for i in p.chromosomeTree['nodes']: chrom_len1[i['key']] = i['chromSize'] for i in q.chromosomeTree['nodes']: chrom_len2[i['key']] = i['chromSize'] bwHandle1 = BigWigFile(open(bwfile1, 'rb')) bwHandle2 = BigWigFile(open(bwfile2, 'rb')) inf = open(bedfile) t = time.time() for line in inf: ll = line.split() ll[3] = "-" if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]): summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), 1) if summary.valid_count == 0: mean_value1 = 0 else: mean_value1 = (summary.sum_data / summary.valid_count)[0] summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), 1) if summary.valid_count == 0: mean_value2 = 0 else: mean_value2 = (summary.sum_data / summary.valid_count)[0] total_result.append(ll + [mean_value1 + mean_value2]) inf.close() total_result.sort(reverse=True, key=lambda x: x[-1]) bwHs = [] for i in bwfile_add: bwHs.append(BigWigFile(open(i, 'rb'))) outf = open(out, 'w') print "scaning 1st ", time.time() - t t = time.time() for i in range(min(len(total_result), topnumber)): ll = total_result[i] summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value1 = ",".join(map(str, list(summary.sum_data))) summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value2 = ",".join(map(str, list(summary.sum_data))) result = map(str, (ll + [additional_value1, additional_value2])) for bwH in bwHs: summary = bwH.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value_add = ",".join(map(str, list(summary.sum_data))) result.append(additional_value_add) outf.write("\t".join(result) + "\n") outf.close() print "scaning 2nd ", time.time() - t
def sitepro_scan(peak, outname, w_plus, w_minus, Cspan): inf = open(peak) w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) outf = open(outname + "_Cuts.txt", 'w') for line in inf: ll = line.split() chrm = ll[0] start = int(ll[1]) end = int(ll[2]) if start - Cspan < 0: print ll continue plus_obj = w_plus_H.summarize(chrm, start - Cspan, end + Cspan, (end - start + 2 * Cspan)) minus_obj = w_minus_H.summarize(chrm, start - Cspan, end + Cspan, (end - start + 2 * Cspan)) if not plus_obj: plus_vector = numpy.array([0] * (end - start + 2 * Cspan)) else: plus_vector = plus_obj.sum_data if not minus_obj: minus_vector = numpy.array([0] * (end - start + 2 * Cspan)) else: minus_vector = minus_obj.sum_data #roundN = 4 #### assign bias to bp and proportion for outpos in range(Cspan, (end - start + Cspan)): this_plus = plus_vector[outpos] this_minus = minus_vector[outpos] this_plus_cuts_sum = sum(plus_vector[(outpos - Cspan):(outpos + Cspan)]) this_minus_cuts_sum = sum(minus_vector[(outpos - Cspan):(outpos + Cspan)]) out_chrm = chrm out_start = start + outpos - Cspan out_end = out_start + 1 outf.write("\t".join( map(str, [ out_chrm + ":" + str(out_start) + "-" + str(out_end), this_plus, this_plus_cuts_sum, this_minus, this_minus_cuts_sum ])) + "\n") outf.close() inf.close()
def make_template(data, flank, pflank, topmotif, out, pbw, mbw): w_plus_H = BigWigFile(open(pbw, 'rb')) w_minus_H = BigWigFile(open(mbw, 'rb')) i = 0 templatelist = [] pp = [] pm = [] inf = open(data) l1st = inf.readline().split() ml = int(l1st[2]) - int(l1st[1]) inf.seek(0) for line in inf: #if i >= topmotif: # break ll = line.split() templatelist.append(ll) inf.close() templatelist.sort(key=lambda x: float(x[4]), reverse=True) for ll in templatelist: p_sum = list( w_plus_H.summarize(ll[0], int(ll[1]) - flank, int(ll[1]) + flank, 2 * flank).sum_data) m_sum = list( w_minus_H.summarize(ll[0], int(ll[1]) - flank, int(ll[1]) + flank, 2 * flank).sum_data) if ll[5] == "+": pp.append(p_sum[(flank + 1 + ml / 2 - pflank):(flank + 1 + ml / 2 + pflank)]) pm.append(m_sum[(flank + 1 + ml / 2 - pflank):(flank + 1 + ml / 2 + pflank)]) if ll[5] == '-': pm.append(p_sum[::-1][(flank + 1 + ml / 2 - ml - pflank):(flank + 1 + ml / 2 - ml + pflank)]) pp.append(m_sum[::-1][(flank + 1 + ml / 2 - ml - pflank):(flank + 1 + ml / 2 - ml + pflank)]) meanp = apply_mean(pp) meanm = apply_mean(pm) allsum = sum(meanp) + sum(meanm) P = [] M = [] for i in range(len(meanp)): P.append(meanp[i]) #/allsum) M.append(meanm[i]) #/allsum) plot_template(P, M, out)
def get_signal(inputfile, output, plusBW, minusBW, bwfolder, extend): if not bwfolder: bwfolder = "./" if not bwfolder.endswith('/'): bwfolder += '/' plus = BigWigFile(open(bwfolder + plusBW, 'rb')) minus = BigWigFile(open(bwfolder + minusBW, 'rb')) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if "_" in ll[0]: continue if len(ll) >= 6 and ll[5] == "-": strand_flap = 1 else: strand_flap = 0 start = int(ll[1]) end = int(ll[2]) S = max(0, start - extend) E = end + extend # S = int(ll[1]) # E = int(ll[2]) outdata = ll try: plus_signal = (plus.summarize(ll[0], S, E, (E - S))) minus_signal = (minus.summarize(ll[0], S, E, (E - S))) if plus_signal and minus_signal: plus_tmp = list(plus_signal.sum_data) minus_tmp = list(minus_signal.sum_data) if strand_flap == 1: thisdata_tmp = minus_tmp[:: -1] + plus_tmp[:: -1] #map(round,thisdata_tmp,[4]*(E-S))[::-1] else: thisdata_tmp = plus_tmp + minus_tmp thisdata = thisdata_tmp #map(round,thisdata_tmp,[4]*len(thisdata_tmp)) except: pass outdata.extend(thisdata) # ll.extend(list(signal.sum_data/signal.valid_count)) outf.write("\t".join(map(str, outdata)) + "\n") inf.close() outf.close()
def summarize(self, interval, bins=None, method='summarize', function='mean'): # We may be dividing by zero in some cases, which raises a warning in # NumPy based on the IEEE 754 standard (see # http://docs.scipy.org/doc/numpy/reference/generated/ # numpy.seterr.html) # # That's OK -- we're expecting that to happen sometimes. So temporarily # disable this error reporting for the duration of this method. orig = np.geterr()['invalid'] np.seterr(invalid='ignore') if (bins is None) or (method == 'get_as_array'): bw = BigWigFile(open(self.fn)) s = bw.get_as_array( interval.chrom, interval.start, interval.stop, ) if s is None: s = np.zeros((interval.stop - interval.start, )) else: s[np.isnan(s)] = 0 elif method == 'ucsc_summarize': if function in ['mean', 'min', 'max', 'std', 'coverage']: return self.ucsc_summarize(interval, bins, function=function) else: raise ValueError('function "%s" not supported by UCSC\'s' 'bigWigSummary') else: bw = BigWigFile(open(self.fn)) s = bw.summarize(interval.chrom, interval.start, interval.stop, bins) if s is None: s = np.zeros((bins, )) else: if function == 'sum': s = s.sum_data if function == 'mean': s = s.sum_data / s.valid_count s[np.isnan(s)] = 0 if function == 'min': s = s.min_val s[np.isinf(s)] = 0 if function == 'max': s = s.max_val s[np.isinf(s)] = 0 if function == 'std': s = (s.sum_squares / s.valid_count) s[np.isnan(s)] = 0 # Reset NumPy error reporting np.seterr(divide=orig) return s
def Readbw(bwfile,chrm,start,end,n): bwHandle=BigWigFile(open(bwfile, 'rb')) summary = bwHandle.summarize(chrm,int(start),int(end),(int(end)-int(start))/n) count = map(sudocount,summary.valid_count) sum = summary.sum_data scores = list(sum/count) return scores
def sitepro_scan(peak, out, w_plus, w_minus, bgmatrix, span, gen, lflank, rflank): nmer = lflank + rflank genome = twobitreader.TwoBitFile(gen) pBG, nBG = readBG(bgmatrix) inf = open(peak) w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) outf = open(out, 'w') for line in inf: ### chr start end name motifscore strand FP DNase chip ll = line.split() ##### 3 below is flanking length chrm = ll[0] start = int(ll[1]) end = int(ll[2]) if start - span - lflank <= 0: continue p_sum = list( w_plus_H.summarize(chrm, start - span, end + span, end - start + 2 * span).sum_data) n_sum = list( w_minus_H.summarize(chrm, start - span, end + span, end - start + 2 * span).sum_data) seq = genome[chrm][(start - span - lflank):(end + span + rflank)] if 'N' in seq.upper(): continue pseq = seq[:-1] nseq = seq[1:] p = [] n = [] for k in range(len(pseq) + 1 - nmer): p.append(pBG[pseq[k:(k + nmer)].upper()]) n.append(nBG[nseq[k:(k + nmer)].upper()]) p_assign = [] n_assign = [] for bp in range(len(p_sum) - 2 * span): ptotal = sum(p_sum[bp:(bp + 2 * span)]) ntotal = sum(n_sum[bp:(bp + 2 * span)]) pbias_per = p[bp + span] * 1.0 / sum(p[bp:(bp + 2 * span)]) nbias_per = n[bp + span] * 1.0 / sum(n[bp:(bp + 2 * span)]) p_assign.append(pbias_per * ptotal) n_assign.append(nbias_per * ntotal) newll = ll + p_sum[span:(len(p_sum) - span)] + n_sum[span:( len(n_sum) - span)] + p[span:(len(p) - span)] + n[span:( len(n) - span)] + p_assign + n_assign outf.write("\t".join(map(str, newll)) + "\n") outf.close()
def summarize(self, interval, bins=None, method='summarize', function='mean'): # We may be dividing by zero in some cases, which raises a warning in # NumPy based on the IEEE 754 standard (see # http://docs.scipy.org/doc/numpy/reference/generated/ # numpy.seterr.html) # # That's OK -- we're expecting that to happen sometimes. So temporarily # disable this error reporting for the duration of this method. orig = np.geterr()['invalid'] np.seterr(invalid='ignore') if (bins is None) or (method == 'get_as_array'): bw = BigWigFile(open(self.fn)) s = bw.get_as_array( interval.chrom, interval.start, interval.stop,) if s is None: s = np.zeros((interval.stop - interval.start,)) else: s[np.isnan(s)] = 0 elif method == 'ucsc_summarize': if function in ['mean', 'min', 'max', 'std', 'coverage']: return self.ucsc_summarize(interval, bins, function=function) else: raise ValueError('function "%s" not supported by UCSC\'s' 'bigWigSummary') else: bw = BigWigFile(open(self.fn)) s = bw.summarize( interval.chrom, interval.start, interval.stop, bins) if s is None: s = np.zeros((bins,)) else: if function == 'sum': s = s.sum_data if function == 'mean': s = s.sum_data / s.valid_count s[np.isnan(s)] = 0 if function == 'min': s = s.min_val s[np.isinf(s)] = 0 if function == 'max': s = s.max_val s[np.isinf(s)] = 0 if function == 'std': s = (s.sum_squares / s.valid_count) s[np.isnan(s)] = 0 # Reset NumPy error reporting np.seterr(divide=orig) return s
def sitepro_scan(peak, out, w_plus, w_minus): inf = open(peak) w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) outf = open(out, 'w') for line in inf: ### chr start end name motifscore strand FP DNase chip ll = line.split() ##### 3 below is flanking length p_sum = list( w_plus_H.summarize(ll[0], int(ll[1]), int(ll[2]), int(ll[2]) - int(ll[1])).sum_data) m_sum = list( w_minus_H.summarize(ll[0], int(ll[1]), int(ll[2]), int(ll[2]) - int(ll[1])).sum_data) fp = (ll + p_sum + m_sum) newline = "\t".join(map(str, fp)) + "\n" outf.write(newline) outf.close()
def getsignal(inputfile, outputfile, ATAC100, ATAC247, ATACall, DNase, pspan): # p=BwIO(pcut) # chrom_len = {} # for i in p.chromosomeTree['nodes']: # chrom_len[i['key']] = i['chromSize'] ATAC100bw = BigWigFile(open(ATAC100, 'rb')) ATAC247bw = BigWigFile(open(ATAC247, 'rb')) ATACallbw = BigWigFile(open(ATACall, 'rb')) DNasebw = BigWigFile(open(DNase, 'rb')) inf = open(inputfile) outf = open(outputfile, 'w') for line in inf: ll = line.split() if ll[0] == 'chrY': continue # print [ll[0],(int(ll[1])+int(ll[2]))/2 -pspan ,(int(ll[1])+int(ll[2]))/2 -pspan] ATAC100_signal = float( ATAC100bw.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - pspan, (int(ll[1]) + int(ll[2])) / 2 + pspan, 1).sum_data) / (2 * pspan) ATAC247_signal = float( ATAC247bw.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - pspan, (int(ll[1]) + int(ll[2])) / 2 + pspan, 1).sum_data) / (2 * pspan) ATACall_signal = float( ATACallbw.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - pspan, (int(ll[1]) + int(ll[2])) / 2 + pspan, 1).sum_data) / (2 * pspan) DNase_signal = float( DNasebw.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - pspan, (int(ll[1]) + int(ll[2])) / 2 + pspan, 1).sum_data) / (2 * pspan) newll = ll + [ ATAC100_signal, ATAC247_signal, ATACall_signal, DNase_signal ] outf.write("\t".join(map(str, newll)) + "\n") outf.close()
def fragment_v_predcut(pefrag, output, bwp, bwn): inf = open(pefrag) outf = open(output, 'w') pH = BigWigFile(open(bwp, 'rb')) mH = BigWigFile(open(bwn, 'rb')) for line in inf: ll = line.split() chrm = ll[0] pcut = int(ll[1]) ncut = int(ll[2]) - 1 fraglen = ncut - pcut + 1 pPred = float(pH.summarize(chrm, pcut, pcut + 1, 1).sum_data) nPred = float(mH.summarize(chrm, ncut, ncut + 1, 1).sum_data) if pPred == -1 or nPred == -1: continue newll = [chrm, pcut, ncut, fraglen, pPred, nPred] outf.write("\t".join(map(str, newll)) + "\n") inf.close() outf.close()
def getsignal(inputfile, outputfile, pcut, pspan, FPregion): # p=BwIO(pcut) # chrom_len = {} # for i in p.chromosomeTree['nodes']: # chrom_len[i['key']] = i['chromSize'] pcutbw = BigWigFile(open(pcut, 'rb')) FPbw = BigWigFile(open(FPregion, 'rb')) inf = open(inputfile) testll = inf.readline().split() ml = int(testll[2]) - int(testll[1]) inf.seek(0) outf = open(outputfile, 'w') for line in inf: ll = line.split() # if not chrom_len.has_key(ll[0]): # continue cut = list( pcutbw.summarize(ll[0], int(ll[1]) + ml / 2 - pspan, int(ll[1]) + ml / 2 + pspan, 2 * pspan).sum_data) TC = sum(cut) C = sum(cut[(pspan - ml / 2):(pspan - ml / 2 + ml)]) L = sum(cut[(pspan - ml / 2 - ml):(pspan - ml / 2)]) R = sum(cut[(pspan - ml / 2 + ml):(pspan - ml / 2 + 2 * ml)]) FOS = -1 * ((C + 1) / (R + 1) + (C + 1) / (L + 1)) try: FP_bw = map( float, list( FPbw.summarize(ll[0], int(ll[1]), int(ll[2]), int(ll[2]) - int(ll[1])).sum_data)) except: FP_bw = [0.0] * (int(ll[2]) - int(ll[1])) minFPbw = min(FP_bw) maxFPbw = max(FP_bw) newll = ll + [TC, FOS, minFPbw, maxFPbw] outf.write("\t".join(map(str, newll)) + "\n") outf.close()
def extract_phastcons ( bedfile, phas_chrnames, width, pf_res ): """Extract phastcons scores from a bed file. Return the average scores """ info("read bed file...") bfhd = open(bedfile) bed = parse_BED(bfhd) # calculate the middle point of bed regions then extend left and right by 1/2 width bchrs = bed.peaks.keys() bchrs.sort() chrs = [] for c in phas_chrnames: if c in bchrs: chrs.append(c) sumscores = [] for chrom in chrs: info("processing chromosome: %s" %chrom) pchrom = bed.peaks[chrom] bw = BigWigFile(open(chrom+'.bw', 'rb')) for i in range(len(pchrom)): mid = int((pchrom[i][0]+pchrom[i][1])/2) left = int(mid - width/2) right = int(mid + width/2) if left < 0: left = 0 right = width summarize = bw.summarize(chrom, left, right, width/pf_res) if not summarize: continue dat = summarize.sum_data / summarize.valid_count #dat = dat.strip().split('\t') sumscores.append(dat) ## a list with each element is a list of conservation score at the same coordinate sumscores = map(list, zip(*sumscores)) ## exclude na sumscores = [[t2 for t2 in t if not math.isnan(t2)] for t in sumscores] try: conscores = [sum(t)/len(t) for t in sumscores] except ZeroDivisionError: conscores = [0] * (width/pf_res) return conscores
class TestBigWig(unittest.TestCase): def setUp(self): f = open("test_data/bbi_tests/test.bw") self.bw = BigWigFile(file=f) def test_get_summary(self): data = self.bw.query("chr1", 10000, 20000, 10) means = [x['mean'] for x in data] print means assert numpy.allclose(map(float, means), [ -0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998 ]) # Summarize variant sd = self.bw.summarize("chr1", 10000, 20000, 10) assert numpy.allclose(sd.sum_data / sd.valid_count, [ -0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998 ]) # Test min and max for this entire summary region data = self.bw.query("chr1", 10000, 20000, 1) maxs = [x['max'] for x in data] mins = [x['min'] for x in data] self.assertEqual(map(float, maxs), [0.289000004529953]) self.assertEqual(map(float, mins), [-3.9100000858306885]) def test_get_leaf(self): data = self.bw.query("chr1", 11000, 11005, 5) means = [x['mean'] for x in data] assert numpy.allclose(map(float, means), [ 0.050842501223087311, -2.4589500427246094, 0.050842501223087311, 0.050842501223087311, 0.050842501223087311 ]) # Test min and max for this entire leaf region data = self.bw.query("chr1", 11000, 11005, 1) maxs = [x['max'] for x in data] mins = [x['min'] for x in data] self.assertEqual(map(float, maxs), [0.050842501223087311]) self.assertEqual(map(float, mins), [-2.4589500427246094]) def test_wrong_nochrom(self): data = self.bw.query("chr2", 0, 10000, 10) self.assertEqual(data, None)
def get_signal(inputfile, output, plus, minus, fulllen): plusbw = BigWigFile(open(plus, 'rb')) minusbw = BigWigFile(open(minus, 'rb')) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() motiflen = int(ll[2]) - int(ll[1]) upstream_ext = fulllen / 2 - motiflen / 2 try: if ll[5] == "+": start = int(ll[1]) - upstream_ext end = start + fulllen forward_signal = list( plusbw.summarize(ll[0], start, end, end - start).sum_data) reverse_signal = list( minusbw.summarize(ll[0], start, end, end - start).sum_data) else: end = int(ll[2]) + upstream_ext start = end - fulllen forward_signal = list( minusbw.summarize(ll[0], start, end, end - start).sum_data)[::-1] reverse_signal = list( plusbw.summarize(ll[0], start, end, end - start).sum_data)[::-1] except: print ll forward_signal = [0] * (end - start) reverse_signal = [0] * (end - start) newll = ll + forward_signal + reverse_signal outf.write("\t".join(map(str, newll)) + "\n") inf.close() outf.close()
def check_position(chrom, start, end): #is there 10% coverage of region [start, end] valids = 0. wrong = 0. for directory in [x[0] for x in os.walk(DATAPATH + "data")]: for filename in glob(directory + "/*.bigWig") + glob(directory + "/*.bw"): f = open(filename, "r") bigwig = BigWigFile(file=f) summary = bigwig.summarize(chrom, start, end + 1, 1) if summary.valid_count * 10 < end - start + 1: wrong += 1 else: valids += 1 return (valids / (valids + wrong) >= 0.75)
def get_signal(inputfile, output, signalbw): p = BwIO(signalbw) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle = BigWigFile(open(signalbw, 'rb')) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if not chrom_len.has_key(ll[0]): continue signal = bwHandle.summarize(ll[0], max(int(ll[1]) - 50, 0), int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) outf.write("\t".join(ll) + "\n") inf.close() outf.close()
def count_mean_signal(enhancers, bigwig_file, name): print "processing", bigwig_file if os.path.exists(bigwig_file + ".means." + name.split('/')[-1]): print "file exists:", bigwig_file + ".means." + name.split('/')[-1] return [] #output = StringIO.StringIO() f = open(bigwig_file, "r") print "bigwig file opened" bigwig = BigWigFile(file=f) mean_all = count_chrom_mean(bigwig) print "chromosome means counted" output2 = open(bigwig_file + ".means." + name.split('/')[-1], "w") print "output file opened", bigwig_file + ".means." + name.split('/')[-1] start = time.clock() fails = [] i = 0 for enh in enhancers: if i % 10000 == 0: print bigwig_file, name, i summary = bigwig.summarize(enh.chromosome, enh.start, enh.end + 1, 1) #+1 added 24.09.15 after finding endpoint not included if summary.valid_count * 10 < enh.end - enh.start + 1: mean = 1 # mean_all[enh.chromosome] fails.append(1) else: mean = 0 # summary.sum_data / summary.valid_count fails.append(0) i += 1 output2.write("%d\t%f\n" % (enh.id, mean)) output2.close() f.close() print "output written to: %s.means.%s" % (bigwig_file, name.split('/')[-1]) end = time.clock() print "time: %.2f s" % (end - start) return fails
def summary(bwfile, bedfile, out, central_max, central_min, flanking_max, flanking_min, cutoff): total_result = [] p = BwIO(bwfile) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle = BigWigFile(open(bwfile, 'rb')) inf = open(bedfile) outf = open(out, 'w') t = time.time() for line in inf: ll = line.split() if chrom_len.has_key(ll[0]): #t = time.time() summary = bwHandle.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) # print "bw sum time",time.time()-t # t=time.time() digital = list(summary.sum_data) # print "trans to list time",time.time()-t # t=time.time() FT = (caculate_footprint(digital, central_max, central_min, flanking_max, flanking_min, cutoff)) # print "scan footprint time",time.time()-t # time.time() for ft in FT: bed = "\t".join( map(str, [ ll[0], int(ll[1]) + ft[0], int(ll[1]) + ft[1], ll[3], ft[2] ])) + "\n" outf.write(bed) #print "single time",time.time()-t #print (int(ll[2])-int(ll[1]))#*1.0/(time.time()-t) inf.close() outf.close() print "scaning 1st ", time.time() - t
class TestBigWig(unittest.TestCase): def setUp(self): f = open( "test_data/bbi_tests/test.bw" ) self.bw = BigWigFile(file=f) def test_get_summary(self): data = self.bw.query("chr1", 10000, 20000, 10) means = [ x['mean'] for x in data ] print means assert numpy.allclose( map(float, means), [-0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998] ) # Summarize variant sd = self.bw.summarize( "chr1", 10000, 20000, 10) assert numpy.allclose( sd.sum_data / sd.valid_count, [-0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998] ) # Test min and max for this entire summary region data = self.bw.query("chr1", 10000, 20000, 1) maxs = [ x['max'] for x in data ] mins = [ x['min'] for x in data ] self.assertEqual( map(float, maxs), [0.289000004529953] ) self.assertEqual( map(float, mins), [-3.9100000858306885] ) def test_get_leaf(self): data = self.bw.query("chr1", 11000, 11005, 5) means = [ x['mean'] for x in data ] assert numpy.allclose( map(float, means), [0.050842501223087311, -2.4589500427246094, 0.050842501223087311, 0.050842501223087311, 0.050842501223087311] ) # Test min and max for this entire leaf region data = self.bw.query("chr1", 11000, 11005, 1) maxs = [ x['max'] for x in data ] mins = [ x['min'] for x in data ] self.assertEqual( map(float, maxs), [0.050842501223087311] ) self.assertEqual( map(float, mins), [-2.4589500427246094] ) def test_wrong_nochrom(self): data = self.bw.query("chr2", 0, 10000, 10) self.assertEqual( data, None )
def get_regionLevel_reads(inbed, outputname, plusbw, minusbw, species, flank): genome = twobitreader.TwoBitFile("/scratch/sh8tv/Data/Genome/%s/%s.2bit" % (species, species)) countdict_template = make_nmer_dict(2 * flank) #rddict_template = make_rd_dict(2*flank) plusBWH = BigWigFile(open(plusbw, 'rb')) minusBWH = BigWigFile(open(minusbw, 'rb')) random.seed(1228) inf = open(inbed) outf = open(outputname + "_seqtype.bed", 'w') #outfRD = open(outputname + "_rd.bed",'w') seqtypes = sorted(countdict_template.keys()) newll_seq = [] #newll_rd = [] infileLen = len(inf.readline().split()) for i in range(infileLen): newll_seq.append("C" + str(i)) # newll_rd.append("C"+str(i)) newll_seq += sorted(countdict_template.keys()) #newll_rd += sorted(rddict_template.keys()) outf.write("\t".join(newll_seq) + "\n") #outfRD.write("\t".join(newll_rd)+"\n") inf.seek(0) for line in inf: Sdict = deepcopy(countdict_template) # Rdict = deepcopy(rddict_template) ll = line.split() chrm = ll[0] #center = (int(ll[1]) + int(ll[2]))/2 start = int(ll[1]) #max(0,center-ext) end = int(ll[2]) #center + ext plusSig_obj = plusBWH.summarize(chrm, start, end, end - start) #.sum_data minusSig_obj = minusBWH.summarize(chrm, start, end, end - start) #.sum_data #newll_seq = ll + [Sdict[x] for x in sorted(Sdict.keys())] #outf.write("\t".join(map(str,newll_seq))+"\n") if plusSig_obj and minusSig_obj: plusSig = plusSig_obj.sum_data minusSig = minusSig_obj.sum_data plusSequence = genome[chrm][(start - flank):(end + flank)].upper() minusSequence = genome[chrm][(start - flank + 1):(end + flank + 1)].upper() for i in range(len(plusSig)): #position = start + i pcuts = plusSig[i] if pcuts > 0: pseq = plusSequence[i:(i + 2 * flank)].upper() #pseqRV = revcomp(plusSequence_reverse[i:(i+2*flank)]).upper() if not "N" in pseq: #and not 'N' in pseqRV: # p_out = seq2biasParm(pseq,B,simplex_code) # plus_data += pcuts*p_out Sdict[pseq] += pcuts # Rdict["rd"+str(random.randint(1,4**(2*flank)))] += 1#pcuts #plus_readscount += pcuts #plus_biassum += biasdict[pseq]*pcuts #plus_biasCB += (biasdict[pseq]+biasdict[pseqRV] ) *pcuts/2 #print i,pcuts,plus_readscount for i in range(len(minusSig)): #position = start + i mcuts = minusSig[i] if mcuts > 0: # tmpseq = minusSequence[i:(i+2*flank)] mseq = revcomp(minusSequence[i:(i + 2 * flank)]).upper() #mseqRV = minusSequence_reverse[i:(i+2*flank)].upper() if not "N" in mseq: #and not "N" in mseqRV: # m_out = seq2biasParm(mseq,B,simplex_code) # minus_data += mcuts*m_out Sdict[mseq] += mcuts #Rdict["rd"+str(random.randint(1,4**(2*flank)))] += 1#mcuts #minus_readscount += mcuts #minus_biassum += biasdict[mseq]*mcuts #minus_biasCB += (biasdict[mseq]+biasdict[mseqRV] ) *mcuts/2 # print chrm,start,end,i,mcuts,minus_biassum,minus_biasCB #plus_biasave = plus_biassum / plus_readscount #minus_biasave = minus_biassum / minus_readscount #newll = ll + [plus_readscount,minus_readscount,plus_biassum,minus_biassum]#plus_biassum,minus_biassum,plus_biasCB,minus_biasCB] #+ list(plus_data) + list(minus_data) newll_seq = ll + [Sdict[x] for x in sorted(Sdict.keys())] #newll_rd = ll + [Rdict[x] for x in sorted(Rdict.keys())] outf.write("\t".join(map(str, newll_seq)) + "\n") #outfRD.write("\t".join(map(str,newll_rd))+"\n") inf.close() outf.close()
def sitepro_scan(pattern,peak,out,w_plus,w_minus,trunk): inf = open(pattern) pattern_plus = map(float,inf.readline().strip().split(",")) pattern_minus = map(float,inf.readline().strip().split(",")) all = sum(pattern_plus)+sum(pattern_minus) p_plus = [] p_minus= [] for i in pattern_plus: p_plus.append(i/all) for i in pattern_minus: p_minus.append(i/all) inf.close() l = len(pattern_plus) p0 = [1.0/(2*l)]*l inf = open(peak) p=BwIO(w_plus) q=BwIO(w_minus) chrom_len1 = {} chrom_len2 = {} for i in p.chromosomeTree['nodes']: chrom_len1[i['key']] = i['chromSize'] for i in q.chromosomeTree['nodes']: chrom_len2[i['key']] = i['chromSize'] w_plus_H=BigWigFile(open(w_plus, 'rb')) w_minus_H=BigWigFile(open(w_minus, 'rb')) footprint = [] count = 0 t=time.time() for line in inf: ll = line.split() if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]): # print ll[0],int(ll[1])-l,int(ll[2])+l,(int(ll[2])-int(ll[1])+2*l) p_sum = list(w_plus_H.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1]))).sum_data) m_sum = list(w_minus_H.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1]))).sum_data) #print len(p_sum) last_start = "NA" last_end = "NA" last_value = "NA" for i in range(len(p_sum)-l): o_plus = map(float,p_sum[i:i+l]) o_minus = map(float,m_sum[i:i+l]) for k in range(len(o_plus)): if o_plus[k] > trunk: o_plus[k]=trunk if o_minus[k] > trunk: o_minus[k] = trunk #print pattern_plus,p0 score = match_pattern(p_plus,p_minus,p0,p0,o_plus,o_minus,l) if score == "NA": continue # print score#i,i+l,score,last_start,last_end,last_value if last_start == "NA" : last_start = i last_end = i+l last_value = score elif i >= last_end: footprint.append([ll[0],int(ll[1])+last_start+3,int(ll[1])+last_end-3,last_value]) last_start = i last_end = i+l last_value = score elif score > last_value: last_start = i last_end = i+l last_value = score footprint.append([ll[0],int(ll[1])+last_start+3,int(ll[1])+last_end-3,last_value]) if count%100 ==0: print time.time()-t t = time.time() count += 1 outf = open(out,'w') for fp in footprint: newline = "\t".join(map(str,fp))+"\n" outf.write(newline) outf.close()
def sitepro_scan(peak,out_bed,w_plus,w_minus,bg0,span,gen,lflank,rflank,offset,bpshift,weight): nmer = lflank + rflank genome = twobitreader.TwoBitFile(gen) pBG,nBG = readBG(bg0) #p2BG,n2BG = readBG(bg2) code = encoding() b0s0,b1s0,b2s0 = paramest(pBG) new_b0,forward_b1,forward_b2,reverse_b1,reverse_b2 = apply_weight( b0s0,b1s0,b2s0,weight) inf = open(peak) w_plus_H=BigWigFile(open(w_plus, 'rb')) w_minus_H=BigWigFile(open(w_minus, 'rb')) outf = open(out_bed,'w') for line in inf:### chr start end name motifscore strand FP DNase chip ll = line.split()##### 3 below is flanking length chrm = ll[0] start = int(ll[1]) end = int(ll[2]) p_sum = list(w_plus_H.summarize(chrm,start-span,end+span,end-start+2*span).sum_data) n_sum = list(w_minus_H.summarize(chrm,start-span,end+span,end-start+2*span).sum_data) praw=[] nraw=[] px = [] nx = [] pnew=[] nnew=[] if 'N' in genome[chrm][min((start-span+1-offset +bpshift-lflank),(start-span+1 -bpshift-rflank) ):max((end+span+offset+lflank-bpshift),(end+span + bpshift + rflank))].upper(): # print line # print genome[chrm][min((start-span+1-offset +bpshift-lflank),(start-span+1 -bpshift-rflank) ):max((end+span+offset+lflank-bpshift),(end+span + bpshift + rflank))].upper() continue for bp1 in range(-span,end-start+span): loci = start + bp1 pseq = genome[chrm][(loci + bpshift - lflank) : (loci + bpshift + rflank)].upper() pseq_apart = genome[chrm][(loci+offset -bpshift-rflank):(loci+offset -bpshift+lflank)].upper() nseq = genome[chrm][(loci+1 -bpshift-rflank) : (loci+1 -bpshift+lflank)].upper() nseq_apart = genome[chrm][(loci+1-offset +bpshift-lflank):(loci+1-offset +bpshift+rflank)].upper() # praw.append(pBG[pseq]) # nraw.append(nBG[nseq]) px.append(pBG[pseq] * nBG[pseq_apart]) nx.append(nBG[nseq] * pBG[nseq_apart]) # p_yf = predict2(pseq,b0s0,b1s0,b2s0,8) # p_yr = predict2(revcomp(pseq_apart),b0s0,b1s0,b2s0,8)[::-1] # pnew.append(pow(numpy.e,(p_yf+p_yr)[0])) # n_yr= predict2(nseq_apart,b0s0,b1s0,b2s0,8) # n_yf = predict2(revcomp(nseq),b0s0,b1s0,b2s0,8)[::-1] # nnew.append(pow(numpy.e,(n_yf+n_yr)[0])) ### new method ### 1. fetch seq seq_start_plus = start - span - offset + 1 + bpshift - lflank seq_start_minus = start - span + 1 - bpshift - rflank seq_end_plus = end + span -1 + bpshift + rflank seq_end_minus = end + span -1 + offset - bpshift + lflank seq_start = min(seq_start_plus,seq_start_minus) if seq_start < 0 : #print line continue seq_end = max(seq_end_plus,seq_end_minus) code_seq = genome[chrm][seq_start:seq_end].upper() if 'N' in code_seq : #print line continue ### 2. fetch base seq_f = code_seq[ (seq_start_plus - seq_start) : (seq_end_plus - seq_start) ] seq_r = code_seq[ (seq_start_minus - seq_start) : (seq_end_minus - seq_start) ] yf = predict2(seq_f,b0s0,b1s0,b2s0,8) # print len(seq_f),len(yf) yr = predict2(revcomp(seq_r),b0s0,b1s0,b2s0,8)[::-1] z = yf + yr pnew = list(pow(numpy.e,z[(offset-1):])) nnew = list(pow(numpy.e,z[:(-offset+1)])) yfp_weight = predict2(seq_f,new_b0,forward_b1,forward_b2,8) yrp_weight = predict2(revcomp(seq_r),0,reverse_b1,reverse_b2,8)[::-1] z_p = yfp_weight + yrp_weight yfn_weight = predict2(seq_f,new_b0,reverse_b1,reverse_b2,8) yrn_weight = predict2(revcomp(seq_r),0,forward_b1,forward_b2,8)[::-1] z_n = yfn_weight + yrn_weight pweight = list(pow(numpy.e,z_p[(offset-1):])) nweight = list(pow(numpy.e,z_n[:(-offset+1)])) ## get predicted seqbias praw_assign =[] nraw_assign =[] px_assign = [] nx_assign = [] pnew_assign =[] nnew_assign =[] pweight_assign=[] nweight_assign=[] for bp in range(len(p_sum)- 2*span): ptotal = sum(p_sum[bp:(bp+2*span)])*1.0 ntotal = sum(n_sum[bp:(bp+2*span)])*1.0 px_assign.append(ptotal * px[bp+span]) nx_assign.append(ntotal * nx[bp+span]) pnew_assign.append(ptotal * pnew[bp+span]) nnew_assign.append(ntotal * nnew[bp+span]) pweight_assign.append(ptotal * pweight[bp+span]) nweight_assign.append(ntotal * nweight[bp+span]) # pnewlin_assign.append(ptotal * pnew_linear[bp+span]/sum(pnew_linear[bp:(bp+2*span)])) # nnewlin_assign.append(ntotal * nnew_linear[bp+span]/sum(nnew_linear[bp:(bp+2*span)])) #pf6_assign.append(ptotal * pnew_f6[bp+span]/sum(pnew_f6[bp:(bp+2*span)])) #pf8_assign.append(ptotal * pnew_f8[bp+span]/sum(pnew_f8[bp:(bp+2*span)])) #print type(pnew) #print type(pnew_assign) ### write real cleavage , seqbias , seqbias predicted cleavage newll = ll + p_sum[span:(len(p_sum)-span)] + n_sum[span:(len(n_sum)-span)] +px_assign +nx_assign+ pnew_assign + nnew_assign + pweight_assign + nweight_assign # newll = ll + p_sum[span:(len(p_sum)-span)] + n_sum[span:(len(n_sum)-span)] + praw_assign +nraw_assign +px_assign +nx_assign+ pnew_assign + nnew_assign # newll = ll + p_sum[span:(len(p_sum)-span)] + n_sum[span:(len(n_sum)-span)] + pnew[span:(len(n_sum)-span)] + nnew[span:(len(n_sum)-span)] + pnew_assign + nnew_assign + pf6_assign + pf8_assign outf.write("\t".join(map(str,newll))+"\n") #print "predict cut time :",time.time()-t outf.close() inf.close()
class HilbertMatrixBigWig(HilbertMatrix): # Need to override build(), but otherwise just like a HilbertMatrix def __init__(self, *args, **kwargs): """ Subclass of HilbertMatrix specifically for bigWig format files """ super(HilbertMatrixBigWig, self).__init__(*args, **kwargs) def build(self): """ Build the matrix. Since bigWig files are essentially pre-summarized, this just extracts the chrom/start/stop represented by each cell in the matrix and fills it with the value from the bigWig file. """ self.bigwig = BigWigFile(open(self.file)) chrom_rc, chrom_bins = self.chrom2rc() if self.chrom == 'genome': chroms = self.chromdict.keys() else: chroms = [self.chrom] for chrom in chroms: rc = chrom_rc[chrom] nbins = chrom_bins[chrom] start, stop = self.chromdict[chrom] results = self.bigwig.summarize(chrom, start, stop, nbins) values = results.sum_data / results.valid_count values[np.isnan(values)] = 0 self.matrix[rc[:,0], rc[:, 1]] = values self._cleanup() def chrom2rc(self): """ Return a dictionary of {chrom: (rows, cols)} and {chrom: nbins} """ precomputed = np.load( os.path.join( os.path.dirname(__file__), 'precomputed.npz')) rc = precomputed['_%s' % self.matrix_dim] d = {} bins = {} last_stop = 0 for chrom, startstop in self.chromdict.items(): start, stop = startstop frac = self.chromdict[chrom][1] / float(self.chrom_length) nbins = int(frac * (self.matrix_dim * self.matrix_dim)) d_start = last_stop d_stop = d_start + nbins d[chrom] = rc[d_start:d_stop, :] bins[chrom] = nbins last_stop += nbins return d, bins
def main(): usage = "usage: %prog <-r rfile> [options] <bigwig files> ..." description = "Draw correlation plot for many bigwig files. Based on qc_chIP_whole.py" optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") #optparser.add_option("-d","--db",type="str",dest="dbname",help="UCSC db name for the assembly. Default: ce4",default="ce4") optparser.add_option("-r","--rfile",dest="rfile", help="R output file. If not set, do not save R file.") optparser.add_option("-s","--step",dest="step",type="int", help="sampling step in kbps. default: 100, minimal: 1",default=100) optparser.add_option("-z","--imgsize",dest="imgsize",type="int", help="image size in inches, note the PNG dpi is 72. default: 10, minimal: 10",default=10) optparser.add_option("-f","--format",dest="imgformat",type="string", help="image format. PDF or PNG",default='PDF') #optparser.add_option("-m","--method",dest="method",type="string",default="median", # help="method to process the paired two sets of data in the sampling step. Choices are 'median', 'mean', and 'sample' (just take one point out of a data set). Default: median") optparser.add_option("-l","--wig-label",dest="wiglabel",type="string",action="append", help="the wiggle file labels in the figure. No space is allowed. This option should be used same times as wiggle files, and please input them in the same order as -w option. default: will use the wiggle file filename as labels.") optparser.add_option("--min-score",dest="minscore",type="float",default=-10000, help="minimum score included in calculation. Points w/ score lower than this will be discarded.") optparser.add_option("--max-score",dest="maxscore",type="float",default=10000, help="maximum score included in calculation. Points w/ score larger than this will be discarded.") optparser.add_option("-H","--heatmap",dest="heatmap",action="store_true",default=False, help="If True, a heatmap image will be generated instead of paired scatterplot image.") (options,wigfiles) = optparser.parse_args() imgfmt = options.imgformat.upper() if imgfmt != 'PDF' and imgfmt != 'PNG': print "unrecognized format: %s" % imgfmt sys.exit(1) medfunc = mean wigfilenum = len(wigfiles) if wigfilenum < 2 or not options.rfile: error("must provide >=2 wiggle files") optparser.print_help() sys.exit(1) # wig labels if options.wiglabel and len(options.wiglabel) == wigfilenum: wiglabel = options.wiglabel else: # or use the filename wiglabel = map(lambda x:os.path.basename(x),wigfiles) if options.step < 1: error("Step can not be lower than 1!") sys.exit(1) if options.imgsize < 10: error("Image size can not be lower than 10!") sys.exit(1) # check the files for f in wigfiles: if not os.path.isfile(f): error("%s is not valid!" % f) sys.exit(1) info("number of bigwig files: %d" % wigfilenum) #get chromosome length from optins.wig[0]: p=BwIO(wigfiles[0]) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] # get the common chromosome list: chrset = set([t['key'] for t in p.chromosomeTree['nodes']]) for bw in wigfiles[1:]: p=BwIO(bw) chrset = chrset.intersection(set([t['key'] for t in p.chromosomeTree['nodes']])) chroms = list(chrset) if not chroms: error('No common chrom found') sys.exit() info("common chromosomes are %s." % ",".join(chroms)) # Start writing R file if options.rfile: rfhd = open(options.rfile,"w") rfhd.write('''require("RColorBrewer") ## from CRAN\n''') # for each wig file, sample... for i in range(len(wigfiles)): bw = BigWigFile(open(wigfiles[i],'rb')) info("read wiggle track from bigwig file #%d" % (i+1)) profile = [] for chrom in chroms: # The too-short chromosome will cause error in bw.summarize function below # So filter them out if chrom_len[chrom]/options.step/1000==0: warn("A very-short chromosome (%s) found and skipped"%chrom) continue summary = bw.summarize(chrom, 0, chrom_len[chrom], chrom_len[chrom]/options.step/1000) if not summary: continue profile_chr = summary.sum_data / summary.valid_count profile_chr = [str(t).replace('nan', 'NA') for t in profile_chr] profile.extend(profile_chr) info("write values to r file") rfhd.write("p%d <- c(%s)\n" %(i, ','.join(profile))) rfhd.write("c <- cbind(p0") for i in range(wigfilenum-1): rfhd.write(",p%d" % (i+1)) rfhd.write(")\n") rfhd.write("c <- c[ c[,1]<=%f & c[,1]>=%f " % (options.maxscore,options.minscore)) for i in range(wigfilenum-1): rfhd.write("& c[,%d]<=%f & c[,%d]>=%f " % (i+2,options.maxscore,i+2,options.minscore)) rfhd.write(",]\n") if imgfmt == 'PDF': rfhd.write("pdf(\"%s.pdf\",width=%d,height=%d)\n" % (options.rfile,options.imgsize,options.imgsize)) elif imgfmt == 'PNG': rfhd.write("png(\"%s.png\",units=\"in\",res=150,width=%d,height=%d)\n" % (options.rfile,options.imgsize,options.imgsize)) if options.heatmap: # heatmap rfhd.write('library(gplots)\n') rfhd.write(''' m <- cor(c, method="pearson", use="pairwise.complete.obs") ''') labels = ",".join(map(lambda x:"\""+x+"\"",wiglabel)) rfhd.write("rownames(m) <- c(%s)\n" % labels) rfhd.write("colnames(m) <- c(%s)\n" % labels) rfhd.write('# draw the heatmap using gplots heatmap.2\n') rfhd.write('mn <- -1\n') rfhd.write('mx <- 1\n') rfhd.write('n <- 98\n') rfhd.write('bias <- 1\n') rfhd.write('mc <- matrix(as.character(round(m, 2)), ncol=dim(m)[2])\n') rfhd.write('breaks <- seq(mn, mx, (mx-mn)/(n))\n') rfhd.write('cr <- colorRampPalette(colors = c("#2927FF","#FFFFFF","#DF5C5C"), bias=bias)\n') rfhd.write('heatmap.2(m, col = cr(n), breaks=breaks, trace="none", cellnote=mc, notecol="black", notecex=1.8, keysize=0.5, density.info="histogram", margins=c(27.0,27.0), cexRow=2.20, cexCol=2.20, revC=T, symm=T)\n') else: # scatterplot rfhd.write(''' panel.plot <- function( x,y, ... ) { par(new=TRUE) m <- cbind(x,y) plot(m,col=densCols(m),pch=20) lines(lowess(m[!is.na(m[,1])&!is.na(m[,2]),]),col="red") } panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- cor(x, y,use="complete.obs") txt <- format(round(r,2),width=5,nsmall=2) #format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) #text(0.5, 0.5, txt, cex = cex.cor * abs(r)) text(0.5, 0.5, txt, cex = cex.cor) } ''') labels = ",".join(map(lambda x:"\""+x+"\"",wiglabel)) rfhd.write(''' pairs(c, lower.panel=panel.plot, upper.panel=panel.cor, labels=c(%s)) ''' % (labels)) rfhd.write("dev.off()\n") rfhd.close() # try to call R try: subprocess.call(['Rscript',options.rfile]) except: info("Please check %s" % options.rfile) else: info("Please check %s" % (options.rfile+'.'+imgfmt))