def filter_a_chrom(chrom, oriPeaks, subPeaks, chromWig, args): ''' Filters a chromsome oriPeaks and subPeaks are lists of peaks from the same chromosome. Both peak lists are sorted by coordinates on the chromosome. ''' sub_peak_starts = [k[1] for k in subPeaks] result = [] for op in oriPeaks: startIdx = bisect.bisect_left( chromWig[:,0], op[1] ) endIdx = bisect.bisect_right( chromWig[:,0], op[2] ) subStartIdx = bisect.bisect_left( sub_peak_starts, op[1] ) subEndIdx = bisect.bisect_right( sub_peak_starts, op[2] ) if endIdx <= startIdx or subEndIdx <= subStartIdx: continue currWig = wig.expandWig( chromWig[startIdx:endIdx,:],0, 1, smooth=False ) startPos = chromWig[ startIdx, 0 ] endPos = chromWig[ endIdx - 1, 0 ] maxV = np.max( currWig ) #The maximum value for subI in range( subStartIdx, subEndIdx ): relStart = max( 0, subPeaks[ subI ][1] - startPos ) relEnd = min( currWig.shape[0] - 1, subPeaks[ subI ][2] - startPos ) subMaxV = np.max( currWig[ relStart: relEnd ] ) if subMaxV > args.threshfrac * maxV and subMaxV > args.cutoff: result.append( subPeaks[ subI ] ) result[-1][4] = subMaxV result[-1][5] = args.strand return result
def add_chrom_data(taskQ, outQ, processID, args, strand ): if args.method == 'g': kargs = (args.bw, args.nbw) else: kargs = (args.r, args.mean) for chrom, chromWig in iter( taskQ.get, 'STOP' ): print "add chrom data Process ", processID," is processing ", chrom lines = [] lines.append("variableStep chrom=%s\n"%(chrom,)) startp = chromWig[0,0] expanded = wig.expandWig( chromWig, offset, 1, strand = strand, method=args.method, kargs=kargs) for i in range( expanded.shape[0] ): if expanded[i] > 0.8: lines.append( "%d\t%f\n"%(int(i + startp - offset), expanded[i], ) ) #for i in range( chromWig.shape[0] ): # lines.append('%d\t%f\n'%( chromWig[ i, 0], chromWig[i,1])) expanded.resize(100000, refcheck=False) expanded.resize(0, refcheck=False) chromWig.resize(100000, refcheck=False) chromWig.resize(0, refcheck=False) outQ.put(lines) gc.collect()
def pair( fpeaks, rpeaks, fwig, rwig, ulimit, dlimit, prefix): ''' Assuming that the peaks on one strand is mutually exclusive. They do not overlap with each other. In this case, the ordering of the starts of the peaks and the ends of the peaks are the same. And that when the starts are sorted, the ends are also sorted. ''' print "fwig: ", fwig print "rwig: ", rwig offset = 5 expandCol = 1 out1 = open(prefix + "_singletons_dev.bed",'w') out2 = open(prefix + "_pairs_dev.gff", "w") print fpeaks.keys() for chrom in fpeaks: if chrom not in rpeaks: continue print chrom fp = fpeaks[chrom] rp = rpeaks[chrom] pairF = [] #Store the pairing information, if unpaired, it will be negative. pairR = [] fw = fwig[chrom] expandedFw = WIG.expandWig( fw, offset, expandCol, smooth=False ) rw = rwig[chrom] expandedRw = WIG.expandWig( rw, offset, expandCol, smooth=False ) rstarts = [] rends = [] fprefer = [] rprefer = [] unpairedF = [] for f in fp: fprefer.append( SortedCollection( key=itemgetter(1) ) ) pairF.append( ( -1, 0, 0) ) #( index of the mate, score, distance ) for r in rp: rprefer.append( SortedCollection( key=itemgetter(1) ) ) pairR.append( (-1, 0, 0) ) rstarts.append( r[1] ) rends.append( r[2] ) for i in range( len( fp ) ): currfp = fp[ i ] start = currfp[1] end = currfp[2] es = start - ulimit ee = end + dlimit currFw = expandedFw[ max( 0, start - fw[0,0] ) + offset : max( 0, end - fw[0,0] ) + offset + 1] si = bisect.bisect_left( rends, es ) ei = bisect.bisect_right( rstarts, ee ) ftagCounts,_,_,_ = gs.getTagCount( fwig, chrom, start, end ) #print ei - si maxScore = 0 bestDist = 0 bestIdx = 0 bestRpos = 0 bestHeight = 0 for idx in range( si, ei ): currrp = rp[ idx ] rstart = currrp[1] rend = currrp[2] currRw = expandedRw[ max( 0, rstart - rw[0, 0] ) + offset : max( 0, rend - rw[0,0] ) + offset + 1 ] rtagCoungs,_,_,_ = gs.getTagCount( rwig, chrom, currrp[1], currrp[2] ) tempScore, tempDist, tempRpos, tempHeight = getScore( currFw, currRw, start, rstart ) fprefer[ i ].insert( (idx, tempScore, tempDist, tempRpos, tempHeight) ) rprefer[ idx ].insert( (i, tempScore, tempDist, tempRpos, tempHeight) ) if tempScore > maxScore: maxScore = tempScore bestDist = tempDist bestIdx = idx bestRpos = tempRpos bestHeight = tempHeight if maxScore > pairR[ bestIdx ][1]: pairF[i] = ( bestIdx, maxScore, bestDist , bestRpos, bestHeight) if pairR[ bestIdx ][0] >= 0: pairF[ pairR[ bestIdx ][ 0 ] ] = (-1, 0, 0) unpairedF.append( pairR[ bestIdx ][0] ) pairR[bestIdx] = ( i, maxScore, bestDist, bestRpos, bestHeight ) else: unpairedF.append( i ) try: fprefer[ i ].remove( ( bestIdx, maxScore, bestDist, bestRpos, bestHeight ) ) except ValueError: #print "Value error: ", bestIdx, ' ',maxScore, ' ',bestDist,' ', si,' ', ei pass singletons = [] pairs = [] while len(unpairedF) > 0: for u in unpairedF: if len( fprefer[u] ) > 0: ridx = fprefer[u][-1][0] if pairR[ ridx ][1] < fprefer[u][-1][1]: if pairR[ ridx ][0] > 0: pairF[ pairR[ ridx ][0] ] = (-1, 0, 0) unpairedF.append( pairR[ ridx ][0] ) pairR[ ridx ] = ( u, fprefer[u][-1][1], fprefer[u][-1][2], fprefer[u][-1][3], fprefer[u][-1][4] ) pairF[ u ] = ( ridx, fprefer[u][-1][1], fprefer[u][-1][2], fprefer[u][-1][3], fprefer[u][-1][4] ) fprefer[u].remove( (ridx, fprefer[u][-1][1], fprefer[u][-1][2], fprefer[u][-1][3], fprefer[u][-1][4] ) ) else: unpairedF.remove( u ) for i,f in enumerate(pairF): fp[i][1] -= 1 if f[0] == -1: singletons.append( fp[i] ) else: rp[f[0]][1] -= 1 #pairs.append( fp[i] ) #pairs.append( rp[f[0]] ) pairStart = (2*f[3] - f[2])/2 pairEnd = pairStart + 1 pairs.append( [fp[i][0],'.','.',pairStart-10, pairEnd+10,f[4],'.','.','cw_distance='+str(f[2]) ] ) for i,f in enumerate(pairR): rp[i][1] -= 1 if f[0] == -1: singletons.append( rp[i] ) singletons.sort(key=lambda k:( k[0], k[1], k[2] )) pairs.sort(key = lambda k:( k[0], k[1], k[2])) print "singletons: ", len(singletons) print "pairs: ", len(pairs) for s in singletons: out1.write('\t'.join([str(i) for i in s])) out1.write('\n') for p in pairs: out2.write('\t'.join([str(i) for i in p])) out2.write('\n') out1.close() out2.close()