def findInsertions(bwFile, bedData, interval, x):

    if interval =='start':
        sL = int(bedData[x][1])-options.l
        sR = int(bedData[x][1])+options.r
    elif interval == 'end':
        sL = int(bedData[x][2])-options.l
        sR = int(bedData[x][2])+options.r
    else:
        sL = int(bedData[x][1])-options.l
        sR = int(bedData[x][2])+options.r
    
    # get signal data
    f = open(bwFile, "rb")
    bigwig_class = BigWigFile(f)
    try: signal = bigwig_class.get_as_array(bedData[x][0],sL,sR)
    except OverflowError: signal = np.array([np.nan]*(sR-sL))
    f.close()
    
    if signal is not None:
        if np.sum(np.isfinite(signal)) > 0:
            out = np.nanmean(signal)
        else: out = 0
    else: out = 0
    
    out = signal
    return out
Ejemplo n.º 2
0
def get_mean_phastcons(bedtool, phastcons_location, sample_size=1000):
    """
    
    Get means phastcons scores for all intervals in a bed tool
    bedtool - bedtool to extract data from
    phastcons_location - location of phastcons file
    
    """

    with open(phastcons_location) as bw_file:
        bw = BigWigFile(bw_file)

        data = []

        for bedline in bedtool.random_subset(min(len(bedtool), sample_size)):
            conservation_values = bw.get_as_array(bedline.chrom, bedline.start,
                                                  bedline.stop)
            try:
                if len(conservation_values) > 0:
                    mean_phastcons = np.mean(conservation_values)
                else:
                    mean_phastcons = 0
                data.append(mean_phastcons)
            except TypeError:
                pass
    return data
Ejemplo n.º 3
0
def getsignal(inputfile, outputfile, pcut, pspan):

    #    p=BwIO(pcut)
    #    chrom_len = {}
    #    for i in p.chromosomeTree['nodes']:
    #        chrom_len[i['key']] = i['chromSize']
    pcutbw = BigWigFile(open(pcut, 'rb'))
    inf = open(inputfile)
    testll = inf.readline().split()
    ml = int(testll[2]) - int(testll[1])
    inf.seek(0)
    outf = open(outputfile, 'w')

    for line in inf:
        ll = line.split()
        #        if not chrom_len.has_key(ll[0]):
        #            continue
        cut = list(
            pcutbw.summarize(ll[0],
                             int(ll[1]) + ml / 2 - pspan,
                             int(ll[1]) + ml / 2 + pspan, 2 * pspan).sum_data)
        TC = sum(cut)
        C = sum(cut[(pspan - ml / 2):(pspan - ml / 2 + ml)])
        L = sum(cut[(pspan - ml / 2 - ml):(pspan - ml / 2)])
        R = sum(cut[(pspan - ml / 2 + ml):(pspan - ml / 2 + 2 * ml)])
        FOS = -1 * ((C + 1) / (R + 1) + (C + 1) / (L + 1))
        newll = ll + [TC, FOS]
        outf.write("\t".join(map(str, newll)) + "\n")

    outf.close()
Ejemplo n.º 4
0
def Readbw(bwfile,chrm,start,end,n):
    bwHandle=BigWigFile(open(bwfile, 'rb'))
    summary = bwHandle.summarize(chrm,int(start),int(end),(int(end)-int(start))/n)
    count = map(sudocount,summary.valid_count)
    sum = summary.sum_data
    scores = list(sum/count)
    return scores
Ejemplo n.º 5
0
def getChromatinDataSeries(bigwigFile, libraryTable, sgInfoTable, tssTable, colname = '', naValue = 0):
	bwindex = BigWigFile(open(bigwigFile))
	chromDict = tssTable['chromosome'].to_dict()

	chromatinScores = []
	for name, sgInfo in sgInfoTable.iterrows():
		geneTup = (sgInfo['gene_name'],','.join(sgInfo['transcript_list']))

		if geneTup not in chromDict: #negative controls
			chromatinScores.append(np.nan)
			continue

		if sgInfo['strand'] == '+':
			sgRange = sgInfo['pam coordinate'] + sgInfo['length']
		else:
			sgRange = sgInfo['pam coordinate'] - sgInfo['length']

		chrom = chromDict[geneTup]
		
		chromatinArray = bwindex.get_as_array(chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange))
		if chromatinArray is not None and len(chromatinArray) > 0:
			chromatinScores.append(np.nanmean(chromatinArray))
		else: #often chrY when using K562 data..
			# print name
			# print chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange)
			chromatinScores.append(np.nan)

	chromatinSeries = pd.Series(chromatinScores, index=libraryTable.index, name = colname)

	return chromatinSeries.fillna(naValue)
Ejemplo n.º 6
0
def createMappabilityList(fragmentsMap, bwfile, fragmentCount, options):
    # keep record which fragment has decent mappability
    mappable = np.zeros((fragmentCount, ), dtype=np.float)

    # lazy load
    from bx.intervals.io import GenomicIntervalReader
    from bx.bbi.bigwig_file import BigWigFile
    bw = BigWigFile(open(bwfile))

    for fragmentId in fragmentsMap.keys():

        (chrom, start, end) = fragmentsMap[fragmentId]

        if (options.vverbose):
            print >> sys.stdout, "- process %s %d-%d " % (chrom, start, end)

        try:
            mappable[fragmentId] = bw.query(chrom, start, end, 1)[0]["mean"]
            if (np.isnan(mappable[fragmentId])):
                mappable[fragmentId] = 0
        except:
            mappable[fragmentId] = 0.
            # problem with invalid values
            if (options.vverbose):
                print >> sys.stderr, "Problem with bw file at %s %d-%d" % (
                    chrom, start, end)
                print traceback.format_exc()

    return mappable
Ejemplo n.º 7
0
def main():
    p = optparse.OptionParser(__doc__)
    p.add_option('-A', '--absolute', action='store_true',dest='A',\
                 default=False, help='absolute threshold')
    p.add_option('-s','--standard_background', action='store_true',\
                 dest='stdbg')
    p.add_option('-D', '--debug', action='store_true', dest='debug')
    options, args = p.parse_args()
    debug_c = 0

    BEDFILE = open(args[0], 'rU')
    BW = BigWigFile(file=open(args[1]))
    BEDout = open(args[2], 'w')

    for line in BEDFILE:
        print(line)
        line = line.strip().split('\t')
        x = BW.query(line[0], int(line[1]), int(line[2]),1)
        line.append(str(round(x[0]['mean'], 5)))
        BEDout.write("\t".join(line)+"\n")
        """
        for i in x:
            print i['mean']
        """

        if options.debug:
            debug_c +=1
            if debug_c >= 10:
            break


if __name__ == '__main__':
    main()
Ejemplo n.º 8
0
def summary(bwfile,bedfile,topnumber,out):
    total_result = []
    p=BwIO(bwfile)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    bwHandle=BigWigFile(open(bwfile, 'rb'))
    inf = open(bedfile)
    t = time.time()
    for line in inf:
        ll = line.split()
        ll[3]="-"
        if chrom_len.has_key(ll[0]):
            summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),1)
            if summary.valid_count == 0:
                mean_value = 0
            else:
                mean_value = (summary.sum_data/summary.valid_count)[0]
            total_result.append(ll+[mean_value])
    inf.close()   
    total_result.sort(reverse=True,key=lambda x:x[-1])
    outf = open(out,'w')
    print "scaning 1st ",time.time()-t
    t=time.time()
    for i in range(topnumber):
        ll = total_result[i]
        summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1])))
        additional_value = ",".join(map(str,list(summary.sum_data)))
        result = map(str,(ll+[additional_value]))
        outf.write("\t".join(result)+"\n")
    outf.close()
    print "scaning 2nd ",time.time()-t
Ejemplo n.º 9
0
def main():
	usage="%prog [options]"
	parser = OptionParser(usage,version="%prog " + __version__)
	
	parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="BigWig files")
	parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="BigWig files")
	parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.')
	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file")
	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
	(options,args)=parser.parse_args()
	
	if not (options.BigWig_File1 and options.BigWig_File2  and options.output_wig and options.chromSize):
		parser.print_help()
		sys.exit(0)
	OUT=open(options.output_wig,'w')
	bw1 = BigWigFile( file=open(options.BigWig_File1) )
	bw2 = BigWigFile( file=open(options.BigWig_File2) )
	chrom_sizes = load_chromsize(options.chromSize)
	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
		print >>sys.stderr, "Processing " + chr_name + " ..."
		OUT.write('variableStep chrom='+chr_name+'\n')
		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
			coord = interval[1]
			bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2])
			bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2])
			if all_nan(bw_signal1) and all_nan(bw_signal2):
				continue
			bw_signal1 = replace_nan( bw_signal1 )
			bw_signal2 = replace_nan( bw_signal2 )
		
			call_back = getattr(twoList,options.action)
			for v in call_back(bw_signal1,bw_signal2):
				coord +=1
				if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
Ejemplo n.º 10
0
def get_mean_phastcons(bedtool, phastcons_location):
    
    """
    
    Get means phastcons scores for all intervals in a bed tool
    bedtool - bedtool to extract data from
    phastcons_location - location of phastcons file
    
    """
    
    f = open(phastcons_location, 'r')
    bw = BigWigFile(file=f)

    #if bedtool
    data = np.ndarray(len(bedtool))  
    for i, bedline in enumerate(bedtool):
              
        conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop)
        
        if len(conservation_values) > 0:
            mean_phastcons = np.mean(conservation_values)
        else:
            mean_phastcons = 0
        data[i] = mean_phastcons
        
    return data
Ejemplo n.º 11
0
def createMappabilityList(fragmentsMap, bwfile, fragmentCount, options):
    # keep record which fragment has decent mappability
    mappable = np.zeros((fragmentCount,), dtype=np.float)

    # lazy load
    from bx.intervals.io import GenomicIntervalReader
    from bx.bbi.bigwig_file import BigWigFile
    bw = BigWigFile( open( bwfile ) )

    for fragmentId in fragmentsMap.keys():

        (chrom, start, end) = fragmentsMap[fragmentId]

        if (options.vverbose):
            print >> sys.stdout, "- process %s %d-%d " % (chrom, start, end)

        try:
            mappable[fragmentId] = bw.query(chrom, start, end, 1)[0]["mean"]
            if (np.isnan(mappable[fragmentId])):
                mappable[fragmentId] = 0
        except:
            mappable[fragmentId] = 0.
            # problem with invalid values
            if (options.vverbose):
                print >> sys.stderr, "Problem with bw file at %s %d-%d" % (chrom, start, end)
                print traceback.format_exc()

    return mappable
Ejemplo n.º 12
0
def get_mean_phastcons(bedtool, phastcons_location, sample_size = 1000):
    
    """
    
    Get means phastcons scores for all intervals in a bed tool
    bedtool - bedtool to extract data from
    phastcons_location - location of phastcons file
    
    """
    
    with open(phastcons_location) as bw_file:
        bw = BigWigFile(bw_file)
    
        data = []
        
        for bedline in bedtool.random_subset(min(len(bedtool), sample_size)):
            conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop)
            try:
                if len(conservation_values) > 0:
                    mean_phastcons = np.mean(conservation_values)
                else:
                    mean_phastcons = 0
                data.append(mean_phastcons)
            except TypeError:
                pass
    return data
def count_cut_nmers(fp, w_minus, lflank, rflank, single_nmer_cutoff, sequence,
                    offset):
    """
    count the number of cuts associated with each nmer in sequence covered by X.
    offset is the position of the cut to be associated with each nmer.
    if offset = 0 the first base of the tag is lined up with the nmer start
    """
    #    w_plus_H=BigWigFile(open(w_plus, 'rb'))
    w_minus_H = BigWigFile(open(w_minus, 'rb'))

    genome = twobitreader.TwoBitFile(sequence)
    # keep count of the number of occurrences of each n-mer

    seq_nmer_dict = {}

    cut_nmer_dict = {}

    for line in fp.readlines():
        ll = line.split()
        chrm = ll[0]
        start = int(ll[1])
        end = int(ll[2])
        #        pseq = genome[chrm][(start-lflank+offset):(end+rflank+offset)].upper()
        nseq = genome[chrm][(start - lflank - offset):(end + rflank -
                                                       offset)].upper()

        #        cp = list(w_plus_H.summarize(ll[0],start,end,end-start).sum_data)
        cn = list(w_minus_H.summarize(ll[0], start, end, end - start).sum_data)
        #each = (len(ll)-5)/2
        #cp = (map(float,ll[5:(5+each)]))
        #cn = (map(float,ll[(5+each):(5+each*2)]))

        for k in range(len(cn)):

            #            p_cut = cp[k]
            n_cut = cn[k]

            #            p_seq = pseq[k:(k+lflank+rflank)]
            n_seq = nseq[(k + 1):(k + lflank + rflank + 1)]
            #     rev_n_seq = rev(n_seq)
            #            if 'N' not in p_seq and p_cut <= single_nmer_cutoff :
            #                try:
            #                    cut_nmer_dict[ p_seq ] += p_cut
            #                except:
            #                    cut_nmer_dict[ p_seq ]  = p_cut
            #                try:
            #                    seq_nmer_dict[ p_seq ] += 1
            #                except:
            #                    seq_nmer_dict[ p_seq ]  = 1
            if 'N' not in n_seq and n_cut <= single_nmer_cutoff:
                rev_n_seq = rev(n_seq)
                try:
                    cut_nmer_dict[rev_n_seq] += n_cut
                except:
                    cut_nmer_dict[rev_n_seq] = n_cut
                try:
                    seq_nmer_dict[rev_n_seq] += 1
                except:
                    seq_nmer_dict[rev_n_seq] = 1
    return seq_nmer_dict, cut_nmer_dict
Ejemplo n.º 14
0
def refine_with_summit(_soft,_mark,_tissue):
    _temp_peak = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/top_bed/{0}.{1}.{2}.bed"\
                                                       .format(_soft,_mark,_tissue))]

    _temp_bw = open("/Data/adam/dnase/bigwig/{0}.{1}.rep0.bw".format(_mark,_tissue))
    _temp_enrich = open("/Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w')
    _bw = BigWigFile(file=_temp_bw)

    for line in _temp_peak:
        vals = _bw.get(line[0],int(line[1]),int(line[2]))
        vals =tuple(vals)
        if len(vals)>0:
            maxs = 0
            for _key in vals:
                if float(_key[2])>maxs:
                    maxs = float(_key[2])
                    summit = _key[:2]
            summit_p=int((float(summit[0])+float(summit[1]))/2)
            if summit_p-1000>0:
                print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],str(summit_p-1000),str(summit_p+999))
            else:
                print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],1,2000)
    _temp_enrich.close()
    sh('sort -k 1,1 -k 2g,2g /Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed| bedtools merge -i stdin\
     >/Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed'.format(_soft,_mark,_tissue))

    sh('bash ../get_enrich.sh /Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed {1} {2} {3}'\
       .format(_soft,_mark,_tissue,_soft))
def get_signal(inputfile, output, bwfiles, bwfolder, extend):
    signalbw = bwfiles.strip().strip(',').split(',')

    if not bwfolder:
        bwfolder = "./"
    if not bwfolder.endswith('/'):
        bwfolder += '/'

    bwHs = []
    for sb in signalbw:
        if sb.startswith('/'):
            bwHs.append(BigWigFile(open(sb, 'rb')))
        else:
            bwHs.append(BigWigFile(open(bwfolder + sb, 'rb')))

    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if "_" in ll[0]:
            continue
        #center = (int(ll[1]) + int(ll[2]))/2
        #S = max(0,center - extend)
        #E = center + extend
        #C = (int(ll[1]) + int(ll[2]) ) /2
        #S = C - extend
        #E = C + extend
        S = int(ll[1])
        E = int(ll[2])

        for bwHandle in bwHs:
            try:
                signal1 = (bwHandle.summarize(ll[0], max(0, S - extend), S,
                                              20))
                signal2 = (bwHandle.summarize(ll[0], S, E, 20))
                signal3 = (bwHandle.summarize(ll[0], E, E + extend, 20))
                binlen1 = extend * 1.0 / 20
                binlen2 = (E - S) * 1.0 / 20
                binlen3 = extend * 1.0 / 20
                if type(signal1.sum_data) == None or type(
                        signal2.sum_data) == None or type(
                            signal3.sum_data) == None:
                    addsig = [0] * 60
                else:
                    addsig1 = signal1.sum_data / binlen1  #float(signal.sum_data/signal.valid_count)
                    addsig2 = signal2.sum_data / binlen2
                    addsig3 = signal3.sum_data / binlen3
                    addsig = list(addsig1) + list(addsig2) + list(addsig3)
            except:
                #print 'c2',line
                addsig = [0] * 60  #'nan'
            # ll.extend(list(signal.sum_data/signal.valid_count))
            if len(ll) >= 6 and ll[5] == "-":
                ll.extend(addsig[::-1])
            else:
                ll.extend(addsig)
        outf.write("\t".join(map(str, ll)) + "\n")
    inf.close()
    outf.close()
Ejemplo n.º 16
0
def getsignal(inputfile,outputfile,BGmatrix,pcut,ncut,Ipcut,Incut,pspan,tspan,gen,left,right,fetch_length=100):
    
 #   p=BwIO(pcut)
 #   chrom_len = {}
 #   for i in p.chromosomeTree['nodes']:
 #       chrom_len[i['key']] = i['chromSize']
    genome = twobitreader.TwoBitFile(gen)
    pcutbw = BigWigFile(open(pcut, 'rb'))
    ncutbw = BigWigFile(open(ncut, 'rb'))
    Ipcutbw = BigWigFile(open(Ipcut, 'rb'))
    Incutbw = BigWigFile(open(Incut, 'rb'))

    inf = open(inputfile)    
    testll = inf.readline().split()
    ml = int(testll[2]) - int(testll[1])
    pspan = pspan - ml/2
    inf.seek(0)
    pBG,nBG = readBG(BGmatrix)
    outf = open(outputfile,'w')
    for line in inf:
        ll = line.split()

        chrom = ll[0]
        start = int(ll[1])
        end = int(ll[2])
        strand = ll[5]
        seq = genome[chrom][(start-pspan-left):(end + pspan+right)]
        pout = make_cut(pcutbw,ll,pspan,fetch_length)
        nout = make_cut(ncutbw,ll,pspan,fetch_length)
        Ipout = make_cut(Ipcutbw,ll,pspan,fetch_length)
        Inout = make_cut(Incutbw,ll,pspan,fetch_length)

        if strand == "-":
            pout,nout = nout,pout
            Ipout,Inout = Inout,Ipout
        if pout == 'NA':
            continue        

        if 'N' in seq.upper():
            continue
        #print 1
        pseq = seq[:-1]
        nseq = seq[1:]
        p=[]
        n=[]
        for k in range(len(pseq)  +1 - left-right):
            p.append(pBG[pseq[k:k+left+right].upper()])
            n.append(nBG[nseq[k:k+left+right].upper()])
        if strand != '-':
            pbglist = p
            nbglist = n
        else:
            pbglist = n[::-1]
            nbglist = p[::-1]
        TC,FOS = makeTCFOS(pcutbw,ncutbw,ll,tspan,ml)
        newll = ll  + [TC,FOS] + pout + nout + Ipout + Inout + pbglist + nbglist
        outf.write("\t".join(map(str,newll))+"\n")
    outf.close()
    inf.close()
Ejemplo n.º 17
0
def Main():
    global args
    args = ParseArg()
    bw = BigWigFile(open(args.bigwig))
    CheckFolderExist(args.output)
    fout = WriteToFile(args.output + '/' + args.name + '.bed')
    wout = WriteToFile(args.output + '/' + args.name + '.wig')
    genome = LoadGenome(args.genome)
    if args.smooth:
        logging("Options: turn on smooth mode")
    for chrom in SortGenome(genome):
        chrom_size = genome[chrom]
        logging("Process: %s\t%d" % (chrom, chrom_size))
        array = bw.get_as_array(chrom, 0, chrom_size)
        invalid = np.isnan(array)
        array[invalid] = 0
        agg_array = []
        start = 0
        stop = args.window
        for nn in range(int(math.ceil(len(array) / float(args.window)))):
            if stop >= len(array):
                stop = len(array)
                agg_array.append(np.mean(array[start:stop]))
                break
            agg_array.append(np.mean(array[start:stop]))
            start += args.window
            stop += args.window
        agg_array = np.array(agg_array)
        if args.smooth:
            smooth_array = Smooth(agg_array)
        else:
            smooth_array = agg_array
        print >> wout, "variableStep chrom=%s span=%d" % (chrom, args.window)
        for nn, value in enumerate(smooth_array):
            if nn == 0:
                print >> fout, "%s\t0\t%d\t%.6f" % (chrom,
                                                    (nn + 1) * args.window,
                                                    float(value))
                print >> wout, "%d\t%.6f" % (nn + 1, value)
            elif nn == len(smooth_array) - 1:
                print >> fout, "%s\t%d\t%d\t%.6f" % (chrom, nn * args.window,
                                                     chrom_size, float(value))
                print >> wout, "variableStep chrom=%s span=%d" % (
                    chrom, chrom_size - ((nn) * args.window))
                print >> wout, "%d\t%.6f" % (nn * args.window + 1,
                                             float(value))
            else:
                print >> fout, "%s\t%d\t%d\t%.6f" % (chrom, nn * args.window,
                                                     (nn + 1) * args.window,
                                                     float(value))
                print >> wout, "%d\t%.6f" % (nn * args.window + 1,
                                             float(value))
    fout.flush()
    wout.flush()
    wig2bw = "wigToBigWig -clip %s %s %s" % (args.output + '/' + args.name +
                                             '.wig', args.genome, args.output +
                                             '/' + args.name + '.bw')
    os.system(wig2bw)
    logging("Finish: TSA_smooth DONE!!!")
Ejemplo n.º 18
0
    def summarize(self, interval, bins=None, method='summarize',
                  function='mean'):

        # We may be dividing by zero in some cases, which raises a warning in
        # NumPy based on the IEEE 754 standard (see
        # http://docs.scipy.org/doc/numpy/reference/generated/
        #       numpy.seterr.html)
        #
        # That's OK -- we're expecting that to happen sometimes. So temporarily
        # disable this error reporting for the duration of this method.
        orig = np.geterr()['invalid']
        np.seterr(invalid='ignore')

        if (bins is None) or (method == 'get_as_array'):
            bw = BigWigFile(open(self.fn))
            s = bw.get_as_array(
                interval.chrom,
                interval.start,
                interval.stop,)
            if s is None:
                s = np.zeros((interval.stop - interval.start,))
            else:
                s[np.isnan(s)] = 0

        elif method == 'ucsc_summarize':
            if function in ['mean', 'min', 'max', 'std', 'coverage']:
                return self.ucsc_summarize(interval, bins, function=function)
            else:
                raise ValueError('function "%s" not supported by UCSC\'s'
                                 'bigWigSummary')

        else:
            bw = BigWigFile(open(self.fn))
            s = bw.summarize(
                interval.chrom,
                interval.start,
                interval.stop, bins)
            if s is None:
                s = np.zeros((bins,))
            else:
                if function == 'sum':
                    s = s.sum_data
                if function == 'mean':
                    s = s.sum_data / s.valid_count
                    s[np.isnan(s)] = 0
                if function == 'min':
                    s = s.min_val
                    s[np.isinf(s)] = 0
                if function == 'max':
                    s = s.max_val
                    s[np.isinf(s)] = 0
                if function == 'std':
                    s = (s.sum_squares / s.valid_count)
                    s[np.isnan(s)] = 0

        # Reset NumPy error reporting
        np.seterr(divide=orig)
        return s
Ejemplo n.º 19
0
def wig_reader(infile, chrom_sizes=None, informat='wiggle', bin_size=2000):
    '''infile: either a wiggle or bigwig format file
	   chromsize: chrom_name: size, only needed is format is bigwig
	   format: either 'wiggle' or 'bigwig'
	   return: chrom, position (0-based), value
	'''
    if informat.upper() == 'WIGGLE':
        point_num = 1
        count = 0
        for chrom, start, end, strand, score in bx.wiggle.IntervalReader(
                infile):
            yield (chrom, start, end, score)
            """
			count += 1
			if count ==1:
				chrom = fields[0]
				up_bound = fields[1]+1
				score = fields[2]
				continue
			if (fields[0] == chrom) and (fields[1] +1 == up_bound + 1) and (fields[2] == score):
				point_num += 1
				up_bound = fields[1]+1
				continue
			else:
				yield((chrom, up_bound - point_num, up_bound, score))
				chrom = fields[0]
				score = fields[2]
				up_bound = fields[1]+1
				point_num = 1
			"""

    elif informat.upper() == 'BIGWIG':
        bw_obj = BigWigFile(file=open(infile))
        for chr_name, chr_size in list(chrom_sizes.items()):
            for chrom, st, end in BED.tillingBed(chrName=chr_name,
                                                 chrSize=chr_size,
                                                 stepSize=bin_size):
                sig_list = bw_obj.get_as_array(chrom, st, end)
                if sig_list is None:
                    continue
                sig_list = numpy.nan_to_num(sig_list)
                if numpy.sum(sig_list) == 0:
                    continue
                low_bound = st
                point_num = 1
                score = sig_list[0]
                for value in (sig_list[1:]):
                    if value == score:
                        point_num += 1
                    else:
                        yield ((chrom, low_bound, low_bound + point_num,
                                score))
                        score = value
                        low_bound = low_bound + point_num
                        point_num = 1
    else:
        raise Exception("Unknown format. Must be 'wiggle' or 'bigwig'")
Ejemplo n.º 20
0
def get_signal(inputfile, output, bwfiles, extend, N, bwfolder):
    signalbw = bwfiles.strip().strip(',').split(',')

    if not bwfolder:
        bwfolder = "./"
    if not bwfolder.endswith('/') and not bwfolder != "":
        bwfolder += '/'

    bwHs = []
    for sb in signalbw:
        if sb.startswith('/') or startswith("./") or startswith("../"):
            bwHs.append(BigWigFile(open(sb, 'rb')))
        else:
            bwHs.append(BigWigFile(open(bwfolder + sb, 'rb')))

    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if "_" in ll[0]:
            continue
        if len(ll) >= 6 and ll[5] == "-":
            start = int(ll[2])
            strand_flap = 1
        else:
            start = int(ll[1])
            strand_flap = 0
        S = max(0, start - extend)
        E = start + extend
        #        S = int(ll[1])
        #        E = int(ll[2])
        outdata = ll
        for bwHandle in bwHs:
            try:
                signal = (bwHandle.summarize(ll[0], S, E, N))
                binlen = (E - S) * 1.0 / N
                if type(signal.sum_data) == None:
                    print 'c1', line
                    addsig = ["na"] * N
                else:

                    addsig = list(
                        signal.sum_data * 1.0 /
                        (binlen))  #float(signal.sum_data/signal.valid_count)
            except:
                print 'c2', line
                addsig = ["na"] * N  #'nan'
            # ll.extend(list(signal.sum_data/signal.valid_count))
            if strand_flap == 1:
                ll.extend(addsig[::-1])
            else:
                ll.extend(addsig)

            # ll.extend(list(signal.sum_data/signal.valid_count))
        outf.write("\t".join(map(str, ll)) + "\n")
    inf.close()
    outf.close()
Ejemplo n.º 21
0
def bigwig_to_wav(args):
    import numpy as np
    from bx.bbi.bigwig_file import BigWigFile
    from scipy.signal import convolve
    from scipy.stats import norm
    from scipy.ndimage import zoom

    logger.info('read input BigWigfile: ' + args.bigwig_file)
    f_bigwig = open(args.bigwig_file, 'rb')
    logger.info('read input BED file: ' + args.bed_file)
    f_bed = open(args.bed_file, 'r')
    bigwig = BigWigFile(f_bigwig)

    smooth_filter = None
    scale_factors = None
    if args.smooth == 'boxcar':
        smooth_filter = np.ones(args.window_size, dtype=np.float32)
    elif args.smooth == 'gaussian':
        smooth_filter = norm.pdf(
            np.linspace(-3, 3, args.window_size * 3,
                        endpoint=True)).astype(np.float32)
    if args.smooth != 'none':
        scale_factors = convolve(np.ones(smooth_filter.shape[0]),
                                 smooth_filter,
                                 mode='same')

    if not os.path.exists(args.output_dir):
        logger.info('create output directory: ' + args.output_dir)
        os.makedirs(args.output_dir)

    for line in f_bed:
        c = line.strip().split('\t')
        chrom = c[0]
        start = int(c[1])
        end = int(c[2])
        x = np.nan_to_num(bigwig.get_as_array(chrom, start, end))
        # zoom the signals
        x = zoom(x, args.zoom)
        if args.smooth != 'none':
            # smooth the raw signal with a moving window
            x = convolve(x, smooth_filter, mode='same')
            # scale the signal
            filter_length = smooth_filter.shape[0]
            x[:(filter_length / 2)] /= scale_factors[:(filter_length / 2)]
            x[(-filter_length / 2):] /= scale_factors[(-filter_length / 2):]
            if x.shape[0] > filter_length:
                x[(filter_length / 2):(-filter_length /
                                       2)] /= np.sum(smooth_filter)

        wav_file = os.path.join(args.output_dir,
                                '%s:%d-%d.wav' % (chrom, start, end))
        logger.info('create wav file: ' + wav_file)
        modulate(x,
                 wav_file,
                 sample_rate=args.sample_rate,
                 n_channels=args.n_channels)
Ejemplo n.º 22
0
def refine_with_summit(_soft,_mark,_tissue,_reps):
    _temp_peak = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/sort_bed/{0}.{1}.{2}.bed"\
                                                       .format(_soft,_mark,_tissue))]

    _temp_bw = open("/Data/adam/dnase/bigwig/{0}.{1}.{2}.bw".format(_mark,_tissue,))
    _temp_enrich = open("/Data/adam/dnase/enrich_all_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w')
    _bw = BigWigFile(file=_temp_bw)

    for line in _temp_peak:
        vals = _bw.get(line[0],int(line[1]),int(line[2]))
        vals =tuple(vals)
        if len(vals)>0:
            maxs = 0
            for _key in vals:
                if float(_key[2])>maxs:
                    maxs = float(_key[2])
                    summit = _key[:2]
            summit_p=int((float(summit[0])+float(summit[1]))/2)
            if summit_p-1000>0:
                print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],str(summit_p-1000),str(summit_p+999))
            else:
                print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],1,2000)
    _temp_enrich.close()

    awk_args='{printf "%s\\t%s\\n", $0,NR}'
    sh("sort -k 1,1 -k 2g,2g /Data/adam/dnase/enrich_all_bed/{0}.{1}.{2}.bed| bedtools merge -i stdin\
     | awk '{3}'>/Data/adam/dnase/enrich_all_merge_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue,awk_args))
    enhancer_dir ="/Data/adam/dnase/enhancer/tissue_enhancer/"
    sh("bash /Data/adam/dnase/enhancer/roc_pr.sh {0}{1}_enhancer.txt {0}negative_enhancer.txt \
    {2} {3} {1}".format(enhancer_dir, _tissue,_soft,_mark))
    raw_pr = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/roc_pr_value/{0}.{1}.{2}.bed"\
                                                   .format(_soft,_mark,_tissue))]
    pr_refine = []
    temp_positive = [0,0]
    temp_negative = [0,0]
    for _line in raw_pr:
        if _line[3]==0 and _line[4]==0:
            temp_positive = _line[1:3]
            out_line = _line[:]
            out_line[3:5] = temp_negative
            precision = float(out_line[1])/(float(out_line[1])+float(out_line[3]))
            out_line.append(str(precision))
            pr_refine.append(out_line)
        elif _line[1]==0 and _line[2]==0:
            temp_negative = _line[3:5]
            out_line = _line[:]
            out_line[1:3] = temp_positive
            precision = float(out_line[1])/(float(out_line[1])+float(out_line[3]))
            out_line.append(str(precision))
            pr_refine.append(out_line)
        else:
            print "error in {0}.{1}.{2}, {3}".format(_soft,_mark,_tissue,_line)
    pr_refine2 = ['\t'.join(i) for i in pr_refine]
    with open("/Data/adam/dnase/roc_pr_final/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w') as f:
        for item in pr_refine2:
            print >>f, item
Ejemplo n.º 23
0
def evaluateTC((signalFileName,chrom,start,end)):
  signalFile = open(signalFileName,"r")
  bw = BigWigFile(signalFile)
  mid = (int(start)+int(end))/2
  p1 = max(mid - halfWindow,0)
  p2 = mid + halfWindow
  try: nCount = int(sum(correctBW(bw.get(chrom,p1,p2),p1,p2)))
  except Exception: nCount = 0
  signalFile.close()
  return nCount
Ejemplo n.º 24
0
def get_signal(inputfile, output, bwfiles, bwfolder, extend, N):
    signalbw = bwfiles.strip().strip(',').split(',')

    if not bwfolder:
        bwfolder = "./"
    if not bwfolder.endswith('/'):
        bwfolder += '/'

    bwHs = []
    for sb in signalbw:
        if sb.startswith('/'):
            bwHs.append(BigWigFile(open(sb, 'rb')))
        else:
            bwHs.append(BigWigFile(open(bwfolder + sb, 'rb')))

    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if "_" in ll[0]:
            continue
        #center = (int(ll[1]) + int(ll[2]))/2
        #S = max(0,center - extend)
        #E = center + extend
        C = (int(ll[1]) + int(ll[2])) / 2
        #if len(ll)>=6 and ll[5] == "-":
        #    C = int(ll[1])
        #else:
        #    C = int(ll[2])
        S = max(0, C - extend)
        E = C + extend

        for bwHandle in bwHs:
            try:
                signal = (bwHandle.summarize(ll[0], S, E, N))
                binlen = extend * 2.0 / N
                if type(
                        signal.sum_data
                ) == None:  #or type(signal2.sum_data) == None or type(signal3.sum_data) == None:
                    addsig = [0] * N
                else:
                    addsig_tmp = signal.sum_data / binlen  #float(signal.sum_data/signal.valid_count)
                    addsig = list(addsig_tmp)  #+ list(addsig2) + list(addsig3)
            except:
                #print 'c2',line
                addsig = [0] * N  #'nan'
            # ll.extend(list(signal.sum_data/signal.valid_count))
            if len(ll) >= 6 and ll[5] == "-":
                ll.extend(addsig[::-1])
            else:
                ll.extend(addsig)
        outf.write("\t".join(map(str, ll)) + "\n")
    inf.close()
    outf.close()
def get_signal(inputfile, output, bwfiles, bwfolder, extend):
    signalbw = bwfiles.strip().strip(',').split(',')

    if not bwfolder:
        bwfolder = ""
    if not bwfolder.endswith('/'):
        bwfolder += '/'

    bwHs = []
    for sb in signalbw:
        if sb.startswith('/'):
            bwHs.append(BigWigFile(open(sb, 'rb')))
        else:
            bwHs.append(BigWigFile(open(bwfolder + sb, 'rb')))

    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if "_" in ll[0]:
            continue
        if len(ll) >= 6 and ll[5] == "-":
            strand_flap = 1
        else:
            strand_flap = 0
#        center = (int(ll[1]) + int(ll[2]))/2
#        S = max(0,center - extend)
#        E = center + extend
        S = int(ll[1])
        E = int(ll[2])
        outdata = []
        for bwHandle in bwHs:
            try:
                signal = (bwHandle.summarize(ll[0], S, E, (E - S)))
                if signal:
                    thisdata_tmp = list(signal.sum_data)
                    #                    if strand_flap == 1:
                    #                        thisdata = map(round,thisdata_tmp,[4]*(E-S))[::-1]
                    #                    else:
                    thisdata = map(round, thisdata_tmp, [4] * (E - S))
                else:
                    thisdata = ["NA"] * (E - S)
            except:
                thisdata = ["NA"] * (E - S)
            outdata.append(thisdata)
            # ll.extend(list(signal.sum_data/signal.valid_count))

        for pos in range(len(outdata[0])):
            newll = [ll[0], S + pos, S + pos + 1]
            for dataorder in range(len(outdata)):
                newll.append(outdata[dataorder][pos])
            outf.write("\t".join(map(str, newll)) + "\n")
    inf.close()
    outf.close()
Ejemplo n.º 26
0
def big_wig_summary_worker((span, bw_list, region_bed_file_name, nb_proc,
                            verbose)):

    results = list()
    bw_label = [os.path.basename(p) for p in bw_list]
    bw_label = [os.path.splitext(os.path.basename(p))[0] for p in bw_list]

    if verbose:
        sys.stderr.write("Processing: " + region_bed_file_name)
    for big_wig, cpt in zip(bw_list, range(len(bw_list))):

        bigwig = BigWigFile(open(big_wig, "r"))
        if verbose:

            sys.stderr.write("Computing coverage for file: " + big_wig + " [" +
                             str(multiprocessing.current_process()) + "], " +
                             str(span[1] - span[0]) + " chunks to process.\n")

        bed_windows = pybedtools.BedTool(region_bed_file_name)

        chr_cur = None

        # Loop through bed lines (features object)

        for i in bed_windows[slice(span[0], span[1])]:

            if chr_cur == None:
                chr_cur = i.chrom

            else:
                if i.chrom != chr_cur:
                    chr_cur = i.chrom

            # Note: bigWig is zero-based/half open as bed.
            bw_sum = bigwig.query(i.chrom, i.start, i.end, 1)

            if bw_sum is not None:
                bw_sum = bw_sum[0]['mean']
                bw_sum = np.nan_to_num(bw_sum)
                bw_sum = np.round(bw_sum, 2)
            else:
                bw_sum = 0.00

            results.append(
                (i.chrom + ":" + str(i.start), bw_label[cpt], float(bw_sum)))

    if verbose:

        sys.stderr.write("Computing coverage for file: " + big_wig + " [" +
                         str(multiprocessing.current_process()) +
                         "]. Job done.\n")

    return results
Ejemplo n.º 27
0
def get_mappability(mappability_file,
                    vcf_file,
                    out_file,
                    region=None,
                    append_chr=True):
    map_reader = BigWigFile(open(mappability_file, 'rb'))

    vcf_reader = vcf.Reader(filename=vcf_file)

    if region is not None:
        chrom, beg, end = parse_region_for_vcf(region)
        try:
            vcf_reader = vcf_reader.fetch(chrom, start=beg, end=end)
        except ValueError:
            print("no data for region {} in vcf".format(region))
            vcf_reader = []

    data = []

    for record in vcf_reader:
        if append_chr:
            chrom = 'chr{0}'.format(record.CHROM)

        else:
            chrom = record.CHROM

        coord = record.POS

        beg = coord - 100

        beg = max(beg, 0)

        end = coord + 100

        result = map_reader.query(chrom, beg, end, 1)

        if result is None:
            mappability = 0

        else:
            mappability = result[0]['mean']

        data.append({
            'chrom': record.CHROM,
            'coord': record.POS,
            'mappability': mappability
        })

    data = pd.DataFrame(data)

    csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes())
Ejemplo n.º 28
0
def main():
    input_filename, output_filename, loc_filename, loc_key, chrom_col, start_col = sys.argv[
        1:]

    # open input, output, and bigwig files
    location_file = LocationFile(loc_filename)
    bigwig_filename = location_file.get_values(loc_key)
    bwfh = open_or_die(bigwig_filename,
                       message='Error opening BigWig file %s' %
                       bigwig_filename)
    bw = BigWigFile(file=bwfh)
    ifh = open_or_die(input_filename,
                      message='Error opening input file %s' % input_filename)
    ofh = open_or_die(output_filename,
                      mode='w',
                      message='Error opening output file %s' % output_filename)

    # make column numbers 0-based
    chrom_col = int(chrom_col) - 1
    start_col = int(start_col) - 1
    min_cols = max(chrom_col, start_col)

    # add score column to imput file
    line_number = 0
    for line in ifh:
        line_number += 1
        line = line.rstrip('\r\n')
        elems = line.split('\t')
        if len(elems) > min_cols:
            chrom = elems[chrom_col].strip()
            # base-0 position in chrom
            start = int(elems[start_col])
            score_list = bw.get(chrom, start, start + 1)
            score_list_len = len(score_list)
            if score_list_len == 1:
                beg, end, score = score_list[0]
                score_val = '%1.3f' % score
            elif score_list_len == 0:
                score_val = 'NA'
            else:
                die('%s line %d: chrom=%s, start=%d, score_list_len = %d' %
                    (input_filename, line_number, chrom, start,
                     score_list_len))
            print >> ofh, '\t'.join([line, score_val])
        else:
            print >> ofh, line

    bwfh.close()
    ifh.close()
    ofh.close()
Ejemplo n.º 29
0
def extract_phastcons ( bedfile, phas_chrnames, width, pf_res ):
    """Extract phastcons scores from a bed file.

    Return the average scores
    """
    info("read bed file...")
    bfhd = open(bedfile)
    bed = parse_BED(bfhd)

    # calculate the middle point of bed regions then extend left and right by 1/2 width
    bchrs = bed.peaks.keys()
    bchrs.sort()

    chrs = []
    for c in phas_chrnames:
        if c in bchrs:
            chrs.append(c)

    sumscores = []
    for chrom in chrs:
        info("processing chromosome: %s" %chrom)
        pchrom = bed.peaks[chrom]
        bw = BigWigFile(open(chrom+'.bw', 'rb'))
        for i in range(len(pchrom)):
            mid = int((pchrom[i][0]+pchrom[i][1])/2)
            left = int(mid - width/2)
            right = int(mid + width/2)

            if left < 0:
                left = 0
                right = width

            summarize = bw.summarize(chrom, left, right, width/pf_res)
            if not summarize:
                continue
            dat = summarize.sum_data / summarize.valid_count
            #dat = dat.strip().split('\t')
            sumscores.append(dat)

    ## a list with each element is a list of conservation score at the same coordinate
    sumscores = map(list, zip(*sumscores))

    ## exclude na
    sumscores = [[t2 for t2 in t if not math.isnan(t2)] for t in sumscores]
    try:
        conscores = [sum(t)/len(t) for t in sumscores]
    except ZeroDivisionError:
        conscores = [0] * (width/pf_res)

    return conscores
Ejemplo n.º 30
0
def extract_phastcons ( bedfile, phas_chrnames, width, pf_res ):
    """Extract phastcons scores from a bed file.

    Return the average scores
    """
    info("read bed file...")
    bfhd = open(bedfile)
    bed = parse_BED(bfhd)

    # calculate the middle point of bed regions then extend left and right by 1/2 width
    bchrs = bed.peaks.keys()
    bchrs.sort()

    chrs = []
    for c in phas_chrnames:
        if c in bchrs:
            chrs.append(c)

    sumscores = []
    for chrom in chrs:
        info("processing chromosome: %s" %chrom)
        pchrom = bed.peaks[chrom]
        bw = BigWigFile(open(chrom+'.bw', 'rb'))
        for i in range(len(pchrom)):
            mid = int((pchrom[i][0]+pchrom[i][1])/2)
            left = int(mid - width/2)
            right = int(mid + width/2)

            if left < 0:
                left = 0
                right = width

            summarize = bw.summarize(chrom, left, right, width/pf_res)
            if not summarize:
                continue
            dat = summarize.sum_data / summarize.valid_count
            #dat = dat.strip().split('\t')
            sumscores.append(dat)

    ## a list with each element is a list of conservation score at the same coordinate
    sumscores = map(list, zip(*sumscores))

    ## exclude na
    sumscores = [[t2 for t2 in t if not math.isnan(t2)] for t in sumscores]
    try:
        conscores = [sum(t)/len(t) for t in sumscores]
    except ZeroDivisionError:
        conscores = [0] * (width/pf_res)

    return conscores
Ejemplo n.º 31
0
def summary(bwfile1, bwfile2, bwfile_add, bedfile, topnumber, out):
    total_result = []
    p = BwIO(bwfile1)
    q = BwIO(bwfile2)
    chrom_len1 = {}
    chrom_len2 = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len1[i['key']] = i['chromSize']
    for i in q.chromosomeTree['nodes']:
        chrom_len2[i['key']] = i['chromSize']
    bwHandle1 = BigWigFile(open(bwfile1, 'rb'))
    bwHandle2 = BigWigFile(open(bwfile2, 'rb'))
    inf = open(bedfile)
    t = time.time()
    for line in inf:
        ll = line.split()
        ll[3] = "-"
        if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]):
            summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), 1)
            if summary.valid_count == 0:
                mean_value1 = 0
            else:
                mean_value1 = (summary.sum_data / summary.valid_count)[0]
            summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), 1)
            if summary.valid_count == 0:
                mean_value2 = 0
            else:
                mean_value2 = (summary.sum_data / summary.valid_count)[0]
            total_result.append(ll + [mean_value1 + mean_value2])
    inf.close()
    total_result.sort(reverse=True, key=lambda x: x[-1])
    bwHs = []
    for i in bwfile_add:
        bwHs.append(BigWigFile(open(i, 'rb')))
    outf = open(out, 'w')
    print "scaning 1st ", time.time() - t
    t = time.time()
    for i in range(min(len(total_result), topnumber)):
        ll = total_result[i]
        summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]),
                                      (int(ll[2]) - int(ll[1])))
        additional_value1 = ",".join(map(str, list(summary.sum_data)))
        summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]),
                                      (int(ll[2]) - int(ll[1])))
        additional_value2 = ",".join(map(str, list(summary.sum_data)))
        result = map(str, (ll + [additional_value1, additional_value2]))
        for bwH in bwHs:
            summary = bwH.summarize(ll[0], int(ll[1]), int(ll[2]),
                                    (int(ll[2]) - int(ll[1])))
            additional_value_add = ",".join(map(str, list(summary.sum_data)))
            result.append(additional_value_add)
        outf.write("\t".join(result) + "\n")
    outf.close()
    print "scaning 2nd ", time.time() - t
Ejemplo n.º 32
0
def check_position(chrom, start, end):
    #is there 10% coverage of region [start, end]
    valids = 0.
    wrong = 0.
    for directory in [x[0] for x in os.walk(DATAPATH + "data")]:
        for filename in glob(directory + "/*.bigWig") + glob(directory +
                                                             "/*.bw"):
            f = open(filename, "r")
            bigwig = BigWigFile(file=f)
            summary = bigwig.summarize(chrom, start, end + 1, 1)
            if summary.valid_count * 10 < end - start + 1:
                wrong += 1
            else:
                valids += 1
    return (valids / (valids + wrong) >= 0.75)
Ejemplo n.º 33
0
def get_signal(inputfile,output,signalbw,extend):
    signalbw = signalbw.strip().strip(',').split(',')
    
    p=BwIO(signalbw[0])
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    bwHandle = []
    for k in signalbw:
        bwHandle.append(BigWigFile(open(k, 'rb')))
    inf = open(inputfile)
    outf = open(output,'w')
    for line in inf:
        ll = line.split()
        inputlen = len(ll)
        if not chrom_len.has_key(ll[0]):
            continue
        for bwH in bwHandle:
            S = (int(ll[1]) + int(ll[2]))/2
            E = (int(ll[1]) + int(ll[2]))/2 + 1
            try:
                signal=bwH.summarize(ll[0],max(0,S-extend),E+extend,1)
            except:
                break
            if float(signal.valid_count) == 0:
                ll.append('0')
            else:
                ll.append(str(float(signal.sum_data/signal.valid_count)))
        if len(ll) == ( inputlen + len(bwHandle)  ):
            outf.write("\t".join(ll)+"\n")
    inf.close()
    outf.close()
def test_summaries_from_file():
    bw = BigWigFile(file=open("test_data/bbi_tests/test.bw"))

    def check_summary(line):
        fields = line.split()
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        n = int(fields[3])
        t = fields[4]
        values = [float(v.replace('n/a', 'NaN')) for v in fields[5:]]
        sd = bw.summarize(chrom, start, end, n)
        if t == 'mean':
            print sd.sum_data / sd.valid_count
            print values
            assert allclose(sd.sum_data / sd.valid_count, values)
        elif t == 'min':
            assert allclose(sd.min_val, values)
        elif t == 'max':
            assert allclose(sd.max_val, values)
        #elif t == 'std':
        #    assert numpy.allclose( sd.max_val, values )

    for line in open("test_data/bbi_tests/test.expectation"):
        yield check_summary, line
Ejemplo n.º 35
0
class BigWigWrapper(object):
    """A wrapper for bx-python BigWig file"""
    def __init__(self, filepath):
        self.bw = BigWigFile(open(filepath))

    def __getitem__(self, iv):
        return self.bw.get_as_array(iv.chrom, iv.start, iv.end)
Ejemplo n.º 36
0
    def build(self):
        """
        Build the matrix.

        Since bigWig files are essentially pre-summarized, this just extracts
        the chrom/start/stop represented by each cell in the matrix and fills
        it with the value from the bigWig file.
        """
        self.bigwig = BigWigFile(open(self.file))

        chrom_rc, chrom_bins = self.chrom2rc()

        if self.chrom == 'genome':
            chroms = self.chromdict.keys()

        else:
            chroms = [self.chrom]

        for chrom in chroms:
            rc = chrom_rc[chrom]
            nbins = chrom_bins[chrom]

            start, stop = self.chromdict[chrom]
            results = self.bigwig.summarize(chrom, start, stop, nbins)
            values = results.sum_data / results.valid_count
            values[np.isnan(values)] = 0

            self.matrix[rc[:,0], rc[:, 1]] = values

        self._cleanup()
Ejemplo n.º 37
0
def test_summaries_from_file():
    bw = BigWigFile(file=open("test_data/bbi_tests/test.bw", 'rb'))

    def check_summary(line):
        fields = line.split()
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        n = int(fields[3])
        t = fields[4]
        values = [float(v.replace('n/a', 'NaN')) for v in fields[5:]]
        sd = bw.summarize(chrom, start, end, n)
        if t == 'mean':
            print(sd.sum_data / sd.valid_count)
            print(values)
            assert allclose(sd.sum_data / sd.valid_count, values)
        elif t == 'min':
            assert allclose(sd.min_val, values)
        elif t == 'max':
            assert allclose(sd.max_val, values)
        # elif t == 'std':
        #    assert numpy.allclose( sd.max_val, values )
    for i, line in enumerate(open("test_data/bbi_tests/test.expectation")):
        f = partial(check_summary, line)
        f.description = "Test summaries line %d: %s" % (i, line[:40])
        yield (f, )
Ejemplo n.º 38
0
def load_annos(args):
    """
    Populate a dictionary of Tabixfile handles for
    each annotation file.  Other modules can then
    access a given handle and fetch data from it
    as follows:

    dbsnp_handle = annotations.annos['dbsnp']
    hits = dbsnp_handle.fetch(chrom, start, end)
    """
    anno_files = get_anno_files(args)
    for anno in anno_files:
        try:
            # .gz denotes Tabix files.
            if anno_files[anno].endswith(".gz"):
                annos[anno] = pysam.Tabixfile(anno_files[anno])
            # .bw denotes BigWig files.
            elif anno_files[anno].endswith(".bw"):
                annos[anno] = BigWigFile(open(anno_files[anno]))

        except IOError:
            sys.exit("Gemini cannot open this annotation file: %s. \n"
                     "Have you installed the annotation files?  If so, "
                     "have they been moved or deleted? Exiting...\n\n"
                     "For more details:\n\t"
                     "http://gemini.readthedocs.org/en/latest/content/"
                     "#installation.html\#installing-annotation-files\n" %
                     anno_files[anno])
Ejemplo n.º 39
0
class BigWigWrapper(object):

    """A wrapper for bx-python BigWig file"""

    def __init__(self, filepath):
        self.bw = BigWigFile(open(filepath))

    def __getitem__(self, iv):
        return self.bw.get_as_array(iv.chrom, iv.start, iv.end)
Ejemplo n.º 40
0
def get_phastcons(bedtool, phastcons_location, species=None, index=None, ):
    
    """
    
    Get phastcons scores for intervals in a bed tool
    
    """
    
    if species is None and index is None:
        print "Error, must select species or index"
    
    f = open(phastcons_location, 'r')
    bw = BigWigFile(file=f)

    try:
        
        #if its a line
        #for each line fetch bigwig values 
        type(bedtool)
        v = bedtool.chrom #is a single interval
        vals = bw.get(bedtool.chrom, bedtool.start, bedtool.stop)
        consvals = list(v[-1] for v in vals)
        if len(consvals) > 0:
            mean_phastcons = np.mean(consvals)
        else:
            mean_phastcons=0
        data = mean_phastcons
    except:
        
        #if bedtool
        for i, bedline in enumerate(bedtool):
            data = np.ndarray(len(bedtool))        
            vals = bw.get(bedline.chrom, bedline.start, bedline.stop)
            consvals = list(v[-1] for v in vals)
            if len(consvals) > 0:
                mean_phastcons = np.mean(consvals)
            else:
                mean_phastcons=0
            data[i] = mean_phastcons
            
    #returns mean phastcons score for each line 
    #returns inconistant data types, need to convert so it just returns an array 
    return data
Ejemplo n.º 41
0
def main():
    input_filename, output_filename, loc_filename, loc_key, chrom_col, start_col = sys.argv[1:]

    # open input, output, and bigwig files
    location_file = LocationFile( loc_filename )
    bigwig_filename = location_file.get_values( loc_key )
    bwfh = open_or_die( bigwig_filename, message='Error opening BigWig file %s' % bigwig_filename )
    bw = BigWigFile( file=bwfh )
    ifh = open_or_die( input_filename, message='Error opening input file %s' % input_filename )
    ofh = open_or_die( output_filename, mode='w', message='Error opening output file %s' % output_filename )

    # make column numbers 0-based
    chrom_col = int( chrom_col ) - 1
    start_col = int( start_col ) - 1
    min_cols = max( chrom_col, start_col )

    # add score column to imput file
    line_number = 0
    for line in ifh:
        line_number += 1
        line = line.rstrip( '\r\n' )
        elems = line.split( '\t' )
        if len( elems ) > min_cols:
            chrom = elems[chrom_col].strip()
            # base-0 position in chrom
            start = int( elems[start_col] )
            score_list = bw.get( chrom, start, start + 1 )
            score_list_len = len( score_list )
            if score_list_len == 1:
                beg, end, score = score_list[0]
                score_val = '%1.3f' % score
            elif score_list_len == 0:
                score_val = 'NA'
            else:
                die( '%s line %d: chrom=%s, start=%d, score_list_len = %d' % ( input_filename, line_number, chrom, start, score_list_len ) )
            print('\t'.join( [line, score_val] ), file=ofh)
        else:
            print(line, file=ofh)

    bwfh.close()
    ifh.close()
    ofh.close()
Ejemplo n.º 42
0
def getNumberOfFragmentsPerRegionFromBigWig(bw, chromSizes):
    """
    Get the number of all mapped fragments per region in all chromosomes
    from a bigWig. Utilizing bx-python.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    Get number of fragments in sample.
    >>> getNumberOfFragmentsPerRegionFromBigWig(test.bwFile1, [('3R', 200)])
    3.0
    >>> getNumberOfFragmentsPerRegionFromBigWig(test.bwFile2, [('3R', 200)])
    4.0
    """
    bwh = BigWigFile(open(bw, "rb"))
    mapped = 0
    for cname, csize in chromSizes:
        regions = bwh.get(cname, 0, csize) # region = bwh.get(chrom_name, start, end)
        for region in regions:
            mapped += region[2]
    return mapped
Ejemplo n.º 43
0
 def get_GA_from_bw(self, plus, minus, GTF, filterfxn):
     ##bx-python 'get' method is 0 based, fully closed
     ga = HTSeq.GenomicArray( "auto", typecode='d' , stranded = True)
     with open(plus) as f:
         bw_file = BigWigFile(file=f)
         for GF in GTF:
             if filterfxn( GF ) == False: continue
             window = GF.iv
             chrom, start, stop = window.chrom, window.start, window.end
             vals = bw_file.get(chrom, start, stop)
             for start, stop, value in vals:
                 ga[ HTSeq.GenomicPosition(chrom, start, '+') ] = value
     with open(minus) as f:
         bw_file = BigWigFile(file=f)
         for GF in GTF:
             if filterfxn( GF ) == False: continue
             window = GF.iv
             chrom, start, stop = window.chrom, window.start, window.end
             vals = bw_file.get(chrom, start, stop)
             for start, stop, value in vals:
                 ga[ HTSeq.GenomicPosition(chrom, start, '-') ] = value
     return ga
Ejemplo n.º 44
0
def profile_bwfile(inbed,bwfile):
	'''retrieve signal from bigwig file for each entry in input bed file'''
	bw = BigWigFile( file=open( bwfile ) )
	
	for line in open(inbed):
		bw_signal=[]
		try:
			if line.startswith('#'):continue
			if line.startswith('track'):continue
			if line.startswith('browser'):continue
			if not line.strip():
				continue
			else:
				line = line.rstrip('\r\n')
				fields = line.split()
				chrom = fields[0]
				start = int(fields[1])
				end = int(fields[2])
		except:
			print >>sys.stderr,"Must be  chrom [space] start [space] end: " + line,
			continue		
		bw_signal.extend(bw.get_as_array(chrom,start,end))
		print chrom +'\t'+ str(start) +'\t'+ str(end) + '\t' + ','.join(str(i) for i in bw_signal)		
Ejemplo n.º 45
0
def findInsertions(bwFile, bedData, x):
    if options.tn5 is not None:
        bwFile = options.b + options.tn5 + "." + bedData[x][0] + ".Scores.bw"

    sL = int(bedData[x][1]) - options.l
    sR = int(bedData[x][2]) + options.r

    # get signal data
    f = open(bwFile, "rb")
    bw = BigWigFile(f)
    try:
        signal = bw.get_as_array(bedData[x][0], sL, sR)
    except OverflowError:
        signal = np.array([np.nan] * (sR - sL))
    f.close()

    out = signal
    try:
        if bedData[x][3] == "-":
            out = out[::-1]
    except IndexError:
        pass
    return out
Ejemplo n.º 46
0
def get_phastcons(bedtool, species=None, index=None):
    """
    Get phastcons scores for intervals in a bed tool
    """
    if species is None and index is None:
        print "Error, must select species or index"
    if species is not None and index is None:
        if species == "mm9":
            index= basedir + "/yeolab/Conservation/phastCons/mm9_30way/placental/mm9_phastcons.bw"
        elif species == "hg19":
            index = basedir + "/yeolab/Conservation/phastCons/hg19_46way/placentalMammals/reformat/hg19_phastcons.bw"
    f = open(index, 'r')
    bw = BigWigFile(file=f)

    try:
        type(bedtool)
        v = bedtool.chrom #is a single interval
        vals = bw.get(bedtool.chrom, bedtool.start, bedtool.stop)
        consvals = list(v[-1] for v in vals)
        if len(consvals) > 0:
            mean_phastcons = np.mean(consvals)
        else:
            mean_phastcons=0
        data = mean_phastcons


    except:
        for i, bedline in enumerate(bedtool):
            data = np.ndarray(len(bedtool))        
            vals = bw.get(bedline.chrom, bedline.start, bedline.stop)
            consvals = list(v[-1] for v in vals)
            if len(consvals) > 0:
                mean_phastcons = np.mean(consvals)
            else:
                mean_phastcons=0
            data[i] = mean_phastcons
    return data
Ejemplo n.º 47
0
class TestBigWig(unittest.TestCase):
    def setUp(self):
        f = open( "test_data/bbi_tests/test.bw" )
        self.bw = BigWigFile(file=f)
        
    def test_get_summary(self):
        data = self.bw.query("chr1", 10000, 20000, 10)
        means = [ x['mean'] for x in data ]
        print means
        assert numpy.allclose( map(float, means), [-0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998] )
        
        # Summarize variant
        sd = self.bw.summarize( "chr1", 10000, 20000, 10)
        assert numpy.allclose( sd.sum_data / sd.valid_count, [-0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998] )
        
        # Test min and max for this entire summary region
        data = self.bw.query("chr1", 10000, 20000, 1)
        maxs = [ x['max'] for x in data ]
        mins = [ x['min'] for x in data ]
        self.assertEqual( map(float, maxs), [0.289000004529953] )
        self.assertEqual( map(float, mins), [-3.9100000858306885] )
        
    def test_get_leaf(self):
        data = self.bw.query("chr1", 11000, 11005, 5)
        means = [ x['mean'] for x in data ]
        assert numpy.allclose( map(float, means), [0.050842501223087311, -2.4589500427246094, 0.050842501223087311, 0.050842501223087311, 0.050842501223087311] )
        
        # Test min and max for this entire leaf region
        data = self.bw.query("chr1", 11000, 11005, 1)
        maxs = [ x['max'] for x in data ]
        mins = [ x['min'] for x in data ]
        self.assertEqual( map(float, maxs), [0.050842501223087311] )
        self.assertEqual( map(float, mins), [-2.4589500427246094] )
        
    def test_wrong_nochrom(self):
        data = self.bw.query("chr2", 0, 10000, 10)
        self.assertEqual( data, None )
Ejemplo n.º 48
0
import numpy as np
fl=sys.argv[1]
dist=int(sys.argv[2])
from bx.bbi.bigwig_file import BigWigFile

genes=read.dat("/home/ssaberi/resources/list.genes.txt",'\t')
table=read.dat("/projects/epigenomics/MarcoJuliaPon/peaks.txt",'\t')
mygenes=read.dat("/projects/epigenomics/MarcoJuliaPon/mygenes.txt",'\t')
ens=[]
for i in mygenes:
	for gn in genes:
		if i in gn[0]:
			ens.append(gn[1])
			break

genespos=read.read_gene_pos('/home/ssaberi/resources/hg19v69_genes.TSS_2000.pc.A03480.H3K27me3.GE02.coverage')
genesbed=bedtools.makebed_genpos(ens,genespos,100000)
              


f = open(fl)
bw = BigWigFile(file=f)
mat=[]
for bed_i in genesbed:
   vals = bw.get( bed_i[0], bed_i[1], bed_i[2])
   mat.append(np.array(vals))
mat=np.array(mat)
plt.matshow(mat,aspect='auto',cmap='YlOrBr')
fl=fl[-fl[::-1].index('/'):-fl[::-1].index('.')]
plt.save(fl+".pdf")
Ejemplo n.º 49
0
def output(fragmentsMap , fragmentList, fragmentPairs, fragmentCount, fragmentsChrom):
    '''
    outputs 2 files, the first containing 
    "chr    extraField      fragmentMid     marginalizedContactCount        mappable? (0/1)"
    
    and the second containing:
    "chr1   fragmentMid1    chr2    fragmentMid2    contactCount"
    
    optionally output the 2D contact matrix
    '''
    
    if (options.verbose):
        print >> sys.stdout, "- %s START   : output data " % (timeStamp())

    if ( options.outputFilename != "" ):
        outfile1 = gzip.open(options.outputDir+options.outputFilename+".fragmentLists.gz","wb")
    else:
        outfile1 = gzip.open(options.outputDir+os.path.basename(args[0])+".fragmentLists.gz","wb")
    
    fragmentIds = fragmentsMap.keys()
    fragmentIds.sort()

    # lookup mean mappability ratio
    bw = ""
    if (options.mappability != ""):
        # lazy load
        from bx.intervals.io import GenomicIntervalReader
        from bx.bbi.bigwig_file import BigWigFile
        bw = BigWigFile( open( options.mappability ) )
    

    for fragmentId in fragmentIds:

        contactCounts = 0
        chrom = fragmentsMap[fragmentId][0]
        midpoint =  fragmentsMap[fragmentId][1]

        if (options.vverbose):
            print >> sys.stdout, "- process %s %d " % (chrom, midpoint)

        if (fragmentList.has_key(fragmentId)):
            contactCounts = fragmentList[fragmentId]
        
        if (bw != ""):    
            try:
                mappable = bw.query(chrom, midpoint-options.resolution/2, midpoint+options.resolution/2, 1)[0]["mean"]
            except:
                mappable = 0
                # problem with invalid values
                if (options.vverbose):
                    print >> sys.stderr, "Problem with bw file at %s %d-%d" % (chrom, midpoint-options.resolution/2, midpoint+options.resolution/2)
                    print traceback.format_exc()
                
        elif (contactCounts>0):
            mappable=1

        outfile1.write("%s\t%d\t%s\t%f\n" % (chrom, midpoint, "NA", mappable))
        
    outfile1.close()
    
    if ( options.outputFilename != "" ):
        outfile2 = gzip.open(options.outputDir+options.outputFilename+".contactCounts.gz","wb")
    else:
        outfile2 = gzip.open(options.outputDir+os.path.basename(args[0])+".contactCounts.gz","wb")
        
    for fragmentIds, contactCounts in fragmentPairs.iteritems():
        chrom1 = fragmentsMap[fragmentIds[0]][0]
        midpoint1 =  fragmentsMap[fragmentIds[0]][1]
    
        chrom2 = fragmentsMap[fragmentIds[1]][0]
        midpoint2 =  fragmentsMap[fragmentIds[1]][1]
    
        outfile2.write("%s\t%d\t%s\t%d\t%d\n" % (chrom1, midpoint1, chrom2, midpoint2, contactCounts))
        
    outfile2.close()
    
    if (options.create2DMatrix or options.create2DMatrixPerChr):
        # lazy loading
        from scipy.sparse import lil_matrix
        import numpy

        # populate sparse matrix
        A = lil_matrix((fragmentCount, fragmentCount), dtype='i')    
        for fragmentIds, contactCounts in fragmentPairs.iteritems():
            A[fragmentIds[0],fragmentIds[1]] = contactCounts
            A[fragmentIds[1],fragmentIds[0]] = contactCounts
        # convert to coordinate format 
        B = A.tocoo()
            
        if (options.create2DMatrix):

            if ( options.outputFilename != "" ):
                outfile3 = options.outputDir+options.outputFilename+".matrix"
            else:
                outfile3 = options.outputDir+os.path.basename(args[0])+".matrix"

            if (options.verbose):
                print >> sys.stdout, "- save 2Dmatrix to %s " % (outfile3)
            
            f_handle=open(outfile3,'w')
            
            C = B.tocsr()
            for i in xrange(fragmentCount):
                numpy.savetxt(f_handle, C[i].toarray(),fmt='%i', delimiter='\t')
                
            f_handle.close()

        if (options.create2DMatrixPerChr):
            for chr in fragmentsChrom.keys():

                C = B.tocsc()[:,fragmentsChrom[chr][0]:fragmentsChrom[chr][1]].tocsr()[fragmentsChrom[chr][0]:fragmentsChrom[chr][1],:]

                fragmentRange=fragmentsChrom[chr][1]-fragmentsChrom[chr][0]
                header=['d']+[ "%s%d" % i for i in zip(['r']*fragmentRange,range(fragmentRange))]
                
                if ( options.outputFilename != "" ):
                    outfile3 = options.outputDir+options.outputFilename+"."+chr+".matrix"
                else:
                    outfile3 = options.outputDir+os.path.basename(args[0])+"."+chr+".matrix"

                if (options.verbose):
                    print >> sys.stdout, "- save 2Dmatrix for chromosome %s to %s " % (chr, outfile3)
                
                f_handle=open(outfile3,'w')
                f_handle.write('\t'.join(header)+"\n")
                for i in xrange(fragmentRange):
                    f_handle.write(header[i+1]+"\t")
                    numpy.savetxt(f_handle, C[i].toarray(),fmt='%i', delimiter='\t')
                    
                f_handle.close()
            
    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: output data" % (timeStamp())
Ejemplo n.º 50
0
#data = np.array(range(1,10));
#for f in allFunctions:
#	data = applyFunction(data,f);
#	print(data);

#print(allFunctions);
#exit;

chromSizesFile = MYUTILS.smartGZOpen(args.chrsFile,'r');
chromSizes = {};
for line in chromSizesFile:
	if line is None or line == "" or line[0]=="#": continue
	data=line.rstrip().split("\t");
	chromSizes[data[0]]=int(data[1]);

curBW = BigWigFile(open(args.inBW))

outStream = MYUTILS.smartGZOpen("%s.wig.gz"%(args.outFPre),"w");
outStream.write("track type=wiggle_0\n")


for chr in chromSizes.keys():
	last = 0;
	final = chromSizes[chr];
	sys.stderr.write("Outputting data for %s:\n"%(chr));
	while last!=final: # this breaks it up into chunks so that I'm not piping entire (human) chromosomes at once
		if args.verbose>0: sys.stderr.write("  Section %i - %i:\n"%(last,curLast));
		curLast = np.min([last+args.chunks,final]);
		curEnd = np.min([curLast+additionalFlankSize, final]);
		curSt = np.max([last-additionalFlankSize,0]);
		values = curBW.get_as_array( chr, curSt, curEnd )
Ejemplo n.º 51
0
useThese = {};
totalLength = 0
for chr in chrOrder:
	#sample positions
	useThese[chr] = np.random.random_sample((chromSizes[chr]))<args.sample;
	totalLength = totalLength + np.sum(useThese[chr]);


#make a matrix of the data
allData = np.empty([totalLength,len(IDs)]);
if args.eliminateMissing>0:
	keepThese = np.ones([totalLength]).astype(bool); # onlt those for which data was observed in all tracks

for i in range(0,len(IDs)):
	#input GB tracks
	curBW = BigWigFile(open(files[i]))
	curTot = 0;
	if args.verbose>1: sys.stderr.write("Inputting data for %s.\n"%(IDs[i]));
	for chr in chrOrder:
		if args.verbose>1: sys.stderr.write("  Inputting data for %s.\n"%(chr));
		if args.verbose>2: sys.stderr.write("    Getting data from BW.\n");
		values = curBW.get_as_array( chr, 0, chromSizes[chr] )
		if values is None:
			sys.stderr.write("%s is missing %s... skipping it for all\n"%(IDs[i],chr));
			chrOrder.remove(chr)
			allData = np.delete(allData, [range(curTot, (curTot+np.sum(useThese[chr])))],0);
			if args.eliminateMissing>0:
				keepThese = np.delete(keepThese, [range(curTot, (curTot+np.sum(useThese[chr])))],0);
			totalLength = totalLength -  np.sum(useThese[chr]);
			del useThese[chr]
			del chromSizes[chr]
Ejemplo n.º 52
0
opts.add_option("-a", help="<bw> Accepts a bigwig file")
opts.add_option("-g", help="<Genome Size file>")
opts.add_option("-w", default=150,type='int', help="<Int> window size")
opts.add_option("-s", default=20,type='int', help="<Int> step size (span)")
options, arguments = opts.parse_args()

# return usage information if no argvs given
if len(sys.argv)==1:
    os.system(sys.argv[0]+" --help")
    sys.exit()

##### DEFINE FUNCTIONS ##### 

##### INPUTS AND OUTPUTS #####
# open bigwig
bw = BigWigFile(open(options.a))

# get gSize file
gSizes = np.loadtxt(options.g,'str')
chunkSize = 1000000
padLen = 5000

# open out file

outName = os.path.join(os.path.dirname(options.a), 'out.smooth.bed')
try: os.remove(outName)
except OSError: pass

print "Saving to %s.."%outName
outF = file(outName, 'a')
Ejemplo n.º 53
0
 def setUp(self):
     f = open( "test_data/bbi_tests/test.bw" )
     self.bw = BigWigFile(file=f)
	if line is None or line == "" or line[0]=="#": continue
	data=line.rstrip().split("\t");
	for i in range(0,len(data)):
		oldToNew[data[i]] = data[0];
		transChrs.append(data[i]);

inFile.close();

chromSizesFile = MYUTILS.smartGZOpen(args.chrsFile,'r');
chromSizes = {};
for line in chromSizesFile:
	if line is None or line == "" or line[0]=="#": continue
	data=line.rstrip().split("\t");
	chromSizes[data[0]]=int(data[1]);

curBW = BigWigFile(open(args.inBW))

outStream = MYUTILS.smartGZOpen("%s.wig.gz"%(args.outFPre),"w");
outStream.write("track type=wiggle_0\n")

for chr in transChrs:
	values = curBW.get_as_array( chr, 0, chromSizes[oldToNew[chr]] )
	#print(chr);
	if values is not None:
		sys.stderr.write("Adding %s -> %s\n"%(chr, oldToNew[chr]));
		outStream.write("fixedStep chrom=%s start=1 step=1\n"%(oldToNew[chr]))
		outStream.write("\n".join(map(str,values)));
		outStream.write("\n");

toBW = subprocess.Popen(["wigToBigWig","%s.wig.gz"%(args.outFPre),args.chrsFile,"%s.bw"%(args.outFPre)])
temp = toBW.communicate()
Ejemplo n.º 55
0
         if tokens[3] == '+':
             annotated_5p.add((tokens[0], int(tokens[1]) - 1))
             annotated_3p.add((tokens[0], int(tokens[2]) - 1))
         elif tokens[3] == '-':
             annotated_3p.add((tokens[0], int(tokens[1]) - 1))
             annotated_5p.add((tokens[0], int(tokens[2]) - 1))
         else:
             raise RuntimeError(
                     'Invalid line in annotation file: "{}".'.format(line)
                 )
 unannotated_fivep_splice_site_counts = defaultdict(int)
 unannotated_threep_splice_site_counts = defaultdict(int)
 annotated_fivep_splice_site_counts = defaultdict(int)
 annotated_threep_splice_site_counts = defaultdict(int)
 from bx.bbi.bigwig_file import BigWigFile
 bw = BigWigFile(open(args.phylop_bw, 'rb'))
 print >>sys.stderr, '\x1b[KDone. Computing/writing matrix elements...'
 with open(
         allincidence
     ) as incidence_stream, open(
         args.out, 'w'
     ) as output_stream:
     unannotated_line_counts = defaultdict(int)
     annotated_line_counts = defaultdict(int)
     splice_sites = 0
     for key, group in itertools.groupby(
                             incidence_stream, lambda x: x.split('\t')[0]
                         ):
         for line in group:
             print >>sys.stderr, (
                     'Processed {} splice sites...\r'.format(
Create a site profile vector showing the average signal accumulated from a
bigwig file around the center of each interval from a BED file.

Output is the average signal value at that relative position across the 
intervals.

usage: %prog bigwig_file.bw padding < bed_file.bed 
"""

import sys
from numpy import *

from bx.intervals.io import GenomicIntervalReader
from bx.bbi.bigwig_file import BigWigFile

bw = BigWigFile( open( sys.argv[1] ) )
padding = int( sys.argv[2] )
totals = zeros( padding*2, dtype=float64 )
valid = zeros( padding*2, dtype=int32 )

for interval in GenomicIntervalReader( sys.stdin ):
    center = floor( ( interval.start + interval.end ) / 2 )
    values = bw.get_as_array( interval.chrom, center - padding, center + padding )
    # Determine which positions had data and mask the rest for totalling
    invalid = isnan( values )
    values[ invalid ] = 0
    totals += values
    valid += ( ~ invalid )

savetxt( sys.stdout, totals/valid )
Ejemplo n.º 57
0
	lengthCount+=1;
	lengthSum+=curLen;
	if scanThese[i][GENOMEDATA.STR]!="+" and scanThese[i][GENOMEDATA.STR]!="-":
		numStrandless+=1
avgLength = int(round(lengthSum/lengthCount))

if numStrandless>0 and args.inFile2 is not None:
	raise Exception("Error: loci contain strandless entries, but genome tracks provided for each strand!!");
if numStrandless>0:
	sys.stderr.write("Warning: Strandless loci detected; assuming forward orientation.\n");
padding = int(args.flank);

#read in track file(s)
if args.format=="BIGWIG" or args.format=="BW":
	from bx.bbi.bigwig_file import BigWigFile
	inFile1 = BigWigFile(open(args.inFile))
	if args.inFile2 is not None:
		inFile2 = BigWigFile(open(args.inFile2))
elif args.format=="BIGBED" or args.format=="BB":
	from bx.bbi.bigwig_file import BigBedFile
	inFile1 = BigBedFile(open(args.inFile))
	if args.inFile2 is not None:
		inFile2 = BigBedFile(open(args.inFile2))
elif args.format=="WIG" or args.format=="W":
	from bx.arrays.wiggle import WiggleReader
	inFile1 = WiggleReader(open(args.inFile))
	if args.inFile2 is not None:
		inFile2 = WiggleReader(open(args.inFile2))
elif args.format=="BEDGR" or args.format=="BG":
	from bx.arrays.bed import BedReader
	inFile1 = BedReader(open(args.inFile))
Ejemplo n.º 58
0
def main(args):
    bw_file = BigWigFile( open(args.bigWigFile) )
    bw_file.get_as_array(chrom, st, end)
Ejemplo n.º 59
0
class HilbertMatrixBigWig(HilbertMatrix):
    # Need to override build(), but otherwise just like a HilbertMatrix
    def __init__(self, *args, **kwargs):
        """
        Subclass of HilbertMatrix specifically for bigWig format files
        """
        super(HilbertMatrixBigWig, self).__init__(*args, **kwargs)

    def build(self):
        """
        Build the matrix.

        Since bigWig files are essentially pre-summarized, this just extracts
        the chrom/start/stop represented by each cell in the matrix and fills
        it with the value from the bigWig file.
        """
        self.bigwig = BigWigFile(open(self.file))

        chrom_rc, chrom_bins = self.chrom2rc()

        if self.chrom == 'genome':
            chroms = self.chromdict.keys()

        else:
            chroms = [self.chrom]

        for chrom in chroms:
            rc = chrom_rc[chrom]
            nbins = chrom_bins[chrom]

            start, stop = self.chromdict[chrom]
            results = self.bigwig.summarize(chrom, start, stop, nbins)
            values = results.sum_data / results.valid_count
            values[np.isnan(values)] = 0

            self.matrix[rc[:,0], rc[:, 1]] = values

        self._cleanup()


    def chrom2rc(self):
        """
        Return a dictionary of {chrom: (rows, cols)} and {chrom: nbins}
        """
        precomputed = np.load(
            os.path.join(
                os.path.dirname(__file__),
                'precomputed.npz'))
        rc = precomputed['_%s' % self.matrix_dim]

        d = {}
        bins = {}
        last_stop = 0
        for chrom, startstop in self.chromdict.items():
            start, stop = startstop
            frac = self.chromdict[chrom][1] / float(self.chrom_length)
            nbins = int(frac * (self.matrix_dim * self.matrix_dim))
            d_start = last_stop
            d_stop = d_start + nbins
            d[chrom] = rc[d_start:d_stop, :]
            bins[chrom] = nbins
            last_stop += nbins
        return d, bins