class TestBigWig(unittest.TestCase): def setUp(self): f = open("test_data/bbi_tests/test.bw") self.bw = BigWigFile(file=f) def test_get_summary(self): data = self.bw.query("chr1", 10000, 20000, 10) means = [x['mean'] for x in data] print means assert numpy.allclose(map(float, means), [ -0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998 ]) # Summarize variant sd = self.bw.summarize("chr1", 10000, 20000, 10) assert numpy.allclose(sd.sum_data / sd.valid_count, [ -0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998 ]) # Test min and max for this entire summary region data = self.bw.query("chr1", 10000, 20000, 1) maxs = [x['max'] for x in data] mins = [x['min'] for x in data] self.assertEqual(map(float, maxs), [0.289000004529953]) self.assertEqual(map(float, mins), [-3.9100000858306885]) def test_get_leaf(self): data = self.bw.query("chr1", 11000, 11005, 5) means = [x['mean'] for x in data] assert numpy.allclose(map(float, means), [ 0.050842501223087311, -2.4589500427246094, 0.050842501223087311, 0.050842501223087311, 0.050842501223087311 ]) # Test min and max for this entire leaf region data = self.bw.query("chr1", 11000, 11005, 1) maxs = [x['max'] for x in data] mins = [x['min'] for x in data] self.assertEqual(map(float, maxs), [0.050842501223087311]) self.assertEqual(map(float, mins), [-2.4589500427246094]) def test_wrong_nochrom(self): data = self.bw.query("chr2", 0, 10000, 10) self.assertEqual(data, None)
def main(): p = optparse.OptionParser(__doc__) p.add_option('-A', '--absolute', action='store_true',dest='A',\ default=False, help='absolute threshold') p.add_option('-s','--standard_background', action='store_true',\ dest='stdbg') p.add_option('-D', '--debug', action='store_true', dest='debug') options, args = p.parse_args() debug_c = 0 BEDFILE = open(args[0], 'rU') BW = BigWigFile(file=open(args[1])) BEDout = open(args[2], 'w') for line in BEDFILE: print(line) line = line.strip().split('\t') x = BW.query(line[0], int(line[1]), int(line[2]),1) line.append(str(round(x[0]['mean'], 5))) BEDout.write("\t".join(line)+"\n") """ for i in x: print i['mean'] """ if options.debug: debug_c +=1 if debug_c >= 10: break if __name__ == '__main__': main()
def createMappabilityList(fragmentsMap, bwfile, fragmentCount, options): # keep record which fragment has decent mappability mappable = np.zeros((fragmentCount, ), dtype=np.float) # lazy load from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile(open(bwfile)) for fragmentId in fragmentsMap.keys(): (chrom, start, end) = fragmentsMap[fragmentId] if (options.vverbose): print >> sys.stdout, "- process %s %d-%d " % (chrom, start, end) try: mappable[fragmentId] = bw.query(chrom, start, end, 1)[0]["mean"] if (np.isnan(mappable[fragmentId])): mappable[fragmentId] = 0 except: mappable[fragmentId] = 0. # problem with invalid values if (options.vverbose): print >> sys.stderr, "Problem with bw file at %s %d-%d" % ( chrom, start, end) print traceback.format_exc() return mappable
def createMappabilityList(fragmentsMap, bwfile, fragmentCount, options): # keep record which fragment has decent mappability mappable = np.zeros((fragmentCount,), dtype=np.float) # lazy load from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile( open( bwfile ) ) for fragmentId in fragmentsMap.keys(): (chrom, start, end) = fragmentsMap[fragmentId] if (options.vverbose): print >> sys.stdout, "- process %s %d-%d " % (chrom, start, end) try: mappable[fragmentId] = bw.query(chrom, start, end, 1)[0]["mean"] if (np.isnan(mappable[fragmentId])): mappable[fragmentId] = 0 except: mappable[fragmentId] = 0. # problem with invalid values if (options.vverbose): print >> sys.stderr, "Problem with bw file at %s %d-%d" % (chrom, start, end) print traceback.format_exc() return mappable
def big_wig_summary_worker((span, bw_list, region_bed_file_name, nb_proc, verbose)): results = list() bw_label = [os.path.basename(p) for p in bw_list] bw_label = [os.path.splitext(os.path.basename(p))[0] for p in bw_list] if verbose: sys.stderr.write("Processing: " + region_bed_file_name) for big_wig, cpt in zip(bw_list, range(len(bw_list))): bigwig = BigWigFile(open(big_wig, "r")) if verbose: sys.stderr.write("Computing coverage for file: " + big_wig + " [" + str(multiprocessing.current_process()) + "], " + str(span[1] - span[0]) + " chunks to process.\n") bed_windows = pybedtools.BedTool(region_bed_file_name) chr_cur = None # Loop through bed lines (features object) for i in bed_windows[slice(span[0], span[1])]: if chr_cur == None: chr_cur = i.chrom else: if i.chrom != chr_cur: chr_cur = i.chrom # Note: bigWig is zero-based/half open as bed. bw_sum = bigwig.query(i.chrom, i.start, i.end, 1) if bw_sum is not None: bw_sum = bw_sum[0]['mean'] bw_sum = np.nan_to_num(bw_sum) bw_sum = np.round(bw_sum, 2) else: bw_sum = 0.00 results.append( (i.chrom + ":" + str(i.start), bw_label[cpt], float(bw_sum))) if verbose: sys.stderr.write("Computing coverage for file: " + big_wig + " [" + str(multiprocessing.current_process()) + "]. Job done.\n") return results
def get_mappability(mappability_file, vcf_file, out_file, region=None, append_chr=True): map_reader = BigWigFile(open(mappability_file, 'rb')) vcf_reader = vcf.Reader(filename=vcf_file) if region is not None: chrom, beg, end = parse_region_for_vcf(region) try: vcf_reader = vcf_reader.fetch(chrom, start=beg, end=end) except ValueError: print("no data for region {} in vcf".format(region)) vcf_reader = [] data = [] for record in vcf_reader: if append_chr: chrom = 'chr{0}'.format(record.CHROM) else: chrom = record.CHROM coord = record.POS beg = coord - 100 beg = max(beg, 0) end = coord + 100 result = map_reader.query(chrom, beg, end, 1) if result is None: mappability = 0 else: mappability = result[0]['mean'] data.append({ 'chrom': record.CHROM, 'coord': record.POS, 'mappability': mappability }) data = pd.DataFrame(data) csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes())
class TestBigWig(unittest.TestCase): def setUp(self): f = open( "test_data/bbi_tests/test.bw" ) self.bw = BigWigFile(file=f) def test_get_summary(self): data = self.bw.query("chr1", 10000, 20000, 10) means = [ x['mean'] for x in data ] print means assert numpy.allclose( map(float, means), [-0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998] ) # Summarize variant sd = self.bw.summarize( "chr1", 10000, 20000, 10) assert numpy.allclose( sd.sum_data / sd.valid_count, [-0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998] ) # Test min and max for this entire summary region data = self.bw.query("chr1", 10000, 20000, 1) maxs = [ x['max'] for x in data ] mins = [ x['min'] for x in data ] self.assertEqual( map(float, maxs), [0.289000004529953] ) self.assertEqual( map(float, mins), [-3.9100000858306885] ) def test_get_leaf(self): data = self.bw.query("chr1", 11000, 11005, 5) means = [ x['mean'] for x in data ] assert numpy.allclose( map(float, means), [0.050842501223087311, -2.4589500427246094, 0.050842501223087311, 0.050842501223087311, 0.050842501223087311] ) # Test min and max for this entire leaf region data = self.bw.query("chr1", 11000, 11005, 1) maxs = [ x['max'] for x in data ] mins = [ x['min'] for x in data ] self.assertEqual( map(float, maxs), [0.050842501223087311] ) self.assertEqual( map(float, mins), [-2.4589500427246094] ) def test_wrong_nochrom(self): data = self.bw.query("chr2", 0, 10000, 10) self.assertEqual( data, None )
def output(fragmentsMap , fragmentList, fragmentPairs, fragmentCount, fragmentsChrom): ''' outputs 2 files, the first containing "chr extraField fragmentMid marginalizedContactCount mappable? (0/1)" and the second containing: "chr1 fragmentMid1 chr2 fragmentMid2 contactCount" optionally output the 2D contact matrix ''' if (options.verbose): print >> sys.stdout, "- %s START : output data " % (timeStamp()) if ( options.outputFilename != "" ): outfile1 = gzip.open(options.outputDir+options.outputFilename+".fragmentLists.gz","wb") else: outfile1 = gzip.open(options.outputDir+os.path.basename(args[0])+".fragmentLists.gz","wb") fragmentIds = fragmentsMap.keys() fragmentIds.sort() # lookup mean mappability ratio bw = "" if (options.mappability != ""): # lazy load from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile( open( options.mappability ) ) for fragmentId in fragmentIds: contactCounts = 0 chrom = fragmentsMap[fragmentId][0] midpoint = fragmentsMap[fragmentId][1] if (options.vverbose): print >> sys.stdout, "- process %s %d " % (chrom, midpoint) if (fragmentList.has_key(fragmentId)): contactCounts = fragmentList[fragmentId] if (bw != ""): try: mappable = bw.query(chrom, midpoint-options.resolution/2, midpoint+options.resolution/2, 1)[0]["mean"] except: mappable = 0 # problem with invalid values if (options.vverbose): print >> sys.stderr, "Problem with bw file at %s %d-%d" % (chrom, midpoint-options.resolution/2, midpoint+options.resolution/2) print traceback.format_exc() elif (contactCounts>0): mappable=1 outfile1.write("%s\t%d\t%s\t%f\n" % (chrom, midpoint, "NA", mappable)) outfile1.close() if ( options.outputFilename != "" ): outfile2 = gzip.open(options.outputDir+options.outputFilename+".contactCounts.gz","wb") else: outfile2 = gzip.open(options.outputDir+os.path.basename(args[0])+".contactCounts.gz","wb") for fragmentIds, contactCounts in fragmentPairs.iteritems(): chrom1 = fragmentsMap[fragmentIds[0]][0] midpoint1 = fragmentsMap[fragmentIds[0]][1] chrom2 = fragmentsMap[fragmentIds[1]][0] midpoint2 = fragmentsMap[fragmentIds[1]][1] outfile2.write("%s\t%d\t%s\t%d\t%d\n" % (chrom1, midpoint1, chrom2, midpoint2, contactCounts)) outfile2.close() if (options.create2DMatrix or options.create2DMatrixPerChr): # lazy loading from scipy.sparse import lil_matrix import numpy # populate sparse matrix A = lil_matrix((fragmentCount, fragmentCount), dtype='i') for fragmentIds, contactCounts in fragmentPairs.iteritems(): A[fragmentIds[0],fragmentIds[1]] = contactCounts A[fragmentIds[1],fragmentIds[0]] = contactCounts # convert to coordinate format B = A.tocoo() if (options.create2DMatrix): if ( options.outputFilename != "" ): outfile3 = options.outputDir+options.outputFilename+".matrix" else: outfile3 = options.outputDir+os.path.basename(args[0])+".matrix" if (options.verbose): print >> sys.stdout, "- save 2Dmatrix to %s " % (outfile3) f_handle=open(outfile3,'w') C = B.tocsr() for i in xrange(fragmentCount): numpy.savetxt(f_handle, C[i].toarray(),fmt='%i', delimiter='\t') f_handle.close() if (options.create2DMatrixPerChr): for chr in fragmentsChrom.keys(): C = B.tocsc()[:,fragmentsChrom[chr][0]:fragmentsChrom[chr][1]].tocsr()[fragmentsChrom[chr][0]:fragmentsChrom[chr][1],:] fragmentRange=fragmentsChrom[chr][1]-fragmentsChrom[chr][0] header=['d']+[ "%s%d" % i for i in zip(['r']*fragmentRange,range(fragmentRange))] if ( options.outputFilename != "" ): outfile3 = options.outputDir+options.outputFilename+"."+chr+".matrix" else: outfile3 = options.outputDir+os.path.basename(args[0])+"."+chr+".matrix" if (options.verbose): print >> sys.stdout, "- save 2Dmatrix for chromosome %s to %s " % (chr, outfile3) f_handle=open(outfile3,'w') f_handle.write('\t'.join(header)+"\n") for i in xrange(fragmentRange): f_handle.write(header[i+1]+"\t") numpy.savetxt(f_handle, C[i].toarray(),fmt='%i', delimiter='\t') f_handle.close() if (options.verbose): print >> sys.stdout, "- %s FINISHED: output data" % (timeStamp())
def processChromatin(offBed, f_configuration): """ process chromatin data into blocks of specific resolution """ if (options.verbose): print "Start chromatin processing" if (not f_configuration["conf"].has_key("genomesize")): calculateChromosomeSpecifics(chromsizes, f_configuration) step_size = int(f_configuration["conf"]["genomesize"] * options.chromatinResolution / (360. - f_configuration["conf"]["target_radius"] )) if (options.verbose): print "chromatin stepsize: %d nts" % (step_size) samfile = "" bigwig = "" # check if chromatin file has been supplied if (options.chromatinFormat == "bam"): import pysam # lazy import of required module # try open the bam file try: samfile = pysam.Samfile(options.chromatin, "rb" ) except: print >> sys.stderr, "[ERROR] chromatin file (bam) could not be read : %s" % (options.chromatin) traceback.print_exc() exit(1) elif (options.chromatinFormat == "bigwig"): from bx.intervals.io import GenomicIntervalReader # lazy import of required module from bx.bbi.bigwig_file import BigWigFile # lazy import of required module # try open the bigwig file try: bigwig = BigWigFile(open(options.chromatin)) except: print >> sys.stderr, "[ERROR] chromatin file (bigwig) could not be read : %s" % (options.chromatin) traceback.print_exc() exit(1) if (options.verbose): print "... chromatin file opened" max_value = 0. # open output file if (os.path.exists(options.output_dir+"chromatin.txt")): if (options.verbose): print "using existing chromatin file" for line in fileinput.input([options.output_dir+"chromatin.txt"]): c_value = float(line.strip().split("\t")[3]) if (c_value > max_value): max_value = c_value else: f_chromatin = open(options.output_dir+"chromatin.txt","w") # process calculate mean chromatin density value for region for region in offBed.values(): chrom = region[0] if (options.verbose): print "Collecting chromatin data for chromosom %s" % (chrom) for span_start in range(region[1], region[2], step_size): span_end = min(span_start + step_size, region[2]) c_value = 0. if (span_start == span_end): continue if (options.chromatinFormat=="bam"): try: on_chromatin_features = 0. for pileupcolumn in samfile.pileup(chrom,span_start,span_end): on_chromatin_features += pileupcolumn.n c_value = on_chromatin_features/(span_end-span_start) except: if (options.verbose): print >> sys.stderr, "[WARN] bam file exception : %s:%d-%d" % (chrom,span_start,span_end) elif (options.chromatinFormat=="bigwig"): try: bwsummary = bigwig.query(chrom,span_start,span_end, 1 ) c_value = bwsummary[0]["mean"] except: if (options.verbose): print >> sys.stderr, "[WARN] bigwig file exception : %s:%d-%d" % (chrom,span_start,span_end) if (math.isnan(c_value)): c_value = 0. # add pseudocount to circumvent log issues c_value += 0.001 if (c_value > max_value): max_value = c_value f_chromatin.write("%s\t%d\t%d\t%.3f\n" % (chrom, span_start,span_end, c_value)) # close filehandle f_chromatin.close() if (options.verbose): print "Maximal chromatin value: %.3f" % (max_value) # add track to configuration file if (not f_configuration.has_key("plots")): f_configuration["plots"] = {} f_configuration["plots"]["type"] = "heatmap" f_configuration["plots"]["color"] = "spectral-9-div" f_configuration["plots"]["stroke_thickness"] = "1" f_configuration["plots"]["stroke_color"] = "black" if (not f_configuration["plots"].has_key("plot")): f_configuration["plots"]["plot"] = [] stroke_thickness = 0 if (options.chromatinResolution >= 0.5): stroke_thickness = 1 f_configuration["plots"]["plot"] += [''' <plot> show = conf(show_histogram) type = histogram fill_under = yes fill_color = lgrey file = %s r0 = 0.95r r1 = 0.997r scale_log_base = 5 stroke_thickness = %d color = black min = 0.0 max = %.3f <axes> show = data thickness = 1 color = lgrey <axis> spacing = 0.1r </axis> </axes> </plot> ''' % (options.output_dir+"chromatin.txt", stroke_thickness, max_value)] if (options.verbose): print "Finished chromatin processing"
pol2 = BigWigFile(open('/Users/JME/Documents/CSE549/hg19.bigWig')) annotations = open('/Users/JME/Documents/CSE549/hg19.txt', 'r') for line in annotations: lines = line.split('\t') if(lines[1] in chroms.keys()): chroms[lines[1]][int(lines[3])] = [int(lines[4]), lines[2]] for i in chroms.keys(): for j in chroms[i]: if not j in trScores.keys(): start = 0 end = 0 if chroms[i][j][1] == '-': start = chroms[i][j][0] end = j else: start = j end = chroms[i][j][0] startTr = pol2.query(i, start - 30, start + 300, 1) endTr = pol2.query(i, start + 301, end, 1) if startTr and endTr: if startTr[0]['mean'] > 0 and endTr[0]['mean'] > 0: trScores[j] = 0 trScores[j] = (startTr[0]['mean'])/ (endTr[0]['mean']) if trScores[j] > maxScore: maxScore = trScores[j] maxGene = j pickle.dump(chroms, open('chrom_data.p', 'wb')) pickle.dump(trScores, open('scores.p', 'wb')) print("Max: " + maxScore + ", gene: " + maxGene)
def process(): # get the data readdata() # set chromatin flag withChromatin = 'false' if (options.chromatin!=""): withChromatin = 'true' samfile = "" bigwig = "" # check if chromatin file has been supplied if (options.chromatin != "" and options.chromatinFormat == "bam"): import pysam # lazy import of required module # try open the bam file try: samfile = pysam.Samfile(options.chromatin, "rb" ) except: exit(1) elif (options.chromatin != "" and options.chromatinFormat == "bigwig"): from bx.intervals.io import GenomicIntervalReader # lazy import of required module from bx.bbi.bigwig_file import BigWigFile # lazy import of required module # try open the bigwig file try: bigwig = BigWigFile(open(options.chromatin)) except: exit(1) processors = 1 if (options.processors <= 0): # figure out number of processors try: processors = multiprocessing.cpu_count() except: processors = 1 else: processors = options.processors output_ontargets = "" # gets filled with a json object for on-targets # process one on-target region/locus of interest (LOI) at a time for loi in targets.keys(): # info for whole region bed = loiBed[loi] chrom = bed[0] cstart = int(bed[1]) cstop = int(bed[2]) clabel = bed[3] cstrand = bed[5] tts = records[loi].seq # process individual eligible target sites in this region/locus for target in targets[loi]: tstart = int(target[1]) tstop = int(target[2]) targetid = target[3] tstrand = target[5] tsize = tstop-tstart lflank = min(tstart-cstart, options.flanks) rflank = min(cstop-tstop, options.flanks) targetSeq = "" if (cstrand == "+"): targetSeq = tts[tstart-cstart-lflank: tstop-cstart+rflank] else: targetSeq = tts.reverse_complement()[tstart-cstart-lflank: tstop-cstart+rflank] targetStr = [" " for i in range(lflank+tsize+rflank)] for i in range(lflank, lflank+tsize): targetStr[i]='+' if (target[8]!=""): for s in target[8].strip('d').split('d'): if (cstrand == "+"): targetStr[lflank+int(s)]='_' else: targetStr[lflank+tsize-int(s)-1]='_' on_chromatin_features = 0. if (options.chromatin != "" and options.chromatinFormat=="bam"): try: for pileupcolumn in samfile.pileup(chrom,tstart-lflank,tstop+rflank): on_chromatin_features += pileupcolumn.n on_chromatin_features = "%.3f" % (on_chromatin_features/(lflank+tsize+rflank)) except: print >> sys.stderr, "[WARN] bam file exception : %s:%d-%d" % (chrom,tstart-lflank,tstop+rflank) elif (options.chromatin != "" and options.chromatinFormat=="bigwig"): try: bwsummary = bigwig.query(chrom, tstart-lflank, tstop+rflank, 1 ) on_chromatin_features = "%.3f" % (bwsummary[0]["mean"]) except: print >> sys.stderr, "[WARN] bigwig file exception : %s:%d-%d" % (chrom,tstart-lflank,tstop+rflank) else: on_chromatin_features = "-" off_targets = {} # hashing start positions (w/r/t on target) and encoding (grouping off-targets) # branch point if (processors == 1): # no need to multiprocess for tfo_region in tfotargets[targetid].keys(): # process each error category for tfo_error in tfotargets[targetid][tfo_region].keys(): processOfftarget(off_targets, targetid, tfo_region, tfo_error, lflank, rflank, tsize, tstrand, samfile, bigwig); else: # contribute workload # load up work queue work_queue = multiprocessing.JoinableQueue(processors*3) # create a queue to pass to workers to store the results result_queue = multiprocessing.Queue() num_jobs = 0 for tfo_region in tfotargets[targetid].keys(): num_jobs += len(tfotargets[targetid][tfo_region]) # spawn workers for i in range(min(num_jobs,processors)): worker = Worker(work_queue, result_queue, options.chromatin, options.chromatinFormat) worker.daemon=True worker.start() # process off-targets w/r/t to on-target # process each intersecting interval for tfo_region in tfotargets[targetid].keys(): # process each error category for tfo_error in tfotargets[targetid][tfo_region].keys(): work_queue.put(Job(targetid, tfo_region, tfo_error, lflank, rflank, tsize, tstrand)); work_queue.close() work_queue.join() for job in range(num_jobs): result = result_queue.get() for offset in result.keys(): if (not off_targets.has_key(offset)): off_targets[offset] = {} for encoding in result[offset].keys(): if (not off_targets[offset].has_key(encoding)): off_targets[offset][encoding] = result[offset][encoding] # merge point output_sequence_w = "" output_sequence_c = "" for i in range(len(targetStr)): if (targetStr[i] ==" "): output_sequence_w += targetSeq.lower()[i] output_sequence_c += targetSeq.complement().lower()[i] else: output_sequence_w += targetSeq.upper()[i] output_sequence_c += targetSeq.complement().upper()[i] # generate oaColumns (data table header) output_offtargets = "" for i in range(len(annotations)): output_offtargets += (',\n { "sTitle":"%s", "sToolTip":"number of off-targets intersecting with annotation data: %s" }' % (annotations[i],annotations[i])) output_offtargets += (',\n { "sTitle":"","bSortable":false,"bSearchable":false,"sClass":"btn_details","sToolTip":"Toggle detailed off-target inspector" }\n ],\n "aaData":[') # generate aaData (data table content) for offset in off_targets.keys(): for encoding in off_targets[offset]: ots = off_targets[offset][encoding] # format chromatin data if available if (options.chromatin != ""): ots["chromatin"] = "%.3f" % (ots["chromatin_max"]) else: ots["chromatin"] = "-" output_offtargets += '\n ["%s",0,%d,%d,"%d-%d",%d,"%s",%d,[' % (encoding, ots["errors"], offset, offset-lflank, offset+len(encoding)-lflank,ots["copies"], ots["chromatin"], ots["length"],) for ot in ots["data"]: output_offtargets += '\n ["%s",%d,%d,"%s",%s],' % (ot) output_offtargets = output_offtargets[:-1]+"\n ]" if (len(annotations)>0): output_offtargets += ",%s" % (",".join(["%s" % el for el in ots["annotation"]])) output_offtargets += ',""],' # finalise off-target json object and write output output_offtargets = ('''{ "bPaginate": true, "bProcessing": true, "bAutoWidth": false, "bInfo": true, "bLengthChange": true, "bFilter": true, "iDisplayLength": 10, "aLengthMenu": [[10, 50, 100, -1], [10, 50, 100, "All"]], "bJQueryUI": true, "bDeferRender": true, "aaSorting": [[2, "asc"]], "aoColumns": [ { "sTitle":"%s","sClass":"monospace left", "sToolTip":"shows the part of the primary target that is responsible for off-targets" }, { "sTitle":"overlap", "sType": "numeric","sToolTip":"number of nucleotide positions off-targets overlap the primary target" }, { "sTitle":"errors", "sType": "numeric","sToolTip":"expected number of mismatches between the triplex-forming molecule designed against the primary target and off-targets" }, { "sTitle":"offset","sType": "numeric","bSearchable":false,"bVisible":false, "sToolTip":"the offset is used internally for computing the lefthand-side alignment" }, { "sTitle":"sub region", "sToolTip":"region of the primary target this off-target spans" }, { "sTitle":"copies", "sType": "numeric","sToolTip":"number of copies of this off-target category" }, { "sTitle":"max. chromatin", "bVisible": %s, "sToolTip":"maximal chromatin score observed in any of the off-targets" }, { "sTitle":"length", "sType": "numeric","sToolTip":"length of the off-target", "bVisible":false }, { "sTitle":"offtargets","bVisible": false, "sToolTip":"detailed off-target list with location and annotation" } '''+output_offtargets[:-1]+"\n ]\n}") % (output_sequence_w+"<br/>"+output_sequence_c, withChromatin) output = open(options.output_dir+targetid+"_off_targets.json","w") output.write(output_offtargets) output.close() # pretty sequence encoding for on-target region output_sequence_w = "" output_sequence_c = "" lastpos = " " interruptions = 0 for i in range(len(targetStr)): if (targetStr[i] != lastpos): if (lastpos !=" "): output_sequence_w+="</span>" output_sequence_c+="</span>" if (targetStr[i] =="+"): output_sequence_w += "<span class='valid_triad'>" output_sequence_c += "<span class='valid_triad'>" elif (targetStr[i] =="_"): output_sequence_w += "<span class='invalid_triad'>" output_sequence_c += "<span class='invalid_triad'>" lastpos = targetStr[i] if (targetStr[i] =="_"): interruptions += 1 if (targetStr[i] ==" "): output_sequence_w += targetSeq.lower()[i] output_sequence_c += targetSeq.complement().lower()[i] else: output_sequence_w += targetSeq.upper()[i] output_sequence_c += targetSeq.complement().upper()[i] if (lastpos == "+" or lastpos == "_"): output_sequence_w+="</span>" output_sequence_c+="</span>" output_sequence = "<span class='monospace flank_triad'>5'-"+output_sequence_w+"-3'<br/>3'-"+output_sequence_c+"-5'</span>" # count the number of off-targets offtarget_counter = 0 for tfo_region in tfotargets[targetid].keys(): for tfo_errors in tfotargets[targetid][tfo_region].keys(): offtarget_counter+=len(tfotargets[targetid][tfo_region][tfo_errors]) output_ontargets += '\n\t\t["%s","%s",%d,%d,%d,"%s",%d,"%s",%d,"%s",%d,%d],' % (targetid, chrom, tstart, tstop, (tstop-tstart), on_chromatin_features, interruptions, output_sequence, offtarget_counter, tstrand, lflank, rflank) # finalizes on-target json object and write output output_ontargets = '''{ "bPaginate": true, "bProcessing": true, "bAutoWidth": false, "bInfo": true, "bLengthChange": true, "bFilter": false, "iDisplayLength": 10, "bJQueryUI": true, "aLengthMenu": [[5, 10, 25, -1], [5, 10, 25, "All"]], "aoColumns": [ { "sTitle": "region Id", "sToolTip":"internal id used to identify regions of overlapping putative primary targets" }, { "sTitle": "chr", "sToolTip":"chromosome this region is located in" }, { "sTitle": "start", "sType": "numeric", "sToolTip":"chromosomal start position of this region" }, { "sTitle": "end", "sType": "numeric", "sToolTip":"chromosomal end position of this region" }, { "sTitle": "length", "sType": "numeric", "sToolTip":"number of nucleotides spanned by this target region" }, { "sTitle": "chromatin", "bVisible": '''+withChromatin+''', "sToolTip":"chromatin score averaged over the shown region" }, { "sTitle": "Y-interruptions", "sType": "numeric", "sToolTip":"number of pyrimdine interruptions in the polypurine/polypyrimidine tract of the region" }, { "sTitle": "on-target region", "bSortable": false, "sClass": "left", "sToolTip":"sequence of the region (plus additional flanking positions if specified)" }, { "sTitle": "off-targets", "bSearchable": false, "bVisible": false, "sToolTip":"total number of off-targets accumulated over all putative primary targets in this region" }, { "sTitle": "strand", "bSearchable": false, "bVisible": false, "sToolTip":"strand on which the purine tract is located" }, { "sTitle": "offset left", "sType": "numeric", "bSearchable": false, "bVisible": false, "sToolTip":"number of upstream flanking positions shown in the sequence" }, { "sTitle": "offset right", "sType": "numeric", "bSearchable": false, "bVisible": false, "sToolTip":"number of downstream flanking positions shown in the sequence" } ], "aaData": [''' + output_ontargets[:-1] + '\n ]\n}' output = open(options.output_dir+"primary_target_regions.json","w") output.write(output_ontargets) output.close() # close file handle if (options.chromatin != "" and options.chromatinFormat == "bam"): try: samfile.close() except: exit(1)
class BigWig(object): def __init__(self, filename): self.filename = filename self.determine_sizes() self.bwf = BigWigFile(open(filename)) def determine_sizes(self): self.sizes = {} fh = open(self.filename, "rb") # read magic number to guess endianness magic = fh.read(4) if magic == '&\xfc\x8f\x88': endianness = '<' elif magic == '\x88\x8f\xfc&': endianness = '>' else: raise IOError("The file is not in bigwig format") # read the header info = struct.unpack(endianness + 'HHQQQHHQQIQ', fh.read(60)) self.version = info[0] self.zoom_levels = info[1] self.chromosome_tree_offset = info[2] self.full_data_offset = info[3] self.full_index_offset = info[4] self.field_count = info[5] self.defined_field_count = info[6] self.auto_SQL_offset = info[7] self.total_summary_offset = info[8] self.uncompress_buf_size = info[9] # go to the data fh.seek(self.chromosome_tree_offset) # read magic again magic = fh.read(4) if magic == '\x91\x8c\xcax': endianness = '<' elif magic == 'x\xca\x8c\x91': endianness = '>' else: raise ValueError("Wrong magic for this bigwig data file") info2 = struct.unpack(endianness + 'IIIQQ', fh.read(28)) self.block_size = info2[0] self.key_size = info2[1] self.val_size = info2[2] self.item_count = info2[3] info3 = struct.unpack(endianness + 'BBH', fh.read(4)) self.is_leaf = info3[0] self.count = info3[2] for n in range(self.count): format_code = endianness + str(self.key_size) + 'sII' info = struct.unpack(format_code, fh.read(self.key_size + 2 * 4)) key, chrom_id, chrom_size = info key = key.replace('\x00', '') self.sizes[key] = chrom_size def get_as_array(self, chrom, start, end): return self.bwf.get_as_array(chrom, start, end) def get(self, chrom, start, end): return self.bwf.get(chrom, start, end) def query(self, chrom, start, end, number): return self.bwf.query(chrom, start, end, number)
def process(): # get the data readdata() # set chromatin flag withChromatin = 'false' if (options.chromatin != ""): withChromatin = 'true' samfile = "" bigwig = "" # check if chromatin file has been supplied if (options.chromatin != "" and options.chromatinFormat == "bam"): import pysam # lazy import of required module # try open the bam file try: samfile = pysam.Samfile(options.chromatin, "rb") except: exit(1) elif (options.chromatin != "" and options.chromatinFormat == "bigwig"): from bx.intervals.io import GenomicIntervalReader # lazy import of required module from bx.bbi.bigwig_file import BigWigFile # lazy import of required module # try open the bigwig file try: bigwig = BigWigFile(open(options.chromatin)) except: exit(1) processors = 1 if (options.processors <= 0): # figure out number of processors try: processors = multiprocessing.cpu_count() except: processors = 1 else: processors = options.processors output_ontargets = "" # gets filled with a json object for on-targets # process one on-target region/locus of interest (LOI) at a time for loi in targets.keys(): # info for whole region bed = loiBed[loi] chrom = bed[0] cstart = int(bed[1]) cstop = int(bed[2]) clabel = bed[3] cstrand = bed[5] tts = records[loi].seq # process individual eligible target sites in this region/locus for target in targets[loi]: tstart = int(target[1]) tstop = int(target[2]) targetid = target[3] tstrand = target[5] tsize = tstop - tstart lflank = min(tstart - cstart, options.flanks) rflank = min(cstop - tstop, options.flanks) targetSeq = "" if (cstrand == "+"): targetSeq = tts[tstart - cstart - lflank:tstop - cstart + rflank] else: targetSeq = tts.reverse_complement()[tstart - cstart - lflank:tstop - cstart + rflank] targetStr = [" " for i in range(lflank + tsize + rflank)] for i in range(lflank, lflank + tsize): targetStr[i] = '+' if (target[8] != ""): for s in target[8].strip('d').split('d'): if (cstrand == "+"): targetStr[lflank + int(s)] = '_' else: targetStr[lflank + tsize - int(s) - 1] = '_' on_chromatin_features = 0. if (options.chromatin != "" and options.chromatinFormat == "bam"): try: for pileupcolumn in samfile.pileup(chrom, tstart - lflank, tstop + rflank): on_chromatin_features += pileupcolumn.n on_chromatin_features = "%.3f" % ( on_chromatin_features / (lflank + tsize + rflank)) except: print >> sys.stderr, "[WARN] bam file exception : %s:%d-%d" % ( chrom, tstart - lflank, tstop + rflank) elif (options.chromatin != "" and options.chromatinFormat == "bigwig"): try: bwsummary = bigwig.query(chrom, tstart - lflank, tstop + rflank, 1) on_chromatin_features = "%.3f" % (bwsummary[0]["mean"]) except: print >> sys.stderr, "[WARN] bigwig file exception : %s:%d-%d" % ( chrom, tstart - lflank, tstop + rflank) else: on_chromatin_features = "-" off_targets = { } # hashing start positions (w/r/t on target) and encoding (grouping off-targets) # branch point if (processors == 1): # no need to multiprocess for tfo_region in tfotargets[targetid].keys(): # process each error category for tfo_error in tfotargets[targetid][tfo_region].keys(): processOfftarget(off_targets, targetid, tfo_region, tfo_error, lflank, rflank, tsize, tstrand, samfile, bigwig) else: # contribute workload # load up work queue work_queue = multiprocessing.JoinableQueue(processors * 3) # create a queue to pass to workers to store the results result_queue = multiprocessing.Queue() num_jobs = 0 for tfo_region in tfotargets[targetid].keys(): num_jobs += len(tfotargets[targetid][tfo_region]) # spawn workers for i in range(min(num_jobs, processors)): worker = Worker(work_queue, result_queue, options.chromatin, options.chromatinFormat) worker.daemon = True worker.start() # process off-targets w/r/t to on-target # process each intersecting interval for tfo_region in tfotargets[targetid].keys(): # process each error category for tfo_error in tfotargets[targetid][tfo_region].keys(): work_queue.put( Job(targetid, tfo_region, tfo_error, lflank, rflank, tsize, tstrand)) work_queue.close() work_queue.join() for job in range(num_jobs): result = result_queue.get() for offset in result.keys(): if (not off_targets.has_key(offset)): off_targets[offset] = {} for encoding in result[offset].keys(): if (not off_targets[offset].has_key(encoding)): off_targets[offset][encoding] = result[offset][ encoding] # merge point output_sequence_w = "" output_sequence_c = "" for i in range(len(targetStr)): if (targetStr[i] == " "): output_sequence_w += targetSeq.lower()[i] output_sequence_c += targetSeq.complement().lower()[i] else: output_sequence_w += targetSeq.upper()[i] output_sequence_c += targetSeq.complement().upper()[i] # generate oaColumns (data table header) output_offtargets = "" for i in range(len(annotations)): output_offtargets += ( ',\n { "sTitle":"%s", "sToolTip":"number of off-targets intersecting with annotation data: %s" }' % (annotations[i], annotations[i])) output_offtargets += ( ',\n { "sTitle":"","bSortable":false,"bSearchable":false,"sClass":"btn_details","sToolTip":"Toggle detailed off-target inspector" }\n ],\n "aaData":[' ) # generate aaData (data table content) for offset in off_targets.keys(): for encoding in off_targets[offset]: ots = off_targets[offset][encoding] # format chromatin data if available if (options.chromatin != ""): ots["chromatin"] = "%.3f" % (ots["chromatin_max"]) else: ots["chromatin"] = "-" output_offtargets += '\n ["%s",0,%d,%d,"%d-%d",%d,"%s",%d,[' % ( encoding, ots["errors"], offset, offset - lflank, offset + len(encoding) - lflank, ots["copies"], ots["chromatin"], ots["length"], ) for ot in ots["data"]: output_offtargets += '\n ["%s",%d,%d,"%s",%s],' % ( ot) output_offtargets = output_offtargets[:-1] + "\n ]" if (len(annotations) > 0): output_offtargets += ",%s" % (",".join( ["%s" % el for el in ots["annotation"]])) output_offtargets += ',""],' # finalise off-target json object and write output output_offtargets = ('''{ "bPaginate": true, "bProcessing": true, "bAutoWidth": false, "bInfo": true, "bLengthChange": true, "bFilter": true, "iDisplayLength": 10, "aLengthMenu": [[10, 50, 100, -1], [10, 50, 100, "All"]], "bJQueryUI": true, "bDeferRender": true, "aaSorting": [[2, "asc"]], "aoColumns": [ { "sTitle":"%s","sClass":"monospace left", "sToolTip":"shows the part of the primary target that is responsible for off-targets" }, { "sTitle":"overlap", "sType": "numeric","sToolTip":"number of nucleotide positions off-targets overlap the primary target" }, { "sTitle":"errors", "sType": "numeric","sToolTip":"expected number of mismatches between the triplex-forming molecule designed against the primary target and off-targets" }, { "sTitle":"offset","sType": "numeric","bSearchable":false,"bVisible":false, "sToolTip":"the offset is used internally for computing the lefthand-side alignment" }, { "sTitle":"sub region", "sToolTip":"region of the primary target this off-target spans" }, { "sTitle":"copies", "sType": "numeric","sToolTip":"number of copies of this off-target category" }, { "sTitle":"max. chromatin", "bVisible": %s, "sToolTip":"maximal chromatin score observed in any of the off-targets" }, { "sTitle":"length", "sType": "numeric","sToolTip":"length of the off-target", "bVisible":false }, { "sTitle":"offtargets","bVisible": false, "sToolTip":"detailed off-target list with location and annotation" } ''' + output_offtargets[:-1] + "\n ]\n}") % ( output_sequence_w + "<br/>" + output_sequence_c, withChromatin) output = open(options.output_dir + targetid + "_off_targets.json", "w") output.write(output_offtargets) output.close() # pretty sequence encoding for on-target region output_sequence_w = "" output_sequence_c = "" lastpos = " " interruptions = 0 for i in range(len(targetStr)): if (targetStr[i] != lastpos): if (lastpos != " "): output_sequence_w += "</span>" output_sequence_c += "</span>" if (targetStr[i] == "+"): output_sequence_w += "<span class='valid_triad'>" output_sequence_c += "<span class='valid_triad'>" elif (targetStr[i] == "_"): output_sequence_w += "<span class='invalid_triad'>" output_sequence_c += "<span class='invalid_triad'>" lastpos = targetStr[i] if (targetStr[i] == "_"): interruptions += 1 if (targetStr[i] == " "): output_sequence_w += targetSeq.lower()[i] output_sequence_c += targetSeq.complement().lower()[i] else: output_sequence_w += targetSeq.upper()[i] output_sequence_c += targetSeq.complement().upper()[i] if (lastpos == "+" or lastpos == "_"): output_sequence_w += "</span>" output_sequence_c += "</span>" output_sequence = "<span class='monospace flank_triad'>5'-" + output_sequence_w + "-3'<br/>3'-" + output_sequence_c + "-5'</span>" # count the number of off-targets offtarget_counter = 0 for tfo_region in tfotargets[targetid].keys(): for tfo_errors in tfotargets[targetid][tfo_region].keys(): offtarget_counter += len( tfotargets[targetid][tfo_region][tfo_errors]) output_ontargets += '\n\t\t["%s","%s",%d,%d,%d,"%s",%d,"%s",%d,"%s",%d,%d],' % ( targetid, chrom, tstart, tstop, (tstop - tstart), on_chromatin_features, interruptions, output_sequence, offtarget_counter, tstrand, lflank, rflank) # finalizes on-target json object and write output output_ontargets = '''{ "bPaginate": true, "bProcessing": true, "bAutoWidth": false, "bInfo": true, "bLengthChange": true, "bFilter": false, "iDisplayLength": 10, "bJQueryUI": true, "aLengthMenu": [[5, 10, 25, -1], [5, 10, 25, "All"]], "aoColumns": [ { "sTitle": "region Id", "sToolTip":"internal id used to identify regions of overlapping putative primary targets" }, { "sTitle": "chr", "sToolTip":"chromosome this region is located in" }, { "sTitle": "start", "sType": "numeric", "sToolTip":"chromosomal start position of this region" }, { "sTitle": "end", "sType": "numeric", "sToolTip":"chromosomal end position of this region" }, { "sTitle": "length", "sType": "numeric", "sToolTip":"number of nucleotides spanned by this target region" }, { "sTitle": "chromatin", "bVisible": ''' + withChromatin + ''', "sToolTip":"chromatin score averaged over the shown region" }, { "sTitle": "Y-interruptions", "sType": "numeric", "sToolTip":"number of pyrimdine interruptions in the polypurine/polypyrimidine tract of the region" }, { "sTitle": "on-target region", "bSortable": false, "sClass": "left", "sToolTip":"sequence of the region (plus additional flanking positions if specified)" }, { "sTitle": "off-targets", "bSearchable": false, "bVisible": false, "sToolTip":"total number of off-targets accumulated over all putative primary targets in this region" }, { "sTitle": "strand", "bSearchable": false, "bVisible": false, "sToolTip":"strand on which the purine tract is located" }, { "sTitle": "offset left", "sType": "numeric", "bSearchable": false, "bVisible": false, "sToolTip":"number of upstream flanking positions shown in the sequence" }, { "sTitle": "offset right", "sType": "numeric", "bSearchable": false, "bVisible": false, "sToolTip":"number of downstream flanking positions shown in the sequence" } ], "aaData": [''' + output_ontargets[:-1] + '\n ]\n}' output = open(options.output_dir + "primary_target_regions.json", "w") output.write(output_ontargets) output.close() # close file handle if (options.chromatin != "" and options.chromatinFormat == "bam"): try: samfile.close() except: exit(1)