def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels) olist = [] total = [0] * len(args.bamfiles) for idx, f in enumerate(args.bamfiles): odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) prev_start_pos = None # to store the start positions for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: continue if args.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): continue prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) total[idx] += 1 # Get blocks, possibly extending features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features, total
def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels) olist = [] for f in args.bamfiles: odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) prev_start_pos = None # to store the start positions for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue if args.minFragmentLength > 0 and abs(read.template_length) < args.minFragmentLength: continue if args.maxFragmentLength > 0 and abs(read.template_length) > args.maxFragmentLength: continue if args.ignoreDuplicates and prev_start_pos \ and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): continue prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) # Get blocks, possibly extending features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features
def preload(self, regions, tmpDir=None): """ Given a sample and a set of regions, write a bigWig file containing the underlying signal. This function returns the file name, which needs to be deleted by the calling function at some point. This sends queries one chromosome at a time, due to memory limits on deepBlue """ startTime = datetime.datetime.now() regions2 = mergeRegions(regions) # Make a temporary file f = tempfile.NamedTemporaryFile(delete=False, dir=tmpDir) fname = f.name f.close() # Start with the bigWig file bw = pyBigWig.open(fname, "w") bw.addHeader(self.chromsTuple, maxZooms=0) # This won't work in IGV! # Make a string out of everything in a resonable order for k, v in self.chromsTuple: # Munge chromosome names as appropriate chrom = mungeChromosome(k, regions2.keys()) if not chrom: continue if chrom not in regions2 or len(regions2) == 0: continue regionsStr = "\n".join([ "{}\t{}\t{}".format(k, reg[0], reg[1]) for reg in regions2[chrom] ]) regionsStr += "\n" # Send the regions (status, regionsID) = self.server.input_regions(self.genome, regionsStr, self.userKey) if status != "okay": raise RuntimeError( "Received the following error while sending regions for '{}': {}" .format(regionsID, self.sample)) # Get the experiment information (status, queryID) = self.server.select_experiments(self.sample, k, None, None, self.userKey) if status != "okay": raise RuntimeError( "Received the following error while running select_experiments on file '{}': {}" .format(self.sample, queryID)) if not queryID: raise RuntimeError( "Somehow, we received None as a query ID (file '{}')". format(self.sample)) # Intersect (status, intersectID) = self.server.intersection(queryID, regionsID, self.userKey) if status != "okay": raise RuntimeError( "Received the following error while running intersection on file '{}': {}" .format(self.sample, intersectID)) if not intersectID: raise RuntimeError( "Somehow, we received None as an intersect ID (file '{}')". format(self.sample)) # Query the regions (status, reqID) = self.server.get_regions(intersectID, "START,END,VALUE", self.userKey) if status != "okay": raise RuntimeError( "Received the following error while fetching regions in file '{}': {}" .format(self.sample, reqID)) # Wait for the server to process the data (status, info) = self.server.info(reqID, self.userKey) request_status = info[0]["state"] while request_status != "done" and request_status != "failed": time.sleep(0.1) (status, info) = self.server.info(reqID, self.userKey) request_status = info[0]["state"] # Get the actual data (status, resp) = self.server.get_request_data(reqID, self.userKey) if status != "okay": raise RuntimeError( "Received the following error while fetching data in file '{}': {}" .format(self.sample, resp)) for intervals in resp.split("\n"): interval = intervals.split("\t") if interval[0] == '': continue bw.addEntries([k], [int(interval[0])], ends=[int(interval[1])], values=[float(interval[2])]) bw.close() sys.stderr.write("{} done (took {})\n".format( self.sample, datetime.datetime.now() - startTime)) sys.stderr.flush() return fname
def preload(self, regions, tmpDir=None): """ Given a sample and a set of regions, write a bigWig file containing the underlying signal. This function returns the file name, which needs to be deleted by the calling function at some point. This sends queries one chromosome at a time, due to memory limits on deepBlue """ startTime = datetime.datetime.now() regions2 = mergeRegions(regions) # Make a temporary file f = tempfile.NamedTemporaryFile(delete=False, dir=tmpDir) fname = f.name f.close() # Start with the bigWig file bw = pyBigWig.open(fname, "w") bw.addHeader(self.chromsTuple, maxZooms=0) # This won't work in IGV! # Make a string out of everything in a resonable order for k, v in self.chromsTuple: # Munge chromosome names as appropriate chrom = mungeChromosome(k, regions2.keys()) if not chrom: continue if chrom not in regions2 or len(regions2) == 0: continue regionsStr = "\n".join(["{}\t{}\t{}".format(k, reg[0], reg[1]) for reg in regions2[chrom]]) regionsStr += "\n" # Send the regions (status, regionsID) = self.server.input_regions(self.genome, regionsStr, self.userKey) if status != "okay": raise RuntimeError("Received the following error while sending regions for '{}': {}".format(regionsID, self.sample)) # Get the experiment information (status, queryID) = self.server.select_experiments(self.sample, k, None, None, self.userKey) if status != "okay": raise RuntimeError("Received the following error while running select_experiments on file '{}': {}".format(self.sample, queryID)) if not queryID: raise RuntimeError("Somehow, we received None as a query ID (file '{}')".format(self.sample)) # Intersect (status, intersectID) = self.server.intersection(queryID, regionsID, self.userKey) if status != "okay": raise RuntimeError("Received the following error while running intersection on file '{}': {}".format(self.sample, intersectID)) if not intersectID: raise RuntimeError("Somehow, we received None as an intersect ID (file '{}')".format(self.sample)) # Query the regions (status, reqID) = self.server.get_regions(intersectID, "START,END,VALUE", self.userKey) if status != "okay": raise RuntimeError("Received the following error while fetching regions in file '{}': {}".format(self.sample, reqID)) # Wait for the server to process the data (status, info) = self.server.info(reqID, self.userKey) request_status = info[0]["state"] while request_status != "done" and request_status != "failed": time.sleep(0.1) (status, info) = self.server.info(reqID, self.userKey) request_status = info[0]["state"] # Get the actual data (status, resp) = self.server.get_request_data(reqID, self.userKey) if status != "okay": raise RuntimeError("Received the following error while fetching data in file '{}': {}".format(self.sample, resp)) for intervals in resp.split("\n"): interval = intervals.split("\t") if interval[0] == '': continue bw.addEntries([k], [int(interval[0])], ends=[int(interval[1])], values=[float(interval[2])]) bw.close() sys.stderr.write("{} done (took {})\n".format(self.sample, datetime.datetime.now() - startTime)) sys.stderr.flush() return fname
def getFiltered_worker(arglist): chrom, start, end, args = arglist # Fix the bounds if end - start > args.binSize and end - start > args.distanceBetweenBins: end -= args.distanceBetweenBins if end <= start: end = start + 1 o = [] for fname in args.bamfiles: fh = bamHandler.openBam(fname) chromUse = utilities.mungeChromosome(chrom, fh.references) prev_pos = set() lpos = None minMapq = 0 samFlagInclude = 0 samFlagExclude = 0 internalDupes = 0 externalDupes = 0 singletons = 0 filterRNAstrand = 0 nFiltered = 0 total = 0 # This is only used to estimate the percentage affected for read in fh.fetch(chromUse, start, end): filtered = 0 if read.pos < start: # ensure that we never double count (in case distanceBetweenBins == 0) continue if read.flag & 4: # Ignore unmapped reads, they were counted already continue if args.minMappingQuality and read.mapq < args.minMappingQuality: filtered = 1 minMapq += 1 if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: filtered = 1 samFlagInclude += 1 if args.samFlagExclude and read.flag & args.samFlagExclude != 0: filtered = 1 samFlagExclude += 1 if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if read.tlen >= 0: s = read.pos e = s + read.tlen else: s = read.pnext e = s - read.tlen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: filtered = 1 internalDupes += 1 if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) if read.is_duplicate: filtered = 1 externalDupes += 1 if read.is_paired and read.mate_is_unmapped: filtered = 1 singletons += 1 # filterRNAstrand if args.filterRNAstrand: if read.is_paired: if args.filterRNAstrand == 'forward': if read.flag & 144 == 128 or read.flag & 96 == 64: pass else: filtered = 1 filterRNAstrand += 1 elif args.filterRNAstrand == 'reverse': if read.flag & 144 == 144 or read.flag & 96 == 96: pass else: filtered = 1 filterRNAstrand += 1 else: if args.filterRNAstrand == 'forward': if read.flag & 16 == 16: pass else: filtered = 1 filterRNAstrand += 1 elif args.filterRNAstrand == 'reverse': if read.flag & 16 == 0: pass else: filtered = 1 filterRNAstrand += 1 total += 1 nFiltered += filtered fh.close() # Append a tuple to the output tup = (total, nFiltered, minMapq, samFlagInclude, samFlagExclude, internalDupes, externalDupes, singletons, filterRNAstrand) o.append(tup) return o
def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) olist = [] total = [0] * len(args.bamfiles) for idx, f in enumerate(args.bamfiles): odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) lpos = None prev_pos = set() for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: continue if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) total[idx] += 1 # Get blocks, possibly extending features = gtf.findOverlaps( chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features, total
def getEnrichment_worker(arglist): """ This is the worker function of plotEnrichment. In short, given a region, iterate over all reads **starting** in it. Filter/extend them as requested and check each for an overlap with findOverlaps. For each overlap, increment the counter for that feature. """ chrom, start, end, args, defaultFragmentLength = arglist if args.verbose: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) olist = [] total = [0] * len(args.bamfiles) for idx, f in enumerate(args.bamfiles): odict = dict() for x in gtf.features: odict[x] = 0 fh = openBam(f) chrom = mungeChromosome(chrom, fh.references) lpos = None prev_pos = set() for read in fh.fetch(chrom, start, end): # Filter if read.pos < start: # Ensure that a given alignment is processed only once continue if read.flag & 4: continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: continue if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) total[idx] += 1 # Get blocks, possibly extending features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset)) if features is not None and len(features) > 0: for x in features: odict[x] += 1 olist.append(odict) return olist, gtf.features, total