def countReadsPerGC_worker(chromNameBam, start, end, stepSize, regionSize, chrNameBamToBit, verbose=False): """given a genome region defined by (start, end), the GC content is quantified for regions of size regionSize that are contiguous """ chromNameBit = chrNameBamToBit[chromNameBam] tbit = py2bit.open(global_vars['2bit']) bam = bamHandler.openBam(global_vars['bam']) c = 1 sub_reads_per_gc = [] positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) for index in range(len(positions_to_sample)): i = positions_to_sample[index] # stop if region extends over the chromosome end if tbit.chroms(chromNameBit) < i + regionSize: break try: gc = getGC_content(tbit, chromNameBit, int(i), int(i + regionSize)) except Exception as detail: if verbose: print("{}:{}-{}".format(chromNameBit, i, i + regionSize)) print(detail) continue numberReads = bam.count(chromNameBam, i, i + regionSize) sub_reads_per_gc.append((numberReads, gc)) c += 1 return sub_reads_per_gc
def countReadsPerGC_worker(chromNameBam, start, end, stepSize, regionSize, chrNameBamToBit, verbose=False): """given a genome region defined by (start, end), the GC content is quantified for regions of size regionSize that are contiguous """ chromNameBit = chrNameBamToBit[chromNameBam] tbit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = bamHandler.openBam(global_vars['bam']) c = 1 sub_reads_per_gc = [] positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) for index in xrange(len(positions_to_sample)): i = positions_to_sample[index] # stop if region extends over the chromosome end if tbit[chromNameBit].size < i + regionSize: break try: gc = getGC_content(tbit[chromNameBit].get(i, i + regionSize)) except Exception as detail: if verbose: print "{}:{}-{}".format(chromNameBit, i, i + regionSize) print detail continue numberReads = bam.count(chromNameBam, i, i + regionSize) sub_reads_per_gc.append((numberReads, gc)) c += 1 return sub_reads_per_gc
def getReadGCcontent(tbit, read, fragmentLength, chrNameBit): """ The fragments for forward and reverse reads are defined as follows:: |- read.pos |- read.aend ---+=================>-----------------------+--------- Forward strand |-fragStart |-fragEnd ---+-----------------------<=================+--------- Reverse strand |-read.pos |-read.aend |-----------------------------------------| read.tlen """ fragStart = None fragEnd = None if read.is_paired and read.is_proper_pair and abs( read.tlen) < 2 * fragmentLength: if read.is_reverse and read.tlen < 0: fragEnd = read.aend fragStart = read.aend + read.tlen elif read.tlen >= read.qlen: fragStart = read.pos fragEnd = read.pos + read.tlen if not fragStart: if read.is_reverse: fragEnd = read.aend fragStart = read.aend - fragmentLength else: fragStart = read.pos fragEnd = fragStart + fragmentLength fragStart = max(0, fragStart) try: gc = getGC_content(tbit[chrNameBit][fragStart:fragEnd], as_fraction=True) except Exception: return None if gc is None: return None # match the gc to the given fragmentLength gc = int(np.round(gc * fragmentLength)) return gc
def getReadGCcontent(tbit, read, fragmentLength, chrNameBit): """ The fragments for forward and reverse reads are defined as follows:: |- read.pos |- read.aend ---+=================>-----------------------+--------- Forward strand |-fragStart |-fragEnd ---+-----------------------<=================+--------- Reverse strand |-read.pos |-read.aend |-----------------------------------------| read.tlen """ fragStart = None fragEnd = None if read.is_paired and read.is_proper_pair and abs(read.tlen) < 2 * fragmentLength: if read.is_reverse and read.tlen < 0: fragEnd = read.reference_end fragStart = read.reference_end + read.template_length elif read.template_length >= read.query_alignment_length: fragStart = read.pos fragEnd = read.pos + read.template_length if not fragStart: if read.is_reverse: fragEnd = read.reference_end fragStart = read.reference_end - fragmentLength else: fragStart = read.pos fragEnd = fragStart + fragmentLength fragStart = max(0, fragStart) try: gc = getGC_content(tbit, chrNameBit, fragStart, fragEnd) except Exception: return None if gc is None: return None # match the gc to the given fragmentLength gc = int(np.round(gc * fragmentLength)) return gc
def getReadGCcontent(tbit, read, fragmentLength, chrNameBit): """ The fragments for forward and reverse reads are defined as follows:: |- read.pos |- read.aend ---+=================>-----------------------+--------- Forward strand |-fragStart |-fragEnd ---+-----------------------<=================+--------- Reverse strand |-read.pos |-read.aend |-----------------------------------------| read.tlen """ fragStart = None fragEnd = None if read.is_paired and read.is_proper_pair and abs(read.tlen) < 2 * fragmentLength: if read.is_reverse and read.tlen < 0: fragEnd = read.aend fragStart = read.aend + read.tlen elif read.tlen >= read.qlen: fragStart = read.pos fragEnd = read.pos + read.tlen if not fragStart: if read.is_reverse: fragEnd = read.aend fragStart = read.aend - fragmentLength else: fragStart = read.pos fragEnd = fragStart + fragmentLength try: gc = getGC_content(tbit[chrNameBit].get(fragStart, fragEnd), as_fraction=True) except Exception: return None # match the gc to the given fragmentLength gc = int(np.round(gc * fragmentLength)) return gc
def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, fragmentLength, chrNameBamToBit, verbose=False): r""" given genome regions, the GC content of the genome is tabulated for fragments of length 'fragmentLength' each 'stepSize' positions. >>> test = Tester() >>> args = test.testTabulateGCcontentWorker() >>> N_gc, F_gc = tabulateGCcontent_worker(*args) The forward read positions are: [1, 4, 10, 10, 16, 18] which correspond to a GC of [1, 1, 1, 1, 2, 1] The evaluated position are [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] the corresponding GC is [2, 1, 1, 2, 2, 1, 2, 3, 2, 1] >>> print(N_gc) [0 4 5 1] >>> print(F_gc) [0 4 1 0] >>> test.set_filter_out_file() >>> chrNameBam2bit = {'2L': 'chr2L'} Test for the filter out option >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2, ... {'median': 3}, chrNameBam2bit) >>> test.unset_filter_out_file() The evaluated positions are [ 0 2 8 10 12 14 16 18] >>> print(N_gc) [0 3 4 1] >>> print(F_gc) [0 3 1 0] Test for extra_sampling option >>> test.set_extra_sampling_file() >>> chrNameBam2bit = {'2L': 'chr2L'} >>> res = tabulateGCcontent_worker('2L', 0, 20, 2, ... {'median': 3}, chrNameBam2bit) The new positions evaluated are [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18] and the GC is [2, 1, 1, 0, 1, 2, 2, 1, 2, 3, 2, 1] >>> print(res[0]) [1 5 5 1] >>> print(res[1]) [0 5 1 0] """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) chromNameBit = chrNameBamToBit[chromNameBam] # array to keep track of the GC from regions of length 'fragmentLength' # from the genome. The index of the array is used to # indicate the gc content. The values inside the # array are counts. Thus, if N_gc[10] = 3, that means # that 3 regions have a gc_content of 10. subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') tbit = py2bit.open(global_vars['2bit']) bam = bamHandler.openBam(global_vars['bam']) peak = 0 startTime = time.time() if verbose: print("[{:.3f}] computing positions to " "sample".format(time.time() - startTime)) positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) read_counts = [] # Optimize IO. # if the sample regions are far apart from each # other is faster to go to each location and fetch # the reads found there. # Otherwise, if the regions to sample are close to # each other, is faster to load all the reads in # a large region into memory and consider only # those falling into the positions to sample. # The following code gets the reads # that are at sampling positions that lie close together if np.mean(np.diff(positions_to_sample)) < 1000: start_pos = min(positions_to_sample) end_pos = max(positions_to_sample) if verbose: print("[{:.3f}] caching reads".format(time.time() - startTime)) counts = np.bincount([r.pos - start_pos for r in bam.fetch(chromNameBam, start_pos, end_pos + 1) if not r.is_reverse and r.pos >= start_pos], minlength=end_pos - start_pos + 2) read_counts = counts[positions_to_sample - min(positions_to_sample)] if verbose: print("[{:.3f}] finish caching reads.".format( time.time() - startTime)) countTime = time.time() c = 1 for index in range(len(positions_to_sample)): i = positions_to_sample[index] # stop if the end of the chromosome is reached if i + fragmentLength['median'] > tbit.chroms(chromNameBit): break try: gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False) except Exception as detail: if verbose: print(detail) continue subN_gc[gc] += 1 # count all reads at position 'i' if len(read_counts) == 0: # case when no cache was done num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1) if x.is_reverse is False and x.pos == i]) else: num_reads = read_counts[index] if num_reads >= global_vars['max_reads']: peak += 1 continue subF_gc[gc] += num_reads if verbose: if index % 50000 == 0: endTime = time.time() print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" % (multiprocessing.current_process().name, index, index / (endTime - countTime), chromNameBit, start, end, stepSize)) c += 1 if verbose: endTime = time.time() print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" % (multiprocessing.current_process().name, index, index / (endTime - countTime), chromNameBit, start, end, stepSize)) print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name, (endTime - startTime), chromNameBit, start, end, stepSize)) return(subN_gc, subF_gc)
def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, fragmentLength, chrNameBamToBit, verbose=False): r""" given genome regions, the GC content of the genome is tabulated for fragments of length 'fragmentLength' each 'stepSize' positions. >>> test = Tester() >>> args = test.testTabulateGCcontentWorker() >>> N_gc, F_gc = tabulateGCcontent_worker(*args) The forward read positions are: [1, 4, 10, 10, 16, 18] which correspond to a GC of [1, 1, 1, 1, 2, 1] The evaluated position are [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] the corresponding GC is [2, 1, 1, 2, 2, 1, 2, 3, 2, 1] >>> print N_gc [0 4 5 1] >>> print F_gc [0 4 1 0] >>> test.set_filter_out_file() >>> chrNameBam2bit = {'2L': 'chr2L'} Test for the filter out option >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2, ... {'median': 3}, chrNameBam2bit) >>> test.unset_filter_out_file() The evaluated positions are [ 0 2 8 10 12 14 16 18] >>> print N_gc [0 3 4 1] >>> print F_gc [0 3 1 0] Test for extra_sampling option >>> test.set_extra_sampling_file() >>> chrNameBam2bit = {'2L': 'chr2L'} >>> res = tabulateGCcontent_worker('2L', 0, 20, 2, ... {'median': 3}, chrNameBam2bit) The new positions evaluated are [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18] and the GC is [2, 1, 1, 0, 1, 2, 2, 1, 2, 3, 2, 1] >>> print res[0] [1 5 5 1] >>> print res[1] [0 5 1 0] """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) chromNameBit = chrNameBamToBit[chromNameBam] # array to keep track of the GC from regions of length 'fragmentLength' # from the genome. The index of the array is used to # indicate the gc content. The values inside the # array are counts. Thus, if N_gc[10] = 3, that means # that 3 regions have a gc_content of 10. subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') tbit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = bamHandler.openBam(global_vars['bam']) peak = 0 startTime = time.time() if verbose: print "[{:.3f}] computing positions to " \ "sample".format(time.time() - startTime) positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) read_counts = [] # Optimize IO. # if the sample regions are far apart from each # other is faster to go to each location and fetch # the reads found there. # Otherwise, if the regions to sample are close to # each other, is faster to load all the reads in # a large region into memory and consider only # those falling into the positions to sample. # The following code gets the reads # that are at sampling positions that lie close together if np.mean(np.diff(positions_to_sample)) < 1000: start_pos = min(positions_to_sample) end_pos = max(positions_to_sample) if verbose: print "[{:.3f}] caching reads".format(time.time() - startTime) counts = np.bincount([r.pos - start_pos for r in bam.fetch(chromNameBam, start_pos, end_pos + 1) if not r.is_reverse and r.pos >= start_pos], minlength=end_pos - start_pos + 2) read_counts = counts[positions_to_sample - min(positions_to_sample)] if verbose: print "[{:.3f}] finish caching reads.".format( time.time() - startTime) countTime = time.time() c = 1 for index in xrange(len(positions_to_sample)): i = positions_to_sample[index] # stop if the end of the chromosome is reached if i + fragmentLength['median'] > tbit[chromNameBit].size: break try: gc = getGC_content( tbit[chromNameBit].get(i, i + fragmentLength['median']), as_fraction=False) except Exception as detail: if verbose: print detail continue subN_gc[gc] += 1 # count all reads at position 'i' if len(read_counts) == 0: # case when no cache was done num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1) if x.is_reverse is False and x.pos == i]) else: num_reads = read_counts[index] if num_reads >= global_vars['max_reads']: peak += 1 continue subF_gc[gc] += num_reads if verbose: if index % 50000 == 0: endTime = time.time() print "%s processing %d (%.1f per sec) @ %s:%s-%s %s" % \ (multiprocessing.current_process().name, index, index / (endTime - countTime), chromNameBit, start, end, stepSize) c += 1 if verbose: endTime = time.time() print "%s processing %d (%.1f per sec) @ %s:%s-%s %s" % \ (multiprocessing.current_process().name, index, index / (endTime - countTime), chromNameBit, start, end, stepSize) print "%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name, (endTime - startTime), chromNameBit, start, end, stepSize) return(subN_gc, subF_gc)