def writeBedGraph_worker(chrom, start, end, tileSize, defaultFragmentLength, bamFilesList, func, funcArgs, extendPairedEnds=True, smoothLength=0, zerosToNans=True, minMappingQuality=None, ignoreDuplicates=False, fragmentFromRead_func=None, centerRead=False, samFlag=None): r""" Writes a bedgraph having as base a number of bam files. The given func is called to compute the desired bedgraph value using the funcArgs tileSize >>> test = Tester() >>> funcArgs = {'scaleFactor': 1.0} >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0, ... [test.bamFile1], scaleCoverage, funcArgs, True, 0, False) >>> open(tempFile, 'r').readlines() ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n'] >>> os.remove(tempFile) Test the file being writen for single end reads with no extension and no smoothing >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0, ... [test.bamFile1], scaleCoverage, funcArgs) >>> open(tempFile, 'r').readlines() ['3R\t100\t200\t1.0\n'] >>> os.remove(tempFile) Test scaling >>> funcArgs = {'scaleFactor': 3.0} >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0, ... [test.bamFile1], scaleCoverage, funcArgs) >>> open(tempFile, 'r').readlines() ['3R\t100\t200\t3.0\n'] >>> os.remove(tempFile) Test ignore duplicates >>> funcArgs = {'scaleFactor': 1.0} >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0, ... [test.bamFile2], scaleCoverage, funcArgs, ignoreDuplicates=True) >>> open(tempFile, 'r').readlines() ['3R\t50\t200\t1.0\n'] >>> os.remove(tempFile) Test smoothing >>> funcArgs = {'scaleFactor': 1.0} >>> tempFile = writeBedGraph_worker( '3R', 100, 200, 20, 0, ... [test.bamFile2], scaleCoverage, funcArgs, smoothLength=60) >>> open(tempFile, 'r').readlines() ['3R\t100\t120\t1.00\n', '3R\t120\t140\t1.67\n', '3R\t140\t160\t2.00\n', '3R\t160\t180\t2.33\n', '3R\t180\t200\t2.0\n'] >>> os.remove(tempFile) Test ratio (needs two bam files) >>> funcArgs = {} >>> tempFile = writeBedGraph_worker( '3R', 100, 200, 50, 0, ... [test.bamFile1, test.bamFile2], ratio , funcArgs) >>> open(tempFile, 'r').readlines() ['3R\t100\t150\t1.00\n', '3R\t150\t200\t0.5\n'] >>> os.remove(tempFile) Test minMapping quality >>> funcArgs = {'scaleFactor': 1.0} >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0, ... [test.bamFile2], scaleCoverage, funcArgs, minMappingQuality=40) >>> open(tempFile, 'r').readlines() ['3R\t150\t200\t1.0\n'] >>> os.remove(tempFile) """ if start > end: raise NameError("start position ({0}) bigger " "than end position ({1})".format(start, end)) coverage = [] for bamFile in bamFilesList: bamHandle = openBam(bamFile) coverage.append( getCoverageOfRegion( bamHandle, chrom, start, end, tileSize, defaultFragmentLength, extendPairedEnds, zerosToNans, ignoreDuplicates=ignoreDuplicates, minMappingQuality=minMappingQuality, fragmentFromRead_func=fragmentFromRead_func, centerRead=centerRead, samFlag=samFlag)) bamHandle.close() _file = open(utilities.getTempFileName(suffix='.bg'), 'w') previousValue = None lengthCoverage = len(coverage[0]) for tileIndex in xrange(lengthCoverage): tileCoverage = [] for index in range(len(bamFilesList)): if smoothLength > 0: vectorStart, vectorEnd = getSmoothRange( tileIndex, tileSize, smoothLength, lengthCoverage) tileCoverage.append( np.mean(coverage[index][vectorStart:vectorEnd])) else: tileCoverage.append(coverage[index][tileIndex]) # if zerosToNans == True and sum(tileCoverage) == 0.0: # continue value = func(tileCoverage, funcArgs) """ # uncomment this lines if fixed step bedgraph is wanted if not np.isnan(value): writeStart = start + tileIndex*tileSize writeEnd = min(writeStart+tileSize, end) _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, value) ) """ if previousValue is None: writeStart = start + tileIndex * tileSize writeEnd = min(writeStart + tileSize, end) previousValue = value elif previousValue == value: writeEnd = min(writeEnd + tileSize, end) elif previousValue != value: if not np.isnan(previousValue): _file.write( "{}\t{}\t{}\t{:.2f}\n".format(chrom, writeStart, writeEnd, previousValue)) previousValue = value writeStart = writeEnd writeEnd = min(writeStart + tileSize, end) # write remaining value if not a nan if previousValue and writeStart != end and not np.isnan(previousValue): _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart, end, previousValue)) tempFileName = _file.name _file.close() return(tempFileName)
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, step=None, tag_but_not_change_number=False, verbose=True): r""" Writes a BAM file, deleting and adding some reads in order to compensate for the GC bias. **This is a stochastic method.** >>> np.random.seed(1) >>> test = Tester() >>> args = test.testWriteCorrectedSam() >>> tempFile = writeCorrectedSam_worker(*args, \ ... tag_but_not_change_number=True, verbose=False) >>> try: ... import StringIO ... except ImportError: ... from io import StringIO >>> ostdout = sys.stdout >>> import tempfile >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") >>> tempFile = \ ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\ ... tag_but_not_change_number=True, verbose=False) >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") """ global R_gc fragmentLength = len(R_gc) - 1 if verbose: print("Sam for %s %s %s " % (chrNameBit, start, end)) i = 0 tbit = py2bit.open(global_vars['2bit']) bam = openBam(global_vars['bam']) tempFileName = utilities.getTempFileName(suffix='.bam') outfile = pysam.Samfile(tempFileName, 'wb', template=bam) startTime = time.time() matePairs = {} read_repetitions = 0 removed_duplicated_reads = 0 # cache data # r.flag & 4 == 0 is to filter unmapped reads that # have a genomic position reads = [r for r in bam.fetch(chrNameBam, start, end) if r.pos > start and r.flag & 4 == 0] r_index = -1 for read in reads: if read.pos <= start or read.is_unmapped: continue r_index += 1 copies = None gc = None # check if a mate has already been procesed # to apply the same correction try: copies = matePairs[read.qname]['copies'] gc = matePairs[read.qname]['gc'] del(matePairs[read.qname]) except: # this exception happens when a mate is # not present. This could # happen because of removal of the mate # by some filtering gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) if gc: copies = numCopiesOfRead(float(1) / R_gc[gc]) else: copies = 1 # is this read in the same orientation and position as the previous? if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \ and read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: copies = 0 # in other words do not take into account this read removed_duplicated_reads += 1 else: read_repetitions = 0 readName = read.qname # Each tag is a tuple of (tag name, value, type) # Note that get_tags() returns ord(type) rather than type and this must # be fixed! # It turns out that the "with_value_type" option only started working in # pysam-0.8.4, so we can't reliably add tags on earlier versions without # potentially creating BAM files that break HTSJDK/IGV/etc. readTag = read.get_tags(with_value_type=True) replace_tags = False if len(readTag) > 0: if len(readTag[0]) == 3: if type(readTag[2]) is int: readTag = [(x[0], x[1], chr(x[2])) for x in readTag] replace_tags = True else: replace_tags = True if gc: GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2)) readTag.append( ('YC', float(round(float(1) / R_gc[gc], 2)), "f")) readTag.append(('YN', copies, "i")) else: GC = -1 readTag.append(('YG', GC, "i")) if replace_tags: read.set_tags(readTag) if read.is_paired and read.is_proper_pair \ and not read.mate_is_unmapped \ and not read.is_reverse: matePairs[readName] = {'copies': copies, 'gc': gc} """ outfile.write(read) """ if tag_but_not_change_number: outfile.write(read) continue for numCop in range(1, copies + 1): # the read has to be renamed such that newly # formed pairs will match if numCop > 1: read.qname = readName + "_%d" % (numCop) outfile.write(read) if verbose: if i % 500000 == 0 and i > 0: endTime = time.time() print("{}, processing {} ({:.1f} per sec) reads " "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end)) i += 1 outfile.close() if verbose: endTime = time.time() print("{}, processing {} ({:.1f} per sec) reads " "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end)) percentage = float(removed_duplicated_reads) * 100 / len(reads) \ if len(reads) > 0 else 0 print("duplicated reads removed %d of %d (%.2f) " % (removed_duplicated_reads, len(reads), percentage)) return tempFileName
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): r"""writes a bedgraph file containing the GC correction of a region from the genome >>> test = Tester() >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk()) >>> open(tempFile, 'r').readlines() ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n'] >>> os.remove(tempFile) """ global R_gc fragmentLength = len(R_gc) - 1 cvg_corr = np.zeros(end - start) i = 0 tbit = py2bit.open(global_vars['2bit']) bam = pysam.Samfile(global_vars['bam']) read_repetitions = 0 removed_duplicated_reads = 0 startTime = time.time() # caching seems to be faster # r.flag & 4 == 0 is to skip unmapped # reads that nevertheless are asigned # to a genomic position reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0] bam.close() r_index = -1 for read in reads: r_index += 1 try: # calculate GC content of read fragment gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) except Exception as detail: print(detail) """ this exception happens when the end of a chromosome is reached """ continue if not gc: continue # is this read in the same orientation and position as the previous? if r_index > 0 and read.pos == reads[r_index - 1].pos and \ read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: removed_duplicated_reads += 1 continue else: read_repetitions = 0 try: fragmentStart, fragmentEnd = getFragmentFromRead( read, fragmentLength, extendPairedEnds=True) vectorStart = max(fragmentStart - start, 0) vectorEnd = min(fragmentEnd - start, end - start) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc] i += 1 if debug: endTime = time.time() print("{}, processing {} ({:.1f} per sec) ") "reads @ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) if i == 0: return None _file = open(utilities.getTempFileName(suffix='.bg'), 'w') # save in bedgraph format for bin in range(0, len(cvg_corr), step): value = np.mean(cvg_corr[bin:min(bin + step, end)]) if value > 0: writeStart = start + bin writeEnd = min(start + bin + step, end) _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart, writeEnd, value)) tempFileName = _file.name _file.close() return tempFileName
def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args, bed_regions_list=None): r"""Writes a bedgraph based on the read coverage found on bamFiles The given func is called to compute the desired bedgraph value using the funcArgs Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. smoothLength : int Distance in bp for smoothing the coverage per tile. bed_regions_list: list List of tuples of the form (chrom, start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- temporary file with the bedgraph results for the region queried. Examples -------- >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" >>> bamFile1 = test_path + "testA.bam" >>> bin_length = 50 >>> number_of_samples = 0 # overruled by step_size >>> func_to_call = scaleCoverage >>> funcArgs = {'scaleFactor': 1.0} >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50) >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs) >>> open(tempFile, 'r').readlines() ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n'] >>> os.remove(tempFile) """ if start > end: raise NameError("start position ({0}) bigger " "than end position ({1})".format(start, end)) coverage = [] bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList] for bam in bam_handlers: coverage.append( self.get_coverage_of_region(bam, chrom, start, end, self.binLength)) bam.close() _file = open(utilities.getTempFileName(suffix='.bg'), 'w') previous_value = None length_coverage = len(coverage[0]) for tileIndex in xrange(length_coverage): tileCoverage = [] for index in range(len(self.bamFilesList)): if self.smoothLength > 0: vector_start, vector_end = self.getSmoothRange(tileIndex, self.binLength, self.smoothLength, length_coverage) tileCoverage.append( np.mean(coverage[index][vector_start:vector_end])) else: tileCoverage.append(coverage[index][tileIndex]) value = func_to_call(tileCoverage, func_args) """ # uncomment this lines if fixed step bedgraph is wanted if not np.isnan(value): writeStart = start + tileIndex*self.binLength writeEnd = min(writeStart+self.binLength, end) _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, value) ) """ if previous_value is None: writeStart = start + tileIndex * self.binLength writeEnd = min(writeStart + self.binLength, end) previous_value = value elif previous_value == value: writeEnd = min(writeEnd + self.binLength, end) elif previous_value != value: if not np.isnan(previous_value): _file.write( "{}\t{}\t{}\t{:.2f}\n".format(chrom, writeStart, writeEnd, previous_value)) previous_value = value writeStart = writeEnd writeEnd = min(writeStart + self.binLength, end) # write remaining value if not a nan if previous_value and writeStart != end and not np.isnan(previous_value): _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart, end, previous_value)) tempfilename = _file.name _file.close() return tempfilename
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): r"""writes a bedgraph file containing the GC correction of a region from the genome >>> test = Tester() >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk()) >>> open(tempFile, 'r').readlines() ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n'] >>> os.remove(tempFile) """ global R_gc fragmentLength = len(R_gc) - 1 cvg_corr = np.zeros(end - start) i = 0 tbit = py2bit.open(global_vars['2bit']) bam = openBam(global_vars['bam']) read_repetitions = 0 removed_duplicated_reads = 0 startTime = time.time() # caching seems to be faster # r.flag & 4 == 0 is to skip unmapped # reads that nevertheless are asigned # to a genomic position reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0] bam.close() r_index = -1 for read in reads: if read.is_unmapped: continue r_index += 1 try: # calculate GC content of read fragment gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) except Exception as detail: print(detail) """ this exception happens when the end of a chromosome is reached """ continue if not gc: continue # is this read in the same orientation and position as the previous? if r_index > 0 and read.pos == reads[r_index - 1].pos and \ read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: removed_duplicated_reads += 1 continue else: read_repetitions = 0 try: fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True) vectorStart = max(fragmentStart - start, 0) vectorEnd = min(fragmentEnd - start, end - start) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc] i += 1 try: if debug: endTime = time.time() print("{}, processing {} ({:.1f} per sec) ") "reads @ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) except NameError: pass if i == 0: return None _file = open(utilities.getTempFileName(suffix='.bg'), 'w') # save in bedgraph format for bin in range(0, len(cvg_corr), step): value = np.mean(cvg_corr[bin:min(bin + step, end)]) if value > 0: writeStart = start + bin writeEnd = min(start + bin + step, end) _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart, writeEnd, value)) tempFileName = _file.name _file.close() return tempFileName
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, step=None, tag_but_not_change_number=False, verbose=True): r""" Writes a BAM file, deleting and adding some reads in order to compensate for the GC bias. **This is a stochastic method.** >>> np.random.seed(1) >>> test = Tester() >>> args = test.testWriteCorrectedSam() >>> tempFile = writeCorrectedSam_worker(*args, \ ... tag_but_not_change_number=True, verbose=False) >>> try: ... import StringIO ... except ImportError: ... from io import StringIO >>> ostdout = sys.stdout >>> import tempfile >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") >>> tempFile = \ ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\ ... tag_but_not_change_number=True, verbose=False) >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") """ global R_gc fragmentLength = len(R_gc) - 1 if verbose: print("Sam for %s %s %s " % (chrNameBit, start, end)) i = 0 tbit = py2bit.open(global_vars['2bit']) bam = openBam(global_vars['bam']) tempFileName = utilities.getTempFileName(suffix='.bam') outfile = pysam.Samfile(tempFileName, 'wb', template=bam) startTime = time.time() matePairs = {} read_repetitions = 0 removed_duplicated_reads = 0 # cache data # r.flag & 4 == 0 is to filter unmapped reads that # have a genomic position reads = [ r for r in bam.fetch(chrNameBam, start, end) if r.pos > start and r.flag & 4 == 0 ] r_index = -1 for read in reads: if read.pos <= start or read.is_unmapped: continue r_index += 1 copies = None gc = None # check if a mate has already been procesed # to apply the same correction try: copies = matePairs[read.qname]['copies'] gc = matePairs[read.qname]['gc'] del (matePairs[read.qname]) except: # this exception happens when a mate is # not present. This could # happen because of removal of the mate # by some filtering gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) if gc: copies = numCopiesOfRead(float(1) / R_gc[gc]) else: copies = 1 # is this read in the same orientation and position as the previous? if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \ and read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: copies = 0 # in other words do not take into account this read removed_duplicated_reads += 1 else: read_repetitions = 0 readName = read.qname # Each tag is a tuple of (tag name, value, type) # Note that get_tags() returns ord(type) rather than type and this must # be fixed! # It turns out that the "with_value_type" option only started working in # pysam-0.8.4, so we can't reliably add tags on earlier versions without # potentially creating BAM files that break HTSJDK/IGV/etc. readTag = read.get_tags(with_value_type=True) replace_tags = False if len(readTag) > 0: if len(readTag[0]) == 3: if type(readTag[2]) is int: readTag = [(x[0], x[1], chr(x[2])) for x in readTag] replace_tags = True else: replace_tags = True if gc: GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2)) readTag.append(('YC', float(round(float(1) / R_gc[gc], 2)), "f")) readTag.append(('YN', copies, "i")) else: GC = -1 readTag.append(('YG', GC, "i")) if replace_tags: read.set_tags(readTag) if read.is_paired and read.is_proper_pair \ and not read.mate_is_unmapped \ and not read.is_reverse: matePairs[readName] = {'copies': copies, 'gc': gc} """ outfile.write(read) """ if tag_but_not_change_number: outfile.write(read) continue for numCop in range(1, copies + 1): # the read has to be renamed such that newly # formed pairs will match if numCop > 1: read.qname = readName + "_%d" % (numCop) outfile.write(read) if verbose: if i % 500000 == 0 and i > 0: endTime = time.time() print("{}, processing {} ({:.1f} per sec) reads " "@ {}:{}-{}".format( multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end)) i += 1 outfile.close() if verbose: endTime = time.time() print("{}, processing {} ({:.1f} per sec) reads " "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end)) percentage = float(removed_duplicated_reads) * 100 / len(reads) \ if len(reads) > 0 else 0 print("duplicated reads removed %d of %d (%.2f) " % (removed_duplicated_reads, len(reads), percentage)) return tempFileName
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc))] global_vars['max_dup_gc'] = max_dup_gc bit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = pysam.Samfile(global_vars['bam']) global_vars['genome_size'] = sum([bit[x].size for x in bit.index]) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print "applying correction" # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print "genome partition size for multiprocessing: {}".format(chunkSize) print "using region {}".format(args.region) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()]) print chrNameBitToBam, chrNameBamToBit c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in xrange(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print "no sequence information for " "chromosome {} in 2bit file".format(chrom) print "Reads in this chromosome will be skipped" continue length = min(size, i + chunkSize) mp_args.append((chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print ("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args))) res = pool.map_async( writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = map(writeCorrectedSam_wrapper, mp_args) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print "concatenating (sorted) intermediate BAMs" header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print "indexing BAM" pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg') if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = map(writeCorrected_wrapper, mp_args) # concatenate intermediary bedgraph files _temp_bg_file = open(_temp_bg_file_name, 'w') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file) os.remove(tempFileName) _temp_bg_file.close() args.correctedFile.close() if args.correctedFile.name.endswith('bg'): shutil.move(_temp_bg_file_name, args.correctedFile.name) else: chromSizes = [(x, bit[x].size) for x in bit.keys()] writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name, args.correctedFile.name) os.remove(_temp_bg_file)
def filterWorker(arglist): chrom, start, end, args, chromDict = arglist fh = openBam(args.bam) mode = 'wbu' oname = getTempFileName(suffix='.bam') if args.filteredOutReads: onameFiltered = getTempFileName(suffix='.bam') else: onameFiltered = None ofh = pysam.AlignmentFile(oname, mode=mode, template=fh) if onameFiltered: ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh) else: ofiltered = None prev_pos = set() lpos = None nFiltered = 0 total = 0 for read in fh.fetch(chrom, start, end): if read.pos < start: # ensure that we never double count (in case distanceBetweenBins == 0) continue total += 1 if read.flag & 4: # Ignore unmapped reads, they were counted already nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.minMappingQuality and read.mapq < args.minMappingQuality: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: nFiltered += 1 if ofiltered: ofiltered.write(read) continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) # filterRNAstrand if args.filterRNAstrand: if read.is_paired: if args.filterRNAstrand == 'forward': if read.flag & 144 == 128 or read.flag & 96 == 64: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue elif args.filterRNAstrand == 'reverse': if read.flag & 144 == 144 or read.flag & 96 == 96: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue else: if args.filterRNAstrand == 'forward': if read.flag & 16 == 16: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue elif args.filterRNAstrand == 'reverse': if read.flag & 16 == 0: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.shift: read = shiftRead(read, chromDict, args) if not read: continue # Read survived filtering ofh.write(read) # The results from the workers will get sorted, so get the TID tid = fh.get_tid(chrom) ofh.close() if ofiltered: ofiltered.close() fh.close() return tid, start, total, nFiltered, oname, onameFiltered
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [ binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc)) ] global_vars['max_dup_gc'] = max_dup_gc bit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = pysam.Samfile(global_vars['bam']) global_vars['genome_size'] = sum([bit[x].size for x in bit.index]) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print "applying correction" # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print "genome partition size for multiprocessing: {}".format(chunkSize) print "using region {}".format(args.region) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()]) print chrNameBitToBam, chrNameBamToBit c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in xrange(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print "no sequence information for " "chromosome {} in 2bit file".format(chrom) print "Reads in this chromosome will be skipped" continue length = min(size, i + chunkSize) mp_args.append( (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args))) res = pool.map_async(writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = map(writeCorrectedSam_wrapper, mp_args) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print "concatenating (sorted) intermediate BAMs" header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print "indexing BAM" pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg') if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = map(writeCorrected_wrapper, mp_args) # concatenate intermediary bedgraph files _temp_bg_file = open(_temp_bg_file_name, 'w') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file) os.remove(tempFileName) _temp_bg_file.close() args.correctedFile.close() if args.correctedFile.name.endswith('bg'): shutil.move(_temp_bg_file_name, args.correctedFile.name) else: chromSizes = [(x, bit[x].size) for x in bit.keys()] writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name, args.correctedFile.name) os.remove(_temp_bg_file)
def writeBedGraph_worker(chrom, start, end, tileSize, defaultFragmentLength, bamFilesList, func, funcArgs, extendPairedEnds=True, smoothLength=0, zerosToNans=True, minMappingQuality=None, ignoreDuplicates=False, fragmentFromRead_func=None, centerRead=False): r""" Writes a bedgraph having as base a number of bam files. The given func is called to compute the desired bedgraph value using the funcArgs tileSize >>> test = Tester() >>> funcArgs = {'scaleFactor': 1.0} >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0, ... [test.bamFile1], scaleCoverage, funcArgs, True, 0, False) >>> open(tempFile, 'r').readlines() ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n'] >>> os.remove(tempFile) Test the file being writen for single end reads with no extension and no smoothing >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0, ... [test.bamFile1], scaleCoverage, funcArgs) >>> open(tempFile, 'r').readlines() ['3R\t100\t200\t1.0\n'] >>> os.remove(tempFile) Test scaling >>> funcArgs = {'scaleFactor': 3.0} >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0, ... [test.bamFile1], scaleCoverage, funcArgs) >>> open(tempFile, 'r').readlines() ['3R\t100\t200\t3.0\n'] >>> os.remove(tempFile) Test ignore duplicates >>> funcArgs = {'scaleFactor': 1.0} >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0, ... [test.bamFile2], scaleCoverage, funcArgs, ignoreDuplicates=True) >>> open(tempFile, 'r').readlines() ['3R\t50\t200\t1.0\n'] >>> os.remove(tempFile) Test smoothing >>> funcArgs = {'scaleFactor': 1.0} >>> tempFile = writeBedGraph_worker( '3R', 100, 200, 20, 0, ... [test.bamFile2], scaleCoverage, funcArgs, smoothLength=60) >>> open(tempFile, 'r').readlines() ['3R\t100\t120\t1.00\n', '3R\t120\t140\t1.67\n', '3R\t140\t160\t2.00\n', '3R\t160\t180\t2.33\n', '3R\t180\t200\t2.0\n'] >>> os.remove(tempFile) Test ratio (needs two bam files) >>> funcArgs = {} >>> tempFile = writeBedGraph_worker( '3R', 100, 200, 50, 0, ... [test.bamFile1, test.bamFile2], ratio , funcArgs) >>> open(tempFile, 'r').readlines() ['3R\t100\t150\t1.00\n', '3R\t150\t200\t0.5\n'] >>> os.remove(tempFile) Test minMapping quality >>> funcArgs = {'scaleFactor': 1.0} >>> tempFile = writeBedGraph_worker( '3R', 0, 200, 50, 0, ... [test.bamFile2], scaleCoverage, funcArgs, minMappingQuality=40) >>> open(tempFile, 'r').readlines() ['3R\t150\t200\t1.0\n'] >>> os.remove(tempFile) """ if start > end: raise NameError("start position ({0}) bigger " "than end position ({1})".format(start, end)) coverage = [] for bamFile in bamFilesList: bamHandle = openBam(bamFile) coverage.append( getCoverageOfRegion(bamHandle, chrom, start, end, tileSize, defaultFragmentLength, extendPairedEnds, zerosToNans, ignoreDuplicates=ignoreDuplicates, minMappingQuality=minMappingQuality, fragmentFromRead_func=fragmentFromRead_func, centerRead=centerRead)) bamHandle.close() _file = open(utilities.getTempFileName(suffix='.bg'), 'w') previousValue = None lengthCoverage = len(coverage[0]) for tileIndex in xrange(lengthCoverage): tileCoverage = [] for index in range(len(bamFilesList)): if smoothLength > 0: vectorStart, vectorEnd = getSmoothRange( tileIndex, tileSize, smoothLength, lengthCoverage) tileCoverage.append( np.mean(coverage[index][vectorStart:vectorEnd])) else: tileCoverage.append(coverage[index][tileIndex]) # if zerosToNans == True and sum(tileCoverage) == 0.0: # continue value = func(tileCoverage, funcArgs) """ # uncomment this lines if fixed step bedgraph is wanted if not np.isnan(value): writeStart = start + tileIndex*tileSize writeEnd = min(writeStart+tileSize, end) _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, value) ) """ if previousValue is None: writeStart = start + tileIndex * tileSize writeEnd = min(writeStart + tileSize, end) previousValue = value elif previousValue == value: writeEnd = min(writeEnd + tileSize, end) elif previousValue != value: if not np.isnan(previousValue): _file.write("{}\t{}\t{}\t{:.2f}\n".format( chrom, writeStart, writeEnd, previousValue)) previousValue = value writeStart = writeEnd writeEnd = min(writeStart + tileSize, end) # write remaining value if not a nan if previousValue and writeStart != end and not np.isnan(previousValue): _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart, end, previousValue)) tempFileName = _file.name _file.close() return (tempFileName)
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, step=None, tag_but_not_change_number=False, verbose=True): r""" Writes a SAM file, deleting and adding some reads in order to compensate for the GC bias. **This is a stochastic method.** First, check if samtools can be executed, otherwise the test will fail >>> resp = cfg.checkProgram(samtools, 'view', '') >>> np.random.seed(1) >>> test = Tester() >>> args = test.testWriteCorrectedSam() >>> tempFile = writeCorrectedSam_worker(*args, \ ... tag_but_not_change_number=True, verbose=False) >>> res = os.system("{} index {}".format(test.samtools, tempFile)) >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['CP'] for r in bam.fetch(args[0], 200, 250)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") >>> tempFile = \ ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\ ... tag_but_not_change_number=True, verbose=False) >>> res = os.system("{} index {}".format(test.samtools, tempFile)) >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['CP'] for r in bam.fetch('chr2L', 0, 50)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") """ global R_gc fragmentLength = len(R_gc) - 1 if verbose: print "Sam for %s %s %s " % (chrNameBit, start, end) i = 0 tbit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = pysam.Samfile(global_vars['bam']) tempFileName = utilities.getTempFileName(suffix='.sam') outfile = pysam.Samfile(tempFileName, 'wh', template=bam) startTime = time.time() matePairs = {} read_repetitions = 0 removed_duplicated_reads = 0 # cache data # r.flag & 4 == 0 is to filter unmapped reads that # have a genomic position reads = [ r for r in bam.fetch(chrNameBam, start, end) if r.pos > start and r.flag & 4 == 0 ] r_index = -1 for read in reads: r_index += 1 copies = None gc = None # check if a mate has already been procesed # to apply the same correction try: copies = matePairs[read.qname]['copies'] gc = matePairs[read.qname]['gc'] del (matePairs[read.qname]) except: # this exception happens when a mate is # not present. This could # happen because of removal of the mate # by some filtering gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) if gc: copies = numCopiesOfRead(float(1) / R_gc[gc]) else: copies = 1 # is this read in the same orientation and position as the previous? if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \ and read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: copies = 0 # in other words do not take into account this read removed_duplicated_reads += 1 else: read_repetitions = 0 readName = read.qname readTag = read.tags if gc: GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2)) readTag.append(('CO', float(round(float(1) / R_gc[gc], 2)))) readTag.append(('CP', copies)) else: GC = -1 readTag.append(('GC', GC)) read.tags = readTag if read.is_paired and read.is_proper_pair \ and not read.mate_is_unmapped \ and not read.is_reverse: matePairs[readName] = {'copies': copies, 'gc': gc} """ outfile.write(read) """ if tag_but_not_change_number: outfile.write(read) continue for numCop in range(1, copies + 1): # the read has to be renamed such that newly # formed pairs will match if numCop > 1: read.qname = readName + "_%d" % (numCop) outfile.write(read) if verbose: if i % 500000 == 0 and i > 0: endTime = time.time() print "{}, processing {} ({:.1f} per sec) reads " \ "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) i += 1 outfile.close() if verbose: endTime = time.time() print "{}, processing {} ({:.1f} per sec) reads " \ "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) percentage = float(removed_duplicated_reads) * 100 / len(reads) \ if len(reads) > 0 else 0 print "duplicated reads removed %d of %d (%.2f) " % \ (removed_duplicated_reads, len(reads), percentage) # convert sam to bam. command = '{0} view -bS {1} 2> /dev/null > {1}.bam'.format( samtools, tempFileName) if verbose: sys.stderr.write("running {}\n".format(command)) run_shell_command(command) os.remove(tempFileName) return tempFileName + ".bam"
def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args, smooth_length=0, bed_regions_list=None): r"""Writes a bedgraph based on the read coverage found on bamFiles The given func is called to compute the desired bedgraph value using the funcArgs Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. smooth_length : int Distance in bp for smoothing the coverage per tile. bed_regions_list: list List of tuples of the form (chrom, start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- temporary file with the bedgraph results for the region queried. Example ------- >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" >>> bamFile1 = test_path + "testA.bam" >>> bin_length = 50 >>> number_of_samples = 0 # overruled by step_size >>> func_to_call = scaleCoverage >>> funcArgs = {'scaleFactor': 1.0} >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50) >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs) >>> open(tempFile, 'r').readlines() ['3R\t0\t100\t0.00\n', '3R\t100\t200\t1.0\n'] >>> os.remove(tempFile) """ if start > end: raise NameError("start position ({0}) bigger " "than end position ({1})".format(start, end)) coverage = [] bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList] for bam in bam_handlers: coverage.append( self.get_coverage_of_region(bam, chrom, start, end, self.binLength)) bam.close() _file = open(utilities.getTempFileName(suffix='.bg'), 'w') previous_value = None length_coverage = len(coverage[0]) for tileIndex in xrange(length_coverage): tileCoverage = [] for index in range(len(self.bamFilesList)): if smooth_length > 0: vector_start, vector_end = self.getSmoothRange( tileIndex, self.binLength, smooth_length, length_coverage) tileCoverage.append( np.mean(coverage[index][vector_start:vector_end])) else: tileCoverage.append(coverage[index][tileIndex]) value = func_to_call(tileCoverage, func_args) """ # uncomment this lines if fixed step bedgraph is wanted if not np.isnan(value): writeStart = start + tileIndex*self.binLength writeEnd = min(writeStart+self.binLength, end) _file.write( "%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, value) ) """ if previous_value is None: writeStart = start + tileIndex * self.binLength writeEnd = min(writeStart + self.binLength, end) previous_value = value elif previous_value == value: writeEnd = min(writeEnd + self.binLength, end) elif previous_value != value: if not np.isnan(previous_value): _file.write("{}\t{}\t{}\t{:.2f}\n".format( chrom, writeStart, writeEnd, previous_value)) previous_value = value writeStart = writeEnd writeEnd = min(writeStart + self.binLength, end) # write remaining value if not a nan if previous_value and writeStart != end and not np.isnan( previous_value): _file.write("%s\t%d\t%d\t%.1f\n" % (chrom, writeStart, end, previous_value)) tempfilename = _file.name _file.close() return tempfilename
def writeBedGraph_worker(self, chrom, start, end, func_to_call, func_args, bed_regions_list=None): r"""Writes a bedgraph based on the read coverage found on bamFiles The given func is called to compute the desired bedgraph value using the funcArgs Parameters ---------- chrom : str Chrom name start : int start coordinate end : int end coordinate func_to_call : str function name to be called to convert the list of coverages computed for each bam file at each position into a single value. An example is a function that takes the ratio between the coverage of two bam files. func_args : dict dict of arguments to pass to `func`. smoothLength : int Distance in bp for smoothing the coverage per tile. bed_regions_list: list List of tuples of the form (chrom, start, end) corresponding to bed regions to be processed. If not bed file was passed to the object constructor then this list is empty. Returns ------- A list of [chromosome, start, end, temporary file], where the temporary file contains the bedgraph results for the region queried. Examples -------- >>> test_path = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" >>> bamFile1 = test_path + "testA.bam" >>> bin_length = 50 >>> number_of_samples = 0 # overruled by step_size >>> func_to_call = scaleCoverage >>> funcArgs = {'scaleFactor': 1.0} >>> c = WriteBedGraph([bamFile1], bin_length, number_of_samples, stepSize=50) >>> tempFile = c.writeBedGraph_worker( '3R', 0, 200, func_to_call, funcArgs) >>> f = open(tempFile[3], 'r') >>> f.readlines() ['3R\t0\t100\t0\n', '3R\t100\t200\t1\n'] >>> f.close() >>> os.remove(tempFile[3]) """ if start > end: raise NameError("start position ({0}) bigger " "than end position ({1})".format(start, end)) coverage, _ = self.count_reads_in_region(chrom, start, end) _file = open(utilities.getTempFileName(suffix='.bg'), 'w') previous_value = None line_string = "{}\t{}\t{}\t{:g}\n" for tileIndex in range(coverage.shape[0]): if self.smoothLength is not None and self.smoothLength > 0: vector_start, vector_end = self.getSmoothRange(tileIndex, self.binLength, self.smoothLength, coverage.shape[0]) tileCoverage = np.mean(coverage[vector_start:vector_end, :], axis=0) else: tileCoverage = coverage[tileIndex, :] if self.skipZeroOverZero and np.sum(tileCoverage) == 0: continue value = func_to_call(tileCoverage, func_args) """ # uncomment these lines if fixed step bedgraph is required if not np.isnan(value): writeStart = start + tileIndex * self.binLength writeEnd = min(writeStart + self.binLength, end) _file.write(line_string.format(chrom, writeStart, writeEnd, value)) continue """ if previous_value is None: writeStart = start + tileIndex * self.binLength writeEnd = min(writeStart + self.binLength, end) previous_value = value elif previous_value == value: writeEnd = min(writeEnd + self.binLength, end) elif previous_value != value: if not np.isnan(previous_value): _file.write( line_string.format(chrom, writeStart, writeEnd, previous_value)) previous_value = value writeStart = writeEnd writeEnd = min(writeStart + self.binLength, end) # write remaining value if not a nan if previous_value is not None and writeStart != end and not np.isnan(previous_value): _file.write(line_string.format(chrom, writeStart, end, previous_value)) tempfilename = _file.name _file.close() return chrom, start, end, tempfilename