def __init__(self): import os self.root = os.path.dirname( os.path.abspath(__file__)) + "/test/test_corrGC/" self.tbitFile = self.root + "sequence.2bit" self.bamFile = self.root + "test.bam" self.mappability = self.root + "mappability.bw" self.chrNameBam = '2L' self.chrNameBit = 'chr2L' self.samtools = cfg.config.get('external_tools', 'samtools') bam = bamHandler.openBam(self.bamFile) bit = twobit.TwoBitFile(open(self.tbitFile)) global debug debug = 0 global global_vars global_vars = { '2bit': self.tbitFile, 'bam': self.bamFile, 'filter_out': None, 'mappability': self.mappability, 'extra_sampling_file': None, 'max_reads': 5, 'min_reads': 0, 'min_reads': 0, 'reads_per_bp': 0.3, 'total_reads': bam.mapped, 'genome_size': sum([bit[x].size for x in bit.index]) }
def chunk_scaffolds(genome, size=10000000): """Given a genome in many scaffolds, build temp files of `size` Mbp for easier querying""" print '\tChunking files into {0} bp...'.format(size) chromos = [] tb = twobit.TwoBitFile(file(genome)) # split target file into `options.size` (~10 Mbp) chunks fd, out = tempfile.mkstemp(suffix='.fasta') os.close(fd) temp = open(out, 'w') length = 0 for seq in tb.keys(): sequence = tb[seq][0:] length += len(sequence) # write it to the outfile temp.write('>{0}\n{1}\n'.format(seq, sequence)) if length > size: temp.close() # put tempfile name on stack chromos.append(out + '[multiple]') # open a new temp file fd, out = tempfile.mkstemp(suffix='.fasta') os.close(fd) temp = open(out, 'w') # reset sequence length length = 0 return chromos
def countReadsPerGC_worker(chromNameBam, start, end, stepSize, regionSize, chrNameBamToBit, verbose=False): """given a genome region defined by (start, end), the GC content is quantified for regions of size regionSize that are contiguous """ chromNameBit = chrNameBamToBit[chromNameBam] tbit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = bamHandler.openBam(global_vars['bam']) c = 1 sub_reads_per_gc = [] positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) for index in xrange(len(positions_to_sample)): i = positions_to_sample[index] # stop if region extends over the chromosome end if tbit[chromNameBit].size < i + regionSize: break try: gc = getGC_content(tbit[chromNameBit].get(i, i + regionSize)) except Exception as detail: if verbose: print "{}:{}-{}".format(chromNameBit, i, i + regionSize) print detail continue numberReads = bam.count(chromNameBam, i, i + regionSize) sub_reads_per_gc.append((numberReads, gc)) c += 1 return sub_reads_per_gc
def main(): args = get_args() if args.nprocs > 1: pool = Pool(args.nprocs) # get and print start time begin_run = start() conf = ConfigParser.ConfigParser() conf.read(args.conf) params = (args.query, args.coverage, args.identity) # get align types ("Chromos"/"Scaffolds") if conf.has_section("chromos"): for genome in conf.items("chromos"): name, f = genome print "{0}\nWorking on {1}\n{0}\n".format("=" * 30, name) chromos = [os.path.join(f, chromo) for chromo in twobit.TwoBitFile(file(f)).keys()] work = [(chromo, params) for chromo in chromos] if args.nprocs > 1: results = pool.map(align_query_to_genomes, work) else: results = map(align_query_to_genomes, work) save_results_and_cleanup(args.output, name, results) if conf.has_section("scaffolds"): for genome in conf.items("scaffolds"): name, f = genome print "{0}\nWorking on {1}\n{0}\n".format("=" * 30, name) chunks = chunk_scaffolds(f) work = [(chunk, params) for chunk in chunks] if args.nprocs > 1: results = pool.map(align_query_to_genomes, work) else: results = map(align_query_to_genomes, work) save_results_and_cleanup(args.output, name, results, chunks) # get and print end time stop(begin_run)
def __init__(self): import os self.root = os.path.dirname( os.path.abspath(__file__)) + "/test/test_corrGC/" self.tbitFile = self.root + "sequence.2bit" self.bamFile = self.root + "test.bam" self.chrNameBam = '2L' self.chrNameBit = 'chr2L' bam = pysam.Samfile(self.bamFile) bit = twobit.TwoBitFile(open(self.tbitFile)) global debug debug = 0 global global_vars global_vars = { '2bit': self.tbitFile, 'bam': self.bamFile, 'filter_out': None, 'extra_sampling_file': None, 'max_reads': 5, 'min_reads': 0, 'min_reads': 0, 'reads_per_bp': 0.3, 'total_reads': bam.mapped, 'genome_size': sum([bit[x].size for x in bit.index]) }
def test_3_deletions(self): """Convert BEDPE breakends that form a deletion. """ genome_2bit = twobit.TwoBitFile(open(self.genome_file)) parts = _get_vcf_breakends(self.in_file, genome_2bit, {"max_single_size": 5000}) deletion = parts.next() assert deletion.alt == "<DEL>", deletion assert "SVLEN=-4348" in deletion.info
def test_2_vcf_parts(self): """Convert BEDPE input line into VCF output parts. """ genome_2bit = twobit.TwoBitFile(open(self.genome_file)) breakends = hydra_parser(self.in_file) brend1, brend2 = build_vcf_parts(breakends.next(), genome_2bit) assert brend1.alt == "G]chr22:10112]" assert brend2.alt == "C]chr22:9764]" assert brend2.info == "SVTYPE=BND;MATEID=hydra1a;IMPRECISE;CIPOS=0,102", brend2.info brend1, brend2 = build_vcf_parts(breakends.next(), genome_2bit) assert brend1.alt == "A[chr22:12112[" assert brend2.alt == "]chr22:7764]G" brend1, brend2 = build_vcf_parts(breakends.next(), genome_2bit) assert brend1.alt == "[chr22:11112[A" assert brend2.alt == "[chr22:8764[T" brend1, brend2 = build_vcf_parts(breakends.next(), genome_2bit) assert brend1.alt == "]chr22:13112]G", brend1.alt assert brend2.alt == "A[chr22:9764[", brend2.alt
def main(): args = get_args() conf = ConfigParser.ConfigParser() conf.read(args.conf) all_files = get_all_files_from_conf(conf) for genome in all_files: name, twobit_name = genome out_file = os.path.join(args.output, name) + ".fasta" out = fasta.FastaWriter(out_file) tb = twobit.TwoBitFile(file(twobit_name)) lz = os.path.join(args.lastz, name) + ".lastz" count = 0 for row in lastz.Reader(lz, long_format=True): sequence = slice_and_return_fasta(tb, row, args.flank) out.write(sequence) count += 1 print "\t{} sequences written to {}".format(count, out_file) out.close()
def main(): args = get_args() conf = ConfigParser.ConfigParser() conf.optionxform = str conf.read(args.conf) all_files = get_all_files_from_conf(conf, args.pattern) #pdb.set_trace() for genome in all_files: short_name, long_name, twobit_name = genome if not args.exclude or (short_name not in args.exclude): out_file = os.path.join(args.output, short_name) + ".fasta" out = fasta.FastaWriter(out_file) tb = twobit.TwoBitFile(file(twobit_name)) lz = os.path.join(args.lastz, long_name) count = 0 for row in lastz.Reader(lz, long_format=True): sequence = slice_and_return_fasta(tb, row, args.flank) out.write(sequence) count += 1 print "\t{} sequences written to {}".format(count, out_file) out.close()
def fix_nonref_positions(in_file, ref_file): """Fix Genotyping VCF positions where the bases are all variants The plink/pseq output does not handle these correctly, and has all reference/variant bases reversed """ ignore_chrs = ["."] ref2bit = twobit.TwoBitFile(open(ref_file)) out_file = in_file.replace("-raw.vcf", ".vcf") with open(in_file) as in_handle: with open(out_file, "w") as out_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: parts = line.rstrip("\r\n").split("\t") pos = int(parts[1]) # handle chr/non-chr naming if parts[0] not in ref2bit.keys() and parts[0].replace( "chr", "") in ref2bit.keys(): parts[0] = parts[0].replace("chr", "") # handle X chromosome elif parts[0] not in ref2bit.keys() and parts[0] == "23": for test in ["X", "chrX"]: if test in ref2bit.keys(): parts[0] = test ref_base = None if parts[0] not in ignore_chrs: try: ref_base = ref2bit[parts[0]].get(pos - 1, pos).upper() except Exception as msg: print( f"Skipping line. Failed to retrieve reference base for {str(parts)}\n{msg}" ) parts = fix_vcf_line(parts, ref_base) if parts is not None: out_handle.write("\t".join(parts) + "\n") return out_file
def fix_nonref_positions(in_file, ref_file): """Fix Genotyping VCF positions where the bases are all variants. The plink/pseq output does not handle these correctly, and has all reference/variant bases reversed. """ ignore_chrs = ["."] ref2bit = twobit.TwoBitFile(open(ref_file)) out_file = apply("{0}-fix{1}".format, os.path.splitext(in_file)) with open(in_file) as in_handle: with open(out_file, "w") as out_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: parts = line.rstrip("\r\n").split("\t") pos = int(parts[1]) # handle chr/non-chr naming if parts[0] not in ref2bit.keys(): #parts[0] = parts[0].replace("chr", "") parts[0] = "chr" + parts[0] ref_base = None if parts[0] not in ignore_chrs: try: #print(parts[0]) ref_base = ref2bit[parts[0]].get(pos - 1, pos).upper() except Exception, msg: # off the end of the chromosome if str(msg).startswith("end before start"): print msg else: print parts raise parts = fix_vcf_line(parts, ref_base) if parts is not None: parts[0] = parts[0].replace("chr", "") out_handle.write("\t".join(parts) + "\n") return out_file
def main(): args = get_args() tb = twobit.TwoBitFile(file(args.twobit)) filtered = 0 kept = 0 skipped = 0 with open(args.output, 'w') as outf: with open(args.fasta, 'rU') as fasta: for record in SeqIO.parse(fasta, 'fasta'): chromo, start, end = get_positions_from_coords( record.description.split('|')[1]) delta = args.buffer_length - (end - start) if delta < args.buffer_length: split = int(round(delta / 2.)) new_start = start - split if new_start < 0: new_start = 0 new_end = end + split sequence = tb[str(chromo)][new_start:new_end] if n_count( sequence) <= args.max_n and not sequence_is_masked( args.mask, sequence): seq = create_sequence_object(record.id, sequence, chromo, new_start, new_end) outf.write(seq.format('fasta')) kept += 1 else: filtered += 1 else: outf.write(record.format('fasta')) skipped += 1 print "Total {} sequences. Expanded {} and filtered {} with > {}% masked bases or > {} masked bases. Kept {}.".format( kept + filtered + skipped, kept + filtered, filtered, args.mask * 100, args.max_n, kept + skipped)
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, step=None, tag_but_not_change_number=False, verbose=True): r""" Writes a SAM file, deleting and adding some reads in order to compensate for the GC bias. **This is a stochastic method.** First, check if samtools can be executed, otherwise the test will fail >>> resp = cfg.checkProgram(samtools, 'view', '') >>> np.random.seed(1) >>> test = Tester() >>> args = test.testWriteCorrectedSam() >>> tempFile = writeCorrectedSam_worker(*args, \ ... tag_but_not_change_number=True, verbose=False) >>> res = os.system("{} index {}".format(test.samtools, tempFile)) >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['CP'] for r in bam.fetch(args[0], 200, 250)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") >>> tempFile = \ ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\ ... tag_but_not_change_number=True, verbose=False) >>> res = os.system("{} index {}".format(test.samtools, tempFile)) >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['CP'] for r in bam.fetch('chr2L', 0, 50)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") """ global R_gc fragmentLength = len(R_gc) - 1 if verbose: print "Sam for %s %s %s " % (chrNameBit, start, end) i = 0 tbit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = pysam.Samfile(global_vars['bam']) tempFileName = utilities.getTempFileName(suffix='.sam') outfile = pysam.Samfile(tempFileName, 'wh', template=bam) startTime = time.time() matePairs = {} read_repetitions = 0 removed_duplicated_reads = 0 # cache data # r.flag & 4 == 0 is to filter unmapped reads that # have a genomic position reads = [ r for r in bam.fetch(chrNameBam, start, end) if r.pos > start and r.flag & 4 == 0 ] r_index = -1 for read in reads: r_index += 1 copies = None gc = None # check if a mate has already been procesed # to apply the same correction try: copies = matePairs[read.qname]['copies'] gc = matePairs[read.qname]['gc'] del (matePairs[read.qname]) except: # this exception happens when a mate is # not present. This could # happen because of removal of the mate # by some filtering gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) if gc: copies = numCopiesOfRead(float(1) / R_gc[gc]) else: copies = 1 # is this read in the same orientation and position as the previous? if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \ and read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: copies = 0 # in other words do not take into account this read removed_duplicated_reads += 1 else: read_repetitions = 0 readName = read.qname readTag = read.tags if gc: GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2)) readTag.append(('CO', float(round(float(1) / R_gc[gc], 2)))) readTag.append(('CP', copies)) else: GC = -1 readTag.append(('GC', GC)) read.tags = readTag if read.is_paired and read.is_proper_pair \ and not read.mate_is_unmapped \ and not read.is_reverse: matePairs[readName] = {'copies': copies, 'gc': gc} """ outfile.write(read) """ if tag_but_not_change_number: outfile.write(read) continue for numCop in range(1, copies + 1): # the read has to be renamed such that newly # formed pairs will match if numCop > 1: read.qname = readName + "_%d" % (numCop) outfile.write(read) if verbose: if i % 500000 == 0 and i > 0: endTime = time.time() print "{}, processing {} ({:.1f} per sec) reads " \ "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) i += 1 outfile.close() if verbose: endTime = time.time() print "{}, processing {} ({:.1f} per sec) reads " \ "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) percentage = float(removed_duplicated_reads) * 100 / len(reads) \ if len(reads) > 0 else 0 print "duplicated reads removed %d of %d (%.2f) " % \ (removed_duplicated_reads, len(reads), percentage) # convert sam to bam. command = '{0} view -bS {1} 2> /dev/null > {1}.bam'.format( samtools, tempFileName) if verbose: sys.stderr.write("running {}\n".format(command)) run_shell_command(command) os.remove(tempFileName) return tempFileName + ".bam"
def main(svevents_file, genome_file): genome_2bit = twobit.TwoBitFile(open(genome_file)) for event in svevent_reader(svevents_file): for vcf_line in _svevent_to_vcf(event): print vcf_line
def main(hydra_file, genome_file, min_support=0): options = {"min_support": min_support, "max_single_size": 10000} out_file = "{0}.vcf".format(os.path.splitext(hydra_file)[0]) genome_2bit = twobit.TwoBitFile(open(genome_file)) with open(out_file, "w") as out_handle: hydra_to_vcf_writer(hydra_file, genome_2bit, options, out_handle)
def main(args=None): args = parse_arguments().parse_args(args) # check if directory is writable if args.filterOut: filter_out_file = args.filterOut.name args.filterOut.close() else: filter_out_file = None if args.extraSampling: extra_sampling_file = args.extraSampling.name args.extraSampling.close() else: extra_sampling_file = None global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile global_vars['filter_out'] = filter_out_file global_vars['extra_sampling_file'] = extra_sampling_file bit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = bamHandler.openBam(global_vars['bam']) if args.fragmentLength: fragment_len_dict = \ {'median': args.fragmentLength} else: fragment_len_dict, __ = \ get_read_and_fragment_length(args.bamfile, None, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if not fragment_len_dict: print "\nPlease provide the fragment length used for the " \ "sample preparation.\n" exit(1) fragment_len_dict = {'median': int(fragment_len_dict['median'])} chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references) global_vars['genome_size'] = sum([bit[x].size for x in bit.index]) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize confidence_p_value = float(1) / args.sampleSize # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] # use poisson distribution to identify peaks that should be discarted. # I multiply by 4, because the real distribution of reads # vary depending on the gc content # and the global number of reads per bp may a be too low. # empirically, a value of at least 4 times as big as the # reads_per_bp was found. # Similarly for the min value, I divide by 4. global_vars['max_reads'] = \ poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value) # this may be of not use, unless the depth of sequencing is really high # as this value is close to 0 global_vars['min_reads'] = \ poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value) for key in global_vars: print "{}: {}".format(key, global_vars[key]) print "computing frequencies" # the GC of the genome is sampled each stepSize bp. stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1) print "stepSize: {}".format(stepSize) data = tabulateGCcontent(fragment_len_dict, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) np.savetxt(args.GCbiasFrequenciesFile.name, data) if args.biasPlot: reads_per_gc = countReadsPerGC( args.regionSize, chrNameBitToBam, stepSize * 10, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, fragmentLength, chrNameBamToBit, verbose=False): r""" given genome regions, the GC content of the genome is tabulated for fragments of length 'fragmentLength' each 'stepSize' positions. >>> test = Tester() >>> args = test.testTabulateGCcontentWorker() >>> N_gc, F_gc = tabulateGCcontent_worker(*args) The forward read positions are: [1, 4, 10, 10, 16, 18] which correspond to a GC of [1, 1, 1, 1, 2, 1] The evaluated position are [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] the corresponding GC is [2, 1, 1, 2, 2, 1, 2, 3, 2, 1] >>> print N_gc [0 4 5 1] >>> print F_gc [0 4 1 0] >>> test.set_filter_out_file() >>> chrNameBam2bit = {'2L': 'chr2L'} Test for the filter out option >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2, ... {'median': 3}, chrNameBam2bit) >>> test.unset_filter_out_file() The evaluated positions are [ 0 2 8 10 12 14 16 18] >>> print N_gc [0 3 4 1] >>> print F_gc [0 3 1 0] Test for extra_sampling option >>> test.set_extra_sampling_file() >>> chrNameBam2bit = {'2L': 'chr2L'} >>> res = tabulateGCcontent_worker('2L', 0, 20, 2, ... {'median': 3}, chrNameBam2bit) The new positions evaluated are [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18] and the GC is [2, 1, 1, 0, 1, 2, 2, 1, 2, 3, 2, 1] >>> print res[0] [1 5 5 1] >>> print res[1] [0 5 1 0] """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) chromNameBit = chrNameBamToBit[chromNameBam] # array to keep track of the GC from regions of length 'fragmentLength' # from the genome. The index of the array is used to # indicate the gc content. The values inside the # array are counts. Thus, if N_gc[10] = 3, that means # that 3 regions have a gc_content of 10. subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') tbit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = bamHandler.openBam(global_vars['bam']) peak = 0 startTime = time.time() if verbose: print "[{:.3f}] computing positions to " \ "sample".format(time.time() - startTime) positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) read_counts = [] # Optimize IO. # if the sample regions are far apart from each # other is faster to go to each location and fetch # the reads found there. # Otherwise, if the regions to sample are close to # each other, is faster to load all the reads in # a large region into memory and consider only # those falling into the positions to sample. # The following code gets the reads # that are at sampling positions that lie close together if np.mean(np.diff(positions_to_sample)) < 1000: start_pos = min(positions_to_sample) end_pos = max(positions_to_sample) if verbose: print "[{:.3f}] caching reads".format(time.time() - startTime) counts = np.bincount([ r.pos - start_pos for r in bam.fetch(chromNameBam, start_pos, end_pos + 1) if not r.is_reverse and r.pos >= start_pos ], minlength=end_pos - start_pos + 2) read_counts = counts[positions_to_sample - min(positions_to_sample)] if verbose: print "[{:.3f}] finish caching reads.".format(time.time() - startTime) countTime = time.time() c = 1 for index in xrange(len(positions_to_sample)): i = positions_to_sample[index] # stop if the end of the chromosome is reached if i + fragmentLength['median'] > tbit[chromNameBit].size: break try: gc = getGC_content(tbit[chromNameBit].get( i, i + fragmentLength['median']), as_fraction=False) except Exception as detail: if verbose: print detail continue subN_gc[gc] += 1 # count all reads at position 'i' if len(read_counts) == 0: # case when no cache was done num_reads = len([ x.pos for x in bam.fetch(chromNameBam, i, i + 1) if x.is_reverse is False and x.pos == i ]) else: num_reads = read_counts[index] if num_reads >= global_vars['max_reads']: peak += 1 continue subF_gc[gc] += num_reads if verbose: if index % 50000 == 0: endTime = time.time() print "%s processing %d (%.1f per sec) @ %s:%s-%s %s" % \ (multiprocessing.current_process().name, index, index / (endTime - countTime), chromNameBit, start, end, stepSize) c += 1 if verbose: endTime = time.time() print "%s processing %d (%.1f per sec) @ %s:%s-%s %s" % \ (multiprocessing.current_process().name, index, index / (endTime - countTime), chromNameBit, start, end, stepSize) print "%s total time %.1f @ %s:%s-%s %s" % ( multiprocessing.current_process().name, (endTime - startTime), chromNameBit, start, end, stepSize) return (subN_gc, subF_gc)
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): r"""writes a bedgraph file containing the GC correction of a region from the genome >>> test = Tester() >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk()) >>> open(tempFile, 'r').readlines() ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n'] >>> os.remove(tempFile) """ global R_gc fragmentLength = len(R_gc) - 1 cvg_corr = np.zeros(end - start) i = 0 tbit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = pysam.Samfile(global_vars['bam']) read_repetitions = 0 removed_duplicated_reads = 0 startTime = time.time() # caching seems to be faster # r.flag & 4 == 0 is to skip unmapped # reads that nevertheless are asigned # to a genomic position reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0] bam.close() r_index = -1 for read in reads: r_index += 1 try: # calculate GC content of read fragment gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) except Exception as detail: print detail """ this exception happens when the end of a chromosome is reached """ continue if not gc: continue # is this read in the same orientation and position as the previous? if r_index > 0 and read.pos == reads[r_index - 1].pos and \ read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: removed_duplicated_reads += 1 continue else: read_repetitions = 0 try: fragmentStart, fragmentEnd = getFragmentFromRead( read, fragmentLength, extendPairedEnds=True) vectorStart = max(fragmentStart - start, 0) vectorEnd = min(fragmentEnd - start, end - start) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc] i += 1 if debug: endTime = time.time() print "{}, processing {} ({:.1f} per sec) " "reads @ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) if i == 0: return None _file = open(utilities.getTempFileName(suffix='.bg'), 'w') # save in bedgraph format for bin in xrange(0, len(cvg_corr), step): value = np.mean(cvg_corr[bin:min(bin + step, end)]) if value > 0: writeStart = start + bin writeEnd = min(start + bin + step, end) _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart, writeEnd, value)) tempFileName = _file.name _file.close() return tempFileName
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [ binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc)) ] global_vars['max_dup_gc'] = max_dup_gc bit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = pysam.Samfile(global_vars['bam']) global_vars['genome_size'] = sum([bit[x].size for x in bit.index]) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print "applying correction" # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print "genome partition size for multiprocessing: {}".format(chunkSize) print "using region {}".format(args.region) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()]) print chrNameBitToBam, chrNameBamToBit c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in xrange(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print "no sequence information for " "chromosome {} in 2bit file".format(chrom) print "Reads in this chromosome will be skipped" continue length = min(size, i + chunkSize) mp_args.append( (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args))) res = pool.map_async(writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = map(writeCorrectedSam_wrapper, mp_args) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print "concatenating (sorted) intermediate BAMs" header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print "indexing BAM" pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg') if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = map(writeCorrected_wrapper, mp_args) # concatenate intermediary bedgraph files _temp_bg_file = open(_temp_bg_file_name, 'w') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file) os.remove(tempFileName) _temp_bg_file.close() args.correctedFile.close() if args.correctedFile.name.endswith('bg'): shutil.move(_temp_bg_file_name, args.correctedFile.name) else: chromSizes = [(x, bit[x].size) for x in bit.keys()] writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name, args.correctedFile.name) os.remove(_temp_bg_file)
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, step=None, tag_but_not_change_number=False, verbose=True): r""" Writes a BAM file, deleting and adding some reads in order to compensate for the GC bias. **This is a stochastic method.** >>> np.random.seed(1) >>> test = Tester() >>> args = test.testWriteCorrectedSam() >>> tempFile = writeCorrectedSam_worker(*args, \ ... tag_but_not_change_number=True, verbose=False) >>> from StringIO import StringIO >>> ostdout = sys.stdout >>> import tempfile >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") >>> tempFile = \ ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\ ... tag_but_not_change_number=True, verbose=False) >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") """ global R_gc fragmentLength = len(R_gc) - 1 if verbose: print "Sam for %s %s %s " % (chrNameBit, start, end) i = 0 tbit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = pysam.Samfile(global_vars['bam']) tempFileName = utilities.getTempFileName(suffix='.bam') outfile = pysam.Samfile(tempFileName, 'wb', template=bam) startTime = time.time() matePairs = {} read_repetitions = 0 removed_duplicated_reads = 0 # cache data # r.flag & 4 == 0 is to filter unmapped reads that # have a genomic position reads = [ r for r in bam.fetch(chrNameBam, start, end) if r.pos > start and r.flag & 4 == 0 ] r_index = -1 for read in reads: r_index += 1 copies = None gc = None # check if a mate has already been procesed # to apply the same correction try: copies = matePairs[read.qname]['copies'] gc = matePairs[read.qname]['gc'] del (matePairs[read.qname]) except: # this exception happens when a mate is # not present. This could # happen because of removal of the mate # by some filtering gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) if gc: copies = numCopiesOfRead(float(1) / R_gc[gc]) else: copies = 1 # is this read in the same orientation and position as the previous? if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \ and read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: copies = 0 # in other words do not take into account this read removed_duplicated_reads += 1 else: read_repetitions = 0 readName = read.qname # Each tag is a tuple of (tag name, value, type) # Note that get_tags() returns ord(type) rather than type and this must # be fixed! # It turns out that the "with_value_type" option only started working in # pysam-0.8.4, so we can't reliably add tags on earlier versions without # potentially creating BAM files that break HTSJDK/IGV/etc. readTag = read.get_tags(with_value_type=True) replace_tags = False if len(readTag) > 0: if len(readTag[0]) == 3: if type(readTag[2]) is int: readTag = [(x[0], x[1], chr(x[2])) for x in readTag] replace_tags = True else: replace_tags = True if gc: GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2)) readTag.append(('YC', float(round(float(1) / R_gc[gc], 2)), "f")) readTag.append(('YN', copies, "i")) else: GC = -1 readTag.append(('YG', GC, "i")) if replace_tags: read.set_tags(readTag) if read.is_paired and read.is_proper_pair \ and not read.mate_is_unmapped \ and not read.is_reverse: matePairs[readName] = {'copies': copies, 'gc': gc} """ outfile.write(read) """ if tag_but_not_change_number: outfile.write(read) continue for numCop in range(1, copies + 1): # the read has to be renamed such that newly # formed pairs will match if numCop > 1: read.qname = readName + "_%d" % (numCop) outfile.write(read) if verbose: if i % 500000 == 0 and i > 0: endTime = time.time() print "{}, processing {} ({:.1f} per sec) reads " \ "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) i += 1 outfile.close() if verbose: endTime = time.time() print "{}, processing {} ({:.1f} per sec) reads " \ "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) percentage = float(removed_duplicated_reads) * 100 / len(reads) \ if len(reads) > 0 else 0 print "duplicated reads removed %d of %d (%.2f) " % \ (removed_duplicated_reads, len(reads), percentage) return tempFileName