def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [ binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc)) ] global_vars['max_dup_gc'] = max_dup_gc tbit = twobit.TwoBitFile(global_vars['2bit']) bam = pysam.Samfile(global_vars['bam']) global_vars['genome_size'] = sum(tbit.sequence_sizes().values()) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(list(tbit.sequence_sizes().keys()), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) print(chrNameBitToBam, chrNameBamToBit) c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in range(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print("no sequence information for ") "chromosome {} in 2bit file".format(chrom) print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) mp_args.append( (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print(("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args)))) res = pool.map_async(writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print("concatenating (sorted) intermediate BAMs") header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print("indexing BAM") pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg') if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) # concatenate intermediary bedgraph files _temp_bg_file = open(_temp_bg_file_name, 'w') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file) os.remove(tempFileName) _temp_bg_file.close() args.correctedFile.close() if args.correctedFile.name.endswith('bg'): shutil.move(_temp_bg_file_name, args.correctedFile.name) else: chromSizes = [(k, v) for k, v in tbit.sequence_sizes().items()] writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name, args.correctedFile.name) os.remove(_temp_bg_file)
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc))] global_vars['max_dup_gc'] = max_dup_gc tbit = py2bit.open(global_vars['2bit']) bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) print(chrNameBitToBam, chrNameBamToBit) c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in range(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print("no sequence information for ") "chromosome {} in 2bit file".format(chrom) print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) mp_args.append((chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print(("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args)))) res = pool.map_async( writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print("concatenating (sorted) intermediate BAMs") header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print("indexing BAM") pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) oname = args.correctedFile.name args.correctedFile.close() if oname.endswith('bg'): f = open(oname, 'wb') for tempFileName in res: if tempFileName: shutil.copyfileobj(open(tempFileName, 'rb'), f) os.remove(tempFileName) f.close() else: chromSizes = [(k, v) for k, v in tbit.chroms().items()] writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
def main(args=None): args = parse_arguments().parse_args(args) if args.extraSampling: extra_sampling_file = args.extraSampling.name args.extraSampling.close() else: extra_sampling_file = None global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile global_vars['filter_out'] = args.blackListFileName global_vars['extra_sampling_file'] = extra_sampling_file tbit = py2bit.open(global_vars['2bit']) bam = bamHandler.openBam(global_vars['bam']) if args.fragmentLength: fragment_len_dict = \ {'median': args.fragmentLength} else: fragment_len_dict, __ = \ get_read_and_fragment_length(args.bamfile, None, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if not fragment_len_dict: print("\nPlease provide the fragment length used for the " "sample preparation.\n") exit(1) fragment_len_dict = {'median': int(fragment_len_dict['median'])} chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize confidence_p_value = float(1) / args.sampleSize # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] # use poisson distribution to identify peaks that should be discarted. # I multiply by 4, because the real distribution of reads # vary depending on the gc content # and the global number of reads per bp may a be too low. # empirically, a value of at least 4 times as big as the # reads_per_bp was found. # Similarly for the min value, I divide by 4. global_vars['max_reads'] = \ poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value) # this may be of not use, unless the depth of sequencing is really high # as this value is close to 0 global_vars['min_reads'] = \ poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value) for key in global_vars: print("{}: {}".format(key, global_vars[key])) print("computing frequencies") # the GC of the genome is sampled each stepSize bp. stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1) print("stepSize: {}".format(stepSize)) data = tabulateGCcontent(fragment_len_dict, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) np.savetxt(args.GCbiasFrequenciesFile.name, data) if args.biasPlot: reads_per_gc = countReadsPerGC(args.regionSize, chrNameBitToBam, stepSize * 10, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
def main(args=None): args = parse_arguments().parse_args(args) if args.extraSampling: extra_sampling_file = args.extraSampling.name args.extraSampling.close() else: extra_sampling_file = None global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile global_vars['filter_out'] = args.blackListFileName global_vars['extra_sampling_file'] = extra_sampling_file bit = twobit.TwoBitFile(open(global_vars['2bit'])) bam = bamHandler.openBam(global_vars['bam']) if args.fragmentLength: fragment_len_dict = \ {'median': args.fragmentLength} else: fragment_len_dict, __ = \ get_read_and_fragment_length(args.bamfile, None, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if not fragment_len_dict: print "\nPlease provide the fragment length used for the " \ "sample preparation.\n" exit(1) fragment_len_dict = {'median': int(fragment_len_dict['median'])} chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references) global_vars['genome_size'] = sum([bit[x].size for x in bit.index]) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize confidence_p_value = float(1) / args.sampleSize # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] # use poisson distribution to identify peaks that should be discarted. # I multiply by 4, because the real distribution of reads # vary depending on the gc content # and the global number of reads per bp may a be too low. # empirically, a value of at least 4 times as big as the # reads_per_bp was found. # Similarly for the min value, I divide by 4. global_vars['max_reads'] = \ poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value) # this may be of not use, unless the depth of sequencing is really high # as this value is close to 0 global_vars['min_reads'] = \ poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value) for key in global_vars: print "{}: {}".format(key, global_vars[key]) print "computing frequencies" # the GC of the genome is sampled each stepSize bp. stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1) print "stepSize: {}".format(stepSize) data = tabulateGCcontent(fragment_len_dict, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) np.savetxt(args.GCbiasFrequenciesFile.name, data) if args.biasPlot: reads_per_gc = countReadsPerGC(args.regionSize, chrNameBitToBam, stepSize * 10, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)