def get_num_kept_reads(args): """ Substracts from the total number of mapped reads in a bamfile the proportion of reads that fall into blacklisted regions or that are filtered :return: integer """ bam_handle = bamHandler.openBam(args.bam) bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization) if args.blackListFileName: blacklisted = utilities.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, args.blackListFileName, args.numberOfProcessors) print("There are {0} alignments, of which {1} are completely " "within a blacklist region.".format(bam_mapped_total, blacklisted)) num_kept_reads = bam_mapped_total - blacklisted else: num_kept_reads = bam_mapped_total ftk = fraction_kept(args) if ftk < 1: num_kept_reads *= ftk print("Due to filtering, {0}% of the aforementioned alignments " "will be used {1}".format(100 * ftk, num_kept_reads)) return num_kept_reads, bam_mapped_total
def fraction_kept(args): """ Count the following: (A) The total number of alignments sampled (B) The total number of alignments ignored due to any of the following: --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates --minFragmentLength --maxFragmentLength Black list regions are already accounted for. This works by sampling the genome (by default, we'll iterate until we sample 1% or 100,000 alignments, whichever is smaller (unless there are fewer than 100,000 alignments, in which case sample everything). The sampling works by dividing the genome into bins and only looking at the first 50000 bases. If this doesn't yield sufficient alignments then the bin size is halved. """ filtered = 0 total = 0 distanceBetweenBins = 2000000 bam_handle = bamHandler.openBam(args.bam) bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization) num_needed_to_sample = max(bam_mapped if bam_mapped <= 100000 else 0, min(100000, 0.01 * bam_mapped)) if args.ignoreForNormalization: chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references) if chrom_name not in args.ignoreForNormalization] else: chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) while total < num_needed_to_sample and distanceBetweenBins > 50000: # If we've iterated, then halve distanceBetweenBins distanceBetweenBins /= 2 if distanceBetweenBins < 50000: distanceBetweenBins = 50000 res = mapReduce.mapReduce((bam_handle.filename, args), getFractionKept_wrapper, chrom_sizes, genomeChunkLength=distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if len(res): filtered, total = np.sum(res, axis=0) if total == 0: # This should never happen total = 1 return 1.0 - float(filtered) / float(total)
def main(args=None): args = parse_arguments().parse_args(args) if args.labels is None: args.labels = args.bamfiles if len(args.labels) != len(args.bamfiles): sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles))) # Get the total counts, excluding blacklisted regions and filtered reads totalCounts = [] fhs = [openBam(x) for x in args.bamfiles] for i, bam_handle in enumerate(fhs): bam_mapped = utilities.bam_total_reads(bam_handle, None) blacklisted = utilities.bam_blacklisted_reads(bam_handle, None, args.blackListFileName, args.numberOfProcessors) if args.verbose: print(("There are {0} alignments in {1}, of which {2} are completely within a blacklist region.".format(bam_mapped, args.bamfiles[i], blacklisted))) bam_mapped -= blacklisted args.bam = args.bamfiles[i] args.ignoreForNormalization = None ftk = fraction_kept(args) bam_mapped *= ftk totalCounts.append(bam_mapped) # Get fragment size and chromosome dict chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose) for fh in fhs: fh.close() frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0], return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: defaultFragmentLength = frag_len_dict['median'] else: sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {0}".format(frag_len_dict['median'])) elif args.extendReads < read_len_dict['median']: sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). " "Reads will not be extended.\n".format(int(read_len_dict['median']))) defaultFragmentLength = 'read length' elif args.extendReads > 2000: sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: defaultFragmentLength = args.extendReads else: defaultFragmentLength = 'read length' # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature res = mapReduce([args, defaultFragmentLength], getEnrichment_worker, chromSize, genomeChunkLength=chunkLength, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) features = res[0][1] featureCounts = [] for i in list(range(len(args.bamfiles))): d = dict() for x in features: d[x] = 0 featureCounts.append(d) # res is a list, with each element a list (length len(args.bamfiles)) of dicts for x in res: for i, y in enumerate(x[0]): for k, v in y.items(): featureCounts[i][k] += v # Make a plot plotEnrichment(args, featureCounts, totalCounts, features) # Raw counts if args.outRawCounts: of = open(args.outRawCounts, "w") of.write("file\tfeatureType\tpercent\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): of.write("{0}\t{1}\t{2:5.2f}\n".format(x, k, (100.0 * v) / totalCounts[i])) of.close()
def fraction_kept(args, stats): """ Count the following: (A) The total number of alignments sampled (B) The total number of alignments ignored due to any of the following: --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates --minFragmentLength --maxFragmentLength Black list regions are already accounted for. This works by sampling the genome (by default, we'll iterate until we sample 1% or 100,000 alignments, whichever is smaller (unless there are fewer than 100,000 alignments, in which case sample everything). The sampling works by dividing the genome into bins and only looking at the first 50000 bases. If this doesn't yield sufficient alignments then the bin size is halved. """ # Do we even need to proceed? if (not args.minMappingQuality or args.minMappingQuality == 0) and \ (not args.samFlagInclude or args.samFlagInclude == 0) and \ (not args.samFlagExclude or args.samFlagExclude == 0) and \ (not args.minFragmentLength or args.minFragmentLength == 0) and \ (not args.maxFragmentLength or args.maxFragmentLength == 0): if hasattr(args, "filterRNAstrand"): if args.filterRNAstrand not in ["forward", "reverse"]: return 1.0 else: return 1.0 filtered = 0 total = 0 distanceBetweenBins = 2000000 bam_handle = bamHandler.openBam(args.bam) bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats) if bam_mapped < 1000000: num_needed_to_sample = bam_mapped else: if 0.1 * bam_mapped >= 1000000: num_needed_to_sample = 0.1 * bam_mapped else: num_needed_to_sample = 1000000 if args.exactScaling: num_needed_to_sample = bam_mapped if num_needed_to_sample == bam_mapped: distanceBetweenBins = 55000 if args.ignoreForNormalization: chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references) if chrom_name not in args.ignoreForNormalization] else: chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) offset = 0 # Iterate over bins at various non-overlapping offsets until we have enough data while total < num_needed_to_sample and offset < np.ceil(distanceBetweenBins / 50000): res = mapReduce.mapReduce((bam_handle.filename, args, offset), getFractionKept_wrapper, chrom_sizes, genomeChunkLength=distanceBetweenBins, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if len(res): foo, bar = np.sum(res, axis=0) filtered += foo total += bar offset += 1 if total == 0: # This should never happen total = 1 return 1.0 - float(filtered) / float(total)