def get_scale_factor(args): scale_factor = args.scaleFactor bam_handle = bamHandler.openBam(args.bam) bam_mapped = parserCommon.bam_total_reads(bam_handle, args.ignoreForNormalization) blacklisted = parserCommon.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, args.blackListFileName) bam_mapped -= blacklisted if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median'])) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format(int(read_len_dict['median'])) current_coverage = \ float(bam_mapped * fragment_length) / args.normalizeTo1x # the scaling sets the coverage to match 1x scale_factor *= 1.0 / current_coverage if debug: print "Estimated current coverage {}".format(current_coverage) print "Scaling factor {}".format(args.scaleFactor) elif args.normalizeUsingRPKM: # the RPKM is the # reads per tile / \ # ( total reads (in millions) * tile length in Kb) million_reads_mapped = float(bam_mapped) / 1e6 tile_len_in_kb = float(args.binSize) / 1000 scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb) if debug: print "scale factor using RPKM is {0}".format(args.scaleFactor) return scale_factor
def get_scale_factors(args): bam1 = bamHandler.openBam(args.bamfile1) bam2 = bamHandler.openBam(args.bamfile2) bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization) bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization) if args.scaleFactors: scale_factors = map(float, args.scaleFactors.split(":")) else: if args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [bam1.filename, bam2.filename], args.sampleLength, args.numberOfSamples, 1, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: print "Size factors using SES: {}".format(scale_factors) print "%s regions of size %s where used " % \ (scalefactors_dict['sites_sampled'], args.sampleLength) print "size factor if the number of mapped " \ "reads would have been used:" print tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped])) elif args.scaleFactorsMethod == 'readCount': scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped]) if args.verbose: print "Size factors using total number " \ "of mapped reads: {}".format(scale_factors) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if scale_factors[0] == 1: mappedReads = bam1_mapped bamfile = args.bamfile1 else: mappedReads = bam2_mapped bamfile = args.bamfile2 if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(bamfile, return_lengths=False, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median'])) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format(int(read_len_dict['median'])) current_coverage = float(mappedReads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "Estimated current coverage {}".format(current_coverage) print "Scale factor to convert " \ "current coverage to 1: {}".format(coverage_scale_factor) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mappedReads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "scale factor for " "RPKM is {0}".format(coverage_scale_factor) return scale_factors
def get_scale_factor(args): scale_factor = args.scaleFactor bam_handle = bamHandler.openBam(args.bam, args.bamIndex) bam_mapped = parserCommon.bam_total_reads(bam_handle, args.ignoreForNormalization) if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bam, args.bamIndex, return_lengths=False, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print( "Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median'])) elif args.extendReads < 1: exit( "*ERROR*: read extension must be bigger than one. Value give: {} " .format(args.extendReads)) elif args.extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format( int(read_len_dict['median'])) current_coverage = \ float(bam_mapped * fragment_length) / args.normalizeTo1x # the scaling sets the coverage to match 1x scale_factor *= 1.0 / current_coverage if debug: print "Estimated current coverage {}".format(current_coverage) print "Scaling factor {}".format(args.scaleFactor) elif args.normalizeUsingRPKM: # the RPKM is the # reads per tile / \ # ( total reads (in millions) * tile length in Kb) million_reads_mapped = float(bam_mapped) / 1e6 tile_len_in_kb = float(args.binSize) / 1000 scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb) if debug: print "scale factor using RPKM is {0}".format(args.scaleFactor) return scale_factor
def get_scale_factors(args): bam1 = bamHandler.openBam(args.bamfile1, args.bamIndex1) bam2 = bamHandler.openBam(args.bamfile2, args.bamIndex2) bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization) bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization) if args.scaleFactors: scale_factors = map(float, args.scaleFactors.split(":")) else: if args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [bam1.filename, bam2.filename], args.sampleLength, args.numberOfSamples, 1, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: print "Size factors using SES: {}".format(scale_factors) print "%s regions of size %s where used " % \ (scalefactors_dict['sites_sampled'], args.sampleLength) print "size factor if the number of mapped " \ "reads would have been used:" print tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped])) elif args.scaleFactorsMethod == 'readCount': scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array( [bam1_mapped, bam2_mapped]) if args.verbose: print "Size factors using total number " \ "of mapped reads: {}".format(scale_factors) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if scale_factors[0] == 1: mappedReads = bam1_mapped bamfile = args.bamfile1 bamindex = args.bamIndex1 else: mappedReads = bam2_mapped bamfile = args.bamfile2 bamindex = args.bamIndex2 if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( bamfile, bamindex, return_lengths=False, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print( "Fragment length based on paired en data " "estimated to be {}".format( frag_len_dict['median'])) elif args.extendReads < 1: exit( "*ERROR*: read extension must be bigger than one. Value give: {} " .format(args.extendReads)) elif args.extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format( int(read_len_dict['median'])) current_coverage = float( mappedReads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "Estimated current coverage {}".format( current_coverage) print "Scale factor to convert " \ "current coverage to 1: {}".format(coverage_scale_factor) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mappedReads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "scale factor for " "RPKM is {0}".format(coverage_scale_factor) return scale_factors