def get_scale_factors(args, statsList, mappedList): if args.scaleFactors: scale_factors = list(map(float, args.scaleFactors.split(":"))) elif args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [args.bamfile1, args.bamfile2], args.sampleLength, args.numberOfSamples, 1, mappingStatsList=mappedList, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: print("Size factors using SES: {}".format(scale_factors)) print("%s regions of size %s where used " % (scalefactors_dict['sites_sampled'], args.sampleLength)) print( "ignoring filtering/blacklists, size factors if the number of mapped " "reads would have been used:") print(tuple(float(min(mappedList)) / np.array(mappedList))) elif args.scaleFactorsMethod == 'readCount': # change the scaleFactor to 1.0 args.scaleFactor = 1.0 # get num of kept reads for bam file 1 args.bam = args.bamfile1 bam1_mapped, _ = get_num_kept_reads(args, statsList[0]) # get num of kept reads for bam file 2 args.bam = args.bamfile2 bam2_mapped, _ = get_num_kept_reads(args, statsList[1]) mapped_reads = [bam1_mapped, bam2_mapped] # new scale_factors (relative to min of two bams) scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads) if args.verbose: print("Size factors using total number " "of mapped reads: {}".format(scale_factors)) elif args.scaleFactorsMethod == 'None': scale_factors = None return scale_factors
def get_scale_factors(args, statsList, mappedList): if args.scaleFactors: scale_factors = list(map(float, args.scaleFactors.split(":"))) elif args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [args.bamfile1, args.bamfile2], args.sampleLength, args.numberOfSamples, 1, mappingStatsList=mappedList, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: print("Size factors using SES: {}".format(scale_factors)) print("%s regions of size %s where used " % (scalefactors_dict['sites_sampled'], args.sampleLength)) print("ignoring filtering/blacklists, size factors if the number of mapped " "reads would have been used:") print(tuple( float(min(mappedList)) / np.array(mappedList))) elif args.scaleFactorsMethod == 'readCount': # change the scaleFactor to 1.0 args.scaleFactor = 1.0 # get num of kept reads for bam file 1 args.bam = args.bamfile1 bam1_mapped, _ = get_num_kept_reads(args, statsList[0]) # get num of kept reads for bam file 2 args.bam = args.bamfile2 bam2_mapped, _ = get_num_kept_reads(args, statsList[1]) mapped_reads = [bam1_mapped, bam2_mapped] # new scale_factors (relative to min of two bams) scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads) if args.verbose: print("Size factors using total number " "of mapped reads: {}".format(scale_factors)) elif args.scaleFactorsMethod == 'None': scale_factors = None return scale_factors
def get_scale_factors(args): bam1 = bamHandler.openBam(args.bamfile1) bam2 = bamHandler.openBam(args.bamfile2) bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization) bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization) if args.scaleFactors: scale_factors = map(float, args.scaleFactors.split(":")) else: if args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [bam1.filename, bam2.filename], args.sampleLength, args.numberOfSamples, 1, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: print "Size factors using SES: {}".format(scale_factors) print "%s regions of size %s where used " % \ (scalefactors_dict['sites_sampled'], args.sampleLength) print "size factor if the number of mapped " \ "reads would have been used:" print tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped])) elif args.scaleFactorsMethod == 'readCount': scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped]) if args.verbose: print "Size factors using total number " \ "of mapped reads: {}".format(scale_factors) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if scale_factors[0] == 1: mappedReads = bam1_mapped bamfile = args.bamfile1 else: mappedReads = bam2_mapped bamfile = args.bamfile2 if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(bamfile, return_lengths=False, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median'])) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format(int(read_len_dict['median'])) current_coverage = float(mappedReads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "Estimated current coverage {}".format(current_coverage) print "Scale factor to convert " \ "current coverage to 1: {}".format(coverage_scale_factor) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mappedReads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "scale factor for " "RPKM is {0}".format(coverage_scale_factor) return scale_factors
def correctReadCounts(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, outFileName, outFileFormat, outFileNameCorr=None, region=None, extendPairedEnds=True, numberOfProcessors=1, Nsigmas=2, maxSignalRatio=10, blackListFileName=None, verbose=False): bam1 = writeBedGraph.openBam(bamFilesList[0]) bam2 = writeBedGraph.openBam(bamFilesList[1]) treatmentMapped = bam1.mapped controlMapped = bam2.mapped treatmentControlRatioMapped = float(treatmentMapped) / controlMapped # 1. Get a table containing number of reads in a sample from the genome. # Only regions for which both samples have more than zero counts are considered scaleFactorsDict = estimateScaleFactor( bamFilesList, binLength, numberOfSamples, defaultFragmentLength, 1, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) """ num_reads_per_region = getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors, skipZeros=True, verbose=verbose) if verbose: print "number of non-zero regions sampled: {}".format(num_reads_per_region.shape[0]) # 2. get Mean and std of treatment (col1) and control (col2) treatmentMean, controlMean = np.mean(num_reads_per_region, axis=0) # axis=0: that measn by column treatmentStd, controlStd = np.std(num_reads_per_region, axis=0) treatmentTotal, controlTotal = np.sum(num_reads_per_region, axis=0) # 3. Calculate residual in treatment & control data, at regions for which treatment # signal exceeds mean + std * Nsigmas # (these are expected to be the regions at which the signal > mean-signal, # so the residual signal is positive) overRows = np.where(num_reads_per_region[:,0].copy() >= treatmentMean + treatmentStd*Nsigmas )[0] over_Nsigma_regions = num_reads_per_region[overRows, :] treatmentSigMean, controlSigMean = np.mean(over_Nsigma_regions, axis=0) treatmentExtraSignal = treatmentSigMean - treatmentMean controlExtraSignal = controlSigMean - controlMean treatmentControlRatio = float(treatmentTotal) / controlTotal adjSignalRatio = maxSignalRatio * treatmentControlRatio; treatmentSignalRatio = float(treatmentExtraSignal) / controlExtraSignal if treatmentSignalRatio < adjSignalRatio and treatmentSignalRatio > 0: treatmentSignalRatio = adjSignalRatio if treatmentSignalRatio < 1: raise NameError("estimated signal in control file {} is greater than estimated signal in treatmant file {}. Perhaps the file names are swapped?".format(bamFilesList[0], bamFilesList[1])) else: controlSignalRatio = 1.0/treatmentSignalRatio controlRatio = 1.0 / treatmentControlRatio """ # scaleFactors = scaleFactorsDict['size_factors'] treatmentMean, controlMean = scaleFactorsDict['meanSES'] treatmentControlRatio = scaleFactorsDict['size_factors'][ 1] / scaleFactorsDict['size_factors'][0] treatmentSignalRatio = treatmentControlRatio controlRatio = controlSignalRatio = 1.0 / treatmentControlRatio treatmentTotal = treatmentMapped controlTotal = controlMapped print("Treatment mean: {:.2f}, Treatment total:{:.2f}".format( treatmentMean, treatmentTotal)) print("Control mean: {:.2f}, Control total:{}".format( controlMean, controlTotal)) print("the ratio of treatment vs. control for enriched regions is: {:.2f}". format(treatmentSignalRatio)) print( "the ratio of treatment vs. control ratio: {:.2f} (if based on mapped reads: {:.2f})" .format(treatmentControlRatio, treatmentControlRatioMapped)) funcArgs = { 'controlMean': controlMean, 'treatmentMean': treatmentMean, 'controlSignalRatio': controlSignalRatio, 'controlRatio': controlRatio, 'treatmentControlRatio': treatmentControlRatio } writeBedGraph.writeBedGraph(bamFilesList, outFileName, defaultFragmentLength, computePvalue, funcArgs, tileSize=binLength, region=region, format=outFileFormat, zerosToNans=False, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, extendPairedEnds=extendPairedEnds) if outFileNameCorr: writeBedGraph.writeBedGraph(bamFilesList, outFileNameCorr, defaultFragmentLength, computeCorrectedReadcounts, funcArgs, tileSize=binLength, region=region, format=outFileFormat, zerosToNans=False, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, extendPairedEnds=extendPairedEnds)
def correctReadCounts(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, outFileName, outFileFormat, outFileNameCorr=None, region=None, extendPairedEnds=True, numberOfProcessors=1, Nsigmas = 2, maxSignalRatio=10, verbose=False): bam1 = writeBedGraph.openBam(bamFilesList[0]) genomeSize = sum(bam1.lengths) bam2 = writeBedGraph.openBam(bamFilesList[1]) treatmentMapped = bam1.mapped controlMapped = bam2.mapped treatmentControlRatioMapped = float(treatmentMapped) / controlMapped # 1. Get a table containing number of reads in a sample from the genome. # Only regions for which both samples have more than zero counts are considered scaleFactorsDict = estimateScaleFactor(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, 1, numberOfProcessors=numberOfProcessors, verbose=verbose) """ num_reads_per_region = getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors, skipZeros=True, verbose=verbose) if verbose: print "number of non-zero regions sampled: {}".format(num_reads_per_region.shape[0]) # 2. get Mean and std of treatment (col1) and control (col2) treatmentMean, controlMean = np.mean(num_reads_per_region, axis=0) # axis=0: that measn by column treatmentStd, controlStd = np.std(num_reads_per_region, axis=0) treatmentTotal, controlTotal = np.sum(num_reads_per_region, axis=0) # 3. Calculate residual in treatment & control data, at regions for which treatment # signal exceeds mean + std * Nsigmas # (these are expected to be the regions at which the signal > mean-signal, # so the residual signal is positive) overRows = np.where(num_reads_per_region[:,0].copy() >= treatmentMean + treatmentStd*Nsigmas )[0] over_Nsigma_regions = num_reads_per_region[overRows, :] treatmentSigMean, controlSigMean = np.mean(over_Nsigma_regions, axis=0) treatmentExtraSignal = treatmentSigMean - treatmentMean controlExtraSignal = controlSigMean - controlMean treatmentControlRatio = float(treatmentTotal) / controlTotal adjSignalRatio = maxSignalRatio * treatmentControlRatio; treatmentSignalRatio = float(treatmentExtraSignal) / controlExtraSignal if treatmentSignalRatio < adjSignalRatio and treatmentSignalRatio > 0: treatmentSignalRatio = adjSignalRatio if treatmentSignalRatio < 1: raise NameError("estimated signal in control file {} is greater than estimated signal in treatmant file {}. Perhaps the file names are swapped?".format(bamFilesList[0], bamFilesList[1])) else: controlSignalRatio = 1.0/treatmentSignalRatio controlRatio = 1.0 / treatmentControlRatio """ # scaleFactors = scaleFactorsDict['size_factors'] treatmentMean, controlMean = scaleFactorsDict['meanSES'] treatmentControlRatio = scaleFactorsDict['size_factors'][1]/scaleFactorsDict['size_factors'][0] treatmentSignalRatio = treatmentControlRatio controlRatio = controlSignalRatio = 1.0 /treatmentControlRatio treatmentTotal = treatmentMapped controlTotal = controlMapped print "Treatment mean: {:.2f}, Treatment total:{:.2f}".format(treatmentMean, treatmentTotal) print "Control mean: {:.2f}, Control total:{}".format(controlMean, controlTotal) print "the ratio of treatment vs. control for enriched regions is: {:.2f}".format(treatmentSignalRatio) print "the ratio of treatment vs. control ratio: {:.2f} (if based on mapped reads: {:.2f})".format(treatmentControlRatio, treatmentControlRatioMapped) funcArgs = {'controlMean': controlMean, 'treatmentMean': treatmentMean, 'controlSignalRatio': controlSignalRatio, 'controlRatio': controlRatio, 'treatmentControlRatio': treatmentControlRatio } writeBedGraph.writeBedGraph( bamFilesList, outFileName, defaultFragmentLength, computePvalue, funcArgs, tileSize=binLength, region=region, format=outFileFormat, zerosToNans = False, numberOfProcessors=numberOfProcessors, extendPairedEnds=extendPairedEnds) if outFileNameCorr: writeBedGraph.writeBedGraph( bamFilesList, outFileNameCorr, defaultFragmentLength, computeCorrectedReadcounts, funcArgs, tileSize=binLength, region=region, format=outFileFormat, zerosToNans = False, numberOfProcessors=numberOfProcessors, extendPairedEnds=extendPairedEnds)
def get_scale_factors(args): bam1 = bamHandler.openBam(args.bamfile1, args.bamIndex1) bam2 = bamHandler.openBam(args.bamfile2, args.bamIndex2) bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization) bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization) if args.scaleFactors: scale_factors = map(float, args.scaleFactors.split(":")) else: if args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [bam1.filename, bam2.filename], args.sampleLength, args.numberOfSamples, 1, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: print "Size factors using SES: {}".format(scale_factors) print "%s regions of size %s where used " % \ (scalefactors_dict['sites_sampled'], args.sampleLength) print "size factor if the number of mapped " \ "reads would have been used:" print tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped])) elif args.scaleFactorsMethod == 'readCount': scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array( [bam1_mapped, bam2_mapped]) if args.verbose: print "Size factors using total number " \ "of mapped reads: {}".format(scale_factors) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if scale_factors[0] == 1: mappedReads = bam1_mapped bamfile = args.bamfile1 bamindex = args.bamIndex1 else: mappedReads = bam2_mapped bamfile = args.bamfile2 bamindex = args.bamIndex2 if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( bamfile, bamindex, return_lengths=False, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print( "Fragment length based on paired en data " "estimated to be {}".format( frag_len_dict['median'])) elif args.extendReads < 1: exit( "*ERROR*: read extension must be bigger than one. Value give: {} " .format(args.extendReads)) elif args.extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print "Estimated read length is {}".format( int(read_len_dict['median'])) current_coverage = float( mappedReads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "Estimated current coverage {}".format( current_coverage) print "Scale factor to convert " \ "current coverage to 1: {}".format(coverage_scale_factor) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mappedReads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print "scale factor for " "RPKM is {0}".format(coverage_scale_factor) return scale_factors
def get_scale_factors(args): if args.ratio == 'subtract': # We need raw counts in this case normalizeTo1x = args.normalizeTo1x normalizeUsingRPKM = args.normalizeUsingRPKM args.normalizeTo1x = False args.normalizeUsingRPKM = False # This is only used if we subtract mapped_reads = [None, None] if args.scaleFactors: scale_factors = list(map(float, args.scaleFactors.split(":"))) elif args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [args.bamfile1, args.bamfile2], args.sampleLength, args.numberOfSamples, 1, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: bam1 = bamHandler.openBam(args.bamfile1) bam2 = bamHandler.openBam(args.bamfile2) print("Size factors using SES: {}".format(scale_factors)) print("%s regions of size %s where used " % (scalefactors_dict['sites_sampled'], args.sampleLength)) print("ignoring filtering/blacklists, size factors if the number of mapped " "reads would have been used:") print(tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped]))) bam1.close() bam2.close() elif args.scaleFactorsMethod == 'readCount': args.bam = args.bamfile1 args.scaleFactor = 1.0 bam1_mapped, _ = get_num_kept_reads(args) args.bam = args.bamfile2 bam2_mapped, _ = get_num_kept_reads(args) scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped]) mapped_reads = [bam1_mapped, bam2_mapped] if args.verbose: print("Size factors using total number " "of mapped reads: {}".format(scale_factors)) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor due to RPKM or normalize1x # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if args.scaleFactors is None: # check which of the two samples is not scaled down if scale_factors[0] == 1: args.bam = args.bamfile1 mapped_reads = mapped_reads[0] else: args.bam = args.bamfile2 mapped_reads = mapped_reads[1] if mapped_reads is None: mapped_reads, _ = get_num_kept_reads(args) # Replace the arguments args.normalizeTo1x = normalizeTo1x args.normalizeUsingRPKM = normalizeUsingRPKM if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit("*ERROR*: library is not paired-end. Please provide an extension length.") if args.verbose: print(("Fragment length based on paired en data " "estimated to be {}".format(frag_len_dict['median']))) elif args.extendReads < 1: exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) elif args.extendReads > 2000: exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print("Estimated read length is {}".format(int(read_len_dict['median']))) current_coverage = float(mapped_reads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print("Estimated current coverage {}".format(current_coverage)) print("Scale factor to convert " "current coverage to 1: {}".format(coverage_scale_factor)) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mapped_reads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print("Scale factor for RPKM is {0}".format(coverage_scale_factor)) return scale_factors
def get_scale_factors(args): if args.ratio == 'subtract': # We need raw counts in this case normalizeTo1x = args.normalizeTo1x normalizeUsingRPKM = args.normalizeUsingRPKM args.normalizeTo1x = False args.normalizeUsingRPKM = False # This is only used if we subtract mapped_reads = [None, None] if args.scaleFactors: scale_factors = list(map(float, args.scaleFactors.split(":"))) elif args.scaleFactorsMethod == 'SES': scalefactors_dict = estimateScaleFactor( [args.bamfile1, args.bamfile2], args.sampleLength, args.numberOfSamples, 1, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, chrsToSkip=args.ignoreForNormalization) scale_factors = scalefactors_dict['size_factors'] if args.verbose: bam1 = bamHandler.openBam(args.bamfile1) bam2 = bamHandler.openBam(args.bamfile2) print("Size factors using SES: {}".format(scale_factors)) print("%s regions of size %s where used " % (scalefactors_dict['sites_sampled'], args.sampleLength)) print( "ignoring filtering/blacklists, size factors if the number of mapped " "reads would have been used:") print( tuple( float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped]))) bam1.close() bam2.close() elif args.scaleFactorsMethod == 'readCount': args.bam = args.bamfile1 args.scaleFactor = 1.0 bam1_mapped, _ = get_num_kept_reads(args) args.bam = args.bamfile2 bam2_mapped, _ = get_num_kept_reads(args) scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array( [bam1_mapped, bam2_mapped]) mapped_reads = [bam1_mapped, bam2_mapped] if args.verbose: print("Size factors using total number " "of mapped reads: {}".format(scale_factors)) # in case the subtract method is used, the final difference # would be normalized according to the given method if args.ratio == 'subtract': # The next lines identify which of the samples is not scaled down. # The normalization using RPKM or normalize to 1x would use # as reference such sample. Since the other sample would be # scaled to match the un-scaled one, the normalization factor due to RPKM or normalize1x # for both samples should be based on the unscaled one. # For example, if sample A is unscaled and sample B is scaled by 0.5, # then normalizing factor for A to report RPKM read counts # is also applied to B. if args.scaleFactors is None: # check which of the two samples is not scaled down if scale_factors[0] == 1: args.bam = args.bamfile1 mapped_reads = mapped_reads[0] else: args.bam = args.bamfile2 mapped_reads = mapped_reads[1] if mapped_reads is None: mapped_reads, _ = get_num_kept_reads(args) # Replace the arguments args.normalizeTo1x = normalizeTo1x args.normalizeUsingRPKM = normalizeUsingRPKM if args.scaleFactors is None: if args.normalizeTo1x: # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length frag_len_dict, read_len_dict = get_read_and_fragment_length( args.bam, return_lengths=False, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: fragment_length = frag_len_dict['median'] else: exit( "*ERROR*: library is not paired-end. Please provide an extension length." ) if args.verbose: print(("Fragment length based on paired en data " "estimated to be {}".format( frag_len_dict['median']))) elif args.extendReads < 1: exit( "*ERROR*: read extension must be bigger than one. Value give: {} " .format(args.extendReads)) elif args.extendReads > 2000: exit( "*ERROR*: read extension must be smaller that 2000. Value give: {} " .format(args.extendReads)) else: fragment_length = args.extendReads else: # set as fragment length the read length fragment_length = int(read_len_dict['median']) if args.verbose: print("Estimated read length is {}".format( int(read_len_dict['median']))) current_coverage = float( mapped_reads * fragment_length) / args.normalizeTo1x # the coverage scale factor is 1 / coverage, coverage_scale_factor = 1.0 / current_coverage scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print("Estimated current coverage {}".format( current_coverage)) print("Scale factor to convert " "current coverage to 1: {}".format( coverage_scale_factor)) else: # by default normalize using RPKM # the RPKM is: # Num reads per tile/(total reads (in millions)*tile length in Kb) millionReadsMapped = float(mapped_reads) / 1e6 tileLengthInKb = float(args.binSize) / 1000 coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb) scale_factors = np.array(scale_factors) * coverage_scale_factor if args.verbose: print("Scale factor for RPKM is {0}".format( coverage_scale_factor)) return scale_factors