Ejemplo n.º 1
0
def compareSignal(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, 
                  outFileName, outFileFormat, outFileNameLambda=None, region=None,
                  extendPairedEnds=True,
                  numberOfProcessors=1, Nsigmas = 2, maxSignalRatio=10, verbose=False):
    
    bam1 = bamHandler.openBam(bamFilesList[0])
    genomeSize = sum(bam1.lengths)

    bam2 = bamHandler.openBam(bamFilesList[1])

    treatmentMapped = bam1.mapped
    controlMapped  =  bam2.mapped
    treatmentControlRatioMapped = float(treatmentMapped) / controlMapped

    # 1. Get a table containing number of reads in a sample from the genome.
    #    Only regions for which both samples have non zero counts are considered

    num_reads_per_region = getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors, skipZeros=True, verbose=verbose)

    if verbose:
        print "number of non-zero regions sampled: {}".format(num_reads_per_region.shape[0])
    
    # 2. get Mean and std of treatment (col1) and control (col2)

    treatmentMean, controlMean = np.mean(num_reads_per_region, axis=0) # axis=0: that measn by column
    treatmentStd, controlStd   = np.std(num_reads_per_region, axis=0)
    treatmentTotal, controlTotal   = np.sum(num_reads_per_region, axis=0)

    # 3. Calculate residual in treatment & control data, at regions for which treatment
    #    signal exceeds mean + std * Nsigmas
    #    (these are expected to be the regions at which the signal > mean-signal, 
    #    so the residual signal is positive)

    overRows = np.where(num_reads_per_region[:,0].copy() >= treatmentMean + treatmentStd*Nsigmas )[0]
    over_Nsigma_regions = num_reads_per_region[overRows, :]
    
    treatmentSigMean, controlSigMean = np.mean(over_Nsigma_regions, axis=0)

    treatmentExtraSignal = treatmentSigMean - treatmentMean
    controlExtraSignal   = controlSigMean - controlMean

    treatmentControlRatio = float(treatmentTotal) / controlTotal
    adjSignalRatio = maxSignalRatio * treatmentControlRatio;
    treatmentSignalRatio = float(treatmentExtraSignal) / controlExtraSignal

    if treatmentSignalRatio < adjSignalRatio and treatmentSignalRatio > 0:
        treatmentSignalRatio = adjSignalRatio

    if treatmentSignalRatio < 1:
        raise NameError("estimated signal in control file {} is greater than estimated signal in treatmant file {}. Perhaps the file names are swapped?".format(bamFilesList[0], bamFilesList[1]))

    else:
        controlSignalRatio = 1.0/treatmentSignalRatio

    controlRatio = 1.0 / treatmentControlRatio

    print "Treatment mean: {:.2f}, Treatment total:{:.2f}".format(treatmentMean, treatmentTotal)
    print "Control mean: {:.2f}, Control total:{}".format(controlMean, controlTotal)
    print "the ratio of treatment vs. control for enriched regions is: {:.2f}".format(treatmentSignalRatio)
    print "the ratio of treatment vs. control ratio: {:.2f} (if based on mapped reads: {:.2f})".format(treatmentControlRatio, treatmentControlRatioMapped)

    

    funcArgs = {'controlMean': controlMean,
                'treatmentMean': treatmentMean,
                'controlSignalRatio': controlSignalRatio,
                'controlRatio': controlRatio,
                'treatmentControlRatio': treatmentControlRatio
                }


    writeBedGraph.writeBedGraph( bamFilesList,
                                 outFileName,
                                 defaultFragmentLength, computePvalue, 
                                 funcArgs, tileSize=binLength, region=region,
                                 format=outFileFormat,
                                 zerosToNans = False,
                                 numberOfProcessors=numberOfProcessors,
                                 extendPairedEnds=extendPairedEnds)

    if outFileNameLambda:
        writeBedGraph.writeBedGraph( bamFilesList,
                                     outFileNameLambda,
                                     defaultFragmentLength, computeLambda, 
                                     funcArgs, tileSize=binLength, region=region,
                                     format=outFileFormat,
                                     zerosToNans = False,
                                     numberOfProcessors=numberOfProcessors,
                                     extendPairedEnds=extendPairedEnds)
Ejemplo n.º 2
0
def estimateScaleFactor(bamFilesList,
                        binLength,
                        numberOfSamples,
                        defaultFragmentLength,
                        normalizationLength,
                        avg_method='median',
                        numberOfProcessors=1,
                        verbose=False,
                        chrsToSkip=[]):
    r"""
    Subdivides the genome into chunks to be analyzed in parallel
    using several processors. The code handles the creation of
    workers that compute fragment counts (coverage) for different
    regions and then collect and integrates the results.


    The arguments are:
         'bamFilesList', list of bam files to normalize
         'binLength', the window size in bp, where reads are going to be
                         counted.
         'numberOfSamples', Number of sites to sample.

         'defaultFragmentLength', if the reads are not paired, this value
                      is used extend the reads.
         'normalizationLength', length, in bp, to normalize the data.
                        For a value of 1, are given such that on average
                        1 fragment per base pair is found
         'avg_method', defines how the different values are to be summarized.
                       The options are 'mean' and 'median'

         'chrsToSkip', name of the chromosomes to be excluded from the
                       scale stimation. Usually the chrX is included.

    For example, to test about 1 million regions of length 500 bp,
    the binLength will be 500 and the numberOfSamples is going
    to be the size of the genome divided by the 1 million. This number
    is not exact because regions in which all counts
    are 0 are not taken into  account

    The test data contains reads for 200 bp
    >>> test = Tester()

    >>> dict = estimateScaleFactor([test.bamFile1, test.bamFile2], 50, 4, 0, 1)
    >>> dict['size_factors']
    array([ 1. ,  0.5])
    >>> dict['size_factors_based_on_mean']
    array([ 1. ,  0.5])
    """
    if len(bamFilesList) > 2:
        raise NameError("SES scale factors are only defined for 2 files")

    bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList]
    mappedReads = [x.mapped for x in bamFilesHandlers]

    sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')

    sizeFactorBasedOnMappedReads = \
        sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads

    num_reads_per_bin = getNumReadsPerBin(
        bamFilesList,
        binLength,
        numberOfSamples,
        defaultFragmentLength,
        numberOfProcessors=numberOfProcessors,
        verbose=verbose,
        chrsToSkip=chrsToSkip)

    sitesSampled = len(num_reads_per_bin)

    # the transpose is taken to easily iterate by columns which are now
    # converted to rows
    num_reads_per_bin = num_reads_per_bin.transpose()
    #    np.savetxt("/home/ramirez/tmp/test.num_reads", num_reads_per_bin)
    # size factors based on order statistics
    # see Signal extraction scaling (SES) method in: Diaz et al (2012)
    # Normalization, bias correction, and peak calling for ChIP-seq.
    # Statistical applications in genetics and molecular biology, 11(3).

    # using the same names as in Diaz paper
    # p refers to ChIP, q to input

    p = np.sort(num_reads_per_bin[0, :]).cumsum()
    q = np.sort(num_reads_per_bin[1, :]).cumsum()

    # p[-1] and q[-1] are the maximum values in the  arrays.
    # both p and q are normalized by this value
    diff = np.abs(p / p[-1] - q / q[-1])
    # get the lowest rank for wich the difference is the maximum
    maxIndex = np.flatnonzero(diff == diff.max())[0]
    # Take a lower rank to move to a region with probably
    # less peaks and more background.
    maxIndex = int(maxIndex * 0.8)
    while (maxIndex < len(p)):
        # in rare cases the maxIndex maps to a zero value.
        # In such cases, the next index is used until
        # a non zero value appears.
        cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])])
        if cumSum.min() > 0:
            break
        maxIndex += 1

    meanSES = [
        np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
        np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])
    ]

    # the maxIndex may be too close to the the signal regions
    # so i take a more conservative approach by taking a close number

    sizeFactorsSES = cumSum.min() / cumSum
    median = np.median(num_reads_per_bin, axis=1)

    # consider only those read numbers that are below the 90
    # percentile to stimate the
    # mean and std
    mean = []
    std = []
    for values in num_reads_per_bin:
        maxNumReads = (np.percentile(values, 90))
        if maxNumReads == 0:
            maxNumReads = (np.percentile(values, 99))
            if maxNumReads == 0:
                print "all genomic regions sampled from one "
                "of the bam files have no reads.\n"
                values = values[values <= maxNumReads]

        mean.append(np.mean(values))
        std.append(np.std(values))

    mean = np.array(mean)
    readsPerBin = mean if avg_method == 'mean' else median

    sizeFactor = sizeFactorsSES

    return {
        'size_factors': sizeFactor,
        'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
        'size_factors_SES': sizeFactorsSES,
        'size_factors_based_on_mean': mean.min() / mean,
        'size_factors_based_on_median': median.min() / median,
        'mean': mean,
        'meanSES': meanSES,
        'median': median,
        'reads_per_bin': readsPerBin,
        'std': std,
        'sites_sampled': sitesSampled
    }
Ejemplo n.º 3
0
def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
                        defaultFragmentLength, normalizationLength,
                        avg_method='median', numberOfProcessors=1,
                        verbose=False, chrsToSkip=[]):
    r"""
    Subdivides the genome into chunks to be analyzed in parallel
    using several processors. The code handles the creation of
    workers that compute fragment counts (coverage) for different
    regions and then collect and integrates the results.


    The arguments are:
         'bamFilesList', list of bam files to normalize
         'binLength', the window size in bp, where reads are going to be
                         counted.
         'numberOfSamples', Number of sites to sample.

         'defaultFragmentLength', if the reads are not paired, this value
                      is used extend the reads.
         'normalizationLength', length, in bp, to normalize the data.
                        For a value of 1, are given such that on average
                        1 fragment per base pair is found
         'avg_method', defines how the different values are to be summarized.
                       The options are 'mean' and 'median'

         'chrsToSkip', name of the chromosomes to be excluded from the
                       scale stimation. Usually the chrX is included.

    For example, to test about 1 million regions of length 500 bp,
    the binLength will be 500 and the numberOfSamples is going
    to be the size of the genome divided by the 1 million. This number
    is not exact because regions in which all counts
    are 0 are not taken into  account

    The test data contains reads for 200 bp
    >>> test = Tester()

    >>> dict = estimateScaleFactor([test.bamFile1, test.bamFile2], 50, 4, 0, 1)
    >>> dict['size_factors']
    array([ 1. ,  0.5])
    >>> dict['size_factors_based_on_mean']
    array([ 1. ,  0.5])
    """
    if len(bamFilesList) > 2:
        raise NameError("SES scale factors are only defined for 2 files")

    bamFilesHandlers = [bamHandler.openBam(x) for x in bamFilesList]
    mappedReads = [x.mapped for x in bamFilesHandlers]

    sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')

    sizeFactorBasedOnMappedReads = \
        sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads

    num_reads_per_bin = getNumReadsPerBin(bamFilesList, binLength,
                                          numberOfSamples,
                                          defaultFragmentLength,
                                          numberOfProcessors=numberOfProcessors,
                                          verbose=verbose,
                                          chrsToSkip=chrsToSkip)

    sitesSampled = len(num_reads_per_bin)

    # the transpose is taken to easily iterate by columns which are now
    # converted to rows
    num_reads_per_bin = num_reads_per_bin.transpose()
#    np.savetxt("/home/ramirez/tmp/test.num_reads", num_reads_per_bin)
    # size factors based on order statistics
    # see Signal extraction scaling (SES) method in: Diaz et al (2012)
    # Normalization, bias correction, and peak calling for ChIP-seq.
    # Statistical applications in genetics and molecular biology, 11(3).

    # using the same names as in Diaz paper
    # p refers to ChIP, q to input

    p = np.sort(num_reads_per_bin[0, :]).cumsum()
    q = np.sort(num_reads_per_bin[1, :]).cumsum()

    # p[-1] and q[-1] are the maximum values in the  arrays.
    # both p and q are normalized by this value
    diff = np.abs(p / p[-1] - q / q[-1])
    # get the lowest rank for wich the difference is the maximum
    maxIndex = np.flatnonzero(diff == diff.max())[0]
    # Take a lower rank to move to a region with probably
    # less peaks and more background.
    maxIndex = int(maxIndex * 0.8)
    while(maxIndex < len(p)):
        # in rare cases the maxIndex maps to a zero value.
        # In such cases, the next index is used until
        # a non zero value appears.
        cumSum = np.array([float(p[maxIndex]), float(q[maxIndex])])
        if cumSum.min() > 0:
            break
        maxIndex += 1

    meanSES = [np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
               np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])]

    # the maxIndex may be too close to the the signal regions
    # so i take a more conservative approach by taking a close number

    sizeFactorsSES = cumSum.min() / cumSum
    median = np.median(num_reads_per_bin, axis=1)

    # consider only those read numbers that are below the 90
    # percentile to stimate the
    # mean and std
    mean = []
    std = []
    for values in num_reads_per_bin:
        maxNumReads = (np.percentile(values, 90))
        if maxNumReads == 0:
            maxNumReads = (np.percentile(values, 99))
            if maxNumReads == 0:
                print "all genomic regions sampled from one "
                "of the bam files have no reads.\n"
                values = values[values <= maxNumReads]

        mean.append(np.mean(values))
        std.append(np.std(values))

    mean = np.array(mean)
    readsPerBin = mean if avg_method == 'mean' else median

    sizeFactor = sizeFactorsSES

    return {'size_factors': sizeFactor,
            'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
            'size_factors_SES': sizeFactorsSES,
            'size_factors_based_on_mean': mean.min() / mean,
            'size_factors_based_on_median': median.min() / median,
            'mean': mean,
            'meanSES': meanSES,
            'median': median,
            'reads_per_bin': readsPerBin,
            'std': std,
            'sites_sampled': sitesSampled}