コード例 #1
0
def tabulateGCcontent(fragmentLength,
                      chrNameBitToBam,
                      stepSize,
                      chromSizes,
                      numberOfProcessors=None,
                      verbose=False,
                      region=None):
    r"""
    Subdivides the genome or the reads into chunks to be analyzed in parallel
    using several processors. This codes handles the creation of
    workers that tabulate the GC content for small regions and then
    collects and integrates the results
    >>> test = Tester()
    >>> arg = test.testTabulateGCcontent()
    >>> res = tabulateGCcontent(*arg)
    >>> res
    array([[   0.        ,   18.        ,    1.        ],
           [   3.        ,   63.        ,    0.42857143],
           [   7.        ,  159.        ,    0.39622642],
           [  25.        ,  192.        ,    1.171875  ],
           [  28.        ,  215.        ,    1.17209302],
           [  16.        ,  214.        ,    0.6728972 ],
           [  12.        ,   95.        ,    1.13684211],
           [   9.        ,   24.        ,    3.375     ],
           [   3.        ,   11.        ,    2.45454545],
           [   0.        ,    0.        ,    1.        ],
           [   0.        ,    0.        ,    1.        ]])
    """
    global global_vars

    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
    chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))
    chromSizes = [(k, v) for k, v in chromSizes
                  if k in list(chrNameBamToBit.keys())]

    imap_res = mapReduce.mapReduce(
        (stepSize, fragmentLength, chrNameBamToBit, verbose),
        tabulateGCcontent_wrapper,
        chromSizes,
        genomeChunkLength=chunkSize,
        numberOfProcessors=numberOfProcessors,
        region=region)

    for subN_gc, subF_gc in imap_res:
        try:
            F_gc += subF_gc
            N_gc += subN_gc
        except NameError:
            F_gc = subF_gc
            N_gc = subN_gc

    scaling = sum(N_gc) // sum(F_gc)

    R_gc = np.array([
        float(F_gc[x]) / N_gc[x] * scaling if N_gc[x] and F_gc[x] > 0 else 1
        for x in range(len(F_gc))
    ])

    data = np.transpose(np.vstack((F_gc, N_gc, R_gc)))
    return data
コード例 #2
0
ファイル: getScaleFactor.py プロジェクト: hrk2109/deepTools
def fraction_kept(args):
    """
    Count the following:
    (A) The total number of alignments sampled
    (B) The total number of alignments ignored due to any of the following:
        --samFlagInclude
        --samFlagExclude
        --minMappingQuality
        --ignoreDuplicates
        --minFragmentLength
        --maxFragmentLength

    Black list regions are already accounted for. This works by sampling the
    genome (by default, we'll iterate until we sample 1% or 100,000 alignments,
    whichever is smaller (unless there are fewer than 100,000 alignments, in
    which case sample everything).

    The sampling works by dividing the genome into bins and only looking at the
    first 50000 bases. If this doesn't yield sufficient alignments then the bin
    size is halved.
    """
    filtered = 0
    total = 0
    distanceBetweenBins = 2000000
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped = utilities.bam_total_reads(bam_handle,
                                           args.ignoreForNormalization)
    num_needed_to_sample = max(bam_mapped if bam_mapped <= 100000 else 0,
                               min(100000, 0.01 * bam_mapped))
    if args.ignoreForNormalization:
        chrom_sizes = [(chrom_name, bam_handle.lengths[idx])
                       for idx, chrom_name in enumerate(bam_handle.references)
                       if chrom_name not in args.ignoreForNormalization]
    else:
        chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))

    while total < num_needed_to_sample and distanceBetweenBins > 50000:
        # If we've iterated, then halve distanceBetweenBins
        distanceBetweenBins /= 2
        if distanceBetweenBins < 50000:
            distanceBetweenBins = 50000

        res = mapReduce.mapReduce((bam_handle.filename, args),
                                  getFractionKept_wrapper,
                                  chrom_sizes,
                                  genomeChunkLength=distanceBetweenBins,
                                  blackListFileName=args.blackListFileName,
                                  numberOfProcessors=args.numberOfProcessors,
                                  verbose=args.verbose)

        if len(res):
            filtered, total = np.sum(res, axis=0)

    if total == 0:
        # This should never happen
        total = 1

    return 1.0 - float(filtered) / float(total)
コード例 #3
0
def getScorePerBin(bigwigFilesList, binLength,
                   numberOfProcessors=1, skipZeros=True,
                   verbose=False, region=None,
                   bedFile=None,
                   stepSize=None,
                   chrsToSkip=[]):
    """
    This function returns a matrix containing scores (median) for the coverage
    of fragments within a region. Each row corresponds to a sampled region.
    Likewise, each column corresponds to a bigwig file.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 5,))
    array([[ 1.,  1.,  2.,  2.],
           [ 1.,  1.,  1.,  3.]])

    """

    # Try to determine an optimal fraction of the genome (chunkSize)
    # that is sent to workers for analysis. If too short, too much time
    # is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    # get list of common chromosome names and sizes
    chromSizes = getChromSizes(bigwigFilesList)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)
    genomeSize = sum(chrLengths)
    if stepSize is None:
        stepSize = binLength    #for consecutive bins
    chunkSize = int(stepSize * 500 / len(bigwigFilesList))
    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)
    # mapReduce( (staticArgs), func, chromSize, etc. )
    imap_res = mapReduce.mapReduce((bigwigFilesList, stepSize, binLength, skipZeros),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    score_per_bin = np.concatenate(imap_res, axis=0)
    return score_per_bin
コード例 #4
0
ファイル: computeGCBias.py プロジェクト: fw1121/deepTools
def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize,
                      chromSizes, numberOfProcessors=None, verbose=False,
                      region=None):
    r"""
    Subdivides the genome or the reads into chunks to be analyzed in parallel
    using several processors. This codes handles the creation of
    workers that tabulate the GC content for small regions and then
    collects and integrates the results
    >>> test = Tester()
    >>> arg = test.testTabulateGCcontent()
    >>> res = tabulateGCcontent(*arg)
    >>> res
    array([[   0.        ,   18.        ,    1.        ],
           [   3.        ,   63.        ,    0.45815996],
           [   7.        ,  159.        ,    0.42358185],
           [  25.        ,  192.        ,    1.25278115],
           [  28.        ,  215.        ,    1.25301422],
           [  16.        ,  214.        ,    0.71935396],
           [  12.        ,   95.        ,    1.21532959],
           [   9.        ,   24.        ,    3.60800971],
           [   3.        ,   11.        ,    2.62400706],
           [   0.        ,    0.        ,    1.        ],
           [   0.        ,    0.        ,    1.        ]])
    """
    global global_vars

    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
    chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))
    chromSizes = [(k, v) for k, v in chromSizes if k in list(chrNameBamToBit.keys())]

    imap_res = mapReduce.mapReduce((stepSize,
                                    fragmentLength, chrNameBamToBit,
                                    verbose),
                                   tabulateGCcontent_wrapper,
                                   chromSizes,
                                   genomeChunkLength=chunkSize,
                                   numberOfProcessors=numberOfProcessors,
                                   region=region)

    for subN_gc, subF_gc in imap_res:
        try:
            F_gc += subF_gc
            N_gc += subN_gc
        except NameError:
            F_gc = subF_gc
            N_gc = subN_gc

    if sum(F_gc) == 0:
        sys.exit("No fragments included in the sampling! Consider decreasing (or maybe increasing) the --sampleSize parameter")
    scaling = float(sum(N_gc)) / float(sum(F_gc))

    R_gc = np.array([float(F_gc[x]) / N_gc[x] * scaling
                     if N_gc[x] and F_gc[x] > 0 else 1
                     for x in range(len(F_gc))])

    data = np.transpose(np.vstack((F_gc, N_gc, R_gc)))
    return data
コード例 #5
0
def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize,
                      chromSizes, numberOfProcessors=None, verbose=False,
                      region=None):
    r"""
    Subdivides the genome or the reads into chunks to be analyzed in parallel
    using several processors. This codes handles the creation of
    workers that tabulate the GC content for small regions and then
    collects and integrates the results
    >>> test = Tester()
    >>> arg = test.testTabulateGCcontent()
    >>> res = tabulateGCcontent(*arg)
    >>> res
    array([[   0.        ,   18.        ,    1.        ],
           [   3.        ,   63.        ,    0.45815996],
           [   7.        ,  159.        ,    0.42358185],
           [  25.        ,  192.        ,    1.25278115],
           [  28.        ,  215.        ,    1.25301422],
           [  16.        ,  214.        ,    0.71935396],
           [  12.        ,   95.        ,    1.21532959],
           [   9.        ,   24.        ,    3.60800971],
           [   3.        ,   11.        ,    2.62400706],
           [   0.        ,    0.        ,    1.        ],
           [   0.        ,    0.        ,    1.        ]])
    """
    global global_vars

    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
    chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))
    chromSizes = [(k, v) for k, v in chromSizes if k in list(chrNameBamToBit.keys())]

    imap_res = mapReduce.mapReduce((stepSize,
                                    fragmentLength, chrNameBamToBit,
                                    verbose),
                                   tabulateGCcontent_wrapper,
                                   chromSizes,
                                   genomeChunkLength=chunkSize,
                                   numberOfProcessors=numberOfProcessors,
                                   region=region)

    for subN_gc, subF_gc in imap_res:
        try:
            F_gc += subF_gc
            N_gc += subN_gc
        except NameError:
            F_gc = subF_gc
            N_gc = subN_gc

    if sum(F_gc) == 0:
        sys.exit("No fragments included in the sampling! Consider decreasing (or maybe increasing) the --sampleSize parameter")
    scaling = float(sum(N_gc)) / float(sum(F_gc))

    R_gc = np.array([float(F_gc[x]) / N_gc[x] * scaling
                     if N_gc[x] and F_gc[x] > 0 else 1
                     for x in range(len(F_gc))])

    data = np.transpose(np.vstack((F_gc, N_gc, R_gc)))
    return data
コード例 #6
0
def fraction_kept(args):
    """
    Count the following:
    (A) The total number of alignments sampled
    (B) The total number of alignments ignored due to any of the following:
        --samFlagInclude
        --samFlagExclude
        --minMappingQuality
        --ignoreDuplicates
        --minFragmentLength
        --maxFragmentLength

    Black list regions are already accounted for. This works by sampling the
    genome (by default, we'll iterate until we sample 1% or 100,000 alignments,
    whichever is smaller (unless there are fewer than 100,000 alignments, in
    which case sample everything).

    The sampling works by dividing the genome into bins and only looking at the
    first 50000 bases. If this doesn't yield sufficient alignments then the bin
    size is halved.
    """
    filtered = 0
    total = 0
    distanceBetweenBins = 2000000
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization)
    num_needed_to_sample = max(bam_mapped if bam_mapped <= 100000 else 0, min(100000, 0.01 * bam_mapped))
    if args.ignoreForNormalization:
        chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references)
                       if chrom_name not in args.ignoreForNormalization]
    else:
        chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))

    while total < num_needed_to_sample and distanceBetweenBins > 50000:
        # If we've iterated, then halve distanceBetweenBins
        distanceBetweenBins /= 2
        if distanceBetweenBins < 50000:
            distanceBetweenBins = 50000

        res = mapReduce.mapReduce((bam_handle.filename, args),
                                  getFractionKept_wrapper,
                                  chrom_sizes,
                                  genomeChunkLength=distanceBetweenBins,
                                  blackListFileName=args.blackListFileName,
                                  numberOfProcessors=args.numberOfProcessors,
                                  verbose=args.verbose)

        if len(res):
            filtered, total = np.sum(res, axis=0)

    if total == 0:
        # This should never happen
        total = 1

    return 1.0 - float(filtered) / float(total)
コード例 #7
0
def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize,
                      chromSizes, numberOfProcessors=None, verbose=False,
                      region=None):
    r"""
    Subdivides the genome or the reads into chunks to be analyzed in parallel
    using several processors. This codes handles the creation of
    workers that tabulate the GC content for small regions and then
    collects and integrates the results
    >>> test = Tester()
    >>> arg = test.testTabulateGCcontent()
    >>> res = tabulateGCcontent(*arg)
    >>> res
    array([[   0.        ,   18.        ,    1.        ],
           [   3.        ,   63.        ,    0.42857143],
           [   7.        ,  159.        ,    0.39622642],
           [  25.        ,  192.        ,    1.171875  ],
           [  28.        ,  215.        ,    1.17209302],
           [  16.        ,  214.        ,    0.6728972 ],
           [  12.        ,   95.        ,    1.13684211],
           [   9.        ,   24.        ,    3.375     ],
           [   3.        ,   11.        ,    2.45454545],
           [   0.        ,    0.        ,    1.        ],
           [   0.        ,    0.        ,    1.        ]])
    """
    global global_vars

    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()])
    chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))
    chromSizes = [(k, v) for k, v in chromSizes if k in chrNameBamToBit.keys()]

    imap_res = mapReduce.mapReduce((stepSize,
                                    fragmentLength, chrNameBamToBit,
                                    verbose),
                                   tabulateGCcontent_wrapper,
                                   chromSizes,
                                   genomeChunkLength=chunkSize,
                                   numberOfProcessors=numberOfProcessors,
                                   region=region)

    for subN_gc, subF_gc in imap_res:
        try:
            F_gc += subF_gc
            N_gc += subN_gc
        except NameError:
            F_gc = subF_gc
            N_gc = subN_gc

    scaling = sum(N_gc) / sum(F_gc)

    R_gc = np.array([float(F_gc[x]) / N_gc[x] * scaling
                     if N_gc[x] and F_gc[x] > 0 else 1
                     for x in xrange(len(F_gc))])

    data = np.transpose(np.vstack((F_gc, N_gc, R_gc)))
    return data
コード例 #8
0
ファイル: bamHandler.py プロジェクト: wenmm/deepTools
def getMappingStats(bam, nThreads):
    """
    This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless

    This requires pysam > 0.13.0
    """
    header = [(x, y) for x, y in zip(bam.references, bam.lengths)]
    res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads)

    mapped = sum([x[0] for x in res])
    unmapped = sum([x[1] for x in res])
    stats = {x[0]: [0, 0] for x in header}
    for r in res:
        stats[r[2]][0] += r[0]
        stats[r[2]][1] += r[1]

    # We need to count the number of unmapped reads as well
    unmapped += bam.count("*")

    return mapped, unmapped, stats
コード例 #9
0
ファイル: bamHandler.py プロジェクト: fidelram/deepTools
def getMappingStats(bam, nThreads):
    """
    This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless

    This requires pysam > 0.13.0
    """
    header = [(x, y) for x, y in zip(bam.references, bam.lengths)]
    res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads)

    mapped = sum([x[0] for x in res])
    unmapped = sum([x[1] for x in res])
    stats = {x[0]: [0, 0] for x in header}
    for r in res:
        stats[r[2]][0] += r[0]
        stats[r[2]][1] += r[1]

    # We need to count the number of unmapped reads as well
    unmapped += bam.count("*")

    return mapped, unmapped, stats
コード例 #10
0
ファイル: computeGCBias.py プロジェクト: linijoseph/deepTools
def countReadsPerGC(regionSize,
                    chrNameBitToBam,
                    stepSize,
                    chromSizes,
                    numberOfProcessors=None,
                    verbose=False,
                    region=None):
    r"""
    Computes for a region of size regionSize, the GC of the region
    and the number of reads that overlap it.
    >>> test = Tester()
    >>> arg = test.testCountReadsPerGC()
    >>> reads_per_gc = countReadsPerGC(*arg)
    >>> reads_per_gc[0:5,:]
    array([[ 132.        ,    0.44      ],
           [ 132.        ,    0.44      ],
           [ 133.        ,    0.44      ],
           [ 134.        ,    0.43666667],
           [ 134.        ,    0.44      ]])
    """
    global global_vars

    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
    chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))

    imap_res = mapReduce.mapReduce(
        (stepSize, regionSize, chrNameBamToBit, verbose),
        countReadsPerGC_wrapper,
        chromSizes,
        genomeChunkLength=chunkSize,
        numberOfProcessors=numberOfProcessors,
        region=region)

    reads_per_gc = []
    for sub_reads_per_gc in imap_res:
        reads_per_gc += sub_reads_per_gc

    reads_per_gc = np.asarray(reads_per_gc)
    return reads_per_gc
コード例 #11
0
def countReadsPerGC(regionSize, chrNameBitToBam, stepSize,
                    chromSizes, numberOfProcessors=None, verbose=False,
                    region=None):
    r"""
    Computes for a region of size regionSize, the GC of the region
    and the number of reads that overlap it.
    >>> test = Tester()
    >>> arg = test.testCountReadsPerGC()
    >>> reads_per_gc = countReadsPerGC(*arg)
    >>> reads_per_gc[0:5,:]
    array([[ 132.        ,    0.44      ],
           [ 132.        ,    0.44      ],
           [ 133.        ,    0.44      ],
           [ 134.        ,    0.43666667],
           [ 134.        ,    0.44      ]])
    """
    global global_vars

    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.iteritems()])
    chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp']))

    imap_res = mapReduce.mapReduce((stepSize,
                                    regionSize, chrNameBamToBit,
                                    verbose),
                                   countReadsPerGC_wrapper,
                                   chromSizes,
                                   genomeChunkLength=chunkSize,
                                   numberOfProcessors=numberOfProcessors,
                                   region=region)

    reads_per_gc = []
    for sub_reads_per_gc in imap_res:
        reads_per_gc += sub_reads_per_gc

    reads_per_gc = np.asarray(reads_per_gc)
    return reads_per_gc
コード例 #12
0
    def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0):
        r"""
        Given a list of bamfiles, a function and a function arguments,
        this method writes a bedgraph file (or bigwig) file
        for a partition of the genome into tiles of given size
        and a value for each tile that corresponds to the given function
        and that is related to the coverage underlying the tile.

        Parameters
        ----------
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0}

        out_file_name : str
            name of the file to save the resulting data.

        smoothLength : int
            Distance in bp for smoothing the coverage per tile.


        """
        self.__dict__["smoothLength"] = smoothLength
        getStats = len(self.mappedList) < len(self.bamFilesList)
        bam_handles = []
        for x in self.bamFilesList:
            if getStats:
                bam, mapped, unmapped, stats = bamHandler.openBam(x, returnStats=True, nThreads=self.numberOfProcessors)
                self.mappedList.append(mapped)
                self.statsList.append(stats)
            else:
                bam = bamHandler.openBam(x)
            bam_handles.append(bam)

        genome_chunk_length = getGenomeChunkLength(bam_handles, self.binLength, self.mappedList)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chrom_names_and_size, non_common = getCommonChrNames(bam_handles, verbose=False)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        for x in list(self.__dict__.keys()):
            if x in ["mappedList", "statsList"]:
                continue
            sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

        res = mapReduce.mapReduce([func_to_call, func_args],
                                  writeBedGraph_wrapper,
                                  chrom_names_and_size,
                                  self_=self,
                                  genomeChunkLength=genome_chunk_length,
                                  region=self.region,
                                  blackListFileName=blackListFileName,
                                  numberOfProcessors=self.numberOfProcessors)

        # Determine the sorted order of the temp files
        chrom_order = dict()
        for i, _ in enumerate(chrom_names_and_size):
            chrom_order[_[0]] = i
        res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res]
        res.sort()

        if format == 'bedgraph':
            out_file = open(out_file_name, 'wb')
            for r in res:
                if r[3]:
                    _foo = open(r[3], 'rb')
                    shutil.copyfileobj(_foo, out_file)
                    _foo.close()
                    os.remove(r[3])
            out_file.close()
        else:
            bedGraphToBigWig(chrom_names_and_size, [x[3] for x in res], out_file_name)
コード例 #13
0
def main(args=None):
    args = parseArguments().parse_args(args)

    if not args.sampleLabels and args.smartLabels:
        args.sampleLabels = smartLabels(args.bamfiles)

    if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles):
        sys.stderr.write(
            "\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n"
        )
        sys.exit(1)

    if args.outFile is None:
        of = sys.stdout
    else:
        of = open(args.outFile, "w")

    bhs = [
        bamHandler.openBam(x,
                           returnStats=True,
                           nThreads=args.numberOfProcessors)
        for x in args.bamfiles
    ]
    mapped = [x[1] for x in bhs]
    unmappedList = [x[2] for x in bhs]
    bhs = [x[0] for x in bhs]

    # Get the reads in blacklisted regions
    if args.blackListFileName:
        blacklisted = []
        for bh in bhs:
            blacklisted.append(
                utilities.bam_blacklisted_reads(bh, None,
                                                args.blackListFileName,
                                                args.numberOfProcessors))
    else:
        blacklisted = [0] * len(bhs)

    # Get the total and mapped reads
    total = [x + y for x, y in list(zip(mapped, unmappedList))]

    chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths))
    for x in bhs:
        x.close()

    # Get the remaining metrics
    res = mapReduce([args],
                    getFiltered_worker,
                    chrom_sizes,
                    genomeChunkLength=args.binSize + args.distanceBetweenBins,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    totals = [0] * len(args.bamfiles)
    nFiltered = [0] * len(args.bamfiles)
    MAPQs = [0] * len(args.bamfiles)
    flagIncludes = [0] * len(args.bamfiles)
    flagExcludes = [0] * len(args.bamfiles)
    internalDupes = [0] * len(args.bamfiles)
    externalDupes = [0] * len(args.bamfiles)
    singletons = [0] * len(args.bamfiles)
    rnaStrand = [0] * len(args.bamfiles)
    for x in res:
        for idx, r in enumerate(x):
            totals[idx] += r[0]
            nFiltered[idx] += r[1]
            MAPQs[idx] += r[2]
            flagIncludes[idx] += r[3]
            flagExcludes[idx] += r[4]
            internalDupes[idx] += r[5]
            externalDupes[idx] += r[6]
            singletons[idx] += r[7]
            rnaStrand[idx] += r[8]

    # Print some output
    of.write(
        "Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n"
    )
    for idx, _ in enumerate(args.bamfiles):
        if args.sampleLabels:
            of.write(args.sampleLabels[idx])
        else:
            of.write(args.bamfiles[idx])
        of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx],
                                       blacklisted[idx]))
        # nFiltered
        metric = 0.0
        if totals[idx] > 0:
            metric = blacklisted[idx] + float(nFiltered[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # MAPQ
        metric = 0.0
        if totals[idx] > 0:
            metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagInclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagIncludes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagExclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagExcludes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Internally determined duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(internalDupes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Externally marked duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(externalDupes[idx]) / float(
                totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Singletons
        metric = 0.0
        if totals[idx] > 0:
            metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # filterRNAstrand
        metric = 0.0
        if totals[idx] > 0:
            metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        of.write("\n")

    if args.outFile is not None:
        of.close()

    return 0
コード例 #14
0
    def run(self, allArgs=None):
        # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
        # workers for analysis. If too short, too much time is spend loading the files
        # if too long, some processors end up free.
        # the following values are empirical
        bamFilesHandlers = []
        for x in self.bamFilesList:
            try:
                y = bamHandler.openBam(x)
            except:
                y = pyBigWig.open(x)
            bamFilesHandlers.append(y)
        chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = list(zip(*chromSizes))

        genomeSize = sum(chrLengths)
        if self.stepSize is None:
            if self.region is None:
                self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
            else:
                # compute the step size, based on the number of samples
                # and the length of the region studied
                (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
                self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)

        # number of samples is better if large
        if np.mean(chrLengths) < self.stepSize and self.bedFile is None:
            min_num_of_samples = int(genomeSize / np.mean(chrLengths))
            raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))

        max_mapped = []
        for x in bamFilesHandlers:
            try:
                max_mapped.append(x.mapped)
            except:
                # bigWig, use a fixed value
                max_mapped.append(0)
        max_mapped = max(max_mapped)

        # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin
        if max_mapped == 0:
            chunkSize = 10000 * self.binLength
            self.stepSize = self.binLength
        else:
            reads_per_bp = float(max_mapped) / genomeSize
            chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers)))
        [bam_h.close() for bam_h in bamFilesHandlers]

        # Ensure that chunkSize is always at least self.stepSize
        if chunkSize < self.stepSize:
            chunkSize = self.stepSize

        if self.verbose:
            print("step size is {}".format(self.stepSize))

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # Handle GTF options
        transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce([],
                                       countReadsInRegions_wrapper,
                                       chromSizes,
                                       self_=self,
                                       genomeChunkLength=chunkSize,
                                       bedFile=self.bedFile,
                                       blackListFileName=self.blackListFileName,
                                       region=self.region,
                                       numberOfProcessors=self.numberOfProcessors,
                                       transcriptID=transcriptID,
                                       exonID=exonID,
                                       keepExons=keepExons,
                                       transcript_id_designator=transcript_id_designator)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                                 "the chromosomes that were not common between the bigwig files\n")

            # concatenate intermediary bedgraph files
            ofile = open(self.out_file_for_raw_data, "w")
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    _foo = open(tempFileName, 'r')
                    shutil.copyfileobj(_foo, ofile)
                    _foo.close()
                    os.remove(tempFileName)

            ofile.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit('\nNo coverage values could be computed.\n\n'
                         'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                         'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                         'contain mapped reads.')
コード例 #15
0
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None,
                                 binSize=50000, distanceBetweenBins=1000000,
                                 numberOfProcessors=None, verbose=False):
    """
    Estimates the fragment length and read length through sampling

    Parameters
    ----------
    bamFile : str
        BAM file name
    return_lengths : bool
    numberOfProcessors : int
    verbose : bool
    binSize : int
    distanceBetweenBins : int

    Returns
    -------
    d : dict
        tuple of two dictionaries, one for the fragment length and the other
        for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile)
    chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))

    distanceBetweenBins *= 2
    fl = []

    # Fix issue #522, allow distanceBetweenBins == 0
    if distanceBetweenBins == 0:
        imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
                                       getFragmentLength_wrapper,
                                       chrom_sizes,
                                       genomeChunkLength=binSize,
                                       blackListFileName=blackListFileName,
                                       numberOfProcessors=numberOfProcessors,
                                       verbose=verbose)
        fl = np.concatenate(imap_res)

    # Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed
    while len(fl) < 1000 and distanceBetweenBins > 1:
        distanceBetweenBins /= 2
        stepsize = binSize + distanceBetweenBins
        imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
                                       getFragmentLength_wrapper,
                                       chrom_sizes,
                                       genomeChunkLength=stepsize,
                                       blackListFileName=blackListFileName,
                                       numberOfProcessors=numberOfProcessors,
                                       verbose=verbose)

        fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {'sample_size': len(fragment_length),
                                 'min': fragment_length.min(),
                                 'qtile25': np.percentile(fragment_length, 25),
                                 'mean': np.mean(fragment_length),
                                 'median': np.median(fragment_length),
                                 'qtile75': np.percentile(fragment_length, 75),
                                 'max': fragment_length.max(),
                                 'std': np.std(fragment_length),
                                 'mad': np.median(np.abs(fragment_length - np.median(fragment_length))),
                                 'qtile10': np.percentile(fragment_length, 10),
                                 'qtile20': np.percentile(fragment_length, 20),
                                 'qtile30': np.percentile(fragment_length, 30),
                                 'qtile40': np.percentile(fragment_length, 40),
                                 'qtile60': np.percentile(fragment_length, 60),
                                 'qtile70': np.percentile(fragment_length, 70),
                                 'qtile80': np.percentile(fragment_length, 80),
                                 'qtile90': np.percentile(fragment_length, 90),
                                 'qtile99': np.percentile(fragment_length, 99)}
        else:
            fragment_len_dict = None

        if return_lengths and fragment_len_dict is not None:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {'sample_size': len(read_length),
                         'min': read_length.min(),
                         'qtile25': np.percentile(read_length, 25),
                         'mean': np.mean(read_length),
                         'median': np.median(read_length),
                         'qtile75': np.percentile(read_length, 75),
                         'max': read_length.max(),
                         'std': np.std(read_length),
                         'mad': np.median(np.abs(read_length - np.median(read_length))),
                         'qtile10': np.percentile(read_length, 10),
                         'qtile20': np.percentile(read_length, 20),
                         'qtile30': np.percentile(read_length, 30),
                         'qtile40': np.percentile(read_length, 40),
                         'qtile60': np.percentile(read_length, 60),
                         'qtile70': np.percentile(read_length, 70),
                         'qtile80': np.percentile(read_length, 80),
                         'qtile90': np.percentile(read_length, 90),
                         'qtile99': np.percentile(read_length, 99)}
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict
コード例 #16
0
ファイル: alignmentSieve.py プロジェクト: fidelram/deepTools
def main(args=None):
    args = parseArguments().parse_args(args)
    if args.shift:
        if len(args.shift) not in [2, 4]:
            sys.exit("The --shift option can accept either 2 or 4 values only.")
        if len(args.shift) == 2:
            args.shift.extend([-args.shift[1], -args.shift[0]])
    elif args.ATACshift:
        args.shift = [4, -5, 5, -4]

    bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
    total = mapped + unmapped
    chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)]
    chromDict = {x: y for x, y in zip(bam.references, bam.lengths)}

    # Filter, writing the results to a bunch of temporary files
    res = mapReduce([args, chromDict],
                    filterWorker,
                    chrom_sizes,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    res = sorted(res)  # The temp files are now in order for concatenation
    nFiltered = sum([x[3] for x in res])
    totalSeen = sum([x[2] for x in res])  # The * contig isn't queried

    tmpFiles = [x[4] for x in res]
    if not args.BED:
        arguments = ["-o", args.outFile]
        arguments.extend(tmpFiles)  # [..., *someList] isn't available in python 2.7
        pysam.samtools.cat(*arguments)
        for tmpFile in tmpFiles:
            os.unlink(tmpFile)
    else:
        convertBED(args.outFile, tmpFiles, chromDict)

    if args.filteredOutReads:
        tmpFiles = [x[5] for x in res]
        if not args.BED:
            arguments = ["-o", args.filteredOutReads]
            arguments.extend(tmpFiles)  # [..., *someList] isn't available in python 2.7
            pysam.samtools.cat(*arguments)
            for tmpFile in tmpFiles:
                os.unlink(tmpFile)
        else:
            convertBED(args.outFile, tmpFiles, chromDict, args)

    if args.filterMetrics:
        sampleName = args.bam
        if args.label:
            sampleName = args.label
        if args.smartLabels:
            sampleName = smartLabels([args.bam])[0]

        of = open(args.filterMetrics, "w")
        of.write("#bamFilterReads --filterMetrics\n")
        of.write("#File\tReads Remaining\tTotal Initial Reads\n")
        of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered, total))
        of.close()

    return 0
コード例 #17
0
def get_read_and_fragment_length(bamFile,
                                 return_lengths=False,
                                 blackListFileName=None,
                                 binSize=50000,
                                 distanceBetweenBins=1000000,
                                 numberOfProcessors=None,
                                 verbose=False):
    """
    Estimates the fragment length and read length through sampling

    Parameters
    ----------
    bamFile : str
        BAM file name
    return_lengths : bool
    numberOfProcessors : int
    verbose : bool
    binSize : int
    distanceBetweenBins : int

    Returns
    -------
    d : dict
        tuple of two dictionaries, one for the fragment length and the other
        for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile)
    chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))

    distanceBetweenBins *= 2
    fl = []

    # Fix issue #522, allow distanceBetweenBins == 0
    if distanceBetweenBins == 0:
        imap_res = mapReduce.mapReduce(
            (bam_handle.filename, distanceBetweenBins),
            getFragmentLength_wrapper,
            chrom_sizes,
            genomeChunkLength=binSize,
            blackListFileName=blackListFileName,
            numberOfProcessors=numberOfProcessors,
            verbose=verbose)
        fl = np.concatenate(imap_res)

    # Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed
    while len(fl) < 1000 and distanceBetweenBins > 1:
        distanceBetweenBins /= 2
        stepsize = binSize + distanceBetweenBins
        imap_res = mapReduce.mapReduce(
            (bam_handle.filename, distanceBetweenBins),
            getFragmentLength_wrapper,
            chrom_sizes,
            genomeChunkLength=stepsize,
            blackListFileName=blackListFileName,
            numberOfProcessors=numberOfProcessors,
            verbose=verbose)

        fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {
                'sample_size':
                len(fragment_length),
                'min':
                fragment_length.min(),
                'qtile25':
                np.percentile(fragment_length, 25),
                'mean':
                np.mean(fragment_length),
                'median':
                np.median(fragment_length),
                'qtile75':
                np.percentile(fragment_length, 75),
                'max':
                fragment_length.max(),
                'std':
                np.std(fragment_length),
                'mad':
                np.median(np.abs(fragment_length -
                                 np.median(fragment_length))),
                'qtile10':
                np.percentile(fragment_length, 10),
                'qtile20':
                np.percentile(fragment_length, 20),
                'qtile30':
                np.percentile(fragment_length, 30),
                'qtile40':
                np.percentile(fragment_length, 40),
                'qtile60':
                np.percentile(fragment_length, 60),
                'qtile70':
                np.percentile(fragment_length, 70),
                'qtile80':
                np.percentile(fragment_length, 80),
                'qtile90':
                np.percentile(fragment_length, 90),
                'qtile99':
                np.percentile(fragment_length, 99)
            }
        else:
            fragment_len_dict = None

        if return_lengths and fragment_len_dict is not None:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {
            'sample_size': len(read_length),
            'min': read_length.min(),
            'qtile25': np.percentile(read_length, 25),
            'mean': np.mean(read_length),
            'median': np.median(read_length),
            'qtile75': np.percentile(read_length, 75),
            'max': read_length.max(),
            'std': np.std(read_length),
            'mad': np.median(np.abs(read_length - np.median(read_length))),
            'qtile10': np.percentile(read_length, 10),
            'qtile20': np.percentile(read_length, 20),
            'qtile30': np.percentile(read_length, 30),
            'qtile40': np.percentile(read_length, 40),
            'qtile60': np.percentile(read_length, 60),
            'qtile70': np.percentile(read_length, 70),
            'qtile80': np.percentile(read_length, 80),
            'qtile90': np.percentile(read_length, 90),
            'qtile99': np.percentile(read_length, 99)
        }
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict
コード例 #18
0
    def run(self, allArgs=None):
        bamFilesHandles = []
        for x in self.bamFilesList:
            try:
                y = bamHandler.openBam(x)
            except SystemExit:
                sys.exit(sys.exc_info()[1])
            except:
                y = pyBigWig.open(x)
            bamFilesHandles.append(y)

        chromsizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandles, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = list(zip(*chromsizes))

        genomeSize = sum(chrLengths)

        if self.bedFile is None:
            chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize, chromsizes, chrLengths)
        else:
            chunkSize = None

        [bam_h.close() for bam_h in bamFilesHandles]

        if self.verbose:
            print("step size is {}".format(self.stepSize))

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # Handle GTF options
        transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce([],
                                       countReadsInRegions_wrapper,
                                       chromsizes,
                                       self_=self,
                                       genomeChunkLength=chunkSize,
                                       bedFile=self.bedFile,
                                       blackListFileName=self.blackListFileName,
                                       region=self.region,
                                       numberOfProcessors=self.numberOfProcessors,
                                       transcriptID=transcriptID,
                                       exonID=exonID,
                                       keepExons=keepExons,
                                       transcript_id_designator=transcript_id_designator)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                                 "the chromosomes that were not common between the bigwig files\n")

            # concatenate intermediary bedgraph files
            ofile = open(self.out_file_for_raw_data, "w")
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    _foo = open(tempFileName, 'r')
                    shutil.copyfileobj(_foo, ofile)
                    _foo.close()
                    os.remove(tempFileName)

            ofile.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit('\nNo coverage values could be computed.\n\n'
                         'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                         'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                         'contain mapped reads.')
コード例 #19
0
ファイル: getScaleFactor.py プロジェクト: fidelram/deepTools
def fraction_kept(args, stats):
    """
    Count the following:
    (A) The total number of alignments sampled
    (B) The total number of alignments ignored due to any of the following:
        --samFlagInclude
        --samFlagExclude
        --minMappingQuality
        --ignoreDuplicates
        --minFragmentLength
        --maxFragmentLength

    Black list regions are already accounted for. This works by sampling the
    genome (by default, we'll iterate until we sample 1% or 100,000 alignments,
    whichever is smaller (unless there are fewer than 100,000 alignments, in
    which case sample everything).

    The sampling works by dividing the genome into bins and only looking at the
    first 50000 bases. If this doesn't yield sufficient alignments then the bin
    size is halved.
    """
    # Do we even need to proceed?
    if (not args.minMappingQuality or args.minMappingQuality == 0) and \
       (not args.samFlagInclude or args.samFlagInclude == 0) and \
       (not args.samFlagExclude or args.samFlagExclude == 0) and \
       (not args.minFragmentLength or args.minFragmentLength == 0) and \
       (not args.maxFragmentLength or args.maxFragmentLength == 0):
        if hasattr(args, "filterRNAstrand"):
            if args.filterRNAstrand not in ["forward", "reverse"]:
                return 1.0
        else:
            return 1.0

    filtered = 0
    total = 0
    distanceBetweenBins = 2000000
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats)
    if bam_mapped < 1000000:
        num_needed_to_sample = bam_mapped
    else:
        if 0.1 * bam_mapped >= 1000000:
            num_needed_to_sample = 0.1 * bam_mapped
        else:
            num_needed_to_sample = 1000000
    if args.exactScaling:
        num_needed_to_sample = bam_mapped
    if num_needed_to_sample == bam_mapped:
        distanceBetweenBins = 55000
    if args.ignoreForNormalization:
        chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references)
                       if chrom_name not in args.ignoreForNormalization]
    else:
        chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))

    offset = 0
    # Iterate over bins at various non-overlapping offsets until we have enough data
    while total < num_needed_to_sample and offset < np.ceil(distanceBetweenBins / 50000):
        res = mapReduce.mapReduce((bam_handle.filename, args, offset),
                                  getFractionKept_wrapper,
                                  chrom_sizes,
                                  genomeChunkLength=distanceBetweenBins,
                                  blackListFileName=args.blackListFileName,
                                  numberOfProcessors=args.numberOfProcessors,
                                  verbose=args.verbose)

        if len(res):
            foo, bar = np.sum(res, axis=0)
            filtered += foo
            total += bar
        offset += 1

    if total == 0:
        # This should never happen
        total = 1

    return 1.0 - float(filtered) / float(total)
コード例 #20
0
def main(args=None):
    args = parseArguments().parse_args(args)

    if not args.sampleLabels and args.smartLabels:
        args.sampleLabels = smartLabels(args.bamfiles)

    if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles):
        sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n")
        sys.exit(1)

    if args.outFile is None:
        of = sys.stdout
    else:
        of = open(args.outFile, "w")

    bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles]
    mapped = [x[1] for x in bhs]
    unmappedList = [x[2] for x in bhs]
    bhs = [x[0] for x in bhs]

    # Get the reads in blacklisted regions
    if args.blackListFileName:
        blacklisted = []
        for bh in bhs:
            blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors))
    else:
        blacklisted = [0] * len(bhs)

    # Get the total and mapped reads
    total = [x + y for x, y in list(zip(mapped, unmappedList))]

    chrom_sizes = list(zip(bhs[0].references, bhs[0].lengths))
    for x in bhs:
        x.close()

    # Get the remaining metrics
    res = mapReduce([args],
                    getFiltered_worker,
                    chrom_sizes,
                    genomeChunkLength=args.binSize + args.distanceBetweenBins,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    totals = [0] * len(args.bamfiles)
    nFiltered = [0] * len(args.bamfiles)
    MAPQs = [0] * len(args.bamfiles)
    flagIncludes = [0] * len(args.bamfiles)
    flagExcludes = [0] * len(args.bamfiles)
    internalDupes = [0] * len(args.bamfiles)
    externalDupes = [0] * len(args.bamfiles)
    singletons = [0] * len(args.bamfiles)
    rnaStrand = [0] * len(args.bamfiles)
    for x in res:
        for idx, r in enumerate(x):
            totals[idx] += r[0]
            nFiltered[idx] += r[1]
            MAPQs[idx] += r[2]
            flagIncludes[idx] += r[3]
            flagExcludes[idx] += r[4]
            internalDupes[idx] += r[5]
            externalDupes[idx] += r[6]
            singletons[idx] += r[7]
            rnaStrand[idx] += r[8]

    # Print some output
    of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n")
    for idx, _ in enumerate(args.bamfiles):
        if args.sampleLabels:
            of.write(args.sampleLabels[idx])
        else:
            of.write(args.bamfiles[idx])
        of.write("\t{}\t{}\t{}".format(total[idx], mapped[idx], blacklisted[idx]))
        # nFiltered
        metric = 0.0
        if totals[idx] > 0:
            metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # MAPQ
        metric = 0.0
        if totals[idx] > 0:
            metric = float(MAPQs[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagInclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagIncludes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # samFlagExclude
        metric = 0.0
        if totals[idx] > 0:
            metric = float(flagExcludes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Internally determined duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(internalDupes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Externally marked duplicates
        metric = 0.0
        if totals[idx] > 0:
            metric = float(externalDupes[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # Singletons
        metric = 0.0
        if totals[idx] > 0:
            metric = float(singletons[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        # filterRNAstrand
        metric = 0.0
        if totals[idx] > 0:
            metric = float(rnaStrand[idx]) / float(totals[idx]) * mapped[idx]
        of.write("\t{}".format(min(round(metric, 1), mapped[idx])))
        of.write("\n")

    if args.outFile is not None:
        of.close()

    return 0
コード例 #21
0
def writeBedGraph(bamFilesList,
                  outputFileName,
                  fragmentLength,
                  func,
                  funcArgs,
                  tileSize=25,
                  region=None,
                  numberOfProcessors=None,
                  format="bedgraph",
                  extendPairedEnds=True,
                  zerosToNans=True,
                  smoothLength=0,
                  minMappingQuality=None,
                  ignoreDuplicates=False,
                  fragmentFromRead_func=None,
                  centerRead=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    >>> test = Tester()
    >>> import tempfile
    >>> outFile = tempfile.NamedTemporaryFile()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> writeBedGraph( [test.bamFile1], outFile.name,
    ... 0, scaleCoverage, funcArgs, region='3R:0:200')
    >>> open(outFile.name, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> outFile.close()

    """
    bamHandlers = [openBam(x) for x in bamFilesList]
    genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
    # check if both bam files correspond to the same species
    # by comparing the chromosome names:
    chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce(
        (tileSize, fragmentLength, bamFilesList, func, funcArgs,
         extendPairedEnds, smoothLength, zerosToNans, minMappingQuality,
         ignoreDuplicates, fragmentFromRead_func, centerRead),
        writeBedGraph_wrapper,
        chromNamesAndSize,
        genomeChunkLength=genomeChunkLength,
        region=region,
        numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName,
                         False)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)
コード例 #22
0
def writeBedGraph(bamOrBwFileList,
                  outputFileName,
                  fragmentLength,
                  func,
                  funcArgs,
                  tileSize=25,
                  region=None,
                  blackListFileName=None,
                  numberOfProcessors=None,
                  format="bedgraph",
                  extendPairedEnds=True,
                  missingDataAsZero=False,
                  smoothLength=0,
                  fixed_step=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """

    bamHandlers = [
        bamHandler.openBam(indexedFile)
        for indexedFile, fileFormat in bamOrBwFileList if fileFormat == 'bam'
    ]
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [
            fileName for fileName, fileFormat in bamOrBwFileList
            if fileFormat == 'bigwig'
        ]
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            bwh = pyBigWig.open(bw)
            for chromName, size in list(bwh.chroms().items()):
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print("\nWARNING\n"
                              "Chromosome {} length reported in the "
                              "bigwig files differ.\n{} for {}\n"
                              "{} for {}.\n\nThe smallest "
                              "length will be used".format(
                                  chromName, chromNamesAndSize[chromName],
                                  bigwigs[0], size, bw))
                        chromNamesAndSize[chromName] = min(
                            chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            bwh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce(
        (tileSize, fragmentLength, bamOrBwFileList, func, funcArgs,
         extendPairedEnds, smoothLength, missingDataAsZero, fixed_step),
        writeBedGraph_wrapper,
        chromNamesAndSize,
        genomeChunkLength=genomeChunkLength,
        region=region,
        blackListFileName=blackListFileName,
        numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            _foo = open(tempFileName, 'rb')
            shutil.copyfileobj(_foo, outFile)
            _foo.close()
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print("output file: %s" % (outputFileName))
    else:
        bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, True)
        if debug:
            print("output file: %s" % (outputFileName))
        os.remove(bedGraphFile)
コード例 #23
0
def main(args=None):
    args = parseArguments().parse_args(args)
    if args.shift:
        if len(args.shift) not in [2, 4]:
            sys.exit(
                "The --shift option can accept either 2 or 4 values only.")
        if len(args.shift) == 2:
            args.shift.extend([-args.shift[1], -args.shift[0]])
    elif args.ATACshift:
        args.shift = [4, -5, 5, -4]

    bam, mapped, unmapped, stats = openBam(args.bam,
                                           returnStats=True,
                                           nThreads=args.numberOfProcessors)
    total = mapped + unmapped
    chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)]
    chromDict = {x: y for x, y in zip(bam.references, bam.lengths)}

    # Filter, writing the results to a bunch of temporary files
    res = mapReduce([args, chromDict],
                    filterWorker,
                    chrom_sizes,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    res = sorted(res)  # The temp files are now in order for concatenation
    nFiltered = sum([x[3] for x in res])
    totalSeen = sum([x[2] for x in res])  # The * contig isn't queried

    tmpFiles = [x[4] for x in res]
    if not args.BED:
        arguments = ["-o", args.outFile]
        arguments.extend(
            tmpFiles)  # [..., *someList] isn't available in python 2.7
        pysam.samtools.cat(*arguments)
        for tmpFile in tmpFiles:
            os.unlink(tmpFile)
    else:
        convertBED(args.outFile, tmpFiles, chromDict)

    if args.filteredOutReads:
        tmpFiles = [x[5] for x in res]
        if not args.BED:
            arguments = ["-o", args.filteredOutReads]
            arguments.extend(
                tmpFiles)  # [..., *someList] isn't available in python 2.7
            pysam.samtools.cat(*arguments)
            for tmpFile in tmpFiles:
                os.unlink(tmpFile)
        else:
            convertBED(args.outFile, tmpFiles, chromDict, args)

    if args.filterMetrics:
        sampleName = args.bam
        if args.label:
            sampleName = args.label
        if args.smartLabels:
            sampleName = smartLabels([args.bam])[0]

        of = open(args.filterMetrics, "w")
        of.write("#bamFilterReads --filterMetrics\n")
        of.write("#File\tReads Remaining\tTotal Initial Reads\n")
        of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered,
                                       total))
        of.close()

    return 0
コード例 #24
0
ファイル: writeBedGraph.py プロジェクト: avilella/deepTools
def writeBedGraph(bamFilesList, outputFileName, fragmentLength,
                  func, funcArgs, tileSize=25, region=None,
                  numberOfProcessors=None, format="bedgraph",
                  extendPairedEnds=True, zerosToNans=True, smoothLength=0,
                  minMappingQuality=None, ignoreDuplicates=False,
                  fragmentFromRead_func=None,
                  centerRead=False, samFlag=None):

    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    >>> test = Tester()
    >>> import tempfile
    >>> outFile = tempfile.NamedTemporaryFile()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> writeBedGraph( [test.bamFile1], outFile.name,
    ... 0, scaleCoverage, funcArgs, region='3R:0:200')
    >>> open(outFile.name, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> outFile.close()

    """
    bamHandlers = [openBam(x) for x in bamFilesList]
    genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
    # check if both bam files correspond to the same species
    # by comparing the chromosome names:
    chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamFilesList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               zerosToNans, minMappingQuality,
                               ignoreDuplicates,
                               fragmentFromRead_func, centerRead, samFlag),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, sort=True)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)
コード例 #25
0
    def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0):
        r"""
        Given a list of bamfiles, a function and a function arguments,
        this method writes a bedgraph file (or bigwig) file
        for a partition of the genome into tiles of given size
        and a value for each tile that corresponds to the given function
        and that is related to the coverage underlying the tile.

        Parameters
        ----------
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0}

        out_file_name : str
            name of the file to save the resulting data.

        smoothLength : int
            Distance in bp for smoothing the coverage per tile.


        """
        self.__dict__["smoothLength"] = smoothLength
        bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList]
        genome_chunk_length = getGenomeChunkLength(bam_handlers, self.binLength)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chrom_names_and_size, non_common = getCommonChrNames(bam_handlers, verbose=False)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        for x in self.__dict__.keys():
            sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

        res = mapReduce.mapReduce([func_to_call, func_args],
                                  writeBedGraph_wrapper,
                                  chrom_names_and_size,
                                  self_=self,
                                  genomeChunkLength=genome_chunk_length,
                                  region=self.region,
                                  blackListFileName=blackListFileName,
                                  numberOfProcessors=self.numberOfProcessors)

        # concatenate intermediary bedgraph files
        out_file = open(out_file_name + ".bg", 'wb')
        for tempfilename in res:
            if tempfilename:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                shutil.copyfileobj(open(tempfilename, 'rb'), out_file)
                os.remove(tempfilename)

        bedgraph_file = out_file.name
        out_file.close()
        if format == 'bedgraph':
            os.rename(bedgraph_file, out_file_name)
            if self.verbose:
                print "output file: {}".format(out_file_name)
        else:
            bedGraphToBigWig(
                chrom_names_and_size, bedgraph_file, out_file_name, True)
            if self.verbose:
                print "output file: {}".format(out_file_name)
            os.remove(bedgraph_file)
コード例 #26
0
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None,
                                 binSize=50000, distanceBetweenBins=1000000,
                                 numberOfProcessors=None, verbose=False):
    """
    Estimates the fragment length and read length through sampling

    Parameters
    ----------
    bamFile : str
        BAM file name
    return_lengths : bool
    numberOfProcessors : int
    verbose : bool
    binSize : int
    distanceBetweenBins : int

    Returns
    -------
    d : dict
        tuple of two dictionaries, one for the fragment length and the other
        for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile)
    chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths))

    distanceBetweenBins *= 2
    fl = []
    while len(fl) < 1000 and distanceBetweenBins > 1:
        distanceBetweenBins /= 2
        stepsize = binSize + distanceBetweenBins
        imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
                                       getFragmentLength_wrapper,
                                       chrom_sizes,
                                       genomeChunkLength=stepsize,
                                       blackListFileName=blackListFileName,
                                       numberOfProcessors=numberOfProcessors,
                                       verbose=verbose)

        fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {'sample_size': len(fragment_length),
                                 'min': fragment_length.min(),
                                 'qtile25': np.percentile(fragment_length, 25),
                                 'mean': np.mean(fragment_length),
                                 'median': np.median(fragment_length),
                                 'qtile75': np.percentile(fragment_length, 75),
                                 'max': fragment_length.max(),
                                 'std': np.std(fragment_length)}
        else:
            fragment_len_dict = None

        if return_lengths and fragment_len_dict is not None:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {'sample_size': len(read_length),
                         'min': read_length.min(),
                         'qtile25': np.percentile(read_length, 25),
                         'mean': np.mean(read_length),
                         'median': np.median(read_length),
                         'qtile75': np.percentile(read_length, 75),
                         'max': read_length.max(),
                         'std': np.std(read_length)}
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict
コード例 #27
0
def getScorePerBin(bigWigFiles,
                   binLength,
                   numberOfProcessors=1,
                   verbose=False,
                   region=None,
                   bedFile=None,
                   blackListFileName=None,
                   stepSize=None,
                   chrsToSkip=[],
                   out_file_for_raw_data=None,
                   allArgs=None):
    """
    This function returns a matrix containing scores (median) for the coverage
    of fragments within a region. Each row corresponds to a sampled region.
    Likewise, each column corresponds to a bigwig file.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()
    >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3))
    array([[ 1.,  1.,  2.,  2.],
           [ 1.,  1.,  1.,  3.]])

    """
    # Try to determine an optimal fraction of the genome (chunkSize)
    # that is sent to workers for analysis. If too short, too much time
    # is spent loading the files
    # if too long, some processors end up free.
    # the following is a heuristic

    # get list of common chromosome names and sizes
    chrom_sizes, non_common = getChromSizes(bigWigFiles)
    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if chrsToSkip and len(chrsToSkip):
        chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip]

    chrnames, chrlengths = list(zip(*chrom_sizes))
    if stepSize is None:
        stepSize = binLength  # for adjacent bins

    # set chunksize based on number of processors used
    chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6))
    # make chunkSize multiple of binLength
    chunkSize -= chunkSize % binLength
    if verbose:
        print("step size is {}".format(stepSize))

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)
    # mapReduce( (staticArgs), func, chromSize, etc. )
    if out_file_for_raw_data:
        save_file = True
    else:
        save_file = False

    # Handle GTF options
    transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(
        allArgs)

    imap_res = mapReduce.mapReduce(
        (bigWigFiles, stepSize, binLength, save_file),
        countReadsInRegions_wrapper,
        chrom_sizes,
        genomeChunkLength=chunkSize,
        bedFile=bedFile,
        blackListFileName=blackListFileName,
        region=region,
        numberOfProcessors=numberOfProcessors,
        transcriptID=transcriptID,
        exonID=exonID,
        keepExons=keepExons,
        transcript_id_designator=transcript_id_designator)

    if out_file_for_raw_data:
        if len(non_common):
            sys.stderr.write(
                "*Warning*\nThe resulting bed file does not contain information for "
                "the chromosomes that were not common between the bigwig files\n"
            )

        # concatenate intermediary bedgraph files
        ofile = open(out_file_for_raw_data, "w")
        for _values, tempFileName in imap_res:
            if tempFileName:
                # concatenate all intermediate tempfiles into one
                f = open(tempFileName, 'r')
                shutil.copyfileobj(f, ofile)
                f.close()
                os.remove(tempFileName)

        ofile.close()

    # the matrix scores are in the first element of each of the entries in imap_res
    score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
    return score_per_bin
コード例 #28
0
def getScorePerBin(bigWigFiles, binLength,
                   numberOfProcessors=1,
                   verbose=False, region=None,
                   bedFile=None,
                   blackListFileName=None,
                   stepSize=None,
                   chrsToSkip=[],
                   out_file_for_raw_data=None):
    """
    This function returns a matrix containing scores (median) for the coverage
    of fragments within a region. Each row corresponds to a sampled region.
    Likewise, each column corresponds to a bigwig file.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()
    >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3))
    array([[ 1.,  1.,  2.,  2.],
           [ 1.,  1.,  1.,  3.]])

    """

    # Try to determine an optimal fraction of the genome (chunkSize)
    # that is sent to workers for analysis. If too short, too much time
    # is spent loading the files
    # if too long, some processors end up free.
    # the following is a heuristic

    # get list of common chromosome names and sizes
    chrom_sizes, non_common = getChromSizes(bigWigFiles)
    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if chrsToSkip and len(chrsToSkip):
        chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip]

    chrnames, chrlengths = zip(*chrom_sizes)
    if stepSize is None:
        stepSize = binLength  # for adjacent bins

    # set chunksize based on number of processors used
    chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6))
    # make chunkSize multiple of binLength
    chunkSize -= chunkSize % binLength
    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)
    # mapReduce( (staticArgs), func, chromSize, etc. )
    if out_file_for_raw_data:
        save_file = True
    else:
        save_file = False

    imap_res = mapReduce.mapReduce((bigWigFiles, stepSize, binLength, save_file),
                                   countReadsInRegions_wrapper,
                                   chrom_sizes,
                                   genomeChunkLength=chunkSize,
                                   bedFile=bedFile,
                                   blackListFileName=blackListFileName,
                                   region=region,
                                   numberOfProcessors=numberOfProcessors)

    if out_file_for_raw_data:
        if len(non_common):
            sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                             "the chromosomes that were not common between the bigwig files\n")

        # concatenate intermediary bedgraph files
        for _values, tempFileName in imap_res:
            if tempFileName:
                # concatenate all intermediate tempfiles into one
                shutil.copyfileobj(open(tempFileName, 'r'), out_file_for_raw_data)
                os.remove(tempFileName)

        out_file_for_raw_data.close()

    # the matrix scores are in the first element of each of the entries in imap_res
    score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
    return score_per_bin
コード例 #29
0
ファイル: plotEnrichment.py プロジェクト: fidelram/deepTools
def main(args=None):

    args = parse_arguments().parse_args(args)

    if not args.outRawCounts and not args.plotFile:
        sys.exit("Error: You need to specify at least one of --plotFile or --outRawCounts!\n")

    if args.labels is None:
        args.labels = args.bamfiles
    if args.smartLabels:
        args.labels = smartLabels(args.bamfiles)
    if len(args.labels) != len(args.bamfiles):
        sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles)))

    # Ensure that if we're given an attributeKey that it's not empty
    if args.attributeKey and args.attributeKey == "":
        args.attributeKey = None

    global gtf
    if not args.regionLabels and args.smartLabels:
        args.regionLabels = smartLabels(args.BED)
    gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels, attributeKey=args.attributeKey)

    # Get fragment size and chromosome dict
    fhs = [openBam(x) for x in args.bamfiles]
    chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose)
    for fh in fhs:
        fh.close()

    frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0],
                                                                return_lengths=False,
                                                                blackListFileName=args.blackListFileName,
                                                                numberOfProcessors=args.numberOfProcessors,
                                                                verbose=args.verbose)
    if args.extendReads:
        if args.extendReads is True:
            # try to guess fragment length if the bam file contains paired end reads
            if frag_len_dict:
                defaultFragmentLength = frag_len_dict['median']
            else:
                sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.")
            if args.verbose:
                print("Fragment length based on paired en data "
                      "estimated to be {0}".format(frag_len_dict['median']))
        elif args.extendReads < read_len_dict['median']:
            sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
                             "Reads will not be extended.\n".format(int(read_len_dict['median'])))
            defaultFragmentLength = 'read length'
        elif args.extendReads > 2000:
            sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
        else:
            defaultFragmentLength = args.extendReads
    else:
        defaultFragmentLength = 'read length'

    # Get the chunkLength
    chunkLength = getChunkLength(args, chromSize)

    # Map reduce to get the counts/file/feature
    res = mapReduce([args, defaultFragmentLength],
                    getEnrichment_worker,
                    chromSize,
                    genomeChunkLength=chunkLength,
                    region=args.region,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    features = res[0][1]
    featureCounts = []
    for i in list(range(len(args.bamfiles))):
        d = dict()
        for x in features:
            d[x] = 0
        featureCounts.append(d)

    # res is a list, with each element a list (length len(args.bamfiles)) of dicts
    totalCounts = [0] * len(args.bamfiles)
    for x in res:
        for i, y in enumerate(x[2]):
            totalCounts[i] += y
        for i, y in enumerate(x[0]):
            for k, v in y.items():
                featureCounts[i][k] += v

    # Make a plot
    if args.plotFile:
        plotEnrichment(args, featureCounts, totalCounts, features)

    # Raw counts
    if args.outRawCounts:
        of = open(args.outRawCounts, "w")
        of.write("file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n")
        for i, x in enumerate(args.labels):
            for k, v in featureCounts[i].items():
                of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format(x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i]))
        of.close()
コード例 #30
0
def writeBedGraph(
        bamOrBwFileList, outputFileName, fragmentLength,
        func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=1,
        format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
        skipZeroOverZero=False, smoothLength=0, fixed_step=False, verbose=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """
    bamHandles = []
    mappedList = []
    for indexedFile, fileFormat in bamOrBwFileList:
        if fileFormat == 'bam':
            bam, mapped, unmapped, stats = bamHandler.openBam(indexedFile, returnStats=True, nThreads=numberOfProcessors)
            bamHandles.append(bam)
            mappedList.append(mapped)

    if len(bamHandles):
        genomeChunkLength = getGenomeChunkLength(bamHandles, tileSize, mappedList)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandles, verbose=verbose)
    else:
        genomeChunkLength = int(10e6)
        cCommon = []
        chromNamesAndSize = {}
        for fileName, fileFormat in bamOrBwFileList:
            if fileFormat == 'bigwig':
                fh = pyBigWig.open(fileName)
            else:
                continue

            for chromName, size in list(fh.chroms().items()):
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print("\nWARNING\n"
                              "Chromosome {} length reported in the "
                              "input files differ.\n{} for {}\n"
                              "{} for {}.\n\nThe smallest "
                              "length will be used".format(
                                  chromName, chromNamesAndSize[chromName],
                                  bamOrBwFileList[0][0], size, fileName))
                        chromNamesAndSize[chromName] = min(
                            chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            fh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               skipZeroOverZero, missingDataAsZero, fixed_step),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              blackListFileName=blackListFileName,
                              numberOfProcessors=numberOfProcessors,
                              verbose=verbose)

    # Determine the sorted order of the temp files
    chrom_order = dict()
    for i, _ in enumerate(chromNamesAndSize):
        chrom_order[_[0]] = i
    res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res]
    res.sort()

    if format == 'bedgraph':
        of = open(outputFileName, 'wb')
        for r in res:
            if r is not None:
                _ = open(r[3], 'rb')
                shutil.copyfileobj(_, of)
                _.close()
                os.remove(r[3])
        of.close()
    else:
        bedGraphToBigWig(chromNamesAndSize, [x[3] for x in res], outputFileName)
コード例 #31
0
    def run(self, allArgs=None):
        bamFilesHandles = []
        for x in self.bamFilesList:
            try:
                y = bamHandler.openBam(x)
            except SystemExit:
                sys.exit(sys.exc_info()[1])
            except:
                y = pyBigWig.open(x)
            bamFilesHandles.append(y)

        chromsizes, non_common = deeptools.utilities.getCommonChrNames(
            bamFilesHandles, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromsizes = [x for x in chromsizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = list(zip(*chromsizes))

        genomeSize = sum(chrLengths)

        if self.bedFile is None:
            chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize,
                                              chromsizes, chrLengths)
        else:
            chunkSize = None

        [bam_h.close() for bam_h in bamFilesHandles]

        if self.verbose:
            print("step size is {}".format(self.stepSize))

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # Handle GTF options
        transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(
            allArgs)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce(
            [],
            countReadsInRegions_wrapper,
            chromsizes,
            self_=self,
            genomeChunkLength=chunkSize,
            bedFile=self.bedFile,
            blackListFileName=self.blackListFileName,
            region=self.region,
            numberOfProcessors=self.numberOfProcessors,
            transcriptID=transcriptID,
            exonID=exonID,
            keepExons=keepExons,
            transcript_id_designator=transcript_id_designator)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write(
                    "*Warning*\nThe resulting bed file does not contain information for "
                    "the chromosomes that were not common between the bigwig files\n"
                )

            # concatenate intermediary bedgraph files
            ofile = open(self.out_file_for_raw_data, "w")
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    _foo = open(tempFileName, 'r')
                    shutil.copyfileobj(_foo, ofile)
                    _foo.close()
                    os.remove(tempFileName)

            ofile.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res],
                                               axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit(
                    '\nNo coverage values could be computed.\n\n'
                    'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                    'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit(
                    '\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                    'contain mapped reads.')
コード例 #32
0
def writeBedGraph(bamOrBwFileList,
                  outputFileName,
                  fragmentLength,
                  func,
                  funcArgs,
                  tileSize=25,
                  region=None,
                  blackListFileName=None,
                  numberOfProcessors=1,
                  format="bedgraph",
                  extendPairedEnds=True,
                  missingDataAsZero=False,
                  smoothLength=0,
                  fixed_step=False,
                  verbose=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """
    bamHandles = []
    mappedList = []
    for indexedFile, fileFormat in bamOrBwFileList:
        if fileFormat == 'bam':
            bam, mapped, unmapped, stats = bamHandler.openBam(
                indexedFile, returnStats=True, nThreads=numberOfProcessors)
            bamHandles.append(bam)
            mappedList.append(mapped)

    if len(bamHandles):
        genomeChunkLength = getGenomeChunkLength(bamHandles, tileSize,
                                                 mappedList)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandles, verbose=verbose)
    else:
        genomeChunkLength = int(10e6)
        cCommon = []
        chromNamesAndSize = {}
        for fileName, fileFormat in bamOrBwFileList:
            if fileFormat == 'bigwig':
                fh = pyBigWig.open(fileName)
            else:
                continue

            for chromName, size in list(fh.chroms().items()):
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print("\nWARNING\n"
                              "Chromosome {} length reported in the "
                              "input files differ.\n{} for {}\n"
                              "{} for {}.\n\nThe smallest "
                              "length will be used".format(
                                  chromName, chromNamesAndSize[chromName],
                                  bamOrBwFileList[0][0], size, fileName))
                        chromNamesAndSize[chromName] = min(
                            chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            fh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce(
        (tileSize, fragmentLength, bamOrBwFileList, func, funcArgs,
         extendPairedEnds, smoothLength, missingDataAsZero, fixed_step),
        writeBedGraph_wrapper,
        chromNamesAndSize,
        genomeChunkLength=genomeChunkLength,
        region=region,
        blackListFileName=blackListFileName,
        numberOfProcessors=numberOfProcessors,
        verbose=verbose)

    # Determine the sorted order of the temp files
    chrom_order = dict()
    for i, _ in enumerate(chromNamesAndSize):
        chrom_order[_[0]] = i
    res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res]
    res.sort()

    if format == 'bedgraph':
        of = open(outputFileName, 'wb')
        for r in res:
            if r is not None:
                _ = open(r[3], 'rb')
                shutil.copyfileobj(_, of)
                _.close()
                os.remove(r[3])
        of.close()
    else:
        bedGraphToBigWig(chromNamesAndSize, [x[3] for x in res],
                         outputFileName)
コード例 #33
0
    def run(self,
            func_to_call,
            func_args,
            out_file_name,
            blackListFileName=None,
            format="bedgraph",
            smoothLength=0):
        r"""
        Given a list of bamfiles, a function and a function arguments,
        this method writes a bedgraph file (or bigwig) file
        for a partition of the genome into tiles of given size
        and a value for each tile that corresponds to the given function
        and that is related to the coverage underlying the tile.

        Parameters
        ----------
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0}

        out_file_name : str
            name of the file to save the resulting data.

        smoothLength : int
            Distance in bp for smoothing the coverage per tile.


        """
        self.__dict__["smoothLength"] = smoothLength
        bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList]
        genome_chunk_length = getGenomeChunkLength(bam_handlers,
                                                   self.binLength)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chrom_names_and_size, non_common = getCommonChrNames(bam_handlers,
                                                             verbose=False)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        for x in list(self.__dict__.keys()):
            sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

        res = mapReduce.mapReduce([func_to_call, func_args],
                                  writeBedGraph_wrapper,
                                  chrom_names_and_size,
                                  self_=self,
                                  genomeChunkLength=genome_chunk_length,
                                  region=self.region,
                                  blackListFileName=blackListFileName,
                                  numberOfProcessors=self.numberOfProcessors)

        # concatenate intermediary bedgraph files
        out_file = open(out_file_name + ".bg", 'wb')
        for tempfilename in res:
            if tempfilename:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                _foo = open(tempfilename, 'rb')
                shutil.copyfileobj(_foo, out_file)
                _foo.close()
                os.remove(tempfilename)

        bedgraph_file = out_file.name
        out_file.close()
        if format == 'bedgraph':
            os.rename(bedgraph_file, out_file_name)
            if self.verbose:
                print("output file: {}".format(out_file_name))
        else:
            bedGraphToBigWig(chrom_names_and_size, bedgraph_file,
                             out_file_name, True)
            if self.verbose:
                print("output file: {}".format(out_file_name))
            os.remove(bedgraph_file)
コード例 #34
0
    def run(self, allArgs=None):
        # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
        # workers for analysis. If too short, too much time is spend loading the files
        # if too long, some processors end up free.
        # the following values are empirical
        bamFilesHandlers = []
        for x in self.bamFilesList:
            try:
                y = bamHandler.openBam(x)
            except:
                y = pyBigWig.open(x)
            bamFilesHandlers.append(y)
        chromSizes, non_common = deeptools.utilities.getCommonChrNames(
            bamFilesHandlers, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = list(zip(*chromSizes))

        genomeSize = sum(chrLengths)
        if self.stepSize is None:
            if self.region is None:
                self.stepSize = max(
                    int(float(genomeSize) / self.numberOfSamples), 1)
            else:
                # compute the step size, based on the number of samples
                # and the length of the region studied
                (chrom, start,
                 end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
                self.stepSize = max(
                    int(float(end - start) / self.numberOfSamples), 1)

        # number of samples is better if large
        if np.mean(chrLengths) < self.stepSize and self.bedFile is None:
            min_num_of_samples = int(genomeSize / np.mean(chrLengths))
            raise ValueError(
                "numberOfSamples has to be bigger than {} ".format(
                    min_num_of_samples))

        max_mapped = []
        for x in bamFilesHandlers:
            try:
                max_mapped.append(x.mapped)
            except:
                # bigWig, use a fixed value
                max_mapped.append(0)
        max_mapped = max(max_mapped)

        # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin
        if max_mapped == 0:
            chunkSize = 10000 * self.binLength
            self.stepSize = self.binLength
        else:
            reads_per_bp = float(max_mapped) / genomeSize
            chunkSize = int(self.stepSize * 1e3 /
                            (reads_per_bp * len(bamFilesHandlers)))
        [bam_h.close() for bam_h in bamFilesHandlers]

        # Ensure that chunkSize is always at least self.stepSize
        if chunkSize < self.stepSize:
            chunkSize = self.stepSize

        if self.verbose:
            print("step size is {}".format(self.stepSize))

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # Handle GTF options
        transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(
            allArgs)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce(
            [],
            countReadsInRegions_wrapper,
            chromSizes,
            self_=self,
            genomeChunkLength=chunkSize,
            bedFile=self.bedFile,
            blackListFileName=self.blackListFileName,
            region=self.region,
            numberOfProcessors=self.numberOfProcessors,
            transcriptID=transcriptID,
            exonID=exonID,
            keepExons=keepExons,
            transcript_id_designator=transcript_id_designator)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write(
                    "*Warning*\nThe resulting bed file does not contain information for "
                    "the chromosomes that were not common between the bigwig files\n"
                )

            # concatenate intermediary bedgraph files
            ofile = open(self.out_file_for_raw_data, "w")
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    _foo = open(tempFileName, 'r')
                    shutil.copyfileobj(_foo, ofile)
                    _foo.close()
                    os.remove(tempFileName)

            ofile.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res],
                                               axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit(
                    '\nNo coverage values could be computed.\n\n'
                    'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                    'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit(
                    '\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                    'contain mapped reads.')
コード例 #35
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if not args.outRawCounts and not args.plotFile:
        sys.exit(
            "Error: You need to specify at least one of --plotFile or --outRawCounts!\n"
        )

    if args.labels is None:
        args.labels = args.bamfiles
    if len(args.labels) != len(args.bamfiles):
        sys.exit(
            "Error: The number of labels ({0}) does not match the number of BAM files ({1})!"
            .format(len(args.labels), len(args.bamfiles)))

    # Get fragment size and chromosome dict
    fhs = [openBam(x) for x in args.bamfiles]
    chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose)
    for fh in fhs:
        fh.close()

    frag_len_dict, read_len_dict = get_read_and_fragment_length(
        args.bamfiles[0],
        return_lengths=False,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose)
    if args.extendReads:
        if args.extendReads is True:
            # try to guess fragment length if the bam file contains paired end reads
            if frag_len_dict:
                defaultFragmentLength = frag_len_dict['median']
            else:
                sys.exit(
                    "*ERROR*: library is not paired-end. Please provide an extension length."
                )
            if args.verbose:
                print("Fragment length based on paired en data "
                      "estimated to be {0}".format(frag_len_dict['median']))
        elif args.extendReads < read_len_dict['median']:
            sys.stderr.write(
                "*WARNING*: read extension is smaller than read length (read length = {}). "
                "Reads will not be extended.\n".format(
                    int(read_len_dict['median'])))
            defaultFragmentLength = 'read length'
        elif args.extendReads > 2000:
            sys.exit(
                "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                .format(args.extendReads))
        else:
            defaultFragmentLength = args.extendReads
    else:
        defaultFragmentLength = 'read length'

    # Get the chunkLength
    chunkLength = getChunkLength(args, chromSize)

    # Map reduce to get the counts/file/feature
    res = mapReduce([args, defaultFragmentLength],
                    getEnrichment_worker,
                    chromSize,
                    genomeChunkLength=chunkLength,
                    region=args.region,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    features = res[0][1]
    featureCounts = []
    for i in list(range(len(args.bamfiles))):
        d = dict()
        for x in features:
            d[x] = 0
        featureCounts.append(d)

    # res is a list, with each element a list (length len(args.bamfiles)) of dicts
    totalCounts = [0] * len(args.bamfiles)
    for x in res:
        for i, y in enumerate(x[2]):
            totalCounts[i] += y
        for i, y in enumerate(x[0]):
            for k, v in y.items():
                featureCounts[i][k] += v

    # Make a plot
    if args.plotFile:
        plotEnrichment(args, featureCounts, totalCounts, features)

    # Raw counts
    if args.outRawCounts:
        of = open(args.outRawCounts, "w")
        of.write("file\tfeatureType\tpercent\n")
        for i, x in enumerate(args.labels):
            for k, v in featureCounts[i].items():
                of.write("{0}\t{1}\t{2:5.2f}\n".format(x, k, (100.0 * v) /
                                                       totalCounts[i]))
        of.close()
コード例 #36
0
def writeBedGraph(
        bamOrBwFileList, outputFileName, fragmentLength,
        func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=None,
        format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
        smoothLength=0, fixed_step=False, verbose=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """

    bamHandlers = [bamHandler.openBam(indexedFile) for
                   indexedFile,
                   fileFormat in bamOrBwFileList if fileFormat == 'bam']
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=verbose)
    else:
        genomeChunkLength = int(10e6)
        cCommon = []
        chromNamesAndSize = {}
        for fileName, fileFormat in bamOrBwFileList:
            if fileFormat == 'bigwig':
                fh = pyBigWig.open(fileName)
            else:
                continue

            for chromName, size in list(fh.chroms().items()):
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print("\nWARNING\n"
                              "Chromosome {} length reported in the "
                              "input files differ.\n{} for {}\n"
                              "{} for {}.\n\nThe smallest "
                              "length will be used".format(
                                  chromName, chromNamesAndSize[chromName],
                                  bamOrBwFileList[0][0], size, fileName))
                        chromNamesAndSize[chromName] = min(
                            chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            fh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               missingDataAsZero, fixed_step),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              blackListFileName=blackListFileName,
                              numberOfProcessors=numberOfProcessors,
                              verbose=verbose)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            _foo = open(tempFileName, 'rb')
            shutil.copyfileobj(_foo, outFile)
            _foo.close()
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print("output file: %s" % (outputFileName))
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, True)
        if debug:
            print("output file: %s" % (outputFileName))
        os.remove(bedGraphFile)