def writeBedGraph(
        bamOrBwFileList, outputFileName, fragmentLength,
        func, funcArgs, tileSize=25, region=None, numberOfProcessors=None,
        format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
        smoothLength=0, fixed_step=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """

    bamHandlers = [bamHandler.openBam(indexedFile) for
                   indexedFile,
                   fileFormat in bamOrBwFileList if fileFormat == 'bam']
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [fileName for fileName,
                   fileFormat in bamOrBwFileList if fileFormat == 'bigwig']
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            bwh = pyBigWig.open(bw)
            for chromName, size in bwh.chroms().items():
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print "\nWARNING\n" \
                            "Chromosome {} length reported in the " \
                            "bigwig files differ.\n{} for {}\n" \
                            "{} for {}.\n\nThe smallest " \
                            "length will be used".format(
                                chromName, chromNamesAndSize[chromName],
                                bigwigs[0], size, bw)
                        chromNamesAndSize[chromName] = min(
                            chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            bwh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               missingDataAsZero, fixed_step),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, True)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)
def writeBedGraph(
    bamOrBwFileList,
    outputFileName,
    fragmentLength,
    func,
    funcArgs,
    tileSize=25,
    region=None,
    numberOfProcessors=None,
    format="bedgraph",
    extendPairedEnds=True,
    missingDataAsZero=False,
    smoothLength=0,
    fixed_step=False,
):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """

    bamHandlers = [
        bamHandler.openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == "bam"
    ]
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == "bigwig"]
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            bwh = pyBigWig.open(bw)
            for chromName, size in bwh.chroms().items():
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print "\nWARNING\n" "Chromosome {} length reported in the " "bigwig files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format(
                            chromName, chromNamesAndSize[chromName], bigwigs[0], size, bw
                        )
                        chromNamesAndSize[chromName] = min(chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            bwh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems() if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce(
        (
            tileSize,
            fragmentLength,
            bamOrBwFileList,
            func,
            funcArgs,
            extendPairedEnds,
            smoothLength,
            missingDataAsZero,
            fixed_step,
        ),
        writeBedGraph_wrapper,
        chromNamesAndSize,
        genomeChunkLength=genomeChunkLength,
        region=region,
        numberOfProcessors=numberOfProcessors,
    )

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", "wb")
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, "rb"), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == "bedgraph":
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, True)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples,
                      defaultFragmentLength, numberOfProcessors=1,
                      skipZeros=True, verbose=False, region=None,
                      bedFile=None, extendPairedEnds=True,
                      minMappingQuality=None,
                      ignoreDuplicates=False,
                      chrsToSkip=[],
                      stepSize=None,
                      samFlag=None):

    r"""
    This function visits a number of sites and returs a matrix containing read
    counts. Each row to one sampled site and each column correspond to each of
    the bamFiles.

    If the chrsToSkip is given, then counts are filter out from this
    chromosome which, unless a female sample is used, the counts are less
    compared to autosomes. For most applications this is irrelevant but for
    other cases, like when stimating the best scaling factor, this is
    important.

    The test data contains reads for 200 bp
    >>> test = Tester()

    The transpose function is used to get a nicer looking output.
    The first line corresponds to the number of reads per bin in bam file 1
    >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])

    >>> aa = np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    >>> np.savez('/tmp/aa', aa)
    """

    # Try to determine an optimal fraction of the genome (chunkSize) that is sent to 
    # workers for analysis. If too short, too much time is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ]
    chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)

    genomeSize = sum(chrLengths)
    max_mapped = max( [ x.mapped for x in  bamFilesHandlers ] )

    reads_per_bp = float(max_mapped) / genomeSize
#    chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

    if stepSize is None:
        stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 )

    chunkSize =  int (stepSize * 1e3 / ( reads_per_bp  * len(bamFilesHandlers)) )
    [ bam_h.close() for bam_h in bamFilesHandlers]

    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)

    imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength,
                                     defaultFragmentLength, skipZeros,
                                     extendPairedEnds, minMappingQuality,
                                     ignoreDuplicates, samFlag),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    try:
        num_reads_per_bin = np.concatenate(imap_res, axis=0)
    except ValueError:
        if bedFile:
            exit('\nNo coverage values could be computed.\n\n'
                 'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                 'The valid chromosome names are:\n{}'.format(chrNames))
        else:
            exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                 'contain mapped reads.')

    return num_reads_per_bin
def writeBedGraph(
        bamOrBwFileList, outputFileName, fragmentLength,
        func, funcArgs, tileSize=25, region=None, numberOfProcessors=None,
        format="bedgraph", extendPairedEnds=True, zerosToNans=True,
        smoothLength=0, fixed_step=False):

    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    >>> test = Tester()
    >>> outFile = tempfile.NamedTemporaryFile()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> writeBedGraph([(test.bamFile1, 'bam')], outFile.name,
    ... 0, scaleCoverage, funcArgs, region='3R:0:200')
    >>> open(outFile.name, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> outFile.close()


    """

    bigwig_info = cfg.config.get('external_tools', 'bigwig_info')
    bamHandlers = [openBam(indexedFile) for
                   indexedFile,
                   fileFormat in bamOrBwFileList if fileFormat == 'bam']
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [fileName for fileName,
                   fileFormat in bamOrBwFileList if fileFormat == 'bigwig']
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            inBlock = False
            for line in os.popen(
                    "{} -chroms {}".format(bigwig_info, bw)).readlines():

                if line[0:10] == "chromCount":
                    inBlock = True
                    continue

                if line[0:5] == "bases":
                    break
                if inBlock:
                    chromName, id, size = line.strip().split(" ")
                    size = int(size)
                    if chromName in chromNamesAndSize:
                        cCommon.append(chromName)
                        if chromNamesAndSize[chromName] != size:
                            print "\nWARNING\n" \
                                "Chromosome {} length reported in the " \
                                "bigwig files differ.\n{} for {}\n" \
                                "{} for {}.\n\nThe smallest " \
                                "length will be used".format(
                                chromName, chromNamesAndSize[chromName],
                                bigwigs[0], size, bigwigs[1])
                            chromNamesAndSize[chromName] = min(
                                chromNamesAndSize[chromName], size)
                    else:
                        chromNamesAndSize[chromName] = size

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               zerosToNans, fixed_step),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, False)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)
Beispiel #5
0
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples,
                      defaultFragmentLength, numberOfProcessors=1,
                      skipZeros=True, verbose=False, region=None,
                      bedFile=None, extendPairedEnds=True,
                      minMappingQuality=None,
                      ignoreDuplicates=False,
                      chrsToSkip=[],
                      stepSize=None):

    r"""
    This function visits a number of sites and returs a matrix containing read
    counts. Each row to one sampled site and each column correspond to each of
    the bamFiles.

    If the chrsToSkip is given, then counts are filter out from this
    chromosome which, unless a female sample is used, the counts are less
    compared to autosomes. For most applications this is irrelevant but for
    other cases, like when stimating the best scaling factor, this is
    important.

    The test data contains reads for 200 bp
    >>> test = Tester()

    The transpose function is used to get a nicer looking output.
    The first line corresponds to the number of reads per bin in bam file 1
    >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])

    >>> aa = np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    >>> np.savez('/tmp/aa', aa)
    """

    # Try to determine an optimal fraction of the genome (chunkSize) that is sent to 
    # workers for analysis. If too short, too much time is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ]
    chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)

    genomeSize = sum(chrLengths)
    max_mapped = max( [ x.mapped for x in  bamFilesHandlers ] )

    reads_per_bp = float(max_mapped) / genomeSize
#    chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

    if stepSize is None:
        stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 )

    chunkSize =  int (stepSize * 1e3 / ( reads_per_bp  * len(bamFilesHandlers)) )
    [ bam_h.close() for bam_h in bamFilesHandlers]

    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)

    imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength,
                                     defaultFragmentLength, skipZeros,
                                     extendPairedEnds, minMappingQuality,
                                     ignoreDuplicates),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    num_reads_per_bin = np.concatenate(imap_res, axis=0)
    return num_reads_per_bin
Beispiel #6
0
def writeBedGraph(bamFilesList, outputFileName, fragmentLength,
                  func, funcArgs, tileSize=25, region=None,
                  numberOfProcessors=None, format="bedgraph",
                  extendPairedEnds=True, zerosToNans=True, smoothLength=0,
                  minMappingQuality=None, ignoreDuplicates=False,
                  fragmentFromRead_func=None):

    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    >>> test = Tester()
    >>> outFile = tempfile.NamedTemporaryFile()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> writeBedGraph( [test.bamFile1], outFile.name,
    ... 0, scaleCoverage, funcArgs, region='3R:0:200')
    >>> open(outFile.name, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> outFile.close()

    """
    bamHandlers = [openBam(x) for x in bamFilesList]
    genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
    # check if both bam files correspond to the same species
    # by comparing the chromosome names:
    chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamFilesList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               zerosToNans, minMappingQuality,
                               ignoreDuplicates,
                               fragmentFromRead_func),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, False)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)
Beispiel #7
0
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples,
                      defaultFragmentLength, numberOfProcessors=1,
                      skipZeros=True, verbose=False, region=None,
                      bedFile=None, extendPairedEnds=True,
                      minMappingQuality=None,
                      ignoreDuplicates=False,
                      chrsToSkip=[],
                      stepSize=None,
                      samFlag=None):

    r"""
    This function collects read counts (coverage) from several bam files and returns
    an numpy array with the results. This function does not explicitly do the
    coverage computation, instead divides the work into smaller chunks that are
    sent to individual processors.

    Parameters
    ----------
    bamFilesList : list
        List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam']

    binLength : int
        Length of the window/bin. This value is overruled by ``bedFile`` if present.

    numberOfSamples : int
        Total number of samples. The genome is divided into ``numberOfSamples``, each
        with a window/bin length equal to ``binLength``. This value is overruled
        by ``stepSize`` in case such value is present and by ``bedFile`` in which
        case the number of samples and bins are defined in the bed file

    defaultFragmentLength : int
        fragment length to extend reads that are not paired. Paired reads are extended to
        the fragment length defined by the mate distance. For Illumina reads, usual values
        are around 300. This value can be determined using the peak caller MACS2 or can be
        approximated by the fragment lengths computed when preparing the library for sequencing.

    numberOfProcessors : int
        Number of processors to use. Default is 4

    skipZeros : bool
        Default is True. This option decides if regions having zero coverage in all bam files
        should be skipped or kept.

    verbose : bool
        Output messages. Default: False

    region : str
        Region to limit the computation in the form chrom:start:end.

    bedFile : str
        Name of a bed file containing the regions for wich to compute the coverage. This option
        overrules ``binLength``, ``numberOfSamples`` and ``stepSize``.
    extendPairedEnds : bool
        Whether coverage should be computed for the extended read length (i.e. the region covered
        by the two mates or the regions expected to be covered by single-reads). Default: true

    minMappingQuality : int
        Reads of a mapping quality less than the give value are not considered. Default: None

    ignoreDuplicates : bool
        Whether read duplicates (same start, end position. If paired-end, same start-end for mates) are
        to be excluded. Default: false

    chrToSkip: list
        List with names of chromosomes that do not want to be included in the coverage computation.
        This is useful to remove unwanted chromosomes (e.g. 'random' or 'Het').

    stepSize : int
        the positions for which the coverage is computed are defined as follows:
        ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute
        the coverage at each base pair. If the stepSize is equal to the
        binLength then the coverage is computed for consecutive bins. If seepSize is
        smaller than the binLength, then teh bins will overlap.

    samFlag : int
        If given, only reads having such flag are considered. For example, to get only
        reads that are the first mates a samFlag of 64 could be used. Similarly, the
        samFlag can be used to select only reads mapping on the forward (or reverse) strand
        or to get only properly paired reads.

    Returns
    -------
    numpy array

        Each row correspond to each bin/bed region and each column correspond to each of
        the bamFiles. If ``skipZeros`` is used, then the result may have less rows
        than expected


    Examples
    --------

    The test data contains reads for 200 bp.

    >>> test = Tester()

    The transpose function is used to get a nicer looking output.
    The first line corresponds to the number of reads per bin in bam file 1

    >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])
    """

    # Try to determine an optimal fraction of the genome (chunkSize) that is sent to 
    # workers for analysis. If too short, too much time is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ]
    chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)

    genomeSize = sum(chrLengths)
    max_mapped = max( [ x.mapped for x in  bamFilesHandlers ] )

    reads_per_bp = float(max_mapped) / genomeSize
#    chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

    if stepSize is None:
        stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 )

    chunkSize =  int (stepSize * 1e3 / ( reads_per_bp  * len(bamFilesHandlers)) )
    [ bam_h.close() for bam_h in bamFilesHandlers]

    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)

    imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength,
                                     defaultFragmentLength, skipZeros,
                                     extendPairedEnds, minMappingQuality,
                                     ignoreDuplicates, samFlag),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    try:
        num_reads_per_bin = np.concatenate(imap_res, axis=0)
    except ValueError:
        if bedFile:
            exit('\nNo coverage values could be computed.\n\n'
                 'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                 'The valid chromosome names are:\n{}'.format(chrNames))
        else:
            exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                 'contain mapped reads.')

    return num_reads_per_bin
Beispiel #8
0
def writeBedGraph(
        bamOrBwFileList, outputFileName, fragmentLength,
        func, funcArgs, tileSize=25, region=None, numberOfProcessors=None,
        format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
        smoothLength=0, fixed_step=False):

    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    >>> test = Tester()
    >>> outFile = tempfile.NamedTemporaryFile()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> writeBedGraph([(test.bamFile1, 'bam')], outFile.name,
    ... 0, scaleCoverage, funcArgs, region='3R:0:200')
    >>> open(outFile.name, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> outFile.close()


    """

    bigwig_info = cfg.config.get('external_tools', 'bigwig_info')
    bamHandlers = [openBam(indexedFile) for
                   indexedFile,
                   fileFormat in bamOrBwFileList if fileFormat == 'bam']
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [fileName for fileName,
                   fileFormat in bamOrBwFileList if fileFormat == 'bigwig']
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            inBlock = False
            for line in os.popen(
                    "{} -chroms {}".format(bigwig_info, bw)).readlines():

                if line[0:10] == "chromCount":
                    inBlock = True
                    continue

                if line[0:5] == "bases":
                    break
                if inBlock:
                    chromName, id, size = line.strip().split(" ")
                    size = int(size)
                    if chromName in chromNamesAndSize:
                        cCommon.append(chromName)
                        if chromNamesAndSize[chromName] != size:
                            print "\nWARNING\n" \
                                "Chromosome {} length reported in the " \
                                "bigwig files differ.\n{} for {}\n" \
                                "{} for {}.\n\nThe smallest " \
                                "length will be used".format(
                                chromName, chromNamesAndSize[chromName],
                                bigwigs[0], size, bigwigs[1])
                            chromNamesAndSize[chromName] = min(
                                chromNamesAndSize[chromName], size)
                    else:
                        chromNamesAndSize[chromName] = size

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               missingDataAsZero, fixed_step),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, False)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)