Python mapReduce Beispiele, mapReduce.mapReduce Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: bamPEFragmentSize.py Projekt: truecrypt32/deepTools

def peFragmentSize(bamFile, bamFileIndex=None,
                   return_lengths=False,
                   numberOfProcessors=None, verbose=False):

    bamHandle = bamHandler.openBam(bamFile, bamFileIndex)
    chromSizes = zip(bamHandle.references, bamHandle.lengths)

    chunkSize = int(
        float(sum(bamHandle.lengths)) * 0.3 / max(numberOfProcessors,
                                                  len(bamHandle.lengths)))
    imap_res = mapReduce.mapReduce((bamHandle.filename, ),
                                   getFragmentLength_wrapper,
                                   chromSizes,
                                   genomeChunkLength=chunkSize,
                                   numberOfProcessors=numberOfProcessors,
                                   verbose=verbose)

    fl = np.concatenate(imap_res)
    if len(fl):
        fragLength = {'sample_size': len(fl),
                      'min': fl.min(),
                      'qtile25': np.percentile(fl, 25),
                      'mean': np.mean(fl),
                      'median': np.median(fl),
                      'qtile75': np.percentile(fl, 75),
                      'max': fl.max(),
                      'std': np.std(fl)}
        if return_lengths:
            fragLength['lengths'] = fl
    else:
        fragLength = None
    return fragLength

Beispiel #2

0

Datei anzeigen

Datei: getScorePerBigWigBin.py Projekt: truecrypt32/deepTools

def getScorePerBin(bigwigFilesList, binLength,
                   numberOfProcessors=1, skipZeros=True,
                   verbose=False, region=None,
                   bedFile=None,
                   chrsToSkip=[]):
    """
    This function returns a matrix containing scores (median) for the coverage
    of fragments within a region. Each row corresponds to a sampled region.
    Likewise, each column corresponds to a bigwig file.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 5,))
    array([[ 1.,  1.,  2.,  2.],
           [ 1.,  1.,  1.,  3.]])

    """

    # Try to determine an optimal fraction of the genome (chunkSize)
    # that is sent to workers for analysis. If too short, too much time
    # is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    # get list of common chromosome names and sizes
    chromSizes = getChromSizes(bigwigFilesList)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)
    genomeSize = sum(chrLengths)
    max_mapped = max( map(lambda x: getNumberOfFragmentsPerRegionFromBigWig(x, chromSizes), bigwigFilesList) )
    reads_per_bp = float(max_mapped) / genomeSize
    stepSize = binLength    #for consecutive bins
    chunkSize = int(stepSize * 1e3 / ( reads_per_bp  * len(bigwigFilesList)) )

    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)

    # mapReduce( (staticArgs), func, chromSize, etc. )
    imap_res = mapReduce.mapReduce((bigwigFilesList, stepSize, binLength, skipZeros),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    score_per_bin = np.concatenate(imap_res, axis=0)
    return score_per_bin

Beispiel #3

0

Datei anzeigen

Datei: writeBedGraph_bam_and_bw.py Projekt: pombredanne/deepTools

def writeBedGraph(
        bamOrBwFileList, outputFileName, fragmentLength,
        func, funcArgs, tileSize=25, region=None, numberOfProcessors=None,
        format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
        smoothLength=0, fixed_step=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """

    bamHandlers = [bamHandler.openBam(indexedFile) for
                   indexedFile,
                   fileFormat in bamOrBwFileList if fileFormat == 'bam']
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [fileName for fileName,
                   fileFormat in bamOrBwFileList if fileFormat == 'bigwig']
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            bwh = pyBigWig.open(bw)
            for chromName, size in bwh.chroms().items():
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print "\nWARNING\n" \
                            "Chromosome {} length reported in the " \
                            "bigwig files differ.\n{} for {}\n" \
                            "{} for {}.\n\nThe smallest " \
                            "length will be used".format(
                                chromName, chromNamesAndSize[chromName],
                                bigwigs[0], size, bw)
                        chromNamesAndSize[chromName] = min(
                            chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            bwh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               missingDataAsZero, fixed_step),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, True)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)

Beispiel #4

0

Datei anzeigen

Datei: getFragmentAndReadSize.py Projekt: pombredanne/deepTools

def get_read_and_fragment_length(bamFile,
                                 bamFileIndex=None,
                                 return_lengths=False,
                                 numberOfProcessors=None,
                                 verbose=False):
    """
    Estimates the fragment length and read length through sampling
    :param bamFile: bamfile name
    :param bamFileIndex: bamfile index name
    :param return_lengths: bool,
    :param numberOfProcessors:
    :param verbose:
    :return: tuple of two dictionaries, one for the fragment length and the other
                    for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile, bamFileIndex)
    chrom_sizes = zip(bam_handle.references, bam_handle.lengths)

    chunk_size = int(
        float(sum(bam_handle.lengths)) * 0.3 /
        max(numberOfProcessors, len(bam_handle.lengths)))
    # avoid small chunk sizes to split the computation
    chunk_size = max(chunk_size, 100000)
    imap_res = mapReduce.mapReduce((bam_handle.filename, ),
                                   getFragmentLength_wrapper,
                                   chrom_sizes,
                                   genomeChunkLength=chunk_size,
                                   numberOfProcessors=numberOfProcessors,
                                   verbose=verbose)

    fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {
                'sample_size': len(fragment_length),
                'min': fragment_length.min(),
                'qtile25': np.percentile(fragment_length, 25),
                'mean': np.mean(fragment_length),
                'median': np.median(fragment_length),
                'qtile75': np.percentile(fragment_length, 75),
                'max': fragment_length.max(),
                'std': np.std(fragment_length)
            }
        else:
            fragment_len_dict = None

        if return_lengths:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {
            'sample_size': len(read_length),
            'min': read_length.min(),
            'qtile25': np.percentile(read_length, 25),
            'mean': np.mean(read_length),
            'median': np.median(read_length),
            'qtile75': np.percentile(read_length, 75),
            'max': read_length.max(),
            'std': np.std(read_length)
        }
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict

Beispiel #5

0

Datei anzeigen

    def run(self):
        # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
        # workers for analysis. If too short, too much time is spend loading the files
        # if too long, some processors end up free.
        # the following values are empirical

        bamFilesHandlers = [bamHandler.openBam(x) for x in self.bamFilesList]
        chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = zip(*chromSizes)

        genomeSize = sum(chrLengths)
        if self.stepSize is None:
            if self.region is None:
                self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
            else:
                # compute the step size, based on the number of samples
                # and the length of the region studied
                (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
                self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)

        # number of samples is better if large
        if np.mean(chrLengths) < self.stepSize:
            min_num_of_samples = int(genomeSize / np.mean(chrLengths))
            raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))

        max_mapped = max([x.mapped for x in bamFilesHandlers])

        reads_per_bp = float(max_mapped) / genomeSize
        # chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

        chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers)))
        [bam_h.close() for bam_h in bamFilesHandlers]

        if self.verbose:
            print "step size is {}".format(self.stepSize)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce([],
                                       countReadsInRegions_wrapper,
                                       chromSizes,
                                       self_=self,
                                       genomeChunkLength=chunkSize,
                                       bedFile=self.bedFile,
                                       region=self.region,
                                       numberOfProcessors=self.numberOfProcessors)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                                 "the chromosomes that were not common between the bigwig files\n")

            # concatenate intermediary bedgraph files
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    shutil.copyfileobj(open(tempFileName, 'r'), self.out_file_for_raw_data)
                    os.remove(tempFileName)

            # self.out_file_for_raw_data.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit('\nNo coverage values could be computed.\n\n'
                         'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                         'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                         'contain mapped reads.')

Beispiel #6

0

Datei anzeigen

Datei: countReadsPerBin.py Projekt: JohnLonginotto/deepTools

    def run(self):
        # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
        # workers for analysis. If too short, too much time is spend loading the files
        # if too long, some processors end up free.
        # the following values are empirical

        bamFilesHandlers = [bamHandler.openBam(x) for x in self.bamFilesList]
        chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = zip(*chromSizes)

        genomeSize = sum(chrLengths)
        if self.stepSize is None:
            if self.region is None:
                self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
            else:
                # compute the step size, based on the number of samples
                # and the length of the region studied
                (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
                self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)

        # number of samples is better if large
        if np.mean(chrLengths) < self.stepSize:
            min_num_of_samples = int(genomeSize / np.mean(chrLengths))
            raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))

        max_mapped = max([x.mapped for x in bamFilesHandlers])

        reads_per_bp = float(max_mapped) / genomeSize
        # chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

        chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers)))
        [bam_h.close() for bam_h in bamFilesHandlers]

        if self.verbose:
            print "step size is {}".format(self.stepSize)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce([],
                                       countReadsInRegions_wrapper,
                                       chromSizes,
                                       self_=self,
                                       genomeChunkLength=chunkSize,
                                       bedFile=self.bedFile,
                                       blackListFileName=self.blackListFileName,
                                       region=self.region,
                                       numberOfProcessors=self.numberOfProcessors)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                                 "the chromosomes that were not common between the bigwig files\n")

            # concatenate intermediary bedgraph files
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    shutil.copyfileobj(open(tempFileName, 'r'), self.out_file_for_raw_data)
                    os.remove(tempFileName)

            self.out_file_for_raw_data.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit('\nNo coverage values could be computed.\n\n'
                         'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                         'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                         'contain mapped reads.')

Beispiel #7

0

Datei anzeigen

Datei: getFragmentAndReadSize.py Projekt: JCSMR-Tremethick-Lab/deepTools

def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None,
                                 binSize=50000, distanceBetweenBins=1000000,
                                 numberOfProcessors=None, verbose=False):
    """
    Estimates the fragment length and read length through sampling

    Parameters
    ----------
    bamFile : str
        BAM file name
    return_lengths : bool
    numberOfProcessors : int
    verbose : bool
    binSize : int
    distanceBetweenBins : int

    Returns
    -------
    d : dict
        tuple of two dictionaries, one for the fragment length and the other
        for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile)
    chrom_sizes = zip(bam_handle.references, bam_handle.lengths)

    distanceBetweenBins *= 2
    fl = []
    while len(fl) < 1000 and distanceBetweenBins > 1:
        distanceBetweenBins /= 2
        stepsize = binSize + distanceBetweenBins
        imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
                                       getFragmentLength_wrapper,
                                       chrom_sizes,
                                       genomeChunkLength=stepsize,
                                       blackListFileName=blackListFileName,
                                       numberOfProcessors=numberOfProcessors,
                                       verbose=verbose)

        fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {'sample_size': len(fragment_length),
                                 'min': fragment_length.min(),
                                 'qtile25': np.percentile(fragment_length, 25),
                                 'mean': np.mean(fragment_length),
                                 'median': np.median(fragment_length),
                                 'qtile75': np.percentile(fragment_length, 75),
                                 'max': fragment_length.max(),
                                 'std': np.std(fragment_length)}
        else:
            fragment_len_dict = None

        if return_lengths and fragment_len_dict is not None:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {'sample_size': len(read_length),
                         'min': read_length.min(),
                         'qtile25': np.percentile(read_length, 25),
                         'mean': np.mean(read_length),
                         'median': np.median(read_length),
                         'qtile75': np.percentile(read_length, 75),
                         'max': read_length.max(),
                         'std': np.std(read_length)}
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict

Beispiel #8

0

Datei anzeigen

Datei: countReadsPerBin.py Projekt: martenson/deepTools

def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples,
                      defaultFragmentLength, numberOfProcessors=1,
                      skipZeros=True, verbose=False, region=None,
                      bedFile=None, extendPairedEnds=True,
                      minMappingQuality=None,
                      ignoreDuplicates=False,
                      chrsToSkip=[],
                      stepSize=None,
                      samFlag=None):

    r"""
    This function visits a number of sites and returs a matrix containing read
    counts. Each row to one sampled site and each column correspond to each of
    the bamFiles.

    If the chrsToSkip is given, then counts are filter out from this
    chromosome which, unless a female sample is used, the counts are less
    compared to autosomes. For most applications this is irrelevant but for
    other cases, like when stimating the best scaling factor, this is
    important.

    The test data contains reads for 200 bp
    >>> test = Tester()

    The transpose function is used to get a nicer looking output.
    The first line corresponds to the number of reads per bin in bam file 1
    >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])

    >>> aa = np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    >>> np.savez('/tmp/aa', aa)
    """

    # Try to determine an optimal fraction of the genome (chunkSize) that is sent to 
    # workers for analysis. If too short, too much time is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ]
    chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)

    genomeSize = sum(chrLengths)
    max_mapped = max( [ x.mapped for x in  bamFilesHandlers ] )

    reads_per_bp = float(max_mapped) / genomeSize
#    chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

    if stepSize is None:
        stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 )

    chunkSize =  int (stepSize * 1e3 / ( reads_per_bp  * len(bamFilesHandlers)) )
    [ bam_h.close() for bam_h in bamFilesHandlers]

    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)

    imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength,
                                     defaultFragmentLength, skipZeros,
                                     extendPairedEnds, minMappingQuality,
                                     ignoreDuplicates, samFlag),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    try:
        num_reads_per_bin = np.concatenate(imap_res, axis=0)
    except ValueError:
        if bedFile:
            exit('\nNo coverage values could be computed.\n\n'
                 'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                 'The valid chromosome names are:\n{}'.format(chrNames))
        else:
            exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                 'contain mapped reads.')

    return num_reads_per_bin

Beispiel #9

0

Datei anzeigen

Datei: writeBedGraph_bam_and_bw.py Projekt: steffenheyne/deepTools

def writeBedGraph(
    bamOrBwFileList,
    outputFileName,
    fragmentLength,
    func,
    funcArgs,
    tileSize=25,
    region=None,
    numberOfProcessors=None,
    format="bedgraph",
    extendPairedEnds=True,
    missingDataAsZero=False,
    smoothLength=0,
    fixed_step=False,
):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """

    bamHandlers = [
        bamHandler.openBam(indexedFile) for indexedFile, fileFormat in bamOrBwFileList if fileFormat == "bam"
    ]
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [fileName for fileName, fileFormat in bamOrBwFileList if fileFormat == "bigwig"]
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            bwh = pyBigWig.open(bw)
            for chromName, size in bwh.chroms().items():
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print "\nWARNING\n" "Chromosome {} length reported in the " "bigwig files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format(
                            chromName, chromNamesAndSize[chromName], bigwigs[0], size, bw
                        )
                        chromNamesAndSize[chromName] = min(chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            bwh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems() if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce(
        (
            tileSize,
            fragmentLength,
            bamOrBwFileList,
            func,
            funcArgs,
            extendPairedEnds,
            smoothLength,
            missingDataAsZero,
            fixed_step,
        ),
        writeBedGraph_wrapper,
        chromNamesAndSize,
        genomeChunkLength=genomeChunkLength,
        region=region,
        numberOfProcessors=numberOfProcessors,
    )

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", "wb")
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, "rb"), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == "bedgraph":
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, True)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)

Beispiel #10

0

Datei anzeigen

Datei: countReadsPerBin.py Projekt: asntech/deepTools

def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples,
                      defaultFragmentLength, numberOfProcessors=1,
                      skipZeros=True, verbose=False, region=None,
                      bedFile=None, extendPairedEnds=True,
                      minMappingQuality=None,
                      ignoreDuplicates=False,
                      chrsToSkip=[],
                      stepSize=None):

    r"""
    This function visits a number of sites and returs a matrix containing read
    counts. Each row to one sampled site and each column correspond to each of
    the bamFiles.

    If the chrsToSkip is given, then counts are filter out from this
    chromosome which, unless a female sample is used, the counts are less
    compared to autosomes. For most applications this is irrelevant but for
    other cases, like when stimating the best scaling factor, this is
    important.

    The test data contains reads for 200 bp
    >>> test = Tester()

    The transpose function is used to get a nicer looking output.
    The first line corresponds to the number of reads per bin in bam file 1
    >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])

    >>> aa = np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    >>> np.savez('/tmp/aa', aa)
    """

    # Try to determine an optimal fraction of the genome (chunkSize) that is sent to 
    # workers for analysis. If too short, too much time is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ]
    chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)

    genomeSize = sum(chrLengths)
    max_mapped = max( [ x.mapped for x in  bamFilesHandlers ] )

    reads_per_bp = float(max_mapped) / genomeSize
#    chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

    if stepSize is None:
        stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 )

    chunkSize =  int (stepSize * 1e3 / ( reads_per_bp  * len(bamFilesHandlers)) )
    [ bam_h.close() for bam_h in bamFilesHandlers]

    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)

    imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength,
                                     defaultFragmentLength, skipZeros,
                                     extendPairedEnds, minMappingQuality,
                                     ignoreDuplicates),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    num_reads_per_bin = np.concatenate(imap_res, axis=0)
    return num_reads_per_bin

Beispiel #11

0

Datei anzeigen

Datei: writeBedGraph_bam_and_bw.py Projekt: hjanime/deepTools

def writeBedGraph(
        bamOrBwFileList, outputFileName, fragmentLength,
        func, funcArgs, tileSize=25, region=None, numberOfProcessors=None,
        format="bedgraph", extendPairedEnds=True, zerosToNans=True,
        smoothLength=0, fixed_step=False):

    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    >>> test = Tester()
    >>> outFile = tempfile.NamedTemporaryFile()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> writeBedGraph([(test.bamFile1, 'bam')], outFile.name,
    ... 0, scaleCoverage, funcArgs, region='3R:0:200')
    >>> open(outFile.name, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> outFile.close()


    """

    bigwig_info = cfg.config.get('external_tools', 'bigwig_info')
    bamHandlers = [openBam(indexedFile) for
                   indexedFile,
                   fileFormat in bamOrBwFileList if fileFormat == 'bam']
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [fileName for fileName,
                   fileFormat in bamOrBwFileList if fileFormat == 'bigwig']
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            inBlock = False
            for line in os.popen(
                    "{} -chroms {}".format(bigwig_info, bw)).readlines():

                if line[0:10] == "chromCount":
                    inBlock = True
                    continue

                if line[0:5] == "bases":
                    break
                if inBlock:
                    chromName, id, size = line.strip().split(" ")
                    size = int(size)
                    if chromName in chromNamesAndSize:
                        cCommon.append(chromName)
                        if chromNamesAndSize[chromName] != size:
                            print "\nWARNING\n" \
                                "Chromosome {} length reported in the " \
                                "bigwig files differ.\n{} for {}\n" \
                                "{} for {}.\n\nThe smallest " \
                                "length will be used".format(
                                chromName, chromNamesAndSize[chromName],
                                bigwigs[0], size, bigwigs[1])
                            chromNamesAndSize[chromName] = min(
                                chromNamesAndSize[chromName], size)
                    else:
                        chromNamesAndSize[chromName] = size

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               zerosToNans, fixed_step),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, False)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)

Beispiel #12

0

Datei anzeigen

Datei: writeBedGraph.py Projekt: bgruening/deepTools

def writeBedGraph(bamFilesList, outputFileName, fragmentLength,
                  func, funcArgs, tileSize=25, region=None,
                  numberOfProcessors=None, format="bedgraph",
                  extendPairedEnds=True, zerosToNans=True, smoothLength=0,
                  minMappingQuality=None, ignoreDuplicates=False,
                  fragmentFromRead_func=None):

    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    >>> test = Tester()
    >>> outFile = tempfile.NamedTemporaryFile()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> writeBedGraph( [test.bamFile1], outFile.name,
    ... 0, scaleCoverage, funcArgs, region='3R:0:200')
    >>> open(outFile.name, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> outFile.close()

    """
    bamHandlers = [openBam(x) for x in bamFilesList]
    genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
    # check if both bam files correspond to the same species
    # by comparing the chromosome names:
    chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamFilesList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               zerosToNans, minMappingQuality,
                               ignoreDuplicates,
                               fragmentFromRead_func),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, False)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)

Beispiel #13

0

Datei anzeigen

def get_read_and_fragment_length(bamFile,
                                 return_lengths=False,
                                 blackListFileName=None,
                                 binSize=50000,
                                 distanceBetweenBins=1000000,
                                 numberOfProcessors=None,
                                 verbose=False):
    """
    Estimates the fragment length and read length through sampling

    Parameters
    ----------
    bamFile : str
        BAM file name
    return_lengths : bool
    numberOfProcessors : int
    verbose : bool
    binSize : int
    distanceBetweenBins : int

    Returns
    -------
    d : dict
        tuple of two dictionaries, one for the fragment length and the other
        for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile)
    chrom_sizes = zip(bam_handle.references, bam_handle.lengths)

    distanceBetweenBins *= 2
    fl = []
    while len(fl) < 1000 and distanceBetweenBins > 1:
        distanceBetweenBins /= 2
        stepsize = binSize + distanceBetweenBins
        imap_res = mapReduce.mapReduce(
            (bam_handle.filename, distanceBetweenBins),
            getFragmentLength_wrapper,
            chrom_sizes,
            genomeChunkLength=stepsize,
            blackListFileName=blackListFileName,
            numberOfProcessors=numberOfProcessors,
            verbose=verbose)

        fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {
                'sample_size': len(fragment_length),
                'min': fragment_length.min(),
                'qtile25': np.percentile(fragment_length, 25),
                'mean': np.mean(fragment_length),
                'median': np.median(fragment_length),
                'qtile75': np.percentile(fragment_length, 75),
                'max': fragment_length.max(),
                'std': np.std(fragment_length)
            }
        else:
            fragment_len_dict = None

        if return_lengths and fragment_len_dict is not None:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {
            'sample_size': len(read_length),
            'min': read_length.min(),
            'qtile25': np.percentile(read_length, 25),
            'mean': np.mean(read_length),
            'median': np.median(read_length),
            'qtile75': np.percentile(read_length, 75),
            'max': read_length.max(),
            'std': np.std(read_length)
        }
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict

Beispiel #14

0

Datei anzeigen

Datei: countReadsPerBin.py Projekt: nashchem/deepTools

def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples,
                      defaultFragmentLength, numberOfProcessors=1,
                      skipZeros=True, verbose=False, region=None,
                      bedFile=None, extendPairedEnds=True,
                      minMappingQuality=None,
                      ignoreDuplicates=False,
                      chrsToSkip=[],
                      stepSize=None,
                      samFlag=None):

    r"""
    This function collects read counts (coverage) from several bam files and returns
    an numpy array with the results. This function does not explicitly do the
    coverage computation, instead divides the work into smaller chunks that are
    sent to individual processors.

    Parameters
    ----------
    bamFilesList : list
        List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam']

    binLength : int
        Length of the window/bin. This value is overruled by ``bedFile`` if present.

    numberOfSamples : int
        Total number of samples. The genome is divided into ``numberOfSamples``, each
        with a window/bin length equal to ``binLength``. This value is overruled
        by ``stepSize`` in case such value is present and by ``bedFile`` in which
        case the number of samples and bins are defined in the bed file

    defaultFragmentLength : int
        fragment length to extend reads that are not paired. Paired reads are extended to
        the fragment length defined by the mate distance. For Illumina reads, usual values
        are around 300. This value can be determined using the peak caller MACS2 or can be
        approximated by the fragment lengths computed when preparing the library for sequencing.

    numberOfProcessors : int
        Number of processors to use. Default is 4

    skipZeros : bool
        Default is True. This option decides if regions having zero coverage in all bam files
        should be skipped or kept.

    verbose : bool
        Output messages. Default: False

    region : str
        Region to limit the computation in the form chrom:start:end.

    bedFile : str
        Name of a bed file containing the regions for wich to compute the coverage. This option
        overrules ``binLength``, ``numberOfSamples`` and ``stepSize``.
    extendPairedEnds : bool
        Whether coverage should be computed for the extended read length (i.e. the region covered
        by the two mates or the regions expected to be covered by single-reads). Default: true

    minMappingQuality : int
        Reads of a mapping quality less than the give value are not considered. Default: None

    ignoreDuplicates : bool
        Whether read duplicates (same start, end position. If paired-end, same start-end for mates) are
        to be excluded. Default: false

    chrToSkip: list
        List with names of chromosomes that do not want to be included in the coverage computation.
        This is useful to remove unwanted chromosomes (e.g. 'random' or 'Het').

    stepSize : int
        the positions for which the coverage is computed are defined as follows:
        ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute
        the coverage at each base pair. If the stepSize is equal to the
        binLength then the coverage is computed for consecutive bins. If seepSize is
        smaller than the binLength, then teh bins will overlap.

    samFlag : int
        If given, only reads having such flag are considered. For example, to get only
        reads that are the first mates a samFlag of 64 could be used. Similarly, the
        samFlag can be used to select only reads mapping on the forward (or reverse) strand
        or to get only properly paired reads.

    Returns
    -------
    numpy array

        Each row correspond to each bin/bed region and each column correspond to each of
        the bamFiles. If ``skipZeros`` is used, then the result may have less rows
        than expected


    Examples
    --------

    The test data contains reads for 200 bp.

    >>> test = Tester()

    The transpose function is used to get a nicer looking output.
    The first line corresponds to the number of reads per bin in bam file 1

    >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])
    """

    # Try to determine an optimal fraction of the genome (chunkSize) that is sent to 
    # workers for analysis. If too short, too much time is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ]
    chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)

    genomeSize = sum(chrLengths)
    max_mapped = max( [ x.mapped for x in  bamFilesHandlers ] )

    reads_per_bp = float(max_mapped) / genomeSize
#    chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

    if stepSize is None:
        stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 )

    chunkSize =  int (stepSize * 1e3 / ( reads_per_bp  * len(bamFilesHandlers)) )
    [ bam_h.close() for bam_h in bamFilesHandlers]

    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)

    imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength,
                                     defaultFragmentLength, skipZeros,
                                     extendPairedEnds, minMappingQuality,
                                     ignoreDuplicates, samFlag),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    try:
        num_reads_per_bin = np.concatenate(imap_res, axis=0)
    except ValueError:
        if bedFile:
            exit('\nNo coverage values could be computed.\n\n'
                 'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                 'The valid chromosome names are:\n{}'.format(chrNames))
        else:
            exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                 'contain mapped reads.')

    return num_reads_per_bin

Beispiel #15

0

Datei anzeigen

def writeBedGraph(
        bamOrBwFileList, outputFileName, fragmentLength,
        func, funcArgs, tileSize=25, region=None, numberOfProcessors=None,
        format="bedgraph", extendPairedEnds=True, missingDataAsZero=False,
        smoothLength=0, fixed_step=False):

    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    >>> test = Tester()
    >>> outFile = tempfile.NamedTemporaryFile()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> writeBedGraph([(test.bamFile1, 'bam')], outFile.name,
    ... 0, scaleCoverage, funcArgs, region='3R:0:200')
    >>> open(outFile.name, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> outFile.close()


    """

    bigwig_info = cfg.config.get('external_tools', 'bigwig_info')
    bamHandlers = [openBam(indexedFile) for
                   indexedFile,
                   fileFormat in bamOrBwFileList if fileFormat == 'bam']
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [fileName for fileName,
                   fileFormat in bamOrBwFileList if fileFormat == 'bigwig']
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            inBlock = False
            for line in os.popen(
                    "{} -chroms {}".format(bigwig_info, bw)).readlines():

                if line[0:10] == "chromCount":
                    inBlock = True
                    continue

                if line[0:5] == "bases":
                    break
                if inBlock:
                    chromName, id, size = line.strip().split(" ")
                    size = int(size)
                    if chromName in chromNamesAndSize:
                        cCommon.append(chromName)
                        if chromNamesAndSize[chromName] != size:
                            print "\nWARNING\n" \
                                "Chromosome {} length reported in the " \
                                "bigwig files differ.\n{} for {}\n" \
                                "{} for {}.\n\nThe smallest " \
                                "length will be used".format(
                                chromName, chromNamesAndSize[chromName],
                                bigwigs[0], size, bigwigs[1])
                            chromNamesAndSize[chromName] = min(
                                chromNamesAndSize[chromName], size)
                    else:
                        chromNamesAndSize[chromName] = size

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.iteritems()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList,
                               func, funcArgs, extendPairedEnds, smoothLength,
                               missingDataAsZero, fixed_step),
                              writeBedGraph_wrapper,
                              chromNamesAndSize,
                              genomeChunkLength=genomeChunkLength,
                              region=region,
                              numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(
            chromNamesAndSize, bedGraphFile, outputFileName, False)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)

Beispiel #16

0

Datei anzeigen

Datei: getFragmentAndReadSize.py Projekt: steffenheyne/deepTools

def get_read_and_fragment_length(bamFile, return_lengths=False,
                                 numberOfProcessors=None, verbose=False):
    """
    Estimates the fragment length and read length through sampling

    Parameters
    ----------
    bamFile : str
        BAM file name
    return_lengths : bool
    numberOfProcessors : int
    verbose : bool

    Returns
    -------
    d : dict
        tuple of two dictionaries, one for the fragment length and the other
        for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile)
    chrom_sizes = zip(bam_handle.references, bam_handle.lengths)

    chunk_size = int(float(sum(bam_handle.lengths)) * 0.3 / max(numberOfProcessors, len(bam_handle.lengths)))
    # avoid small chunk sizes to split the computation
    chunk_size = max(chunk_size, 100000)
    imap_res = mapReduce.mapReduce((bam_handle.filename, ),
                                   getFragmentLength_wrapper,
                                   chrom_sizes,
                                   genomeChunkLength=chunk_size,
                                   numberOfProcessors=numberOfProcessors,
                                   verbose=verbose)

    fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {'sample_size': len(fragment_length),
                                 'min': fragment_length.min(),
                                 'qtile25': np.percentile(fragment_length, 25),
                                 'mean': np.mean(fragment_length),
                                 'median': np.median(fragment_length),
                                 'qtile75': np.percentile(fragment_length, 75),
                                 'max': fragment_length.max(),
                                 'std': np.std(fragment_length)}
        else:
            fragment_len_dict = None

        if return_lengths:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {'sample_size': len(read_length),
                         'min': read_length.min(),
                         'qtile25': np.percentile(read_length, 25),
                         'mean': np.mean(read_length),
                         'median': np.median(read_length),
                         'qtile75': np.percentile(read_length, 75),
                         'max': read_length.max(),
                         'std': np.std(read_length)}
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict