def getFragmentLength_worker(chrom, start, end, bamFile):
    """
    Queries the reads at the given region for the distance between
    reads and the read length
    :param chrom: chromosome name
    :param start: int region start
    :param end:  int region end
    :param bamFile: bamfile name
    :return: np.array, where first column is fragment length, the
     second is for read length
    """
    bam = bamHandler.openBam(bamFile)
    end = min(end, start + 5e4)
    if chrom in bam.references:
        reads = np.array([(abs(r.template_length), r.query_length)
                          for r in bam.fetch(chrom, start, end)
                          if r.is_proper_pair and r.is_read1])
        if not len(reads):
            # if the previous operation produces an empty list
            # it could be that the data is not paired, then
            # we try with out filtering
            reads = np.array([(abs(r.template_length), r.query_length)
                              for r in bam.fetch(chrom, start, end)])
    else:
        raise NameError("chromosome {} not found in bam file".format(chrom))

    if not len(reads):
        reads = np.array([]).reshape(0, 2)

    return reads
예제 #2
0
def peFragmentSize(bamFile, bamFileIndex=None,
                   return_lengths=False,
                   numberOfProcessors=None, verbose=False):

    bamHandle = bamHandler.openBam(bamFile, bamFileIndex)
    chromSizes = zip(bamHandle.references, bamHandle.lengths)

    chunkSize = int(
        float(sum(bamHandle.lengths)) * 0.3 / max(numberOfProcessors,
                                                  len(bamHandle.lengths)))
    imap_res = mapReduce.mapReduce((bamHandle.filename, ),
                                   getFragmentLength_wrapper,
                                   chromSizes,
                                   genomeChunkLength=chunkSize,
                                   numberOfProcessors=numberOfProcessors,
                                   verbose=verbose)

    fl = np.concatenate(imap_res)
    if len(fl):
        fragLength = {'sample_size': len(fl),
                      'min': fl.min(),
                      'qtile25': np.percentile(fl, 25),
                      'mean': np.mean(fl),
                      'median': np.median(fl),
                      'qtile75': np.percentile(fl, 75),
                      'max': fl.max(),
                      'std': np.std(fl)}
        if return_lengths:
            fragLength['lengths'] = fl
    else:
        fragLength = None
    return fragLength
예제 #3
0
def getFragmentLength_worker(chrom, start, end, bamFile):
    bam = bamHandler.openBam(bamFile)
    end = min(end, start + 5e4)
    reads = np.array([])
    if chrom in bam.references:
        reads = np.array([abs(r.tlen)
                          for r in bam.fetch(chrom, start, end)
                          if r.is_proper_pair and r.is_read1])
    else:
        raise NameError("chromosome {} not found in bam file".format(chrom))

    return reads
예제 #4
0
 def getRead(self, readType):
     """ prepare arguments for test
     """
     bam = bamHandler.openBam(self.bamFile_PE)
     if readType == 'paired-reverse':
         read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0]
     elif readType == 'single-forward':
         read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0]
     elif readType == 'single-reverse':
         read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0]
     else:  # by default a forward paired read is returned
         read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0]
     return read
예제 #5
0
 def getRead(self, readType):
     """ prepare arguments for test
     """
     bam = bamHandler.openBam(self.bamFile_PE)
     if readType == 'paired-reverse':
         read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0]
     elif readType == 'single-forward':
         read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0]
     elif readType == 'single-reverse':
         read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0]
     else:  # by default a forward paired read is returned
         read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0]
     return read
예제 #6
0
def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins):
    """
    Queries the reads at the given region for the distance between
    reads and the read length

    Parameters
    ----------
    chrom : str
        chromosome name
    start : int
        region start
    end : int
        region end
    bamFile : str
        BAM file name
    distanceBetweenBins : int
        the number of bases at the end of each bin to ignore

    Returns
    -------
    np.array
        an np.array, where first column is fragment length, the
        second is for read length
    """
    bam = bamHandler.openBam(bamFile)
    end = max(start + 1, end - distanceBetweenBins)
    if chrom in bam.references:
        reads = np.array([(abs(r.template_length), r.infer_query_length())
                          for r in bam.fetch(chrom, start, end)
                          if r.is_proper_pair and r.is_read1])
        if not len(reads):
            # if the previous operation produces an empty list
            # it could be that the data is not paired, then
            # we try with out filtering
            reads = np.array([(abs(r.template_length), r.query_length)
                              for r in bam.fetch(chrom, start, end)])
    else:
        raise NameError("chromosome {} not found in bam file".format(chrom))

    if not len(reads):
        reads = np.array([]).reshape(0, 2)

    return reads
def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins):
    """
    Queries the reads at the given region for the distance between
    reads and the read length

    Parameters
    ----------
    chrom : str
        chromosome name
    start : int
        region start
    end : int
        region end
    bamFile : str
        BAM file name
    distanceBetweenBins : int
        the number of bases at the end of each bin to ignore

    Returns
    -------
    np.array
        an np.array, where first column is fragment length, the
        second is for read length
    """
    bam = bamHandler.openBam(bamFile)
    end = max(start + 1, end - distanceBetweenBins)
    if chrom in bam.references:
        reads = np.array([(abs(r.template_length), r.infer_query_length())
                          for r in bam.fetch(chrom, start, end)
                          if r.is_proper_pair and r.is_read1])
        if not len(reads):
            # if the previous operation produces an empty list
            # it could be that the data is not paired, then
            # we try with out filtering
            reads = np.array([(abs(r.template_length), r.query_length)
                              for r in bam.fetch(chrom, start, end)])
    else:
        raise NameError("chromosome {} not found in bam file".format(chrom))

    if not len(reads):
        reads = np.array([]).reshape(0, 2)

    return reads
예제 #8
0
def countReadsInRegions_worker(chrom, start, end, bamFilesList,
                               stepSize, binLength, defaultFragmentLength,
                               skipZeros=False,
                               extendPairedEnds=True,
                               minMappingQuality=None,
                               ignoreDuplicates=False,
                               samFlag=None,
                               bedRegions=None
                               ):
    """ counts the reads in each bam file at each 'stepSize' position
    within the interval start, end for a 'binLength' window.
    Because the idea is to get counts for window positions at
    different positions for sampling the bins are equally spaced
    between each other and are  not one directly next *after* the other.

    If a list of bedRegions is given, then the number of reads
    that overlaps with each region is counted.

    The result is a list of tuples.
    >>> test = Tester()

    The transpose is used to get better looking numbers. the first line
    corresponds to the number of reads per bin in the first bamfile.
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 50, 25, 0))
    array([[ 0.,  0.,  1.,  1.],
           [ 0.,  1.,  1.,  2.]])

    When skipZeros is set to true, those cases in which *all* of the
    bamfiles have zero counts for a certain bin are ignored
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 50, 25, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])

    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 200, 200, 0))
    array([[ 2.],
           [ 4.]])

    Test min mapping quality
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 50, 25, 0, minMappingQuality=40))
    array([[ 0.,  0.,  0.,  1.],
           [ 0.,  0.,  0.,  1.]])

    Test ignore duplicates
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 50, 25, 0, ignoreDuplicates=True))
    array([[ 0.,  0.,  1.,  1.],
           [ 0.,  1.,  1.,  1.]])

    Test bed regions:
    >>> bedRegions = [(test.chrom, 10, 20), (test.chrom, 150, 160)]
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 0, 200, 0, bedRegions=bedRegions))
    array([[ 0.,  1.],
           [ 0.,  2.]])

    """

    if start > end:
        raise NameError("start %d bigger that end %d" % (start, end))

    # array to keep the read counts for the regions
    subNum_reads_per_bin = []

    rows = 0
    startTime = time.time()
    extendPairedEnds = True
    zerosToNans = False

    bamHandlers = [bamHandler.openBam(bam) for bam in bamFilesList]

    regionsToConsider = []

    if bedRegions:
        for chrom, start, end in bedRegions:
            regionsToConsider.append((chrom, start, end, end - start))
    else:
        for i in xrange(start, end, stepSize):
            if i + binLength > end:
                break
            regionsToConsider.append((chrom, i, i + binLength, binLength))

    for chrom, start, end, binLength in regionsToConsider:
        avgReadsArray = []
        for bam in bamHandlers:
            avgReadsArray.append(
                getCoverageOfRegion(bam,
                                    chrom, start, end,
                                    binLength,
                                    defaultFragmentLength,
                                    extendPairedEnds,
                                    zerosToNans,
                                    minMappingQuality=minMappingQuality,
                                    ignoreDuplicates=ignoreDuplicates,
                                    samFlag=samFlag
                                    )[0])
        # skip if any of the bam files returns a NaN
        if np.isnan(sum(avgReadsArray)):
            continue

        if skipZeros and sum(avgReadsArray) == 0:
            continue
        subNum_reads_per_bin.extend(avgReadsArray)
        rows += 1

    if debug:
        endTime = time.time()
        print "%s countReadsInRegions_worker: processing %d " \
            "(%.1f per sec) @ %s:%s-%s"  % \
            (multiprocessing.current_process().name,
             rows, rows / (endTime - startTime), chrom, start, end )

    return np.array(subNum_reads_per_bin).reshape(rows, len(bamFilesList))
예제 #9
0
def countReadsInRegions_worker(chrom, start, end, bamFilesList, 
                               stepSize, binLength, defaultFragmentLength, 
                               skipZeros = False):
    """ counts the reads in each bam file at each 'stepSize' position
    within the interval start, end 
    for a 'binLength' window.
    The idea is to get counts for window positions at different positions
    for sampling. That is why the bins are not consecutive. 
    The result is a list of tuples. 
    >>> test = Tester()

    The transpose is used to get better looking numbers. the first line corresponds to
    the number of reads per bin in the first bamfile
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, [test.bamFile1, test.bamFile2], 50, 25, 0))
    array([[ 0.,  0.,  1.,  1.],
           [ 0.,  1.,  1.,  2.]])

    When skipZeros is set to true, those cases in which *all* of the
    bamfiles have zero counts for a certain bin are ignored
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, [test.bamFile1, test.bamFile2], 50, 25, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])

    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200, [test.bamFile1, test.bamFile2], 200, 200, 0))
    array([[ 2.],
           [ 4.]])

     """

    if start > end:
        raise NameError("start %d bigger that end %d" % (start, end))

    # array to keep the read counts for the regions
    subNum_reads_per_bin = []

    rows = 0
    startTime = time.time()
    extendPairedEnds = True
    zerosToNans = False
    
    bamHandlers = [bamHandler.openBam(bam) for bam in bamFilesList]
    for i in xrange(start, end, stepSize):
        if i + binLength > end:
            break
        avgReadsArray = []
        for bam in bamHandlers:
            avgReadsArray.append( \
                getCoverageOfRegion( bam, 
                                     chrom, i, i+binLength, 
                                     binLength, 
                                     defaultFragmentLength, 
                                     extendPairedEnds, 
                                     zerosToNans )[0] )

        if np.isnan(sum(avgReadsArray)):
            continue

        if skipZeros and sum(avgReadsArray) == 0:
            continue

        subNum_reads_per_bin.extend(avgReadsArray)

        rows += 1

    if debug:
        endTime = time.time()
        
        print "%s countReadsInRegions_worker: processing %d  (%.1f per sec) @ %s:%s-%s"  % \
                ( multiprocessing.current_process().name,
                  rows, rows / (endTime - startTime) , chrom, start, end )


    return np.array(subNum_reads_per_bin).reshape(rows,len(bamFilesList)) 
예제 #10
0
def countReadsInRegions_worker(chrom, start, end, bamFilesList,
                               stepSize, binLength, defaultFragmentLength,
                               skipZeros=False,
                               extendPairedEnds=True,
                               minMappingQuality=None,
                               ignoreDuplicates=False,
                               samFlag=None,
                               bedRegions=None
                               ):
    """Counts the reads in each bam file at each 'stepSize' position
    within the interval (start, end) for a window or bin of size binLength.
    Because the idea is to get counts for window/bin positions at

    The stepSize controls the distance between bins. For example,
    a step size of 20 and a bin size of 20 will create bins next to
    each other. if the step size is smaller than the bin size the
    bins will overlap.


    If a list of bedRegions is given, then the number of reads
    that overlaps with each region is counted.


    Parameters
    ----------
    chrom : str
        Chrom name
    start : int
        start coordinate
    end : int
        end coordinate
    bamFileList : list
        List of name of indexed bam files.
    stepSize : int
        the positions for which the coverage is computed are defined as follows:
        ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute
        the coverage at each base pair. If the stepSize is equal to the
        binLength then the coverage is computed for consecutive bins. If seepSize is
        smaller than the binLength, then teh bins will overlap.
    binLength : int
        length of the window/bin
    defaultFragmentLength : in
        see :meth:`deepTools.countReadsPerBin.getFragmentFromRead` method

    Returns
    -------
    numpy array
        The result is a numpy array that as rows each bin
        and as columns each bam file.


    Examples
    --------
    Initialize some useful values

    >>> test = Tester()

    The transpose is used to get better looking numbers. The first line
    corresponds to the number of reads per bin in the first bamfile.

    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 50, 25, 0))
    array([[ 0.,  0.,  1.,  1.],
           [ 0.,  1.,  1.,  2.]])

    When skipZeros is set to true, those cases in which *all* of the
    bamfiles have zero counts for a certain bin are ignored.

    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 50, 25, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])


    """

    if start > end:
        raise NameError("start %d bigger that end %d" % (start, end))

    # array to keep the read counts for the regions
    subNum_reads_per_bin = []

    rows = 0
    startTime = time.time()
    extendPairedEnds = True
    zerosToNans = False

    bamHandlers = [bamHandler.openBam(bam) for bam in bamFilesList]

    regionsToConsider = []
    if bedRegions:
        for chrom, start, end in bedRegions:
            regionsToConsider.append((chrom, start, end, end - start))
    else:
        for i in xrange(start, end, stepSize):
            if i + binLength > end:
                break
            regionsToConsider.append((chrom, i, i + binLength, binLength))

    for chrom, start, end, binLength in regionsToConsider:
        avgReadsArray = []
        for bam in bamHandlers:
            avgReadsArray.append(
                getCoverageOfRegion(bam,
                                    chrom, start, end,
                                    binLength,
                                    defaultFragmentLength,
                                    extendPairedEnds,
                                    zerosToNans,
                                    minMappingQuality=minMappingQuality,
                                    ignoreDuplicates=ignoreDuplicates,
                                    samFlag=samFlag
                                    )[0])
        # skip if any of the bam files returns a NaN
        if np.isnan(sum(avgReadsArray)):
            continue

        if skipZeros and sum(avgReadsArray) == 0:
            continue
        subNum_reads_per_bin.extend(avgReadsArray)
        rows += 1

    if debug:
        endTime = time.time()
        print "%s countReadsInRegions_worker: processing %d " \
            "(%.1f per sec) @ %s:%s-%s"  % \
            (multiprocessing.current_process().name,
             rows, rows / (endTime - startTime), chrom, start, end )

    return np.array(subNum_reads_per_bin).reshape(rows, len(bamFilesList))
예제 #11
0
    def run(self):
        # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
        # workers for analysis. If too short, too much time is spend loading the files
        # if too long, some processors end up free.
        # the following values are empirical

        bamFilesHandlers = [bamHandler.openBam(x) for x in self.bamFilesList]
        chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = zip(*chromSizes)

        genomeSize = sum(chrLengths)
        if self.stepSize is None:
            if self.region is None:
                self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
            else:
                # compute the step size, based on the number of samples
                # and the length of the region studied
                (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
                self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)

        # number of samples is better if large
        if np.mean(chrLengths) < self.stepSize:
            min_num_of_samples = int(genomeSize / np.mean(chrLengths))
            raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))

        max_mapped = max([x.mapped for x in bamFilesHandlers])

        reads_per_bp = float(max_mapped) / genomeSize
        # chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

        chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers)))
        [bam_h.close() for bam_h in bamFilesHandlers]

        if self.verbose:
            print "step size is {}".format(self.stepSize)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce([],
                                       countReadsInRegions_wrapper,
                                       chromSizes,
                                       self_=self,
                                       genomeChunkLength=chunkSize,
                                       bedFile=self.bedFile,
                                       region=self.region,
                                       numberOfProcessors=self.numberOfProcessors)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                                 "the chromosomes that were not common between the bigwig files\n")

            # concatenate intermediary bedgraph files
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    shutil.copyfileobj(open(tempFileName, 'r'), self.out_file_for_raw_data)
                    os.remove(tempFileName)

            # self.out_file_for_raw_data.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit('\nNo coverage values could be computed.\n\n'
                         'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                         'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                         'contain mapped reads.')
예제 #12
0
    def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
        """Counts the reads in each bam file at each 'stepSize' position
        within the interval (start, end) for a window or bin of size binLength.

        The stepSize controls the distance between bins. For example,
        a step size of 20 and a bin size of 20 will create bins next to
        each other. If the step size is smaller than the bin size the
        bins will overlap.

        If a list of bedRegions is given, then the number of reads
        that overlaps with each region is counted.

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        bed_regions_list: list
            List of tuples of the form (chrom, start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        numpy array
            The result is a numpy array that as rows each bin
            and as columns each bam file.


        Examples
        --------
        Initialize some useful values

        >>> test = Tester()
        >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)

        The transpose is used to get better looking numbers. The first line
        corresponds to the number of reads per bin in the first bamfile.

        >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
        >>> _array
        array([[ 0.,  0.],
               [ 0.,  1.],
               [ 1.,  1.],
               [ 1.,  2.]])

        """

        if start > end:
            raise NameError("start %d bigger that end %d" % (start, end))

        if self.stepSize is None:
            raise ValueError("stepSize is not set!")
        # array to keep the read counts for the regions
        subnum_reads_per_bin = []

        rows = 0
        start_time = time.time()

        bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList]

        regionsToConsider = []
        if bed_regions_list is not None:
            for chrom, start, end in bed_regions_list:
                regionsToConsider.append((chrom, start, end, end - start))
        else:
            for i in xrange(start, end, self.stepSize):
                if i + self.binLength > end:
                    break
                regionsToConsider.append((chrom, i, i + self.binLength, self.binLength))

        if self.save_data:
            _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
            _file_name = _file.name
        else:
            _file_name = ''

        for chrom, start, end, region_length in regionsToConsider:
            coverage_array = []
            for bam in bam_handlers:
                coverage_array.append(
                    self.get_coverage_of_region(bam, chrom, start, end, region_length)[0])

            subnum_reads_per_bin.extend(coverage_array)
            rows += 1

            if self.save_data:
                _file.write("\t".join(map(str, [chrom, start, end])) + "\t")
                _file.write("\t".join(["{}".format(x) for x in coverage_array]) + "\n")

        if self.verbose:
            endTime = time.time()
            print "%s countReadsInRegions_worker: processing %d " \
                  "(%.1f per sec) @ %s:%s-%s" % \
                  (multiprocessing.current_process().name,
                   rows, rows / (endTime - start_time), chrom, start, end)
        if self.save_data:
            _file.close()

        return np.array(subnum_reads_per_bin).reshape(rows, len(self.bamFilesList)), _file_name
예제 #13
0
    def count_reads_in_region(self, chrom, start, end, bed_regions_list=None):
        """Counts the reads in each bam file at each 'stepSize' position
        within the interval (start, end) for a window or bin of size binLength.

        The stepSize controls the distance between bins. For example,
        a step size of 20 and a bin size of 20 will create bins next to
        each other. If the step size is smaller than the bin size the
        bins will overlap.

        If a list of bedRegions is given, then the number of reads
        that overlaps with each region is counted.

        Parameters
        ----------
        chrom : str
            Chrom name
        start : int
            start coordinate
        end : int
            end coordinate
        bed_regions_list: list
            List of tuples of the form (chrom, start, end)
            corresponding to bed regions to be processed.
            If not bed file was passed to the object constructor
            then this list is empty.

        Returns
        -------
        numpy array
            The result is a numpy array that as rows each bin
            and as columns each bam file.


        Examples
        --------
        Initialize some useful values

        >>> test = Tester()
        >>> c = CountReadsPerBin([test.bamFile1, test.bamFile2], 25, 0, stepSize=50)

        The transpose is used to get better looking numbers. The first line
        corresponds to the number of reads per bin in the first bamfile.

        >>> _array, __ = c.count_reads_in_region(test.chrom, 0, 200)
        >>> _array
        array([[ 0.,  0.],
               [ 0.,  1.],
               [ 1.,  1.],
               [ 1.,  2.]])

        """

        if start > end:
            raise NameError("start %d bigger that end %d" % (start, end))

        if self.stepSize is None:
            raise ValueError("stepSize is not set!")
        # array to keep the read counts for the regions
        subnum_reads_per_bin = []

        rows = 0
        start_time = time.time()

        bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList]

        regionsToConsider = []
        if bed_regions_list is not None:
            for chrom, start, end in bed_regions_list:
                if mapReduce.blOverlap(self.blackList, chrom, [start, end]):
                    continue
                regionsToConsider.append((chrom, start, end, end - start))
        else:
            for i in xrange(start, end, self.stepSize):
                if i + self.binLength > end:
                    break
                if mapReduce.blOverlap(self.blackList, chrom, [i, i + self.binLength]):
                    continue
                regionsToConsider.append((chrom, i, i + self.binLength, self.binLength))

        if self.save_data:
            _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
            _file_name = _file.name
        else:
            _file_name = ''

        for chrom, start, end, region_length in regionsToConsider:
            coverage_array = []
            for bam in bam_handlers:
                coverage_array.append(
                    self.get_coverage_of_region(bam, chrom, start, end, region_length)[0])

            subnum_reads_per_bin.extend(coverage_array)
            rows += 1

            if self.save_data:
                _file.write("\t".join(map(str, [chrom, start, end])) + "\t")
                _file.write("\t".join(["{}".format(x) for x in coverage_array]) + "\n")

        if self.verbose:
            endTime = time.time()
            print "%s countReadsInRegions_worker: processing %d " \
                  "(%.1f per sec) @ %s:%s-%s" % \
                  (multiprocessing.current_process().name,
                   rows, rows / (endTime - start_time), chrom, start, end)
        if self.save_data:
            _file.close()

        return np.array(subnum_reads_per_bin).reshape(rows, len(self.bamFilesList)), _file_name
예제 #14
0
def get_read_and_fragment_length(bamFile,
                                 return_lengths=False,
                                 blackListFileName=None,
                                 binSize=50000,
                                 distanceBetweenBins=1000000,
                                 numberOfProcessors=None,
                                 verbose=False):
    """
    Estimates the fragment length and read length through sampling

    Parameters
    ----------
    bamFile : str
        BAM file name
    return_lengths : bool
    numberOfProcessors : int
    verbose : bool
    binSize : int
    distanceBetweenBins : int

    Returns
    -------
    d : dict
        tuple of two dictionaries, one for the fragment length and the other
        for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile)
    chrom_sizes = zip(bam_handle.references, bam_handle.lengths)

    distanceBetweenBins *= 2
    fl = []
    while len(fl) < 1000 and distanceBetweenBins > 1:
        distanceBetweenBins /= 2
        stepsize = binSize + distanceBetweenBins
        imap_res = mapReduce.mapReduce(
            (bam_handle.filename, distanceBetweenBins),
            getFragmentLength_wrapper,
            chrom_sizes,
            genomeChunkLength=stepsize,
            blackListFileName=blackListFileName,
            numberOfProcessors=numberOfProcessors,
            verbose=verbose)

        fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {
                'sample_size': len(fragment_length),
                'min': fragment_length.min(),
                'qtile25': np.percentile(fragment_length, 25),
                'mean': np.mean(fragment_length),
                'median': np.median(fragment_length),
                'qtile75': np.percentile(fragment_length, 75),
                'max': fragment_length.max(),
                'std': np.std(fragment_length)
            }
        else:
            fragment_len_dict = None

        if return_lengths and fragment_len_dict is not None:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {
            'sample_size': len(read_length),
            'min': read_length.min(),
            'qtile25': np.percentile(read_length, 25),
            'mean': np.mean(read_length),
            'median': np.median(read_length),
            'qtile75': np.percentile(read_length, 75),
            'max': read_length.max(),
            'std': np.std(read_length)
        }
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict
def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None,
                                 binSize=50000, distanceBetweenBins=1000000,
                                 numberOfProcessors=None, verbose=False):
    """
    Estimates the fragment length and read length through sampling

    Parameters
    ----------
    bamFile : str
        BAM file name
    return_lengths : bool
    numberOfProcessors : int
    verbose : bool
    binSize : int
    distanceBetweenBins : int

    Returns
    -------
    d : dict
        tuple of two dictionaries, one for the fragment length and the other
        for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile)
    chrom_sizes = zip(bam_handle.references, bam_handle.lengths)

    distanceBetweenBins *= 2
    fl = []
    while len(fl) < 1000 and distanceBetweenBins > 1:
        distanceBetweenBins /= 2
        stepsize = binSize + distanceBetweenBins
        imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
                                       getFragmentLength_wrapper,
                                       chrom_sizes,
                                       genomeChunkLength=stepsize,
                                       blackListFileName=blackListFileName,
                                       numberOfProcessors=numberOfProcessors,
                                       verbose=verbose)

        fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {'sample_size': len(fragment_length),
                                 'min': fragment_length.min(),
                                 'qtile25': np.percentile(fragment_length, 25),
                                 'mean': np.mean(fragment_length),
                                 'median': np.median(fragment_length),
                                 'qtile75': np.percentile(fragment_length, 75),
                                 'max': fragment_length.max(),
                                 'std': np.std(fragment_length)}
        else:
            fragment_len_dict = None

        if return_lengths and fragment_len_dict is not None:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {'sample_size': len(read_length),
                         'min': read_length.min(),
                         'qtile25': np.percentile(read_length, 25),
                         'mean': np.mean(read_length),
                         'median': np.median(read_length),
                         'qtile75': np.percentile(read_length, 75),
                         'max': read_length.max(),
                         'std': np.std(read_length)}
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict
예제 #16
0
    def run(self):
        # Try to determine an optimal fraction of the genome (chunkSize) that is sent to
        # workers for analysis. If too short, too much time is spend loading the files
        # if too long, some processors end up free.
        # the following values are empirical

        bamFilesHandlers = [bamHandler.openBam(x) for x in self.bamFilesList]
        chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose)

        # skip chromosome in the list. This is usually for the
        # X chromosome which may have either one copy  in a male sample
        # or a mixture of male/female and is unreliable.
        # Also the skip may contain heterochromatic regions and
        # mitochondrial DNA
        if len(self.chrsToSkip):
            chromSizes = [x for x in chromSizes if x[0] not in self.chrsToSkip]

        chrNames, chrLengths = zip(*chromSizes)

        genomeSize = sum(chrLengths)
        if self.stepSize is None:
            if self.region is None:
                self.stepSize = max(int(float(genomeSize) / self.numberOfSamples), 1)
            else:
                # compute the step size, based on the number of samples
                # and the length of the region studied
                (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3]
                self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1)

        # number of samples is better if large
        if np.mean(chrLengths) < self.stepSize:
            min_num_of_samples = int(genomeSize / np.mean(chrLengths))
            raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples))

        max_mapped = max([x.mapped for x in bamFilesHandlers])

        reads_per_bp = float(max_mapped) / genomeSize
        # chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

        chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers)))
        [bam_h.close() for bam_h in bamFilesHandlers]

        if self.verbose:
            print "step size is {}".format(self.stepSize)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        # use map reduce to call countReadsInRegions_wrapper
        imap_res = mapReduce.mapReduce([],
                                       countReadsInRegions_wrapper,
                                       chromSizes,
                                       self_=self,
                                       genomeChunkLength=chunkSize,
                                       bedFile=self.bedFile,
                                       blackListFileName=self.blackListFileName,
                                       region=self.region,
                                       numberOfProcessors=self.numberOfProcessors)

        if self.out_file_for_raw_data:
            if len(non_common):
                sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                                 "the chromosomes that were not common between the bigwig files\n")

            # concatenate intermediary bedgraph files
            for _values, tempFileName in imap_res:
                if tempFileName:
                    # concatenate all intermediate tempfiles into one
                    shutil.copyfileobj(open(tempFileName, 'r'), self.out_file_for_raw_data)
                    os.remove(tempFileName)

            self.out_file_for_raw_data.close()

        try:
            num_reads_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
            return num_reads_per_bin

        except ValueError:
            if self.bedFile:
                sys.exit('\nNo coverage values could be computed.\n\n'
                         'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                         'The valid chromosome names are:\n{}'.format(chrNames))
            else:
                sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                         'contain mapped reads.')
예제 #17
0
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples,
                      defaultFragmentLength, numberOfProcessors=1,
                      skipZeros=True, verbose=False, region=None,
                      bedFile=None, extendPairedEnds=True,
                      minMappingQuality=None,
                      ignoreDuplicates=False,
                      chrsToSkip=[],
                      stepSize=None,
                      samFlag=None):

    r"""
    This function collects read counts (coverage) from several bam files and returns
    an numpy array with the results. This function does not explicitly do the
    coverage computation, instead divides the work into smaller chunks that are
    sent to individual processors.

    Parameters
    ----------
    bamFilesList : list
        List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam']

    binLength : int
        Length of the window/bin. This value is overruled by ``bedFile`` if present.

    numberOfSamples : int
        Total number of samples. The genome is divided into ``numberOfSamples``, each
        with a window/bin length equal to ``binLength``. This value is overruled
        by ``stepSize`` in case such value is present and by ``bedFile`` in which
        case the number of samples and bins are defined in the bed file

    defaultFragmentLength : int
        fragment length to extend reads that are not paired. Paired reads are extended to
        the fragment length defined by the mate distance. For Illumina reads, usual values
        are around 300. This value can be determined using the peak caller MACS2 or can be
        approximated by the fragment lengths computed when preparing the library for sequencing.

    numberOfProcessors : int
        Number of processors to use. Default is 4

    skipZeros : bool
        Default is True. This option decides if regions having zero coverage in all bam files
        should be skipped or kept.

    verbose : bool
        Output messages. Default: False

    region : str
        Region to limit the computation in the form chrom:start:end.

    bedFile : str
        Name of a bed file containing the regions for wich to compute the coverage. This option
        overrules ``binLength``, ``numberOfSamples`` and ``stepSize``.
    extendPairedEnds : bool
        Whether coverage should be computed for the extended read length (i.e. the region covered
        by the two mates or the regions expected to be covered by single-reads). Default: true

    minMappingQuality : int
        Reads of a mapping quality less than the give value are not considered. Default: None

    ignoreDuplicates : bool
        Whether read duplicates (same start, end position. If paired-end, same start-end for mates) are
        to be excluded. Default: false

    chrToSkip: list
        List with names of chromosomes that do not want to be included in the coverage computation.
        This is useful to remove unwanted chromosomes (e.g. 'random' or 'Het').

    stepSize : int
        the positions for which the coverage is computed are defined as follows:
        ``range(start, end, stepSize)``. Thus, a stepSize of 1, will compute
        the coverage at each base pair. If the stepSize is equal to the
        binLength then the coverage is computed for consecutive bins. If seepSize is
        smaller than the binLength, then teh bins will overlap.

    samFlag : int
        If given, only reads having such flag are considered. For example, to get only
        reads that are the first mates a samFlag of 64 could be used. Similarly, the
        samFlag can be used to select only reads mapping on the forward (or reverse) strand
        or to get only properly paired reads.

    Returns
    -------
    numpy array

        Each row correspond to each bin/bed region and each column correspond to each of
        the bamFiles. If ``skipZeros`` is used, then the result may have less rows
        than expected


    Examples
    --------

    The test data contains reads for 200 bp.

    >>> test = Tester()

    The transpose function is used to get a nicer looking output.
    The first line corresponds to the number of reads per bin in bam file 1

    >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])
    """

    # Try to determine an optimal fraction of the genome (chunkSize) that is sent to 
    # workers for analysis. If too short, too much time is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ]
    chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)

    genomeSize = sum(chrLengths)
    max_mapped = max( [ x.mapped for x in  bamFilesHandlers ] )

    reads_per_bp = float(max_mapped) / genomeSize
#    chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

    if stepSize is None:
        stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 )

    chunkSize =  int (stepSize * 1e3 / ( reads_per_bp  * len(bamFilesHandlers)) )
    [ bam_h.close() for bam_h in bamFilesHandlers]

    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)

    imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength,
                                     defaultFragmentLength, skipZeros,
                                     extendPairedEnds, minMappingQuality,
                                     ignoreDuplicates, samFlag),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    try:
        num_reads_per_bin = np.concatenate(imap_res, axis=0)
    except ValueError:
        if bedFile:
            exit('\nNo coverage values could be computed.\n\n'
                 'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                 'The valid chromosome names are:\n{}'.format(chrNames))
        else:
            exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                 'contain mapped reads.')

    return num_reads_per_bin
예제 #18
0
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples,
                      defaultFragmentLength, numberOfProcessors=1,
                      skipZeros=True, verbose=False, region=None,
                      bedFile=None, extendPairedEnds=True,
                      minMappingQuality=None,
                      ignoreDuplicates=False,
                      chrsToSkip=[],
                      stepSize=None):

    r"""
    This function visits a number of sites and returs a matrix containing read
    counts. Each row to one sampled site and each column correspond to each of
    the bamFiles.

    If the chrsToSkip is given, then counts are filter out from this
    chromosome which, unless a female sample is used, the counts are less
    compared to autosomes. For most applications this is irrelevant but for
    other cases, like when stimating the best scaling factor, this is
    important.

    The test data contains reads for 200 bp
    >>> test = Tester()

    The transpose function is used to get a nicer looking output.
    The first line corresponds to the number of reads per bin in bam file 1
    >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])

    >>> aa = np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    >>> np.savez('/tmp/aa', aa)
    """

    # Try to determine an optimal fraction of the genome (chunkSize) that is sent to 
    # workers for analysis. If too short, too much time is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ]
    chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)

    genomeSize = sum(chrLengths)
    max_mapped = max( [ x.mapped for x in  bamFilesHandlers ] )

    reads_per_bp = float(max_mapped) / genomeSize
#    chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

    if stepSize is None:
        stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 )

    chunkSize =  int (stepSize * 1e3 / ( reads_per_bp  * len(bamFilesHandlers)) )
    [ bam_h.close() for bam_h in bamFilesHandlers]

    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)

    imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength,
                                     defaultFragmentLength, skipZeros,
                                     extendPairedEnds, minMappingQuality,
                                     ignoreDuplicates),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    num_reads_per_bin = np.concatenate(imap_res, axis=0)
    return num_reads_per_bin
예제 #19
0
def getNumReadsPerBin(bamFilesList, binLength, numberOfSamples,
                      defaultFragmentLength, numberOfProcessors=1,
                      skipZeros=True, verbose=False, region=None,
                      bedFile=None, extendPairedEnds=True,
                      minMappingQuality=None,
                      ignoreDuplicates=False,
                      chrsToSkip=[],
                      stepSize=None,
                      samFlag=None):

    r"""
    This function visits a number of sites and returs a matrix containing read
    counts. Each row to one sampled site and each column correspond to each of
    the bamFiles.

    If the chrsToSkip is given, then counts are filter out from this
    chromosome which, unless a female sample is used, the counts are less
    compared to autosomes. For most applications this is irrelevant but for
    other cases, like when stimating the best scaling factor, this is
    important.

    The test data contains reads for 200 bp
    >>> test = Tester()

    The transpose function is used to get a nicer looking output.
    The first line corresponds to the number of reads per bin in bam file 1
    >>> np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])

    >>> aa = np.transpose(getNumReadsPerBin([test.bamFile1, test.bamFile2],
    ... 50, 4, 0, skipZeros=True))
    >>> np.savez('/tmp/aa', aa)
    """

    # Try to determine an optimal fraction of the genome (chunkSize) that is sent to 
    # workers for analysis. If too short, too much time is spend loading the files
    # if too long, some processors end up free.
    # the following values are empirical

    bamFilesHandlers = [ bamHandler.openBam(x) for x in bamFilesList ]
    chromSizes = getCommonChrNames(bamFilesHandlers, verbose=verbose)

    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if len(chrsToSkip): chromSizes = [ x for x in chromSizes if x[0] not in chrsToSkip ]

    chrNames, chrLengths = zip(*chromSizes)

    genomeSize = sum(chrLengths)
    max_mapped = max( [ x.mapped for x in  bamFilesHandlers ] )

    reads_per_bp = float(max_mapped) / genomeSize
#    chunkSize =  int(100 / ( reads_per_bp  * len(bamFilesList)) )

    if stepSize is None:
        stepSize = max(int( float(genomeSize) / numberOfSamples ), 1 )

    chunkSize =  int (stepSize * 1e3 / ( reads_per_bp  * len(bamFilesHandlers)) )
    [ bam_h.close() for bam_h in bamFilesHandlers]

    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)

    imap_res = mapReduce.mapReduce( (bamFilesList, stepSize, binLength,
                                     defaultFragmentLength, skipZeros,
                                     extendPairedEnds, minMappingQuality,
                                     ignoreDuplicates, samFlag),
                                    countReadsInRegions_wrapper,
                                    chromSizes,
                                    genomeChunkLength=chunkSize,
                                    bedFile=bedFile,
                                    region=region,
                                    numberOfProcessors=numberOfProcessors)

    try:
        num_reads_per_bin = np.concatenate(imap_res, axis=0)
    except ValueError:
        if bedFile:
            exit('\nNo coverage values could be computed.\n\n'
                 'Please check that the chromosome names in the BED file are found on the bam files.\n\n'
                 'The valid chromosome names are:\n{}'.format(chrNames))
        else:
            exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and '
                 'contain mapped reads.')

    return num_reads_per_bin
예제 #20
0
def openBam(bamFile, bamIndex=None):
    return bamHandler.openBam(bamFile, bamIndex)
예제 #21
0
def countReadsInRegions_worker(chrom, start, end, bamFilesList,
                               stepSize, binLength, defaultFragmentLength,
                               skipZeros=False,
                               extendPairedEnds=True,
                               minMappingQuality=None,
                               ignoreDuplicates=False,
                               bedRegions=None):
    """ counts the reads in each bam file at each 'stepSize' position
    within the interval start, end for a 'binLength' window.
    Because the idea is to get counts for window positions at
    different positions for sampling the bins are equally spaced
    between each other and are  not one directly next *after* the other.

    If a list of bedRegions is given, then the number of reads
    that overlaps with each region is counted.

    The result is a list of tuples.
    >>> test = Tester()

    The transpose is used to get better looking numbers. the first line
    corresponds to the number of reads per bin in the first bamfile.
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 50, 25, 0))
    array([[ 0.,  0.,  1.,  1.],
           [ 0.,  1.,  1.,  2.]])

    When skipZeros is set to true, those cases in which *all* of the
    bamfiles have zero counts for a certain bin are ignored
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 50, 25, 0, skipZeros=True))
    array([[ 0.,  1.,  1.],
           [ 1.,  1.,  2.]])

    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 200, 200, 0))
    array([[ 2.],
           [ 4.]])

    Test min mapping quality
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 50, 25, 0, minMappingQuality=40))
    array([[ 0.,  0.,  0.,  1.],
           [ 0.,  0.,  0.,  1.]])

    Test ignore duplicates
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 50, 25, 0, ignoreDuplicates=True))
    array([[ 0.,  0.,  1.,  1.],
           [ 0.,  1.,  1.,  1.]])

    Test bed regions:
    >>> bedRegions = [(test.chrom, 10, 20), (test.chrom, 150, 160)]
    >>> np.transpose(countReadsInRegions_worker(test.chrom, 0, 200,
    ... [test.bamFile1, test.bamFile2], 0, 200, 0, bedRegions=bedRegions))
    array([[ 0.,  1.],
           [ 0.,  2.]])

    """

    if start > end:
        raise NameError("start %d bigger that end %d" % (start, end))

    # array to keep the read counts for the regions
    subNum_reads_per_bin = []

    rows = 0
    startTime = time.time()
    extendPairedEnds = True
    zerosToNans = False

    bamHandlers = [bamHandler.openBam(bam) for bam in bamFilesList]

    regionsToConsider = []

    if bedRegions:
        for chrom, start, end in bedRegions:
            regionsToConsider.append((chrom, start, end, end - start))
    else:
        for i in xrange(start, end, stepSize):
            if i + binLength > end:
                break
            regionsToConsider.append((chrom, i, i + binLength, binLength))

    for chrom, start, end, binLength in regionsToConsider:
        avgReadsArray = []
        for bam in bamHandlers:
            avgReadsArray.append(
                getCoverageOfRegion(bam,
                                    chrom, start, end,
                                    binLength,
                                    defaultFragmentLength,
                                    extendPairedEnds,
                                    zerosToNans,
                                    minMappingQuality=minMappingQuality,
                                    ignoreDuplicates=ignoreDuplicates,
                                    )[0])
        # skip if any of the bam files returns a NaN
        if np.isnan(sum(avgReadsArray)):
            continue

        if skipZeros and sum(avgReadsArray) == 0:
            continue
        subNum_reads_per_bin.extend(avgReadsArray)
        rows += 1

    if debug:
        endTime = time.time()
        print "%s countReadsInRegions_worker: processing %d " \
            "(%.1f per sec) @ %s:%s-%s"  % \
            (multiprocessing.current_process().name,
             rows, rows / (endTime - startTime), chrom, start, end )

    return np.array(subNum_reads_per_bin).reshape(rows, len(bamFilesList))
def get_read_and_fragment_length(bamFile,
                                 bamFileIndex=None,
                                 return_lengths=False,
                                 numberOfProcessors=None,
                                 verbose=False):
    """
    Estimates the fragment length and read length through sampling
    :param bamFile: bamfile name
    :param bamFileIndex: bamfile index name
    :param return_lengths: bool,
    :param numberOfProcessors:
    :param verbose:
    :return: tuple of two dictionaries, one for the fragment length and the other
                    for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile, bamFileIndex)
    chrom_sizes = zip(bam_handle.references, bam_handle.lengths)

    chunk_size = int(
        float(sum(bam_handle.lengths)) * 0.3 /
        max(numberOfProcessors, len(bam_handle.lengths)))
    # avoid small chunk sizes to split the computation
    chunk_size = max(chunk_size, 100000)
    imap_res = mapReduce.mapReduce((bam_handle.filename, ),
                                   getFragmentLength_wrapper,
                                   chrom_sizes,
                                   genomeChunkLength=chunk_size,
                                   numberOfProcessors=numberOfProcessors,
                                   verbose=verbose)

    fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {
                'sample_size': len(fragment_length),
                'min': fragment_length.min(),
                'qtile25': np.percentile(fragment_length, 25),
                'mean': np.mean(fragment_length),
                'median': np.median(fragment_length),
                'qtile75': np.percentile(fragment_length, 75),
                'max': fragment_length.max(),
                'std': np.std(fragment_length)
            }
        else:
            fragment_len_dict = None

        if return_lengths:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {
            'sample_size': len(read_length),
            'min': read_length.min(),
            'qtile25': np.percentile(read_length, 25),
            'mean': np.mean(read_length),
            'median': np.median(read_length),
            'qtile75': np.percentile(read_length, 75),
            'max': read_length.max(),
            'std': np.std(read_length)
        }
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict
def get_read_and_fragment_length(bamFile, return_lengths=False,
                                 numberOfProcessors=None, verbose=False):
    """
    Estimates the fragment length and read length through sampling

    Parameters
    ----------
    bamFile : str
        BAM file name
    return_lengths : bool
    numberOfProcessors : int
    verbose : bool

    Returns
    -------
    d : dict
        tuple of two dictionaries, one for the fragment length and the other
        for the read length. The dictionaries summarise the mean, median etc. values
    """

    bam_handle = bamHandler.openBam(bamFile)
    chrom_sizes = zip(bam_handle.references, bam_handle.lengths)

    chunk_size = int(float(sum(bam_handle.lengths)) * 0.3 / max(numberOfProcessors, len(bam_handle.lengths)))
    # avoid small chunk sizes to split the computation
    chunk_size = max(chunk_size, 100000)
    imap_res = mapReduce.mapReduce((bam_handle.filename, ),
                                   getFragmentLength_wrapper,
                                   chrom_sizes,
                                   genomeChunkLength=chunk_size,
                                   numberOfProcessors=numberOfProcessors,
                                   verbose=verbose)

    fl = np.concatenate(imap_res)

    if len(fl):
        fragment_length = fl[:, 0]
        read_length = fl[:, 1]
        if fragment_length.mean() > 0:
            fragment_len_dict = {'sample_size': len(fragment_length),
                                 'min': fragment_length.min(),
                                 'qtile25': np.percentile(fragment_length, 25),
                                 'mean': np.mean(fragment_length),
                                 'median': np.median(fragment_length),
                                 'qtile75': np.percentile(fragment_length, 75),
                                 'max': fragment_length.max(),
                                 'std': np.std(fragment_length)}
        else:
            fragment_len_dict = None

        if return_lengths:
            fragment_len_dict['lengths'] = fragment_length

        read_len_dict = {'sample_size': len(read_length),
                         'min': read_length.min(),
                         'qtile25': np.percentile(read_length, 25),
                         'mean': np.mean(read_length),
                         'median': np.median(read_length),
                         'qtile75': np.percentile(read_length, 75),
                         'max': read_length.max(),
                         'std': np.std(read_length)}
        if return_lengths:
            read_len_dict['lengths'] = read_length
    else:
        fragment_len_dict = None
        read_len_dict = None

    return fragment_len_dict, read_len_dict