Example #1
0
def correlateCMapsBinWise(ccMapObjOne,
                          ccMapObjTwo,
                          corrType='pearson',
                          cutoffPercentile=None,
                          workDir=None,
                          outFile=None,
                          logHandler=None):
    """To calculate correlation between two Hi-C maps

    This function can be used to calculate either Pearson or Spearman rank-order correlation
    between two Hi-C maps. It also ignore lower-triangular matrix with diagonal offset to avoid duplicate and large values.


    Parameters
    ----------
    ccMapObjOne : :class:`gcMapExplorer.lib.ccmap.CCMAP`
        First :class:`gcMapExplorer.lib.ccmap.CCMAP` instance containing Hi-C data
    ccMapObjTwo : :class:`gcMapExplorer.lib.ccmap.CCMAP`
        Second :class:`gcMapExplorer.lib.ccmap.CCMAP` instance containing Hi-C data
    ignore_triangular : bool
        Whether entire matrix is considered or only one half triangular region of matrixis considered.
    diagonal_offset : int
        If ``ignore_triangular=True``, it is used to determine how much bins are ignored from the diagonal in one half triangular region of matrix.

        ``diagonal_offset = 0`` is the main diagonal, ``diagonal_offset > 0`` means ignore this many bins from the diagonal.

    corrType : str
        Correlation type. For Pearson and Spearman rank-order correlation, use ``pearson`` and ``spearman``, respectively.
    blockSize : str
        To calculate block-wise correlations by sliding block of given size along diagonals. It should be in resolution.
        For example, ``1mb``, ``500kb``, ``5mb``, ``2.5mb`` etc.
        If ``None``, correlation of whole map is calculated. Sliding step of block depends on ``slideStepSize``.
    slideStepSize : int
        Step-size in bins by which blocks will be shifted for block-wise correlation. If slideStepSize is large then blocks might not be overlapped.

    workDir : str
        Name of working directory, where temporary files will be kept.If ``workDir = None``, file will be generated in OS based temporary directory.

    outFile : str
        Name of output file. Only written for block-wise correlation.

    Returns
    -------
    corr : float or list
        Correlation coefficient
    pvalue/centers : float or list
        If ``blockSize=None`` 2-tailed p-value is returned. For block-wise correlation, list of block-center is returned.


    .. seealso::
        * `scipy.stats.stats.pearsonr <http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html#scipy.stats.pearsonr>`_ for Pearson correlation.
        * `scipy.stats.stats.spearmanr <http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html#scipy.stats.spearmanr>`_ for Spearman rank-order correlation.


    """
    # logger
    logger = logging.getLogger('correlateCMaps')
    if logHandler is not None:
        logger.propagate = False
        logger.addHandler(logHandler)
    logger.setLevel(logging.INFO)

    # To check, user have given correct keywords
    if not (corrType == 'pearson' or corrType == 'spearman'
            or corrType == 'kendell-tau'):
        raise NotImplementedError(
            '{0} not implemented. Use "pearson" or "spearman".'.format(
                corrType))

    # To check if resolution of maps are same
    if ccMapObjOne.binsize != ccMapObjTwo.binsize:
        raise AssertionError(
            "Resolution of first Hi-C map does not match with Second Hi-C map."
        )

    # Determine smallest shape between two maps
    if ccMapObjOne.shape[0] >= ccMapObjTwo.shape[0]:
        smallest_shape = ccMapObjTwo.shape[0]
    else:
        smallest_shape = ccMapObjOne.shape[0]

    corr, centers = [], []
    try:

        # generate boolean arrays to store mask
        m1 = cmh.MemoryMappedArray(shape=(smallest_shape, smallest_shape),
                                   dtype=np.bool,
                                   workDir=workDir)
        m2 = cmh.MemoryMappedArray(shape=(smallest_shape, smallest_shape),
                                   dtype=np.bool,
                                   workDir=workDir)
        mask = cmh.MemoryMappedArray(shape=(smallest_shape, smallest_shape),
                                     dtype=np.bool,
                                     workDir=workDir)

        # Determine masks for two maps and combine it
        m1.arr[:] = ccMapObjOne.matrix[:smallest_shape, :
                                       smallest_shape] == ccMapObjOne.minvalue
        m2.arr[:] = ccMapObjTwo.matrix[:smallest_shape, :
                                       smallest_shape] == ccMapObjTwo.minvalue

        if cutoffPercentile is not None:
            percentileOne = np.percentile(
                ma.array(ccMapObjOne.matrix[:smallest_shape, :smallest_shape],
                         mask=m1.arr).compressed(), cutoffPercentile)
            percentileTwo = np.percentile(
                ma.array(ccMapObjTwo.matrix[:smallest_shape, :smallest_shape],
                         mask=m2.arr).compressed(), cutoffPercentile)
            m1.arr[:] = ccMapObjOne.matrix[:smallest_shape, :
                                           smallest_shape] <= percentileOne
            m2.arr[:] = ccMapObjTwo.matrix[:smallest_shape, :
                                           smallest_shape] <= percentileTwo

        mask.arr[:] = (m1.arr | m2.arr)

        for b in range(smallest_shape):
            ccMapObjOneMa = ma.array(ccMapObjOne.matrix[b], mask=mask.arr[b])
            ccMapObjTwoMa = ma.array(ccMapObjTwo.matrix[b], mask=mask.arr[b])

            tcorr = 0
            if ccMapObjOneMa[~ccMapObjOneMa.mask].shape[0] > 10:
                if corrType == 'pearson':
                    tcorr, _ = stats.pearsonr(ccMapObjOneMa.compressed(),
                                              ccMapObjTwoMa.compressed())
                elif corrType == 'kendall-tau':
                    tcorr, _ = stats.kendelltau(ccMapObjOneMa.compressed(),
                                                ccMapObjTwoMa.compressed())
                else:
                    tcorr, _ = stats.spearmanr(ccMapObjOneMa.compressed(),
                                               ccMapObjTwoMa.compressed())

            if tcorr is ma.masked:
                corr.append(0.0)
            else:
                corr.append(float(tcorr))
            centers.append(b * ccMapObjOne.binsize)

        del m1
        del m2
        del mask
    except (KeyboardInterrupt, SystemExit) as e:
        del m1
        del m2
        del mask
        raise e

    if logHandler is not None:
        logger.removeHandler(logHandler)

    if outFile is not None:
        fout = open(outFile, 'w')
        for i in range(len(corr)):
            if corr[i] != 0:
                fout.write('{0}\t{1}\n'.format(int(centers[i]), corr[i]))
        fout.close()

    return corr, centers
Example #2
0
def correlateCMaps(ccMapObjOne,
                   ccMapObjTwo,
                   ignore_triangular=True,
                   diagonal_offset=1,
                   corrType='pearson',
                   blockSize=None,
                   slideStepSize=1,
                   cutoffPercentile=None,
                   workDir=None,
                   outFile=None,
                   logHandler=None):
    """To calculate correlation between two Hi-C maps

    This function can be used to calculate either Pearson or Spearman rank-order correlation
    between two Hi-C maps. It also ignore lower-triangular matrix with diagnonal offset to avoid duplicate and large values.

    Parameters
    ----------
    ccMapObjOne : :class:`gcMapExplorer.lib.ccmap.CCMAP`
        First :class:`gcMapExplorer.lib.ccmap.CCMAP` instance containing Hi-C data

    ccMapObjTwo : :class:`gcMapExplorer.lib.ccmap.CCMAP`
        Second :class:`gcMapExplorer.lib.ccmap.CCMAP` instance containing Hi-C data

    ignore_triangular : bool
        Whether entire matrix is considered or only one half triangular region of matrixis considered.

    diagonal_offset : int
        If ``ignore_triangular=True``, it is used to determine how much bins are ignored from the diagonal in one half triangular region of matrix.
        ``diagonal_offset = 0`` is the main diagonal, ``diagonal_offset > 0`` means ignore this many bins from the diagonal.

    corrType : str
        Correlation type. For Pearson and Spearman rank-order correlation, use ``pearson`` and ``spearman``, respectively.

    blockSize : str
        To calculate block-wise correlations by sliding block of given size along diagonals. It should be in resolution.
        For example, ``1mb``, ``500kb``, ``5mb``, ``2.5mb`` etc.
        If ``None``, correlation of whole map is calculated. Sliding step of block depends on ``slideStepSize``.

    slideStepSize : int
        Step-size in bins by which blocks will be shifted for block-wise correlation. If slideStepSize is large then blocks might not be overlapped.

    cutoffPercentile : float
        Cutoff percentile to discard values during correlation calculation. If a cutoff percentile is given, values less than this
        percentile value will not be considered during correlation calculation.

    workDir : str
        Name of working directory, where temporary files will be kept.If ``workDir = None``, file will be generated in OS based temporary directory.

    outFile : str
        Name of output file. Only written for block-wise correlation.


    Returns
    -------
    corr : float or list
        Correlation coefficient

    pvalue/centers : float or list
        If ``blockSize=None`` 2-tailed p-value is returned. For block-wise correlation, list of block-center is returned.


    .. seealso::
        * `scipy.stats.pearsonr <http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html#scipy.stats.pearsonr>`_ for Pearson correlation.
        * `scipy.stats.spearmanr <http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html#scipy.stats.spearmanr>`_ for Spearman rank-order correlation.


    """
    # logger
    logger = logging.getLogger('correlateCMaps')
    if logHandler is not None:
        logger.propagate = False
        logger.addHandler(logHandler)
    logger.setLevel(logging.INFO)

    # To check, user have given correct keywords
    if not (corrType == 'pearson' or corrType == 'spearman'
            or corrType == 'kendell-tau'):
        raise NotImplementedError(
            '{0} not implemented. Use "pearson" or "spearman".'.format(
                corrType))

    # To check if resolution of maps are same
    if ccMapObjOne.binsize != ccMapObjTwo.binsize:
        raise AssertionError(
            "Resolution of first Hi-C map does not match with Second Hi-C map."
        )

    # Determine smallest shape between two maps
    if ccMapObjOne.shape[0] >= ccMapObjTwo.shape[0]:
        smallest_shape = ccMapObjTwo.shape[0]
    else:
        smallest_shape = ccMapObjOne.shape[0]

    # Calculation for whole map
    if blockSize is not None:
        logger.info(
            ' Block-wise correlation with [{0}] block-size'.format(blockSize))

        # checking if block-size is larger than smallest shape
        size = int(util.resolutionToBinsize(blockSize) / ccMapObjOne.binsize)
        if size >= smallest_shape:
            raise AssertionError(
                "Size of input block [{0}] is larger than smallest size [{1}] of Hi-C map"
                .format(size, smallest_shape))
        else:
            blockSize = size

        # Print some information about blocks
        logger.info(' Number of Blocks: {0} '.format(
            int(smallest_shape / blockSize)))
        logger.info(' Size of each Block in bins: {0} '.format(blockSize))
        if size - slideStepSize < 0:
            logger.info(
                ' Blocks are not overlapping. Distance between two adjacent blocks in bins : {0}'
                .format(slideStepSize - size))
        else:
            logger.info(
                ' Number of Overlapping bins between sliding blocks:  {0}'.
                format(size - slideStepSize))

    corr, pvalue, centers = None, None, None

    try:

        # generate boolean arrays to store mask
        m1 = cmh.MemoryMappedArray(shape=(smallest_shape, smallest_shape),
                                   dtype=np.bool,
                                   workDir=workDir)
        m2 = cmh.MemoryMappedArray(shape=(smallest_shape, smallest_shape),
                                   dtype=np.bool,
                                   workDir=workDir)
        mask = cmh.MemoryMappedArray(shape=(smallest_shape, smallest_shape),
                                     dtype=np.bool,
                                     workDir=workDir)

        # Determine masks for two maps and combine it
        m1.arr[:] = ccMapObjOne.matrix[:smallest_shape, :
                                       smallest_shape] < ccMapObjOne.minvalue
        m2.arr[:] = ccMapObjTwo.matrix[:smallest_shape, :
                                       smallest_shape] < ccMapObjTwo.minvalue

        if cutoffPercentile is not None:
            percentileOne = np.percentile(
                ma.array(ccMapObjOne.matrix[:smallest_shape, :smallest_shape],
                         mask=m1.arr).compressed(), cutoffPercentile)
            percentileTwo = np.percentile(
                ma.array(ccMapObjTwo.matrix[:smallest_shape, :smallest_shape],
                         mask=m2.arr).compressed(), cutoffPercentile)
            m1.arr[:] = ccMapObjOne.matrix[:smallest_shape, :
                                           smallest_shape] > percentileOne
            m2.arr[:] = ccMapObjTwo.matrix[:smallest_shape, :
                                           smallest_shape] > percentileTwo

            mask.arr[:] = ~(m1.arr | m2.arr)
            mask.arr[np.nonzero(
                ccMapObjOne.matrix[:smallest_shape, :smallest_shape] ==
                0.0)] = True
            mask.arr[np.nonzero(
                ccMapObjTwo.matrix[:smallest_shape, :smallest_shape] ==
                0.0)] = True
        else:
            mask.arr[:] = (m1.arr & m2.arr)

        # Mask lower diagonal with diagonal_offset
        if ignore_triangular:
            mask.arr[np.tril_indices_from(mask.arr, k=diagonal_offset)] = True

        if blockSize is None:
            # This section for correlation between whole maps
            ccMapObjOneMa = ma.array(
                ccMapObjOne.matrix[:smallest_shape, :smallest_shape],
                mask=mask.arr)
            ccMapObjTwoMa = ma.array(
                ccMapObjTwo.matrix[:smallest_shape, :smallest_shape],
                mask=mask.arr)

            if ccMapObjOneMa[~ccMapObjOneMa.mask].shape[0] > 10:
                if corrType == 'pearson':
                    corr, pvalue = stats.pearsonr(ccMapObjOneMa.compressed(),
                                                  ccMapObjTwoMa.compressed())
                elif corrType == 'kendall-tau':
                    corr, pvalue = stats.kendelltau(ccMapObjOneMa.compressed(),
                                                    ccMapObjTwoMa.compressed())
                else:
                    corr, pvalue = stats.spearmanr(ccMapObjOneMa.compressed(),
                                                   ccMapObjTwoMa.compressed())

            else:
                corr = 0
                pvalue = 0

        else:
            # This section for block-wise correlation between maps
            corr, centers = [], []
            csp = 0  # Current start position
            cep = blockSize  # Current end position
            while (cep < smallest_shape):

                ccMapObjOneMa = ma.array(ccMapObjOne.matrix[csp:cep, csp:cep],
                                         mask=mask.arr[csp:cep, csp:cep])
                ccMapObjTwoMa = ma.array(ccMapObjTwo.matrix[csp:cep, csp:cep],
                                         mask=mask.arr[csp:cep, csp:cep])

                # At least 10 valid values are present
                if ccMapObjOneMa[~ccMapObjOneMa.mask].shape[0] > 10:
                    if corrType == 'pearson':
                        tcorr, tpvalue = stats.pearsonr(
                            ccMapObjOneMa.compressed(),
                            ccMapObjTwoMa.compressed())
                    elif corrType == 'kendall-tau':
                        corr, pvalue = stats.kendelltau(
                            ccMapObjOneMa.compressed(),
                            ccMapObjTwoMa.compressed())
                    else:
                        tcorr, tpvalue = stats.spearmanr(
                            ccMapObjOneMa.compressed(),
                            ccMapObjTwoMa.compressed())
                else:
                    tcorr = 0

                if tcorr is ma.masked:
                    corr.append(0.0)
                else:
                    corr.append(float(tcorr))

                centers.append((csp + (blockSize / 2)) * ccMapObjOne.binsize)

                csp = csp + slideStepSize
                cep = csp + blockSize

        del m1
        del m2
        del mask
    except (KeyboardInterrupt, SystemExit) as e:
        del m1
        del m2
        del mask
        raise e

    if logHandler is not None:
        logger.removeHandler(logHandler)

    if outFile is not None and blockSize is not None:
        fout = open(outFile, 'w')
        for i in range(len(corr)):
            if corr[i] != 0:
                fout.write('{0}\t{1}\n'.format(int(centers[i]), corr[i]))
        fout.close()

    if blockSize is None:
        return corr, pvalue
    else:
        return corr, centers