Beispiel #1
0
    def test_nan_fast_cov_just_x(self):
        logger.debug("*************happy path just x")
        x, _ = TestFastCov.build_nan_containing_x_y()

        ex_with_nan = numpy.cov(x, rowvar=False)
        logger.debug(
            "expected with nan's - ex_with_nan:\n{}".format(ex_with_nan))

        r = fast_cov.nan_fast_cov(x)
        logger.debug("r:\n{}".format(r))

        non_nan_locs = ~numpy.isnan(ex_with_nan)
        self.assertTrue(
            numpy.allclose(ex_with_nan[non_nan_locs], r[non_nan_locs]))

        check_nominal_nans = []
        u = x[1:, 1]
        for i in range(3):
            t = x[1:, i]
            c = numpy.cov(t, u, bias=False)[0, 1]
            check_nominal_nans.append(c)
        logger.debug(
            "calculate entries that would be nan - check_nominal_nans:  {}".
            format(check_nominal_nans))
        self.assertTrue(numpy.allclose(check_nominal_nans, r[:, 1]))
        self.assertTrue(numpy.allclose(check_nominal_nans, r[1, :]))
Beispiel #2
0
    def test_nan_fast_cov_all_nan(self):
        x = numpy.zeros(3)
        x[:] = numpy.nan
        x = x[:, numpy.newaxis]
        logger.debug("x:\n{}".format(x))

        r = fast_cov.nan_fast_cov(x)
        logger.debug("r:\n{}".format(r))

        self.assertEqual(1, numpy.sum(numpy.isnan(r)))
Beispiel #3
0
    def test_nan_fast_cov_x_and_y(self):
        logger.debug("*************happy path x and y")
        x, y = TestFastCov.build_nan_containing_x_y()

        combined = numpy.hstack([x, y])
        logger.debug("combined:\n{}".format(combined))
        logger.debug("combined.shape:  {}".format(combined.shape))

        off_diag_ind = int(combined.shape[1] / 2)

        raw_ex = numpy.cov(combined, rowvar=False)
        logger.debug(
            "raw expected produced from numpy.cov on full combined - raw_ex:\n{}"
            .format(raw_ex))
        ex = raw_ex[:off_diag_ind, off_diag_ind:]
        logger.debug("expected ex:\n{}".format(ex))

        r = fast_cov.nan_fast_cov(x, y)
        logger.debug("r:\n{}".format(r))

        non_nan_locs = ~numpy.isnan(ex)
        logger.debug("ex[non_nan_locs]:  {}".format(ex[non_nan_locs]))
        logger.debug("r[non_nan_locs]:  {}".format(r[non_nan_locs]))
        self.assertTrue(numpy.allclose(ex[non_nan_locs], r[non_nan_locs]))

        check_nominal_nans = []
        t = x[1:, 1]
        for i in [1, 2]:
            u = y[1:, i]
            c = numpy.cov(t, u)
            check_nominal_nans.append(c[0, 1])
        logger.debug(
            "calculate entries that would be nan - check_nominal_nans:  {}".
            format(check_nominal_nans))
        logger.debug("r values to compare to - r[1, 1:]:  {}".format(r[1, 1:]))
        self.assertTrue(numpy.allclose(check_nominal_nans, r[1, 1:]))

        check_nominal_nans = []
        u = y[:2, 0]
        for i in [0, 2]:
            t = x[:2, i]
            c = numpy.cov(t, u)
            check_nominal_nans.append(c[0, 1])
        logger.debug(
            "calculate entries that would be nan - check_nominal_nans:  {}".
            format(check_nominal_nans))
        logger.debug("r values to compare to - r[[0,2], 0]:  {}".format(
            r[[0, 2], 0]))
        self.assertTrue(numpy.allclose(check_nominal_nans, r[[0, 2], 0]))

        self.assertTrue(
            numpy.isnan(r[1, 0]),
            """expect this entry to be nan b/c for the intersection of x[:,1] and y[:,0] 
            there is only one entry in common, therefore covariance is undefined"""
        )
Beispiel #4
0
def nan_fast_corr(x, y=None, destination=None):
    """calculate the pearson correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the pearson correlaton matrix
    between x and y (with dimensions OxP).  If destination is provided, put the results there.  
    In the language of statistics the columns are the variables and the rows are the observations.

    Args:
        x (numpy array-like) MxN in shape
        y (optional, numpy array-like) OxP in shape.  M (# rows in x) must equal O (# rows in y)
        destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy
            memmap of a file)

        returns (numpy array-like) array of the covariance values
            for defaults (y=None), shape is NxN
            if y is provied, shape is NxP
    """
    x_masked = numpy.ma.array(x, mask=numpy.isnan(x))

    if y is None:
        y_masked = x_masked
    else:
        y_masked = numpy.ma.array(y, mask=numpy.isnan(y))

    r = fast_cov.nan_fast_cov(x_masked, y_masked, destination=destination)

    # calculate the standard deviation of the columns of each matrix, given the masking from the other
    _, _, var_x = calculate_moments_with_additional_mask(
        x_masked, y_masked.mask)
    std_x = numpy.sqrt(var_x)

    _, _, var_y = calculate_moments_with_additional_mask(
        y_masked, x_masked.mask)
    std_y = numpy.sqrt(var_y)

    numpy.divide(r, std_x.T, out=r)
    numpy.divide(r, std_y, out=r)

    return r