Example #1
0
 def __init__(self, store_precision=True, assume_centered=False, h=None,
              correction=None):
     EmpiricalCovariance.__init__(
         self, store_precision=store_precision,
         assume_centered=assume_centered)
     self.h = h
     self.correction = correction
def test_suffstat_sk_tied():
    # use equation Nk * Sk / N = S_tied
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 500, 2, 2

    resp = rng.rand(n_samples, n_components)
    resp = resp / resp.sum(axis=1)[:, np.newaxis]
    X = rng.rand(n_samples, n_features)
    nk = resp.sum(axis=0)
    xk = np.dot(resp.T, X) / nk[:, np.newaxis]

    covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    covars_pred_full = np.sum(nk[:, np.newaxis, np.newaxis] * covars_pred_full,
                              0) / n_samples

    covars_pred_tied = _estimate_gaussian_covariances_tied(resp, X, nk, xk, 0)

    ecov = EmpiricalCovariance()
    ecov.covariance_ = covars_pred_full
    assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred_tied, 'tied')
    precs_pred = np.dot(precs_chol_pred, precs_chol_pred.T)
    precs_est = linalg.inv(covars_pred_tied)
    assert_array_almost_equal(precs_est, precs_pred)
def test_suffstat_sk_diag():
    # test against 'full' case
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 500, 2, 2

    resp = rng.rand(n_samples, n_components)
    resp = resp / resp.sum(axis=1)[:, np.newaxis]
    X = rng.rand(n_samples, n_features)
    nk = resp.sum(axis=0)
    xk = np.dot(resp.T, X) / nk[:, np.newaxis]
    precs_pred_full = _estimate_gaussian_precisions_cholesky_full(resp, X,
                                                                  nk, xk, 0)
    covars_pred_full = [linalg.inv(np.dot(precision_chol, precision_chol.T))
                        for precision_chol in precs_pred_full]

    precs_pred_diag = _estimate_gaussian_precisions_cholesky_diag(resp, X,
                                                                  nk, xk, 0)
    covars_pred_diag = np.array([np.diag(1. / d) ** 2
                                 for d in precs_pred_diag])

    ecov = EmpiricalCovariance()
    for (cov_full, cov_diag) in zip(covars_pred_full, covars_pred_diag):
        ecov.covariance_ = np.diag(np.diag(cov_full))
        assert_almost_equal(ecov.error_norm(cov_diag, norm='frobenius'), 0)
        assert_almost_equal(ecov.error_norm(cov_diag, norm='spectral'), 0)
Example #4
0
class CovEmbedding(BaseEstimator, TransformerMixin):
    """ Tranformer that returns the coefficients on a flat space to
    perform the analysis.
    """

    def __init__(self, base_estimator=None, kind='tangent'):
        self.base_estimator = base_estimator
        self.kind = kind
#        if self.base_estimator == None:
#            self.base_estimator_ = ...
#        else:
#            self.base_estimator_ = clone(base_estimator)

    def fit(self, X, y=None):
        if self.base_estimator is None:
            self.base_estimator_ = EmpiricalCovariance(
                assume_centered=True)
        else:
            self.base_estimator_ = clone(self.base_estimator)

        if self.kind == 'tangent':
            # self.mean_cov = mean_cov = spd_manifold.log_mean(covs)
            # Euclidean mean as an approximation to the geodesic
            covs = [self.base_estimator_.fit(x).covariance_ for x in X]
            covs = my_stack(covs)
            mean_cov = np.mean(covs, axis=0)
            self.whitening_ = inv_sqrtm(mean_cov)
        return self

    def transform(self, X):
        """Apply transform to covariances

        Parameters
        ----------
        covs: list of array
            list of covariance matrices, shape (n_rois, n_rois)

        Returns
        -------
        list of array, transformed covariance matrices,
        shape (n_rois * (n_rois+1)/2,)
        """
        covs = [self.base_estimator_.fit(x).covariance_ for x in X]
        covs = my_stack(covs)
        p = covs.shape[-1]
        if self.kind == 'tangent':
            id_ = np.identity(p)
            covs = [self.whitening_.dot(c.dot(self.whitening_)) - id_
                    for c in covs]
        elif self.kind == 'partial correlation':
            covs = [cov_to_corr(inv(g)) for g in covs]
        elif self.kind == 'correlation':
            covs = [cov_to_corr(g) for g in covs]
        return np.array([sym_to_vec(c) for c in covs])
Example #5
0
class Mahalanobis (BaseEstimator):
    """Mahalanobis distance estimator. Uses Covariance estimate
    to compute mahalanobis distance of the observations
    from the model.

    Parameters
    ----------
    robust : boolean to determine wheter to use robust estimator
        based on Minimum Covariance Determinant computation
    """
    def __init__(self, robust=False):
        if not robust:
            from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator #
        else:
            from sklearn.covariance import MinCovDet as CovarianceEstimator #
        self.model = CovarianceEstimator()
        self.cov = None
    def fit(self, X, y=None, **params):
        """Fits the covariance model according to the given training
        data and parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        -------
        self : object
            Returns self.
        """
        self.cov = self.model.fit(X)
        return self
    def score(self, X, y=None):
        """Computes the mahalanobis distances of given observations.

        The provided observations are assumed to be centered. One may want to
        center them using a location estimate first.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
          The observations, the Mahalanobis distances of the which we compute.

        Returns
        -------
        mahalanobis_distance : array, shape = [n_observations,]
            Mahalanobis distances of the observations.
        """

        #return self.model.score(X,assume_centered=True)
        return - self.model.mahalanobis(X-self.model.location_) ** 0.33
 def __init__(self, store_precision=True, assume_centered=False,
              h=None, contamination=0.1, pvalue_correction="fwer",
              no_fit=False):
     """
     """
     EmpiricalCovariance.__init__(
         self, store_precision=store_precision,
         assume_centered=assume_centered)
     CovarianceOutlierDetectionMixin.__init__(
         self, contamination=contamination,
         pvalue_correction=pvalue_correction)
     self.no_fit = no_fit
def printSciKitCovarianceMatrixs():
      #does not work, ValueError: setting an array element with a sequence.
      xMaker = RSTCovarianceMatrixMaker()
      nums, data, ilabels = getLabeledRSTData(False)
      for i,d in enumerate(data):
          d['ratio'] = ilabels[i]
      xMaker.setInstanceNums(nums)
      xMaker.fit(data)
      X = xMaker.transform(data)
      correlator = EmpiricalCovariance()
      correlator.fit(X)

      print correlator.covariance_
Example #8
0
class CovEmbedding(BaseEstimator, TransformerMixin):
    """ Tranformer that returns the coefficients on a flat space to
    perform the analysis.
    """

    def __init__(self, cov_estimator=None, kind='tangent'):
        self.cov_estimator = cov_estimator
        self.kind = kind

    def fit(self, X, y=None):
        if self.cov_estimator is None:
            self.cov_estimator_ = EmpiricalCovariance(
                assume_centered=True)
        else:
            self.cov_estimator_ = clone(self.cov_estimator)

        if self.kind == 'tangent':
            covs = [self.cov_estimator_.fit(x).covariance_ for x in X]
            self.mean_cov_ = spd_mfd.frechet_mean(covs, max_iter=30, tol=1e-7)
            self.whitening_ = spd_mfd.inv_sqrtm(self.mean_cov_)
        return self

    def transform(self, X):
        """Apply transform to covariances

        Parameters
        ----------
        covs: list of array
            list of covariance matrices, shape (n_rois, n_rois)

        Returns
        -------
        list of array, transformed covariance matrices,
        shape (n_rois * (n_rois+1)/2,)
        """
        covs = [self.cov_estimator_.fit(x).covariance_ for x in X]
        covs = spd_mfd.my_stack(covs)
        if self.kind == 'tangent':
            covs = [spd_mfd.logm(self.whitening_.dot(c).dot(self.whitening_))
                    for c in covs]
        elif self.kind == 'precision':
            covs = [spd_mfd.inv(g) for g in covs]
        elif self.kind == 'partial correlation':
            covs = [prec_to_partial(spd_mfd.inv(g)) for g in covs]
        elif self.kind == 'correlation':
            covs = [cov_to_corr(g) for g in covs]
        else:
            raise ValueError("Unknown connectivity measure.")

        return np.array([sym_to_vec(c) for c in covs])
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False):
    """
    See http://scikit-learn.org/0.13/modules/outlier_detection.html#\
        fitting-an-elliptic-envelop

    for details.
    """
    if df is None and ctry is None:
        raise ValueError('Either the country or a dataframe must be supplied')
    elif df is None:
        df = load_res(ctry, weighted=weighted)
    if inliers:
        df = get_inliers(df=df)
    X = df.values
    robust_cov = MinCovDet().fit(X)
    #-----------------------------------------------------------------------------
    # compare estimators learnt from the full data set with true parameters
    emp_cov = EmpiricalCovariance().fit(X)
    #-----------------------------------------------------------------------------
    # Display results
    fig = plt.figure()
    fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
    #-----------------------------------------------------------------------------
    # Show data set
    ax1 = fig.add_subplot(1, 1, 1)
    ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.')
    ax1.set_title(country_code[ctry])
    #-----------------------------------------------------------------------------
    # Show contours of the distance functions
    xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1],
                                     100),
                         np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1],
                                     100))
    zz = np.c_[xx.ravel(), yy.ravel()]
    #-----------------------------------------------------------------------------
    mahal_emp_cov = emp_cov.mahalanobis(zz)
    mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
    emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  cmap=plt.cm.PuBu_r,
                                  linestyles='dashed')
    #-----------------------------------------------------------------------------
    mahal_robust_cov = robust_cov.mahalanobis(zz)
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
    robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 cmap=plt.cm.YlOrBr_r, linestyles='dotted')
    ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]],
               ['MLE dist', 'robust dist'],
               loc="upper right", borderaxespad=0)
    ax1.grid()
    return (fig, ax1, ctry)
Example #10
0
def outlier_rejection(feat, prob):
    '''
    '''
    
    from sklearn.covariance import EmpiricalCovariance #MinCovDet
    
    #real_cov
    #linalg.inv(real_cov)
    
    #robust_cov = MinCovDet().fit(feat)
    robust_cov = EmpiricalCovariance().fit(feat)
    dist = robust_cov.mahalanobis(feat - numpy.median(feat, 0))
    
    cut = scipy.stats.chi2.ppf(prob, feat.shape[1])
    return dist < cut
Example #11
0
    def fit(self, data):
        nu = 0.01
        n_sample  = data.shape[0]
        n_feature = data.shape[1]
        
        exclude = set()
        for d in range(n_feature):
            feature = data[:, d]
            s_feature = feature.copy()
            s_feature.sort()
            low = s_feature[int(n_sample*nu/2)]
            upp = s_feature[n_sample-int(n_sample*nu/2)]

            exld = numpy.nonzero(numpy.logical_or((feature > upp),(feature < low)))[0]
            [exclude.add(e) for e in exld]
            
        use = numpy.array([f for f in range(n_sample) if f not in exclude])
        
        data_ = data[use, :]
            
        self.cov = EmpiricalCovariance().fit(data_)
        
        dist = self.cov.mahalanobis(data)
        
        self.cutoff = numpy.percentile(dist, self.perc_keep)
        print self.cutoff
Example #12
0
 def __init__(self, robust=False):
     if not robust:
         from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator #
     else:
         from sklearn.covariance import MinCovDet as CovarianceEstimator #
     self.model = CovarianceEstimator()
     self.cov = None
def test_gaussian_mixture_fit():
    # recover the ground truth
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    n_features = rand_data.n_features
    n_components = rand_data.n_components

    for covar_type in COVARIANCE_TYPE:
        X = rand_data.X[covar_type]
        g = GaussianMixture(n_components=n_components, n_init=20,
                            reg_covar=0, random_state=rng,
                            covariance_type=covar_type)
        g.fit(X)

        # needs more data to pass the test with rtol=1e-7
        assert_allclose(np.sort(g.weights_), np.sort(rand_data.weights),
                        rtol=0.1, atol=1e-2)

        arg_idx1 = g.means_[:, 0].argsort()
        arg_idx2 = rand_data.means[:, 0].argsort()
        assert_allclose(g.means_[arg_idx1], rand_data.means[arg_idx2],
                        rtol=0.1, atol=1e-2)

        if covar_type == 'full':
            prec_pred = g.precisions_
            prec_test = rand_data.precisions['full']
        elif covar_type == 'tied':
            prec_pred = np.array([g.precisions_] * n_components)
            prec_test = np.array([rand_data.precisions['tied']] * n_components)
        elif covar_type == 'spherical':
            prec_pred = np.array([np.eye(n_features) * c
                                 for c in g.precisions_])
            prec_test = np.array([np.eye(n_features) * c for c in
                                 rand_data.precisions['spherical']])
        elif covar_type == 'diag':
            prec_pred = np.array([np.diag(d) for d in g.precisions_])
            prec_test = np.array([np.diag(d) for d in
                                 rand_data.precisions['diag']])

        arg_idx1 = np.trace(prec_pred, axis1=1, axis2=2).argsort()
        arg_idx2 = np.trace(prec_test, axis1=1, axis2=2).argsort()
        for k, h in zip(arg_idx1, arg_idx2):
            ecov = EmpiricalCovariance()
            ecov.covariance_ = prec_test[h]
            # the accuracy depends on the number of data and randomness, rng
            assert_allclose(ecov.error_norm(prec_pred[k]), 0, atol=0.1)
Example #14
0
class OneClassMahalanobis(BaseClassifier):
    _fit_params = ['perc_keep']
    _predict_params = []
    def __init__(self,*args, **kwargs):
#         BaseClassifier.__init__(self, *args, **kwargs)
        self.perc_keep = kwargs["perc_keep"]
    
    def fit(self, data):
        nu = 0.01
        n_sample  = data.shape[0]
        n_feature = data.shape[1]
        
        exclude = set()
        for d in range(n_feature):
            feature = data[:, d]
            s_feature = feature.copy()
            s_feature.sort()
            low = s_feature[int(n_sample*nu/2)]
            upp = s_feature[n_sample-int(n_sample*nu/2)]

            exld = numpy.nonzero(numpy.logical_or((feature > upp),(feature < low)))[0]
            [exclude.add(e) for e in exld]
            
        use = numpy.array([f for f in range(n_sample) if f not in exclude])
        
        data_ = data[use, :]
            
        self.cov = EmpiricalCovariance().fit(data_)
        
        dist = self.cov.mahalanobis(data)
        
        self.cutoff = numpy.percentile(dist, self.perc_keep)
        print self.cutoff
    

    
    def predict(self, data):
        mahal_dist = self.cov.mahalanobis(data)
        self.mahal_dist = mahal_dist
        print mahal_dist.min(), mahal_dist.max(), self.cutoff, (mahal_dist > self.cutoff).sum(), "of", len(mahal_dist)
        
        return (mahal_dist > self.cutoff).astype(numpy.uint8)*-2+1
    
    def decision_function(self, data=None):
        return self.mahal_dist
def test_suffstat_sk_tied():
    # use equation Nk * Sk / N = S_tied
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 500, 2, 2

    resp = rng.rand(n_samples, n_components)
    resp = resp / resp.sum(axis=1)[:, np.newaxis]
    X = rng.rand(n_samples, n_features)
    nk = resp.sum(axis=0)
    xk = np.dot(resp.T, X) / nk[:, np.newaxis]
    covars_pred_full = _estimate_gaussian_covariance_full(resp, X, nk, xk, 0)
    covars_pred_full = np.sum(nk[:, np.newaxis, np.newaxis] * covars_pred_full,
                              0) / n_samples

    covars_pred_tied = _estimate_gaussian_covariance_tied(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance()
    ecov.covariance_ = covars_pred_full
    assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='spectral'), 0)
Example #16
0
def launch_mcd_on_dataset(n_samples, n_features, n_outliers,
                          tol_loc, tol_cov, tol_support, correction):
    """

    """
    data = np.random.randn(n_samples, n_features)
    # add some outliers
    outliers_index = np.random.permutation(n_samples)[:n_outliers]
    outliers_offset = 10. * \
        (np.random.randint(2, size=(n_outliers, n_features)) - 0.5)
    data[outliers_index] += outliers_offset
    inliers_mask = np.ones(n_samples).astype(bool)
    inliers_mask[outliers_index] = False

    # compute MCD directly
    T, S, H = fast_mcd(data, correction=correction)
    # compare with the estimates learnt from the inliers
    pure_data = data[inliers_mask]
    error_location = np.sum((pure_data.mean(0) - T) ** 2)
    assert(error_location < tol_loc)
    emp_cov = EmpiricalCovariance().fit(pure_data)
    #print emp_cov.error_norm(S)
    assert(emp_cov.error_norm(S) < tol_cov)
    assert(np.sum(H) > tol_support)
    # check improvement
    if (n_outliers / float(n_samples) > 0.1) and (n_features > 1):
        error_bad_location = np.sum((data.mean(0) - T) ** 2)
        assert(error_bad_location > error_location)
        bad_emp_cov = EmpiricalCovariance().fit(data)
        assert(emp_cov.error_norm(S) < bad_emp_cov.error_norm(S))

    # compute MCD by fitting an object
    mcd_fit = MCD().fit(data)
    T = mcd_fit.location_
    S = mcd_fit.covariance_
    H = mcd_fit.support_
    # compare with the estimates learnt from the inliers
    error_location = np.sum((pure_data.mean(0) - T) ** 2)
    assert(error_location < tol_loc)
    assert(emp_cov.error_norm(S) < tol_cov)
    assert(np.sum(H) > tol_support)
    # check improvement
    if (n_outliers / float(n_samples) > 0.1) and (n_features > 1):
        error_bad_location = np.sum((data.mean(0) - T) ** 2)
        assert(error_bad_location > error_location)
        bad_emp_cov = EmpiricalCovariance().fit(data)
        assert(emp_cov.error_norm(S) < bad_emp_cov.error_norm(S))
Example #17
0
    def fit(self, X, y=None):
        if self.cov_estimator is None:
            self.cov_estimator_ = EmpiricalCovariance(
                assume_centered=True)
        else:
            self.cov_estimator_ = clone(self.cov_estimator)

        if self.kind == 'tangent':
            covs = [self.cov_estimator_.fit(x).covariance_ for x in X]
            self.mean_cov_ = spd_mfd.frechet_mean(covs, max_iter=30, tol=1e-7)
            self.whitening_ = spd_mfd.inv_sqrtm(self.mean_cov_)
        return self
def test_suffstat_sk_full():
    # compare the precision matrix compute from the
    # EmpiricalCovariance.covariance fitted on X*sqrt(resp)
    # with _sufficient_sk_full, n_components=1
    rng = np.random.RandomState(0)
    n_samples, n_features = 500, 2

    # special case 1, assuming data is "centered"
    X = rng.rand(n_samples, n_features)
    resp = rng.rand(n_samples, 1)
    X_resp = np.sqrt(resp) * X
    nk = np.array([n_samples])
    xk = np.zeros((1, n_features))
    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=True)
    ecov.fit(X_resp)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full')
    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
    assert_array_almost_equal(precs_est, precs_pred)

    # special case 2, assuming resp are all ones
    resp = np.ones((n_samples, 1))
    nk = np.array([n_samples])
    xk = X.mean(axis=0).reshape((1, -1))
    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=False)
    ecov.fit(X)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full')
    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
    assert_array_almost_equal(precs_est, precs_pred)
def test_suffstat_sk_diag():
    # test against 'full' case
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 500, 2, 2

    resp = rng.rand(n_samples, n_components)
    resp = resp / resp.sum(axis=1)[:, np.newaxis]
    X = rng.rand(n_samples, n_features)
    nk = resp.sum(axis=0)
    xk = np.dot(resp.T, X) / nk[:, np.newaxis]
    covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    covars_pred_diag = _estimate_gaussian_covariances_diag(resp, X, nk, xk, 0)

    ecov = EmpiricalCovariance()
    for (cov_full, cov_diag) in zip(covars_pred_full, covars_pred_diag):
        ecov.covariance_ = np.diag(np.diag(cov_full))
        cov_diag = np.diag(cov_diag)
        assert_almost_equal(ecov.error_norm(cov_diag, norm='frobenius'), 0)
        assert_almost_equal(ecov.error_norm(cov_diag, norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred_diag, 'diag')
    assert_almost_equal(covars_pred_diag, 1. / precs_chol_pred ** 2)
Example #20
0
    def fit(self, X, y=None):
        if self.base_estimator is None:
            self.base_estimator_ = EmpiricalCovariance(
                assume_centered=True)
        else:
            self.base_estimator_ = clone(self.base_estimator)

        if self.kind == 'tangent':
            # self.mean_cov = mean_cov = spd_manifold.log_mean(covs)
            # Euclidean mean as an approximation to the geodesic
            covs = [self.base_estimator_.fit(x).covariance_ for x in X]
            covs = my_stack(covs)
            mean_cov = np.mean(covs, axis=0)
            self.whitening_ = inv_sqrtm(mean_cov)
        return self
Example #21
0
def detect_bad_channels(inst, pick_types=None, threshold=.2):
    from sklearn.preprocessing import RobustScaler
    from sklearn.covariance import EmpiricalCovariance
    from jr.stats import median_abs_deviation
    if pick_types is None:
        pick_types = dict(meg='mag')
    inst = inst.pick_types(copy=True, **pick_types)
    cov = EmpiricalCovariance()
    cov.fit(inst._data.T)
    cov = cov.covariance_
    # center
    scaler = RobustScaler()
    cov = scaler.fit_transform(cov).T
    cov /= median_abs_deviation(cov)
    cov -= np.median(cov)
    # compute robust summary metrics
    mu = np.median(cov, axis=0)
    sigma = median_abs_deviation(cov, axis=0)
    mu /= median_abs_deviation(mu)
    sigma /= median_abs_deviation(sigma)
    distance = np.sqrt(mu ** 2 + sigma ** 2)
    bad = np.where(distance < threshold)[0]
    bad = [inst.ch_names[ch] for ch in bad]
    return bad
Example #22
0
class OneClassMahalanobis(BaseClassifier):
    _fit_params = []
    def __init__(self, *args, **kwargs):
        pass
    
    def fit(self, data):
        #self.cov = MinCovDet().fit(data)
        self.cov = EmpiricalCovariance().fit(data)
    
    def predict(self, data):
        mahal_emp_cov = self.cov.mahalanobis(data)
        d = data.shape[1]
        thres = scipy.stats.chi2.ppf(0.95, d)
        
        self.mahal_emp_cov = mahal_emp_cov
        
        return (mahal_emp_cov > thres).astype(numpy.int32)*-2+1
    
    def decision_function(self, data):
        return self.mahal_emp_cov
Example #23
0
import numpy as np
#,def,gen_A(num_classes,t,adj_file):
import pickle
import matplotlib.pyplot as plt

import numpy as np
from sklearn.covariance import EmpiricalCovariance
from sklearn.datasets import make_gaussian_quantiles
real_cov = np.array([[.8, .3], [.3, .4]])
rng = np.random.RandomState(0)
X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500)

print(X)
print(X.shape)
cov = EmpiricalCovariance().fit(X)
print(cov.covariance_.shape)
Example #24
0
    def calc_suggestions(self):
        """ Used to normalize """

        suggs = []
        preds = []
        ents = []
        """ Quick fix till i come up with an idea to approximate the manifold.."""
        inv_cov = np.linalg.inv(
            EmpiricalCovariance(
                assume_centered=True).fit(self.N_samples, self.M_samples)
            .covariance_ + np.eye(self.N) * 1e-6)

        # Exploitation
        for p in range(self.samples):
            gradients, inv_hess = self.S.run(
                (self.IN_grads, self.IN_hessian),
                feed_dict={
                    self.IN: [self.N_samples[p]]
                })

            suggestion = self.N_samples[p].reshape(-1) - (
                inv_hess @ gradients).reshape(-1)

            if self.minis and self.maxis:
                suggestion = np.clip(suggestion, self.minis, self.maxis)

            pred = self.predict([suggestion])

            ent = 1000000
            for point in self.N_samples:
                ent = min(
                    ent,
                    np.sqrt((point - suggestion).reshape(1, -1) @ inv_cov @ (
                        point - suggestion).reshape(-1, 1)))

            # print("Start", self.N_samples[p], "Score", self.M_samples[p])
            # print("Recommendation", suggestion, "Pred", pred, "Ent", ent)
            # print()
            # print("Score", self.M_samples[p])
            # print("Pred", pred, "Ent", ent)
            # print()

            suggs.append(suggestion)
            preds.append(pred)
            ents.append(ent)

        # Exploration..
        for p in range(10):
            suggestion = self.S.run(self.N_gen).reshape(-1)
            pred = self.predict([suggestion])

            ent = 1000000
            for point in self.N_samples:
                ent = min(
                    ent,
                    np.sqrt((point - suggestion).reshape(1, -1) @ inv_cov @ (
                        point - suggestion).reshape(-1, 1)))

            # print("Recommendation", suggestion, "Pred", pred, "Ent", ent)
            # print()

            suggs.append(suggestion)
            preds.append(pred)
            ents.append(ent)

        suggs = np.array(suggs)
        preds = np.array(preds).reshape(-1)
        ents = np.array(ents).reshape(-1)

        mean_pred = np.mean(preds)
        stdev_pred = np.std(preds)

        mean_ent = np.mean(ents)
        stdev_ent = np.std(ents)

        # print("MEANENT VARENT", np.mean(ents), np.var(ents))
        preds = (preds - mean_pred) / stdev_pred
        ents = (ents - mean_ent) / stdev_ent

        return suggs, preds, ents
Example #25
0
  def plot_contours(self, ax, show=False):
    COV = self.emp_cov
    COV_slice = EmpiricalCovariance()
    COV_slice.location_ = np.array([ COV.location_[0], COV.location_[1] ])
    COV_slice.covariance_ = np.array([ COV.covariance_[ 0,0 ], COV.covariance_[ 0,1 ],
                                       COV.covariance_[ 1,0 ], COV.covariance_[ 1,1 ] ])
    COV_slice.covariance_ = COV_slice.covariance_.reshape((2,2))
    COV_slice.precision_ = np.array([ COV.precision_[ 0,0 ], COV.precision_[ 0,1 ],
                                      COV.precision_[ 1,0 ], COV.precision_[ 1,1 ] ])
    COV_slice.precision_ = COV_slice.precision_.reshape((2,2))

    # Show contours of the distance functions
    xx, yy = np.meshgrid(
          np.linspace(COV_slice.location_[0]-5*math.sqrt(COV_slice.covariance_[0,0]), COV_slice.location_[0]+5*math.sqrt(COV_slice.covariance_[0,0]), 100),
          np.linspace(COV_slice.location_[1]-5*math.sqrt(COV_slice.covariance_[1,1]), COV_slice.location_[1]+5*math.sqrt(COV_slice.covariance_[1,1]), 100),
    )
    zz = np.c_[xx.ravel(), yy.ravel()]

    # Empirical fit is not so good.  Don't plot this
    if False: # keep for debugging
      mahal_emp_cov = COV_slice.mahalanobis(zz)
      mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
      emp_cov_contour = ax.contour(xx, yy, np.sqrt(mahal_emp_cov),
                                  levels=[1.,2.,3.,4.,5.],
                                  #cmap=plt.cm.PuBu_r,
                                  cmap=plt.cm.cool_r,
                                  linestyles='dashed')

    COV = self.rob_cov
    COV_slice = EmpiricalCovariance()
    COV_slice.location_ = np.array([ COV.location_[0], COV.location_[1] ])
    COV_slice.covariance_ = np.array([ COV.covariance_[ 0,0 ], COV.covariance_[ 0,1 ],
                                       COV.covariance_[ 1,0 ], COV.covariance_[ 1,1 ] ])
    COV_slice.covariance_ = COV_slice.covariance_.reshape((2,2))
    COV_slice.precision_ = np.array([ COV.precision_[ 0,0 ], COV.precision_[ 0,1 ],
                                      COV.precision_[ 1,0 ], COV.precision_[ 1,1 ] ])
    COV_slice.precision_ = COV_slice.precision_.reshape((2,2))
    self.robust_model_XY = COV_slice

    # robust is better
    if show:
      mahal_robust_cov = COV_slice.mahalanobis(zz)
      mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
      robust_contour = ax.contour(xx, yy, np.sqrt(mahal_robust_cov),
                                 levels=[1.,2.,3.,4.,5.],
                                 #cmap=plt.cm.YlOrBr_r,
                                 cmap=plt.cm.spring_r,
                                 linestyles='dotted')
Example #26
0
    def __init__(self,
                 lab_coords_x,
                 lab_coords_y,
                 data,
                 i_panel,
                 delta_scalar,
                 params,
                 verbose=True):
        training_data = []

        mean_x = flex.mean(lab_coords_x)
        mean_y = flex.mean(lab_coords_y)
        limit = delta_scalar * 10

        for ix in range(len(data)):
            if abs(lab_coords_x[ix] - mean_x) > limit: continue
            if abs(lab_coords_y[ix] - mean_y) > limit: continue
            if abs(data[ix]) > 1: continue
            training_data.append(
                (lab_coords_x[ix], lab_coords_y[ix], data[ix]))
        if verbose:
            print("Training data is less",
                  len(lab_coords_x) - len(training_data),
                  end=" ")
        colorcode_set = []
        for ix in range(len(data)):
            colorcode_set.append(
                (lab_coords_x[ix], lab_coords_y[ix], data[ix]))

        from sklearn.covariance import EmpiricalCovariance, MinCovDet
        # compare estimators learnt from the full data set with true parameters
        emp_cov = EmpiricalCovariance(
            assume_centered=False, store_precision=True).fit(X=training_data)
        # fit a Minimum Covariance Determinant (MCD) robust estimator to data
        robust_cov = MinCovDet(assume_centered=False,
                               store_precision=True).fit(X=training_data)

        features = ["Δx", "Δy", "ΔΨ(deg)"]
        if verbose:
            print("%3d" % i_panel, end=" ")
            print("%4d items " % (len(training_data), ), end=" ")
        for idx_report in range(len(features)):
            feature = features[idx_report]
            diag_elem = math.sqrt(emp_cov.covariance_[idx_report, idx_report])
            if verbose:
                print("%s=%7.2f±%6.2f" %
                      (feature, emp_cov.location_[idx_report], diag_elem),
                      end=" ")

        if verbose:
            print("%4d items:" % (flex.bool(robust_cov.support_).count(True)),
                  end=" ")
        for idx_report in range(len(features)):
            feature = features[idx_report]
            diag_elem = math.sqrt(robust_cov.covariance_[idx_report,
                                                         idx_report])
            if verbose:
                print("%s=%7.2f±%6.2f" %
                      (feature, robust_cov.location_[idx_report], diag_elem),
                      end=" ")

        disc = flex.double(robust_cov.mahalanobis(
            X=colorcode_set))  # this metric represents malahanobis ** 2
        disc_select = disc < (
            params.residuals.mcd_filter.mahalanobis_distance)**2
        if params.residuals.mcd_filter.keep == "outliers":
            disc_select = (disc_select == False)
        if verbose:
            print("OK %4.1f%%" %
                  (100 * (disc_select.count(True)) / len(training_data)))
        self.lab_coords_x = lab_coords_x.select(disc_select)
        self.lab_coords_y = lab_coords_y.select(disc_select)
        self.data = data.select(disc_select)
        self.n_input = len(lab_coords_x)
        self.n_output = len(self.lab_coords_x)
        self.emp_cov = emp_cov
        self.rob_cov = robust_cov
Example #27
0
###### Likelyhood Computation ######
# Fold the angles in params into proper range, such that
# they centered at the mean.
N_CYCLE_FOLD_ANGLE = 10
for j in xrange(N_CYCLE_FOLD_ANGLE):
    mean = np.mean(params, axis=0)
    for i in xrange(3, 6):  # index 3,4,5 are angles, others are distances
        params[:, i][params[:, i] > mean[i] + np.pi] -= 2 * np.pi
        params[:, i][params[:, i] < mean[i] - np.pi] += 2 * np.pi
        if PARAMS_TLR[i] > mean[i] + np.pi:
            PARAMS_TLR[i] += 2 * np.pi
        if PARAMS_TLR[i] < mean[i] - np.pi:
            PARAMS_TLR[i] -= 2 * np.pi

est = EmpiricalCovariance(True, False)
est.fit(params)
log_likelyhood = est.score(PARAMS_TLR[None, :])
KT = 0.59
free_e = -log_likelyhood * KT

print 'Log likelyhood score:', log_likelyhood
print 'Free energy:', free_e


###### Output the best conformer to pdb ######
def generate_bp_par_file(params, bps, out_name):
    assert(len(params) == len(bps))
    n_bp = len(params)
    # convert from radians to degrees
    params[:, 3:] = np.degrees(params[:, 3:])
Example #28
0
    def __init__(
        self,
        propensity_transform=None,
        caliper=None,
        with_replacement=True,
        n_neighbors=1,
        matching_mode="both",
        metric="mahalanobis",
        knn_backend="sklearn",
        estimate_observed_outcome=False,
    ):
        """Match treatment and control samples with similar covariates.

        Args:
            propensity_transform (causallib.transformers.PropensityTransformer):
                an object for data preprocessing which adds the propensity
                score as a feature (default: None)
            caliper (float) : maximal distance for a match to be accepted. If
                not defined, all matches will be accepted. If defined, some
                samples may not be matched and their outcomes will not be
                estimated. (default: None)
            with_replacement (bool): whether samples can be used multiple times
                for matching. If set to False, the matching process will optimize
                the linear sum of distances between pairs of treatment and
                control samples and only `min(N_treatment, N_control)` samples
                will be estimated. Matching with no replacement does not make
                use of the `fit` data and is therefore not implemented for
                out-of-sample data (default: True)
            n_neighbors (int) : number of nearest neighbors to include in match.
                Must be 1 if `with_replacement` is `False.` If larger than 1, the
                estimate is calculated using the `regress_agg_function` or 
                `classify_agg_function` across the `n_neighbors`. Note that when
                the `caliper` variable is set, some samples will have fewer than
                `n_neighbors` matches. (default: 1).
            matching_mode (str) : Direction of matching: `treatment_to_control`,
                `control_to_treatment` or `both` to indicate which set should
                be matched to which. All sets are cross-matched in `match`
                and when `with_replacement` is `False` all matching modes 
                coincide. With replacement there is a difference.
            metric (str) : Distance metric string for calculating distance
                between samples. Note: if an external built `knn_backend`
                object with a different metric is supplied, `metric` needs to
                be changed to reflect that, because `Matching` will set its 
                inverse covariance matrix if "mahalanobis" is set. (default: 
                "mahalanobis", also supported: "euclidean")
            knn_backend (str or callable) : Backend to use for nearest neighbor
                search. Options are "sklearn"  or a callable  which returns an 
                object implementing `fit`, `kneighbors` and `set_params` 
                like the sklearn `NearestNeighbors` object. (default: "sklearn"). 
            estimate_observed_outcome (bool) : Whether to allow a match of a
                sample to a sample other than itself when looking within its own
                treatment value. If True, the estimated potential outcome for the
                observed outcome may differ from the true observed outcome.
                (default: False)

        Attributes:
            classify_agg_function (callable) : Aggregating function for outcome
                estimation when classifying. (default: majority_rule)
                Usage is determined by type of `y` during `fit`
            regress_agg_function (callable) : Aggregating function for outcome
                estimation when regressing or predicting prob_a. (default: np.mean)
                Usage is determined by type of `y` during `fit`
            treatments_ (pd.DataFrame) : DataFrame of treatments (created after `fit`)
            outcomes_ (pd.DataFrame) : DataFrame of outcomes (created after `fit`)
            match_df_ (pd.DataFrame) : Dataframe of most recently calculated
                matches. For details, see `match`. (created after `match`)
            samples_used_ (pd.Series) : Series with count of samples used
                during most recent match. Series includes a count for each
                treatment value. (created after `match`)
        """
        self.propensity_transform = propensity_transform
        self.covariance_conditioner = EmpiricalCovariance()
        self.caliper = caliper
        self.with_replacement = with_replacement
        self.n_neighbors = n_neighbors
        self.matching_mode = matching_mode
        self.metric = metric
        # if classify task, default aggregation function is majority
        self.classify_agg_function = majority_rule
        # if regress task,  default aggregation function is mean
        self.regress_agg_function = np.mean
        self.knn_backend = knn_backend
        self.estimate_observed_outcome = estimate_observed_outcome
n_features = 2

# generate data
gen_cov = np.eye(n_features)
gen_cov[0, 0] = 2.
X = np.dot(np.random.randn(n_samples, n_features), gen_cov)
# add some outliers
outliers_cov = np.eye(n_features)
outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.
X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov)

# fit a Minimum Covariance Determinant (MCD) robust estimator to data
robust_cov = MinCovDet().fit(X)

# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)

###############################################################################
# Display results
fig = pl.figure()
pl.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)

# Show data set
subfig1 = pl.subplot(3, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
                              color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
                               color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
Example #30
0
    def fit(self, X, y=None):
        """Fits the GraphLasso covariance model to X.
        
        Closely follows sklearn.covariance.graph_lasso.GraphLassoCV.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Data from which to compute the covariance estimate
        """
        # initialize
        X = check_array(X, ensure_min_features=2, estimator=self)
        X = as_float_array(X, copy=False, force_all_finite=False)
        cv = check_cv(self.cv, X, y, classifier=False)
        self.init_coefs(X)

        # get path
        if isinstance(self.lams, collections.Sequence):
            path = self.lams
            n_refinements = 1
        else:
            n_refinements = self.n_refinements
            lam_1 = self.lam_scale_
            lam_0 = 1e-2 * lam_1
            path = np.logspace(np.log10(lam_0), np.log10(lam_1), self.lams)[::-1]

        # run this thing a bunch
        results = list()
        t0 = time.time()
        for rr in range(n_refinements):
            # parallel version
            this_result = Parallel(
                n_jobs=self.n_jobs,
                verbose=self.verbose,
            )(
                delayed(_quic_path)(
                    X[train],
                    path,
                    X_test=X[test],
                    lam=self.lam, tol=self.tol, max_iter=self.max_iter, 
                    Theta0=self.Theta0, Sigma0=self.Sigma0, method=self.method,
                    verbose=self.verbose, score_metric=self.score_metric,
                    init_method=self.init_method)
                for train, test in cv)

            # Little dance to transform the list in what we need
            covs, _, scores = zip(*this_result)
            covs = zip(*covs)
            scores = zip(*scores)
            results.extend(zip(path, scores, covs))
            results = sorted(results, key=operator.itemgetter(0), reverse=True)

            # Find the maximum (avoid using built in 'max' function to
            # have a fully-reproducible selection of the smallest alpha
            # in case of equality)
            best_score = -np.inf
            last_finite_idx = 0
            for index, (lam, scores, _) in enumerate(results):
                # sometimes we get -np.inf in the result (in kl-loss)
                scores = [s for s in scores if not np.isinf(s)]
                if len(scores) == 0:
                    this_score = -np.inf 
                else:
                    this_score = np.mean(scores)

                if this_score >= .1 / np.finfo(np.float64).eps:
                    this_score = np.nan

                if np.isfinite(this_score):
                    last_finite_idx = index
                
                if this_score >= best_score:
                    best_score = this_score
                    best_index = index

            # Refine the grid
            if best_index == 0:
                # We do not need to go back: we have chosen
                # the highest value of lambda for which there are
                # non-zero coefficients
                lam_1 = results[0][0]
                lam_0 = results[1][0]
            
            elif (best_index == last_finite_idx
                    and not best_index == len(results) - 1):
                # We have non-converged models on the upper bound of the
                # grid, we need to refine the grid there
                lam_1 = results[best_index][0]
                lam_0 = results[best_index + 1][0]
            
            elif best_index == len(results) - 1:
                lam_1 = results[best_index][0]
                lam_0 = 0.01 * results[best_index][0]
            
            else:
                lam_1 = results[best_index - 1][0]
                lam_0 = results[best_index + 1][0]

            if not isinstance(self.lams, collections.Sequence):
                path = np.logspace(np.log10(lam_1), np.log10(lam_0),
                                     self.lams + 2)
                path = path[1:-1]

            if self.verbose and n_refinements > 1:
                print('[GraphLassoCV] Done refinement % 2i out of %i: % 3is'
                      % (rr + 1, n_refinements, time.time() - t0))

        results = list(zip(*results))
        grid_scores = list(results[1])
        lams = list(results[0])
        
        # Finally, compute the score with lambda = 0
        lams.append(0)
        grid_scores.append(cross_val_score(EmpiricalCovariance(), X,
                                           cv=cv, n_jobs=self.n_jobs))
        self.grid_scores = np.array(grid_scores)
        self.lam_ = self.lam * lams[best_index]
        self.cv_lams_ = [self.lam * l for l in lams]

        # Finally fit the model with the selected lambda
        if self.method == 'quic':
            (self.precision_, self.covariance_, self.opt_, self.cputime_, 
            self.iters_, self.duality_gap_) = quic(self.sample_covariance_,
                                                self.lam_,
                                                mode='default',
                                                tol=self.tol,
                                                max_iter=self.max_iter,
                                                Theta0=self.Theta0,
                                                Sigma0=self.Sigma0,
                                                path=None,
                                                msg=self.verbose)
        else:
            raise NotImplementedError(
                "Only method='quic' has been implemented.")

        self.is_fitted = True
        return self
        outliers_offset = 10. * \
            (np.random.randint(2, size=(n_outliers, n_features)) - 0.5)
        X[outliers_index] += outliers_offset
        inliers_mask = np.ones(n_samples).astype(bool)
        inliers_mask[outliers_index] = False

        # fit a Minimum Covariance Determinant (MCD) robust estimator to data
        mcd = MinCovDet().fit(X)
        # compare raw robust estimates with the true location and covariance
        err_loc_mcd[i, j] = np.sum(mcd.location_**2)
        err_cov_mcd[i, j] = mcd.error_norm(np.eye(n_features))

        # compare estimators learned from the full data set with true
        # parameters
        err_loc_emp_full[i, j] = np.sum(X.mean(0)**2)
        err_cov_emp_full[i, j] = EmpiricalCovariance().fit(X).error_norm(
            np.eye(n_features))

        # compare with an empirical covariance learned from a pure data set
        # (i.e. "perfect" mcd)
        pure_X = X[inliers_mask]
        pure_location = pure_X.mean(0)
        pure_emp_cov = EmpiricalCovariance().fit(pure_X)
        err_loc_emp_pure[i, j] = np.sum(pure_location**2)
        err_cov_emp_pure[i, j] = pure_emp_cov.error_norm(np.eye(n_features))

# Display results
font_prop = matplotlib.font_manager.FontProperties(size=11)
plt.subplot(2, 1, 1)
lw = 2
plt.errorbar(range_n_outliers,
             err_loc_mcd.mean(1),
Example #32
0
def test_covariance():
    # Tests Covariance module on a simple dataset.
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    emp_cov = empirical_covariance(X)
    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(emp_cov), 0)
    assert_almost_equal(cov.error_norm(emp_cov, norm='spectral'), 0)
    assert_almost_equal(cov.error_norm(emp_cov, norm='frobenius'), 0)
    assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
    assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
    assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm='foo')
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    print(np.amin(mahal_dist), np.amax(mahal_dist))
    assert (np.amin(mahal_dist) > 0)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test with one sample
    # FIXME I don't know what this test does
    X_1sample = np.arange(5)
    cov = EmpiricalCovariance()
    assert_warns(UserWarning, cov.fit, X_1sample)
    assert_array_almost_equal(cov.covariance_,
                              np.zeros(shape=(5, 5), dtype=np.float64))

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)

    # test centered case
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    assert_array_equal(cov.location_, np.zeros(X.shape[1]))
Example #33
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), squared=False), 0)
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    assert(np.amax(mahal_dist) < 250)
    assert(np.amin(mahal_dist) > 50)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)
Example #34
0
 def _sample_covariance(self, X):
     return EmpiricalCovariance().fit(X).covariance_
def test_connectivity_measure_outputs():
    n_subjects = 10
    n_features = 49
    n_samples = 200

    # Generate signals and compute covariances
    emp_covs = []
    ledoit_covs = []
    signals = []
    random_state = check_random_state(0)
    ledoit_estimator = LedoitWolf()
    for k in range(n_subjects):
        signal = random_state.randn(n_samples, n_features)
        signals.append(signal)
        signal -= signal.mean(axis=0)
        emp_covs.append((signal.T).dot(signal) / n_samples)
        ledoit_covs.append(ledoit_estimator.fit(signal).covariance_)

    kinds = ["correlation", "tangent", "precision", "partial correlation"]

    # Check outputs properties
    for cov_estimator, covs in zip(
        [EmpiricalCovariance(), LedoitWolf()], [emp_covs, ledoit_covs]):
        input_covs = copy.copy(covs)
        for kind in kinds:
            conn_measure = ConnectivityMeasure(kind=kind,
                                               cov_estimator=cov_estimator)
            connectivities = conn_measure.fit_transform(signals)

            # Generic
            assert_true(isinstance(connectivities, np.ndarray))
            assert_equal(len(connectivities), len(covs))

            for k, cov_new in enumerate(connectivities):
                assert_array_equal(input_covs[k], covs[k])
                assert (is_spd(covs[k], decimal=7))

                # Positive definiteness if expected and output value checks
                if kind == "tangent":
                    assert_array_almost_equal(cov_new, cov_new.T)
                    gmean_sqrt = _map_eigenvalues(np.sqrt, conn_measure.mean_)
                    assert (is_spd(gmean_sqrt, decimal=7))
                    assert (is_spd(conn_measure.whitening_, decimal=7))
                    assert_array_almost_equal(
                        conn_measure.whitening_.dot(gmean_sqrt),
                        np.eye(n_features))
                    assert_array_almost_equal(
                        gmean_sqrt.dot(_map_eigenvalues(
                            np.exp, cov_new)).dot(gmean_sqrt), covs[k])
                elif kind == "precision":
                    assert (is_spd(cov_new, decimal=7))
                    assert_array_almost_equal(cov_new.dot(covs[k]),
                                              np.eye(n_features))
                elif kind == "correlation":
                    assert (is_spd(cov_new, decimal=7))
                    d = np.sqrt(np.diag(np.diag(covs[k])))
                    if cov_estimator == EmpiricalCovariance():
                        assert_array_almost_equal(
                            d.dot(cov_new).dot(d), covs[k])
                    assert_array_almost_equal(np.diag(cov_new),
                                              np.ones((n_features)))
                elif kind == "partial correlation":
                    prec = linalg.inv(covs[k])
                    d = np.sqrt(np.diag(np.diag(prec)))
                    assert_array_almost_equal(
                        d.dot(cov_new).dot(d),
                        -prec + 2 * np.diag(np.diag(prec)))
def test_connectivity_measure_outputs():
    n_subjects = 10
    n_features = 49

    # Generate signals and compute covariances
    emp_covs = []
    ledoit_covs = []
    signals = []
    ledoit_estimator = LedoitWolf()
    for k in range(n_subjects):
        n_samples = 200 + k
        signal, _, _ = generate_signals(n_features=n_features,
                                        n_confounds=5,
                                        length=n_samples,
                                        same_variance=False)
        signals.append(signal)
        signal -= signal.mean(axis=0)
        emp_covs.append((signal.T).dot(signal) / n_samples)
        ledoit_covs.append(ledoit_estimator.fit(signal).covariance_)

    kinds = [
        "covariance", "correlation", "tangent", "precision",
        "partial correlation"
    ]

    # Check outputs properties
    for cov_estimator, covs in zip(
        [EmpiricalCovariance(), LedoitWolf()], [emp_covs, ledoit_covs]):
        input_covs = copy.copy(covs)
        for kind in kinds:
            conn_measure = ConnectivityMeasure(kind=kind,
                                               cov_estimator=cov_estimator)
            connectivities = conn_measure.fit_transform(signals)

            # Generic
            assert_true(isinstance(connectivities, np.ndarray))
            assert_equal(len(connectivities), len(covs))

            for k, cov_new in enumerate(connectivities):
                assert_array_equal(input_covs[k], covs[k])
                assert (is_spd(covs[k], decimal=7))

                # Positive definiteness if expected and output value checks
                if kind == "tangent":
                    assert_array_almost_equal(cov_new, cov_new.T)
                    gmean_sqrt = _map_eigenvalues(np.sqrt, conn_measure.mean_)
                    assert (is_spd(gmean_sqrt, decimal=7))
                    assert (is_spd(conn_measure.whitening_, decimal=7))
                    assert_array_almost_equal(
                        conn_measure.whitening_.dot(gmean_sqrt),
                        np.eye(n_features))
                    assert_array_almost_equal(
                        gmean_sqrt.dot(_map_eigenvalues(
                            np.exp, cov_new)).dot(gmean_sqrt), covs[k])
                elif kind == "precision":
                    assert (is_spd(cov_new, decimal=7))
                    assert_array_almost_equal(cov_new.dot(covs[k]),
                                              np.eye(n_features))
                elif kind == "correlation":
                    assert (is_spd(cov_new, decimal=7))
                    d = np.sqrt(np.diag(np.diag(covs[k])))
                    if cov_estimator == EmpiricalCovariance():
                        assert_array_almost_equal(
                            d.dot(cov_new).dot(d), covs[k])
                    assert_array_almost_equal(np.diag(cov_new),
                                              np.ones((n_features)))
                elif kind == "partial correlation":
                    prec = linalg.inv(covs[k])
                    d = np.sqrt(np.diag(np.diag(prec)))
                    assert_array_almost_equal(
                        d.dot(cov_new).dot(d),
                        -prec + 2 * np.diag(np.diag(prec)))

    # Check the mean_
    for kind in kinds:
        conn_measure = ConnectivityMeasure(kind=kind)
        conn_measure.fit_transform(signals)
        assert_equal((conn_measure.mean_).shape, (n_features, n_features))
        if kind != 'tangent':
            assert_array_almost_equal(
                conn_measure.mean_,
                np.mean(conn_measure.transform(signals), axis=0))

    # Check that the mean isn't modified in transform
    conn_measure = ConnectivityMeasure(kind='covariance')
    conn_measure.fit(signals[:1])
    mean = conn_measure.mean_
    conn_measure.transform(signals[1:])
    assert_array_equal(mean, conn_measure.mean_)

    # Check vectorization option
    for kind in kinds:
        conn_measure = ConnectivityMeasure(kind=kind)
        connectivities = conn_measure.fit_transform(signals)
        conn_measure = ConnectivityMeasure(vectorize=True, kind=kind)
        vectorized_connectivities = conn_measure.fit_transform(signals)
        assert_array_almost_equal(vectorized_connectivities,
                                  sym_matrix_to_vec(connectivities))

    # Check not fitted error
    assert_raises_regex(ValueError, 'has not been fitted. ',
                        ConnectivityMeasure().inverse_transform,
                        vectorized_connectivities)

    # Check inverse transformation
    kinds.remove('tangent')
    for kind in kinds:
        # without vectorization: input matrices are returned with no change
        conn_measure = ConnectivityMeasure(kind=kind)
        connectivities = conn_measure.fit_transform(signals)
        assert_array_almost_equal(
            conn_measure.inverse_transform(connectivities), connectivities)

        # with vectorization: input vectors are reshaped into matrices
        # if diagonal has not been discarded
        conn_measure = ConnectivityMeasure(kind=kind, vectorize=True)
        vectorized_connectivities = conn_measure.fit_transform(signals)
        assert_array_almost_equal(
            conn_measure.inverse_transform(vectorized_connectivities),
            connectivities)

    # with vectorization if diagonal has been discarded
    for kind in ['correlation', 'partial correlation']:
        connectivities = ConnectivityMeasure(kind=kind).fit_transform(signals)
        conn_measure = ConnectivityMeasure(kind=kind,
                                           vectorize=True,
                                           discard_diagonal=True)
        vectorized_connectivities = conn_measure.fit_transform(signals)
        assert_array_almost_equal(
            conn_measure.inverse_transform(vectorized_connectivities),
            connectivities)

    for kind in ['covariance', 'precision']:
        connectivities = ConnectivityMeasure(kind=kind).fit_transform(signals)
        conn_measure = ConnectivityMeasure(kind=kind,
                                           vectorize=True,
                                           discard_diagonal=True)
        vectorized_connectivities = conn_measure.fit_transform(signals)
        diagonal = np.array(
            [np.diagonal(conn) / sqrt(2) for conn in connectivities])
        inverse_transformed = conn_measure.inverse_transform(
            vectorized_connectivities, diagonal=diagonal)
        assert_array_almost_equal(inverse_transformed, connectivities)
        assert_raises_regex(ValueError,
                            'can not reconstruct connectivity matrices',
                            conn_measure.inverse_transform,
                            vectorized_connectivities)

    # for 'tangent' kind, covariance matrices are reconstructed
    # without vectorization
    tangent_measure = ConnectivityMeasure(kind='tangent')
    displacements = tangent_measure.fit_transform(signals)
    covariances = ConnectivityMeasure(kind='covariance').fit_transform(signals)
    assert_array_almost_equal(tangent_measure.inverse_transform(displacements),
                              covariances)

    # with vectorization
    # when diagonal has not been discarded
    tangent_measure = ConnectivityMeasure(kind='tangent', vectorize=True)
    vectorized_displacements = tangent_measure.fit_transform(signals)
    assert_array_almost_equal(
        tangent_measure.inverse_transform(vectorized_displacements),
        covariances)

    # when diagonal has been discarded
    tangent_measure = ConnectivityMeasure(kind='tangent',
                                          vectorize=True,
                                          discard_diagonal=True)
    vectorized_displacements = tangent_measure.fit_transform(signals)
    diagonal = np.array(
        [np.diagonal(matrix) / sqrt(2) for matrix in displacements])
    inverse_transformed = tangent_measure.inverse_transform(
        vectorized_displacements, diagonal=diagonal)
    assert_array_almost_equal(inverse_transformed, covariances)
    assert_raises_regex(ValueError,
                        'can not reconstruct connectivity matrices',
                        tangent_measure.inverse_transform,
                        vectorized_displacements)
Example #37
0
hPurity_disc.GetYaxis().SetRangeUser(0, 1.3)
hPurity_disc.Divide(hPurity_discDen)
hPurity_disc.Draw()
c.Print("purity_disc.png")

hMVAdisc_pt.Draw("colz")
c.Print("discriminator_vs_candPt.png")

from sklearn.covariance import EmpiricalCovariance

npRocInput = numpy.array(rocInput)
npRocAnswers = numpy.array(rocScore)
slimNpData0 = npRocInput[npRocAnswers == 0]
slimNpData1 = npRocInput[npRocAnswers == 1]

ecv = EmpiricalCovariance()
ecv.fit(slimNpData0)

from scipy.linalg import fractional_matrix_power


def diagElements(m):
    size = m.shape[0]
    return numpy.matrix(numpy.diag([m[i, i] for i in xrange(size)]))


def corrMat(m):
    sqrt_diag = fractional_matrix_power(diagElements(m), -0.5)
    return numpy.array(sqrt_diag * m * sqrt_diag)

Example #38
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), squared=False), 0)
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    assert(np.amax(mahal_dist) < 250)
    assert(np.amin(mahal_dist) > 50)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)
Example #39
0
 def __init__(self, cov_estimator=EmpiricalCovariance(assume_centered=True),
              kind='covariance'):
     self.cov_estimator = cov_estimator
     self.kind = kind
Example #40
0
class Matching(IndividualOutcomeEstimator):
    def __init__(
        self,
        propensity_transform=None,
        caliper=None,
        with_replacement=True,
        n_neighbors=1,
        matching_mode="both",
        metric="mahalanobis",
        knn_backend="sklearn",
        estimate_observed_outcome=False,
    ):
        """Match treatment and control samples with similar covariates.

        Args:
            propensity_transform (causallib.transformers.PropensityTransformer):
                an object for data preprocessing which adds the propensity
                score as a feature (default: None)
            caliper (float) : maximal distance for a match to be accepted. If
                not defined, all matches will be accepted. If defined, some
                samples may not be matched and their outcomes will not be
                estimated. (default: None)
            with_replacement (bool): whether samples can be used multiple times
                for matching. If set to False, the matching process will optimize
                the linear sum of distances between pairs of treatment and
                control samples and only `min(N_treatment, N_control)` samples
                will be estimated. Matching with no replacement does not make
                use of the `fit` data and is therefore not implemented for
                out-of-sample data (default: True)
            n_neighbors (int) : number of nearest neighbors to include in match.
                Must be 1 if `with_replacement` is `False.` If larger than 1, the
                estimate is calculated using the `regress_agg_function` or 
                `classify_agg_function` across the `n_neighbors`. Note that when
                the `caliper` variable is set, some samples will have fewer than
                `n_neighbors` matches. (default: 1).
            matching_mode (str) : Direction of matching: `treatment_to_control`,
                `control_to_treatment` or `both` to indicate which set should
                be matched to which. All sets are cross-matched in `match`
                and when `with_replacement` is `False` all matching modes 
                coincide. With replacement there is a difference.
            metric (str) : Distance metric string for calculating distance
                between samples. Note: if an external built `knn_backend`
                object with a different metric is supplied, `metric` needs to
                be changed to reflect that, because `Matching` will set its 
                inverse covariance matrix if "mahalanobis" is set. (default: 
                "mahalanobis", also supported: "euclidean")
            knn_backend (str or callable) : Backend to use for nearest neighbor
                search. Options are "sklearn"  or a callable  which returns an 
                object implementing `fit`, `kneighbors` and `set_params` 
                like the sklearn `NearestNeighbors` object. (default: "sklearn"). 
            estimate_observed_outcome (bool) : Whether to allow a match of a
                sample to a sample other than itself when looking within its own
                treatment value. If True, the estimated potential outcome for the
                observed outcome may differ from the true observed outcome.
                (default: False)

        Attributes:
            classify_agg_function (callable) : Aggregating function for outcome
                estimation when classifying. (default: majority_rule)
                Usage is determined by type of `y` during `fit`
            regress_agg_function (callable) : Aggregating function for outcome
                estimation when regressing or predicting prob_a. (default: np.mean)
                Usage is determined by type of `y` during `fit`
            treatments_ (pd.DataFrame) : DataFrame of treatments (created after `fit`)
            outcomes_ (pd.DataFrame) : DataFrame of outcomes (created after `fit`)
            match_df_ (pd.DataFrame) : Dataframe of most recently calculated
                matches. For details, see `match`. (created after `match`)
            samples_used_ (pd.Series) : Series with count of samples used
                during most recent match. Series includes a count for each
                treatment value. (created after `match`)
        """
        self.propensity_transform = propensity_transform
        self.covariance_conditioner = EmpiricalCovariance()
        self.caliper = caliper
        self.with_replacement = with_replacement
        self.n_neighbors = n_neighbors
        self.matching_mode = matching_mode
        self.metric = metric
        # if classify task, default aggregation function is majority
        self.classify_agg_function = majority_rule
        # if regress task,  default aggregation function is mean
        self.regress_agg_function = np.mean
        self.knn_backend = knn_backend
        self.estimate_observed_outcome = estimate_observed_outcome

    def fit(self, X, a, y, sample_weight=None):
        """Load the treatments and outcomes and fit search trees.

        Applies transform to covariates X, initializes search trees for each
        treatment value for performing nearest neighbor searches.
        Note: Running `fit` a second time overwrites any information from
        previous `fit or `match` and re-fits the propensity_transform object.

        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.
            y (pd.Series): Series of shape (n,) containing outcomes for
                the n samples.
            sample_weight: IGNORED In signature for compatibility with other
                estimators.


        Note: `X`, `a` and `y` must share the same index.

        Returns:
            self (Matching) the fitted object
        """
        self._clear_post_fit_variables()
        self.outcome_ = y.copy()
        self.treatments_ = a.copy()

        if self.propensity_transform:
            self.propensity_transform.fit(X, a)
            X = self.propensity_transform.transform(X)

        self.conditioned_covariance_ = self._calculate_covariance(X)

        self.treatment_knns_ = {}
        for a in self.treatments_.unique():
            haystack = X[self.treatments_ == a]
            self.treatment_knns_[a] = self._fit_sknn(haystack)

        return self

    def _execute_matching(self, X, a):
        """Execute matching of samples in X according to the treatment values in a.

        Returns a DataFrame including all the results, which is also set as
        the attribute `self.match_df_`. The arguments `X` and `a` define the
        "needle" where the "haystack" is the data that was previously passed
        to fit, for matching with replacement. As such, treatment and control 
        samples from within `X` will not be matched with each other, unless
        the same `X` and `a` were passed to `fit`. For matching without
        replacement, the `X` and `a` passed to `match` provide the "needle" and
        the "haystack". If the attribute `caliper` is set, the matches are
        limited to those with a distance less than `caliper`.

        This function ignores the existing `match_df_` and will overwrite it.
        It is thus useful for if you have changed the settings and need to
        rematch the samples. For most applications, the `match` function is
        more convenient.

        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.

        Note: The args are assumed to share the same index.

        Returns:
            match_df: The resulting matches DataFrame is indexed so that
              ` match_df.loc[treatment_value, sample_id]` has columns `matches`
               and `distances` containing lists of indices to samples and the
               respective distances for the matches discovered for `sample_id`
               from within the fitted samples with the given `treatment_value`.
               The indices in the `matches` column are from the fitted data,
               not the X argument in `match`. If `sample_id` had no match,
               `match_df.loc[treatment_value, sample_id].matches = []`.
               The DataFrame has shape (n* len(a.unique()), 2 ).

        Raises:
            NotImplementedError: Raised when with_replacement is False and
               n_neighbors is not 1.
        """
        if self.n_neighbors != 1 and not self.with_replacement:
            raise NotImplementedError(
                "Matching more than one neighbor is only implemented for"
                "no-replacement")

        if self.propensity_transform:
            X = self.propensity_transform.transform(X)
        if self.with_replacement:
            self.match_df_ = self._withreplacement_match(X, a)
        else:
            self.match_df_ = self._noreplacement_match(X, a)
        sample_id_name = X.index.name if X.index.name is not None else "sample_id"
        self.match_df_.index.set_names(["match_to_treatment", sample_id_name],
                                       inplace=True)
        # we record the number of samples that were successfully matched of
        # each treatment value
        self.samples_used_ = self._count_samples_used_by_treatment_value(a)

        return self.match_df_

    def estimate_individual_outcome(self,
                                    X,
                                    a,
                                    y=None,
                                    treatment_values=None,
                                    predict_proba=True,
                                    dropna=True):
        """
        Calculate the potential outcome for each sample and treatment value.

        Execute match and calculate, for each treatment value and each sample,
        the expected outcome. 

        Note: Out of sample estimation for matching without replacement requires
        passing a `y` vector here. If no 'y' is passed here, the values received
        by `fit` are used, and if the estimation indices are not a subset of the 
        fitted indices, the estimation will fail.

        If the attribute `estimate_observed_outcome` is 
        `True`, estimates will be calculated for the observed outcomes as well.
        If not, then the observed outcome will be passed through from the 
        corresponding element of `y` passed to `fit`.


        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.
            y (pd.Series): Series of shape (n,) containing outcome values for
                n samples. This is only used when `with_replacemnt=False`. 
                Otherwise, the outcome values passed to `fit` are used.
            predict_proba (bool) : whether to output classifications or
                probabilties for a classification task. If set to False and
                data is non-integer, a warning is issued. (default True)
            dropna (bool) : For samples that were unmatched due to caliper
                restrictions, drop from outcome_df leading to a potentially
                smaller sized output, or include them as NaN. (default: True)
            treatment_values : IGNORED

        Note: The args are assumed to share the same index.

        Returns:
            outcome_df (pd.DataFrame)
        """
        match_df = self.match(X, a, use_cached_result=True)

        outcome_df = self._aggregate_match_df_to_generate_outcome_df(
            match_df, a, predict_proba)
        outcome_df = self._filter_outcome_df_by_matching_mode(outcome_df, a)
        if outcome_df.isna().all(axis=None):
            raise ValueError("Matching was not successful and no outcomes can"
                             "be estimated. Check caliper value.")
        if dropna:
            outcome_df = outcome_df.dropna()

        return outcome_df

    def match(self,
              X,
              a,
              use_cached_result=True,
              successful_matches_only=False):
        """Matching the samples in X according to the treatment values in a.

        Returns a DataFrame including all the results, which is also set as
        the attribute `self.match_df_`. The arguments `X` and `a` define the
        "needle" where the "haystack" is the data that was previously passed
        to fit, for matching with replacement. As such, treatment and control 
        samp    les from within `X` will not be matched with each other, unless
        the same `X` and `a` were passed to `fit`. For matching without
        replacement, the `X` and `a` passed to `match` provide the "needle" and
        the "haystack". If the attribute `caliper` is set, the matches are
        limited to those with a distance less than `caliper`.

        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.
            use_cached_result (bool): Whether or not to return the `match_df` 
                from the most recent matching operation. The cached result will
                only be used if the sample indices of `X` and those of `match_df`
                are identical, otherwise it will rematch.
            successful_matches_only (bool): Whether or not to filter the matches
                to those which matched successfully. If set to `False`, the
                resulting DataFrame will have shape (n* len(a.unique()), 2 ),
                otherwise it may have a smaller shape due to unsuccessful matches.

        Note: The args are assumed to share the same index.

        Returns:
            match_df: The resulting matches DataFrame is indexed so that
              ` match_df.loc[treatment_value, sample_id]` has columns `matches`
               and `distances` containing lists of indices to samples and the
               respective distances for the matches discovered for `sample_id`
               from within the fitted samples with the given `treatment_value`.
               The indices in the `matches` column are from the fitted data,
               not the X argument in `match`. If `sample_id` had no match,
               `match_df.loc[treatment_value, sample_id].matches = []`.
               The DataFrame has shape (n* len(a.unique()), 2 ), if
               `successful_matches_only` is set to `False.

        Raises:
            NotImplementedError: Raised when with_replacement is False and
               n_neighbors is not 1.
        """
        cached_result_available = (hasattr(self, "match_df_") and
                                   X.index.equals(self.match_df_.loc[0].index))
        if not (use_cached_result and cached_result_available):
            self._execute_matching(X, a)

        return self._get_match_df(
            successful_matches_only=successful_matches_only)

    def matches_to_weights(self, match_df=None):
        """Calculate weights based on a given set of matches.

        For each matching from one treatment value to another, a weight vector
        is generated. The weights are calculated as the number of times a
        sample was selected in a matching, with each occurrence weighted
        according to the number of other samples in that matching. The weights
        can be used to estimate outcomes or to check covariate balancing. The 
        function can only be called after `match` has been run.

        Args:
            match_df (pd.DataFrame) : a DataFrame of matches returned from
                `match`. If not supplied, will use the `match_df_` attribute if
                available, else raises ValueError. Will not execute `match` to
                generate a `match_df`.

        Returns:
            weights_df (pd.DataFrame): DataFrame of shape (n,M) where M is the
                number of permutations of `a.unique()`.
        """
        if match_df is None:
            match_df = self._get_match_df(successful_matches_only=False)

        match_permutations = sorted(permutations(self.treatments_.unique()))
        weights_df = pd.DataFrame([
            self._matches_to_weights_single_matching(s, t, match_df)
            for s, t in match_permutations
        ], ).T

        return weights_df

    def get_covariates_of_matches(self, s, t, covariates):
        """
        Look up covariates of closest matches for a given matching.

        Using `self.match_df_` and the supplied `covariates`, look up
        the covariates of the last match. The function can only be called after
        `match` has been run.

            Args:
                s (int) : source treatment value
                t (int) : target treatment value
                covariates (pd.DataFrame) : The same covariates which were
                   passed to `fit`.

            Returns:
                covariate_df (pd.DataFrame) : a DataFrame of size
                (n_matched_samples, n_covariates * 3 + 2) with the covariate
                values of the sample, covariates of its match, calculated
                distance and number of neighbors found within the given
                caliper (with no caliper this will equal self.n_neighbors )

        """
        match_df = self._get_match_df()
        subdf = match_df.loc[s][self.treatments_ == t]
        sample_id_name = subdf.index.name

        def get_covariate_difference_from_nearest_match(source_row_index):
            j = subdf.loc[source_row_index].matches[0]
            delta_series = pd.Series(covariates.loc[source_row_index] -
                                     covariates.loc[j])
            source_row = covariates.loc[j].copy()
            source_row.at[sample_id_name] = j
            target_row = covariates.loc[source_row_index].copy()
            target_row = target_row
            covariate_differences = pd.concat({
                t:
                target_row,
                s:
                source_row,
                "delta":
                delta_series,
                "outcomes":
                pd.Series({
                    t: self.outcome_.loc[source_row_index],
                    s: self.outcome_.loc[j]
                }),
                "match":
                pd.Series(
                    dict(
                        n_neighbors=len(subdf.loc[source_row_index].matches),
                        distance=subdf.loc[source_row_index].distances[0],
                    )),
            })
            return covariate_differences

        covdf = pd.DataFrame(data=[
            get_covariate_difference_from_nearest_match(i) for i in subdf.index
        ],
                             index=subdf.index)
        covdf = covdf.reset_index()
        cols = covdf.columns
        covdf.columns = pd.MultiIndex.from_tuples([(t, sample_id_name)] +
                                                  list(cols[1:]))
        return covdf

    def _clear_post_fit_variables(self):
        for var in list(vars(self)):
            if var[-1] == "_":
                self.__delattr__(var)

    def _calculate_covariance(self, X):
        if len(X.shape) > 1 and X.shape[1] > 1:
            V_list = []
            for a in self.treatments_.unique():
                X_at_a = X[self.treatments_ == a].copy()
                current_V = self.covariance_conditioner.fit(X_at_a).covariance_
                V_list.append(current_V)
            # following Imbens&Rubin, we average across treatment groups
            V = np.mean(V_list, axis=0)
        else:
            # for 1d data revert to euclidean metric
            V = np.array(1).reshape(1, 1)
        return V

    def _aggregate_match_df_to_generate_outcome_df(self, match_df, a,
                                                   predict_proba):
        agg_function = self._get_agg_function(predict_proba)

        def outcome_from_matches_by_idx(x):
            return agg_function(self.outcome_.loc[x])

        outcomes = {}
        for i in sorted(a.unique()):
            outcomes[i] = match_df.loc[i].matches.apply(
                outcome_from_matches_by_idx)
        outcome_df = pd.DataFrame(outcomes)
        return outcome_df

    def _get_match_df(self, successful_matches_only=True):
        if not hasattr(self, "match_df_") or self.match_df_ is None:
            raise NotFittedError("You need to run `match` first")
        match_df = self.match_df_.copy()
        if successful_matches_only:
            match_df = match_df[match_df.matches.apply(bool)]
        if match_df.empty:
            raise ValueError(
                "Matching was not successful and no outcomes can be "
                "estimated. Check caliper value.")
        return match_df

    def _filter_outcome_df_by_matching_mode(self, outcome_df, a):
        if self.matching_mode == "treatment_to_control":
            outcome_df = outcome_df[a == 1]
        elif self.matching_mode == "control_to_treatment":
            outcome_df = outcome_df[a == 0]
        elif self.matching_mode == "both":
            pass
        else:
            raise NotImplementedError(
                "Matching mode {} is not implemented. Please select one of "
                "'treatment_to_control', 'control_to_treatment, "
                "or 'both'.".format(self.matching_mode))
        return outcome_df

    def _get_agg_function(self, predict_proba):
        if predict_proba:
            agg_function = self.regress_agg_function
        else:
            agg_function = self.classify_agg_function
            try:
                isoutputinteger = np.allclose(self.outcome_.apply(int),
                                              self.outcome_)
                if not isoutputinteger:
                    warnings.warn("Classifying non-categorical outcomes. "
                                  "This is probably a mistake.")
            except:
                warnings.warn(
                    "Unable to detect whether outcome is integer-like. ")
        return agg_function

    def _instantiate_nearest_neighbors_object(self):
        backend = self.knn_backend
        if backend == "sklearn":
            backend_instance = NearestNeighbors(algorithm="auto")
        elif callable(backend):
            backend_instance = backend()
            self.metric = backend_instance.metric
        elif hasattr(backend, "fit") and hasattr(backend, "kneighbors"):
            backend_instance = sk_clone(backend)
            self.metric = backend_instance.metric
        else:
            raise NotImplementedError(
                "`knn_backend` must be either an NearestNeighbors-like object,"
                " a callable returning such an object, or the string \"sklearn\""
            )
        backend_instance.set_params(**self._get_metric_dict())
        return backend_instance

    def _fit_sknn(self, target_df):
        """
        Fit scikit-learn NearestNeighbors object with samples in target_df.

        Fits object, adds metric parameters and returns namedtuple which
        also includes DataFrame indices so that identities can looked up.

        Args:
            target_df (pd.DataFrame) : DataFrame of covariates to fit

        Returns:
            KNN (namedtuple) : Namedtuple with members `learner` and `index`
            containing the fitted sklearn object and an index lookup vector,
            respectively.
        """
        target_array = target_df.values

        sknn = self._instantiate_nearest_neighbors_object()

        target_array = self._ensure_array_columnlike(target_array)

        sknn.fit(target_array)
        return KNN(sknn, target_df.index)

    @staticmethod
    def _ensure_array_columnlike(target_array):
        if len(target_array.shape) < 2 or target_array.shape[1] == 1:
            target_array = target_array.reshape(-1, 1)
        return target_array

    def _get_metric_dict(
        self,
        VI_in_metric_params=True,
    ):
        metric_dict = dict(metric=self.metric)
        if self.metric == "mahalanobis":
            VI = np.linalg.inv(self.conditioned_covariance_)
            if VI_in_metric_params:
                metric_dict["metric_params"] = {"VI": VI}
            else:
                metric_dict["VI"] = VI

        return metric_dict

    def _kneighbors(self, knn, source_df):
        """Lookup neighbors in knn object.

        Args:
           knn (namedtuple) : knn named tuple to look for neighbors in. The
               object has `learner` and `index` attributes to reference the
               original df index.
           source_df (pd.DataFrame) : a DataFrame of source data points to use
               as "needles" for the knn "haystack."

        Returns:
            match_df (pd.DataFrame) : a DataFrame of matches
        """
        source_array = source_df.values
        # 1d data must be in shape (-1, 1) for sklearn.knn
        source_array = self._ensure_array_columnlike(source_array)

        distances, neighbor_array_indices = knn.learner.kneighbors(
            source_array, n_neighbors=self.n_neighbors)

        return self._generate_match_df(source_df, knn.index, distances,
                                       neighbor_array_indices)

    def _generate_match_df(self, source_df, target_df_index, distances,
                           neighbor_array_indices):
        """
        Take results of matching and build into match_df DataFrame.

        For clarity we'll call the samples that are being matched "needles" and
        the set of samples that they looked for matches in the "haystack".

        Args:
            source_df (pd.DataFrame) : Covariate dataframe of N "needles"
            target_df_index (np.array) : An array of M indices of the haystack
                samples in their original dataframe.
            distances (np.array) : An array of N arrays of floats of length K
                where K is `self.n_neighbors`.
            neighbor_array_indices (np.array) : An array of N arrays of ints of
                length K where K is `self.n_neighbors`.
        """
        # target is the haystack, source is the needle(s)
        # translate array indices back to original indices
        matches_dict = {}
        for source_idx, distance_row, neighbor_array_index_row in zip(
                source_df.index, distances, neighbor_array_indices):
            neighbor_df_indices = \
                target_df_index[neighbor_array_index_row.flatten()]
            if self.caliper is not None:
                neighbor_df_indices = [
                    n for i, n in enumerate(neighbor_df_indices)
                    if distance_row[i] < self.caliper
                ]
                distance_row = [d for d in distance_row if d < self.caliper]
            matches_dict[source_idx] = dict(matches=list(neighbor_df_indices),
                                            distances=list(distance_row))
        # convert dict of dicts like { 1: {'matches':[], 'distances':[]}} to df
        return pd.DataFrame(matches_dict).T

    def _matches_to_weights_single_matching(self, s, t, match_df):
        """
        For a given match, calculate the resulting weight vector.

        The weight vector adds a count each time a sample is used, weighted by
        the number of other neighbors when it was used. This is necessary to
        make the weighted sum return the correct effect estimate.
        """
        weights = pd.Series(self.treatments_.copy() * 0)
        name = {0: "control", 1: "treatment"}
        weights.name = "{s}_to_{t}".format(s=name[s], t=name[t])
        s_to_t_matches = match_df.loc[t][self.treatments_ == s].matches
        for source_idx, matches_list in s_to_t_matches.iteritems():
            if matches_list:
                weights.loc[source_idx] += 1
            for match in matches_list:
                weights.loc[match] += 1 / len(matches_list)
        return weights

    def _get_distance_matrix(self, source_df, target_df):
        """
        Create distance matrix for no replacement match.

        Combines metric, caliper and source/target data into a
        precalculated distance matrix which can be passed to
        scipy.optimize.linear_sum_assignment.
        """

        cdist_args = dict(
            XA=self._ensure_array_columnlike(source_df.values),
            XB=self._ensure_array_columnlike(target_df.values),
        )
        cdist_args.update(self._get_metric_dict(False))
        distance_matrix = distance.cdist(**cdist_args)

        if self.caliper is not None:
            distance_matrix[distance_matrix > self.caliper] = VERY_LARGE_NUMBER
        return distance_matrix

    def _withreplacement_match(self, X, a):
        matches = {}  # maps treatment value to list of matches TO that value

        for treatment_value, knn in self.treatment_knns_.items():
            matches[treatment_value] = self._kneighbors(knn, X)
            # when producing potential outcomes we may want to force the
            # value of the observed outcome to be the actual observed
            # outcome, and not an average of the k nearest samples.
            if not self.estimate_observed_outcome:

                def limit_within_treatment_matches_to_self_only(row):
                    if (a.loc[row.name] == treatment_value
                            and row.name in row.matches):
                        row.matches = [row.name]
                        row.distances = [0]
                    return row

                matches[treatment_value] = matches[treatment_value].apply(
                    limit_within_treatment_matches_to_self_only, axis=1)

        return pd.concat(matches, sort=True)

    def _noreplacement_match(self, X, a):

        match_combinations = sorted(combinations(a.unique(), 2))
        matches = {}

        for s, t in match_combinations:
            distance_matrix = self._get_distance_matrix(X[a == s], X[a == t])
            source_array, neighbor_array_indices, distances = \
                self._optimally_match_distance_matrix(distance_matrix)
            source_df = X[a == s].iloc[np.array(source_array)]
            target_df = X[a == t].iloc[np.array(neighbor_array_indices)]
            if t in matches or s in matches:
                warnings.warn("No-replacement matching for more than "
                              "2 treatment values is not supported")

            matches[t] = self._create_match_df_for_no_replacement(
                a, source_df, target_df, distances)
            matches[s] = self._create_match_df_for_no_replacement(
                a, target_df, source_df, distances)

        match_df = pd.concat(matches, sort=True)
        return match_df

    def _optimally_match_distance_matrix(self, distance_matrix):
        source_array, neighbor_array_indices = linear_sum_assignment(
            distance_matrix)
        distances = [[
            distance_matrix[s_idx, t_idx]
        ] for s_idx, t_idx in zip(source_array, neighbor_array_indices)]
        source_array, neighbor_array_indices, distances = \
            self._filter_noreplacement_matches_using_caliper(
                source_array, neighbor_array_indices, distances)
        return source_array, neighbor_array_indices, distances

    def _filter_noreplacement_matches_using_caliper(self, source_array,
                                                    neighbor_array_indices,
                                                    distances):
        if self.caliper is None:
            return source_array, neighbor_array_indices, distances
        keep_indices = [
            i for i, d in enumerate(distances) if d[0] <= self.caliper
        ]
        source_array = source_array[keep_indices]
        neighbor_array_indices = neighbor_array_indices[keep_indices]
        distances = [distances[i] for i in keep_indices]
        if not keep_indices:
            warnings.warn("No matches found, check caliper."
                          "No estimation possible.")
        return source_array, neighbor_array_indices, distances

    @staticmethod
    def _create_match_df_for_no_replacement(base_series, source_df, target_df,
                                            distances):
        match_sub_df = pd.DataFrame(
            index=base_series.index,
            columns=[
                "matches",
                "distances",
            ],
            data=base_series.apply(lambda x: pd.Series([[], []])).values,
            dtype="object",
        )

        # matching from source to target: read distances
        match_sub_df.loc[source_df.index] = pd.DataFrame(
            data=dict(
                matches=[[tidx] for tidx in target_df.index],
                distances=distances,
            ),
            index=source_df.index,
        )

        # matching from target to target: fill with zeros
        match_sub_df.loc[target_df.index] = pd.DataFrame(
            data=dict(
                matches=[[tidx] for tidx in target_df.index],
                distances=[[0]] * len(distances),
            ),
            index=target_df.index,
        )
        return match_sub_df

    def _count_samples_used_by_treatment_value(self, a):
        # we record the number of samples that were successfully matched of
        # each treatment value
        samples_used = {
            treatment_value: self.match_df_.loc[treatment_value][
                a != treatment_value].matches.apply(bool).sum()
            for treatment_value in sorted(a.unique(), reverse=True)
        }

        return pd.Series(samples_used)
Example #41
0
def clipped(X, alpha=None, return_covariance=False):
    """Clips the eigenvalues of an empirical correlation matrix E 
       in order to provide a cleaned estimator E_clipped of the 
       underlying correlation matrix.
       Proceeds by keeping the [N * alpha] top eigenvalues and shrinking
       the remaining ones by a trace-preserving constant 
       (i.e. Tr(E_clipped) = Tr(E)).

       Parameters
       ----------
       X: design matrix, of shape (T, N), where T denotes the number
           of samples (think measurements in a time series), while N
           stands for the number of features (think of stock tickers).

       alpha: type float or derived from numbers.Real (default: None)
           Parameter between 0 and 1, inclusive, determining the fraction
           to keep of the top eigenvalues of an empirical correlation matrix.

           If left unspecified, alpha is chosen so as to keep all the
           empirical eigenvalues greater than the upper limit of 
           the support to the Marcenko-Pastur spectrum. Indeed, such 
           eigenvalues can be considered as associated with some signal,
           whereas the ones falling inside the Marcenko-Pastur range
           should be considered as corrupted with noise and indistinguishable
           from the spectrum of the correlation of a random matrix.

           This ignores finite-size effects that make it possible
           for the eigenvalues to exceed the upper and lower edges
           defined by the Marcenko-Pastur spectrum (cf. a set of results
           revolving around the Tracy-Widom distribution)
           
       return_covariance: type bool (default: False)
           If set to True, compute the standard deviations of each individual
           feature across observations, clean the underlying matrix
           of pairwise correlations, then re-apply the standard
           deviations and return a cleaned variance-covariance matrix.

       Returns
       -------
       E_clipped: type numpy.ndarray, shape (N, N)
           Cleaned estimator of the true correlation matrix C underlying
           a noisy, in-sample estimate E (empirical correlation matrix
           estimated from X). This cleaned estimator proceeds through
           a simple eigenvalue clipping procedure (cf. reference below).
           
           If return_covariance=True, E_clipped corresponds to a cleaned 
           variance-covariance matrix.

       Reference
       ---------
       "Financial Applications of Random Matrix Theory: a short review",
       J.-P. Bouchaud and M. Potters
       arXiv: 0910.1205 [q-fin.ST]
    """

    try:
        if alpha is not None:
            assert isinstance(alpha, Real) and 0 <= alpha <= 1
            
        assert isinstance(return_covariance, bool)
    except AssertionError:
        raise
        sys.exit(1)
    
    T, N, transpose_flag = checkDesignMatrix(X)
    if transpose_flag:
        X = X.T
        
    if not return_covariance:
        X = StandardScaler(with_mean=False,
                           with_std=True).fit_transform(X)

    ec = EmpiricalCovariance(store_precision=False,
                             assume_centered=True)
    ec.fit(X)
    E = ec.covariance_
    
    if return_covariance:
        inverse_std = 1./np.sqrt(np.diag(E))
        E *= inverse_std
        E *= inverse_std.reshape(-1, 1)

    eigvals, eigvecs = np.linalg.eigh(E)
    eigvecs = eigvecs.T

    if alpha is None:
        (lambda_min, lambda_max), _ = marcenkoPastur(X)
        xi_clipped = np.where(eigvals >= lambda_max, eigvals, np.nan)
    else:
        xi_clipped = np.full(N, np.nan)
        threshold = int(ceil(alpha * N))
        if threshold > 0:
            xi_clipped[-threshold:] = eigvals[-threshold:]

    gamma = float(E.trace() - np.nansum(xi_clipped))
    gamma /= np.isnan(xi_clipped).sum()
    xi_clipped = np.where(np.isnan(xi_clipped), gamma, xi_clipped)

    E_clipped = np.zeros((N, N), dtype=float)
    for xi, eigvec in zip(xi_clipped, eigvecs):
        eigvec = eigvec.reshape(-1, 1)
        E_clipped += xi * eigvec.dot(eigvec.T)
        
    tmp = 1./np.sqrt(np.diag(E_clipped))
    E_clipped *= tmp
    E_clipped *= tmp.reshape(-1, 1)
    
    if return_covariance:
      std = 1./inverse_std
      E_clipped *= std
      E_clipped *= std.reshape(-1, 1)

    return E_clipped
 def __init__(self):
     super().__init__()
     self.estimator = EmpiricalCovariance()
def test_suffstat_sk_full():
    # compare the precision matrix compute from the
    # EmpiricalCovariance.covariance fitted on X*sqrt(resp)
    # with _sufficient_sk_full, n_components=1
    rng = np.random.RandomState(0)
    n_samples, n_features = 500, 2

    # special case 1, assuming data is "centered"
    X = rng.rand(n_samples, n_features)
    resp = rng.rand(n_samples, 1)
    X_resp = np.sqrt(resp) * X
    nk = np.array([n_samples])
    xk = np.zeros((1, n_features))
    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=True)
    ecov.fit(X_resp)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full')
    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
    assert_array_almost_equal(precs_est, precs_pred)

    # special case 2, assuming resp are all ones
    resp = np.ones((n_samples, 1))
    nk = np.array([n_samples])
    xk = X.mean(axis=0).reshape((1, -1))
    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=False)
    ecov.fit(X)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full')
    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
    assert_array_almost_equal(precs_est, precs_pred)
Example #44
0
    def correlation(self, estimator="maximum_likelihood", assume_centered=False):

        if estimator=="maximum_likelihood":
            correlation_measure = ConnectivityMeasure(kind="correlation", cov_estimator=EmpiricalCovariance(assume_centered=assume_centered))
        elif estimator=="ledoit_wolf":
            correlation_measure = ConnectivityMeasure(kind="correlation", cov_estimator=LedoitWolf(assume_centered=assume_centered))
        else:
            raise ValueError("Estimator should be 'maximum_likelihood' or 'ledoit_wolf'")

        R = np.nan_to_num(correlation_measure.fit_transform(self.ts))

        return R
Example #45
0
# Fixing the positive definiteness of the precision matrix
gam = 1
c = np.array([4,-1,0,0,-1,0,0,0,0,0,-1,0,0,0,-1])
c = c + gam/len(c)
C = circulant(c)
Q = np.dot(C.T,C) + gam*np.ones(len(c))    
mu = np.zeros((d,))
theta = sampler_circulant(a=np.reshape(Q[:,0],(len(c),1)),
                  M=1,
                  N=len(c),
                  mu=mu,
                  mode="precision",
                  size=10000)
mu_hat = np.mean(theta,axis=1)
Q_hat = EmpiricalCovariance().fit(theta.T).precision_
np.linalg.norm(mu-mu_hat)
np.linalg.norm(Q-Q_hat)/np.linalg.norm(Q)


mu = np.array([0., 1.])
Sigma = np.array([[ 1. , -0.9], [-0.9,  1]])
Q = np.linalg.inv(Sigma)
def matvec_fun(x):
    return Q.dot(x)
lam_l = 0
lam_u = np.max(np.sum(np.abs(Q),0))
tol = 1e-3
[theta,K] = sampler_squareRootApprox(mu,matvec_fun,lam_l,lam_u,tol,
                               K=100,mode="precision",
                               size=1,info=True)
Example #46
0
def optimalShrinkage(X, return_covariance=False, method='rie'):
    """This function computes a cleaned, optimal shrinkage, 
       rotationally-invariant estimator (RIE) of the true correlation 
       matrix C underlying the noisy, in-sample estimate 
       E = 1/T X * transpose(X)
       associated to a design matrix X of shape (T, N) (T measurements 
       and N features).

       One approach to getting a cleaned estimator that predates the
       optimal shrinkage, RIE estimator consists in inverting the 
       Marcenko-Pastur equation so as to replace the eigenvalues
       from the spectrum of E by an estimation of the true ones.

       This approach is known to be numerically-unstable, in addition
       to failing to account for the overlap between the sample eigenvectors
       and the true eigenvectors. How to compute such overlaps was first
       explained by Ledoit and Peche (cf. reference below). Their procedure
       was extended by Bun, Bouchaud and Potters, who also correct
       for a systematic downward bias in small eigenvalues.
       
       It is this debiased, optimal shrinkage, rotationally-invariant
       estimator that the function at hand implements.
       
       In addition to above method, this funtion also provides access to:  
       - The finite N regularization of the optimal RIE for small eigenvalues
         as provided in section 8.1 of [3] a.k.a the inverse wishart (IW) regularization.
       - The direct kernel method of O. Ledoit and M. Wolf in their 2017 paper [4]. 
         This is a direct port of their Matlab code.
        
         
       Parameters
       ----------
       X: design matrix, of shape (T, N), where T denotes the number
           of samples (think measurements in a time series), while N
           stands for the number of features (think of stock tickers).
           
       return_covariance: type bool (default: False)
           If set to True, compute the standard deviations of each individual
           feature across observations, clean the underlying matrix
           of pairwise correlations, then re-apply the standard
           deviations and return a cleaned variance-covariance matrix.
       
       method: type string, optional (default="rie")
           - If "rie" : optimal shrinkage in the manner of Bun & al.
            with no regularisation  
           - If "iw" : optimal shrinkage in the manner of Bun & al.
            with the so called Inverse Wishart regularization
           - If 'kernel': Direct kernel method of Ledoit  Wolf.

       Returns
       -------
       E_RIE: type numpy.ndarray, shape (N, N)
           Cleaned estimator of the true correlation matrix C. A sample
           estimator of C is the empirical covariance matrix E 
           estimated from X. E is corrupted by in-sample noise.
           E_RIE is the optimal shrinkage, rotationally-invariant estimator 
           (RIE) of C computed following the procedure of Joel Bun 
           and colleagues (cf. references below).
           
           If return_covariance=True, E_clipped corresponds to a cleaned
           variance-covariance matrix.

       References
       ----------
       1 "Eigenvectors of some large sample covariance matrix ensembles",
         O. Ledoit and S. Peche
         Probability Theory and Related Fields, Vol. 151 (1), pp 233-264
       2 "Rotational invariant estimator for general noisy matrices",
         J. Bun, R. Allez, J.-P. Bouchaud and M. Potters
         arXiv: 1502.06736 [cond-mat.stat-mech]
       3 "Cleaning large Correlation Matrices: tools from Random Matrix Theory",
         J. Bun, J.-P. Bouchaud and M. Potters
         arXiv: 1610.08104 [cond-mat.stat-mech]
       4 "Direct Nonlinear Shrinkage Estimation of Large-Dimensional Covariance Matrices (September 2017)", 
         O. Ledoit and M. Wolf https://ssrn.com/abstract=3047302 or http://dx.doi.org/10.2139/ssrn.3047302
    """
    
    try:
        assert isinstance(return_covariance, bool)
    except AssertionError:
        raise
        sys.exit(1)

    T, N, transpose_flag = checkDesignMatrix(X)
    if transpose_flag:
        X = X.T
        
    if not return_covariance:
        X = StandardScaler(with_mean=False,
                           with_std=True).fit_transform(X)

    ec = EmpiricalCovariance(store_precision=False,
                             assume_centered=True)
    ec.fit(X)
    E = ec.covariance_
    
    if return_covariance:
        inverse_std = 1./np.sqrt(np.diag(E))
        E *= inverse_std
        E *= inverse_std.reshape(-1, 1)

    eigvals, eigvecs = np.linalg.eigh(E)
    eigvecs = eigvecs.T

    q = N / float(T)
    lambda_N = eigvals[0]  # The smallest empirical eigenvalue,
                           # given that the function used to compute
                           # the spectrum of a Hermitian or symmetric
                           # matrix - namely np.linalg.eigh - returns
                           # the eigenvalues in ascending order.
    lambda_hats = None
    
    if method is not 'kernel':
        use_inverse_wishart = (method == 'iw')
        xis = map(lambda x: xiHelper(x, q, E), eigvals)
        Gammas = map(lambda x: gammaHelper(x, q, N, lambda_N, inverse_wishart=use_inverse_wishart), eigvals)
        xi_hats = map(lambda a, b: a * b if b > 1 else a, xis, Gammas)
        lambda_hats = xi_hats
    else:
         lambda_hats = directKernel(q, T, N, eigvals)
        
    E_RIE = np.zeros((N, N), dtype=float)
    for lambda_hat, eigvec in zip(lambda_hats, eigvecs):
        eigvec = eigvec.reshape(-1, 1)
        E_RIE += lambda_hat * eigvec.dot(eigvec.T)

    # bp()
    tmp = 1./np.sqrt(np.diag(E_RIE))
    E_RIE *= tmp
    E_RIE *= tmp.reshape(-1, 1)
    
    if return_covariance:
        std = 1./inverse_std
        E_RIE *= std
        E_RIE *= std.reshape(-1, 1)

    return E_RIE
Example #47
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    emp_cov = empirical_covariance(X)
    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(emp_cov), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, squared=False), 0)
    assert_raises(NotImplementedError,
                  cov.error_norm, emp_cov, norm='foo')
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    print np.amin(mahal_dist), np.amax(mahal_dist)
    assert(np.amin(mahal_dist) > 0)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test with one sample
    X_1sample = np.arange(5)
    cov = EmpiricalCovariance()
    with warnings.catch_warnings(record=True):
        cov.fit(X_1sample)

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)

    # test centered case
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    assert_equal(cov.location_, np.zeros(X.shape[1]))
Example #48
0
def rand_pts_overall_cov_init(X,
                              n_components,
                              cov_est_method='LW',
                              covariance_type='full',
                              random_state=None):
    """
    Sets the means to randomly selected points. Sets the covariances to the overall covariance matrix.

    Parameters
    ----------
    X: (n_samples, n_features)

    n_components: int

    cov_est_method: str
        Must be one of ['emperical', 'LW', 'OAS'] for
        empirical covariance matrix estimate, LedoitWolf and
        Oracle Approximating Shrinkage Estimator. See
        sklean.covariace for details.

    random_state: None, int, random seed
        Random seed.

    """
    assert cov_est_method in ['empirical', 'LW', 'OAS']
    assert covariance_type in ['full', 'diag', 'tied', 'spherical']
    n_samples = X.shape[0]

    # randomly select data points to start cluster centers from
    rng = check_random_state(random_state)

    # estimate global covariance
    if cov_est_method == 'empirical':
        cov_estimator = EmpiricalCovariance(store_precision=False)
    elif cov_est_method == 'LW':
        cov_estimator = LedoitWolf(store_precision=False)
    elif cov_est_method == 'OAS':
        cov_estimator = OAS(store_precision=False)
    cov_estimator.fit(X)
    cov_est = cov_estimator.covariance_

    # set covariance matrix for each cluster
    if covariance_type == 'tied':
        covs = cov_est

    elif covariance_type == 'full':
        covs = np.stack([cov_est for _ in range(n_components)])

    elif covariance_type == 'diag':
        # each components gets the diagonal of the estimated covariance matrix
        covs = np.diag(cov_est)
        covs = np.repeat(covs.reshape(1, -1), repeats=n_components, axis=0)

    elif covariance_type == 'spherical':
        # each components gets the average of the variances
        covs = np.diag(cov_est).mean()
        covs = np.repeat(covs, repeats=n_components)

    # set means to random data points
    rand_idxs = rng.choice(range(n_samples), replace=False, size=n_components)

    means = [X[pt_idx, ] for pt_idx in rand_idxs]
    means = np.array(means)

    return means, covs
        inliers_mask[outliers_index] = False

        # fit a Minimum Covariance Determinant (MCD) robust estimator to data
        S = MinCovDet().fit(X)
        # compare raw robust estimates with the true location and covariance
        err_loc_mcd[i, j] = np.sum(S.location_ ** 2)
        err_cov_mcd[i, j] = S.error_norm(np.eye(n_features))
        # compare estimators learnt from the full data set with true parameters
        err_loc_emp_full[i, j] = np.sum(X.mean(0) ** 2)
        err_cov_emp_full[i, j] = EmpiricalCovariance().fit(X).error_norm(
            np.eye(n_features))
        # compare with an empirical covariance learnt from a pure data set
        # (i.e. "perfect" MCD)
        pure_X = X[inliers_mask]
        pure_location = pure_X.mean(0)
        pure_emp_cov = EmpiricalCovariance().fit(pure_X)
        err_loc_emp_pure[i, j] = np.sum(pure_location ** 2)
        err_cov_emp_pure[i, j] = pure_emp_cov.error_norm(np.eye(n_features))

# Display results
font_prop = matplotlib.font_manager.FontProperties(size=11)
pl.subplot(2, 1, 1)
pl.errorbar(range_n_outliers, err_loc_mcd.mean(1),
            yerr=err_loc_mcd.std(1) / np.sqrt(repeat),
            label="Robust location", color='m')
pl.errorbar(range_n_outliers, err_loc_emp_full.mean(1),
            yerr=err_loc_emp_full.std(1) / np.sqrt(repeat),
            label="Full data set mean", color='green')
pl.errorbar(range_n_outliers, err_loc_emp_pure.mean(1),
            yerr=err_loc_emp_pure.std(1) / np.sqrt(repeat),
            label="Pure data set mean", color='black')
Example #50
0
def main(out_data: str = 'chexpert'):
    models = wsl_model_dir.glob('*')
    # all_configs = []

    for idx, path in enumerate(models):
        if 'debug' in str(path):  # Debugging model
            continue
        elif not (path / 'configs.json').exists():  # Model not completed
            continue
        else:
            with open(path / 'configs.json') as f:
                configs = json.load(f)
                # print(configs)
        print(f'Model {idx} : {path}')

        # ------------------------------------------------------
        train_dataset = Loader(data=configs['data'],
                               split='train',
                               extension=configs['extension'],
                               classes=configs['classes'],
                               column=configs['column'],
                               regression=configs['regression'])
        train_loader = DataLoader(  # type: ignore
            train_dataset, batch_size=configs['batchsize'], num_workers=4,
            pin_memory=True, shuffle=True)

        valid_dataset = Loader(data=configs['data'],
                               split='valid',
                               extension=configs['extension'],
                               classes=configs['classes'],
                               column=configs['column'],
                               regression=configs['regression'])
        valid_loader = DataLoader(  # type: ignore
            valid_dataset, batch_size=configs['batchsize'], num_workers=4,
            pin_memory=True, shuffle=True)

        out_dataset = Loader(data=out_data,
                             split='valid',
                             extension=configs['extension'],
                             classes=configs['classes'],
                             column=configs['column'],
                             regression=configs['regression'])
        out_loader = DataLoader(  # type: ignore
            out_dataset, batch_size=configs['batchsize'], num_workers=4,
            pin_memory=True, shuffle=True)

        checkpoint = torch.load(path / 'best.pt', map_location='cuda:0' if torch.cuda.is_available() else 'cpu')
        checkpoint['model'] = checkpoint['model'].module
        checkpoint['model'].network = configs['network']
        checkpoint['model'].get_map = False
        checkpoint['model'].eval()
        # sigmoid = torch.nn.Sigmoid()
        group_lasso = EmpiricalCovariance(assume_centered=False)
        layer_names = {}

        # ------------------------------------------------------
        def get_mean_precision(loader):

            print('building hook function...')
            features = {}

            def hook(layer, inp, out):
                name = layer_names[layer]
                if name not in features:
                    features[name] = out.detach().data.view(out.size(0), out.size(1), -1).mean(dim=-1)
                else:
                    features[name] = torch.cat((features[name], out.detach().data.view(out.size(0), out.size(1), -1).mean(dim=-1)), dim=0)
            handles = checkpoint['model'].register_forward_hooks(checkpoint['model'], hook, layer_names)

            start = time.time()
            with torch.set_grad_enabled(False):
                for idx, data in enumerate(loader):
                    imgs = data[0].cuda().float()
                    _ = data[1]
                    _ = checkpoint['model'](imgs)
                    speed = configs['batchsize'] * idx // (time.time() - start)
                    print('Iter:', idx, 'Speed:', int(speed), 'img/s', end='\r', flush=True)
                    if idx > 20:
                        break
            print('Total time:', time.time() - start, 'secs')

            print('calculating sample mean...')
            mean = {}
            precision = {}
            for key, value in features.items():
                mean[key] = value.mean(dim=0)
                features[key] -= mean[key]
                group_lasso.fit(features[key].cpu().numpy())
                precision[key] = torch.from_numpy(group_lasso.precision_).float().cuda()

            for handle in handles:
                handle.remove()
            return mean, precision

        train_mean, train_precision = get_mean_precision(train_loader)

        # ------------------------------------------------------
        def get_mahalanobis_score(loader: Any, features: Any, magnitude: float):

            scores = {}
            gaussian = {}
            for layer, name in layer_names.items():
                checkpoint['optimizer'].zero_grad()

                def hook(layer, inp, out):
                    zero_feat = out.view(out.size(0), out.size(1), -1).mean(dim=-1) - train_mean[name]
                    gaussian[name] = -0.5 * torch.mm(torch.mm(zero_feat, train_precision[name]), zero_feat.t()).diag()

                handle = layer.register_forward_hook(hook)

                start = time.time()
                for idx, data in enumerate(loader):
                    with torch.set_grad_enabled(True):
                        imgs = data[0].cuda().float()
                        _ = data[1]
                        imgs.requires_grad = True
                        _ = checkpoint['model'](imgs)

                        loss = gaussian[name].mean()
                        loss.backward()

                        gradient = torch.ge(imgs.grad.data, 0)
                        gradient = (gradient.float() - 0.5) * 2

                    with torch.set_grad_enabled(False):
                        noisy_imgs = torch.add(imgs.data, gradient, alpha=-magnitude)
                        _ = checkpoint['model'](noisy_imgs)
                        if name not in scores:
                            scores[name] = gaussian[name].detach().data
                        else:
                            scores[name] = torch.cat((scores[name], gaussian[name].detach().data), dim=0)
                        print(scores[name].mean())

                    checkpoint['optimizer'].zero_grad()
                    speed = configs['batchsize'] * idx // (time.time() - start)
                    print(name, 'Iter:', idx, 'Speed:', int(speed), 'img/s', end='\r', flush=True)

                handle.remove()
                print()
            return scores

        print('get mahalanobis scores...')
        magnitudes = [0.0, 0.01, 0.005, 0.002, 0.0014, 0.001, 0.0005]
        maha_valid_scores = {}
        maha_out_scores = {}
        for magnitude in magnitudes:
            print('Noise:', magnitude)
            print('Data - Assumed negative class:', configs['data'])
            maha_valid_scores[magnitude] = get_mahalanobis_score(valid_loader, layer_names, magnitude)
            print('Data - Assumed positive class:', out_data)
            maha_out_scores[magnitude] = get_mahalanobis_score(out_loader, layer_names, magnitude)
            print()

        print('merge mahalanobis scores...')
def test_suffstat_sk_full():
    # compare the EmpiricalCovariance.covariance fitted on X*sqrt(resp)
    # with _sufficient_sk_full, n_components=1
    rng = np.random.RandomState(0)
    n_samples, n_features = 500, 2

    # special case 1, assuming data is "centered"
    X = rng.rand(n_samples, n_features)
    resp = rng.rand(n_samples, 1)
    X_resp = np.sqrt(resp) * X
    nk = np.array([n_samples])
    xk = np.zeros((1, n_features))
    covars_pred = _estimate_gaussian_covariance_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=True)
    ecov.fit(X_resp)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # special case 2, assuming resp are all ones
    resp = np.ones((n_samples, 1))
    nk = np.array([n_samples])
    xk = X.mean().reshape((1, -1))
    covars_pred = _estimate_gaussian_covariance_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=False)
    ecov.fit(X)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)
Example #52
0
 def extract_vector(self):
     if self.level == 2:
         self.df_cifti_load = pd.DataFrame(
             self.fmri_data_np_arr.mean(axis=2))
     if type(self.seed_ROI_name) == list and len(self.seed_ROI_name) > 1:
         if self.seed_analysis_output == 'parcellated':
             self.df_cifti_load = pd.DataFrame(
                 self.parcellated_cifti_load.get_fdata())
             self.df_cifti_load.columns = self.parcel_labels
             self.df_cifti_load['avg'] = self.df_cifti_load[
                 self.seed_ROI_name].mean(axis=1)
             self.parcel_labels = self.df_cifti_load.columns.to_list()
         else:
             self.df_cifti_load = pd.DataFrame(self.cifti_load.get_fdata())
             df_parcellated_cifti_load = pd.DataFrame(
                 self.parcellated_cifti_load.get_fdata())
             df_parcellated_cifti_load.columns = self.parcel_labels
             self.df_cifti_load['avg'] = df_parcellated_cifti_load[
                 self.seed_ROI_name].mean(axis=1)
         self.seed_ROI_name = 'avg'
     else:
         if self.seed_analysis_output == 'dense':
             self.df_cifti_load = pd.DataFrame(self.cifti_load.get_fdata())
             df_parcellated_cifti_load = pd.DataFrame(
                 self.parcellated_cifti_load.get_fdata())
             df_parcellated_cifti_load.columns = self.parcel_labels
             self.df_cifti_load[
                 self.seed_ROI_name] = df_parcellated_cifti_load[
                     self.seed_ROI_name]
         else:
             self.df_cifti_load = pd.DataFrame(
                 self.parcellated_cifti_load.get_fdata())
     cifti_np_array = self.df_cifti_load.to_numpy()
     if self.method == 'correlation':
         #Pearson correlation coefficients with LedoitWolf covariance estimator
         #measure = ConnectivityMeasure(kind='correlation',cov_estimator='LedoitWolf')
         #Pearson correlation coefficients based oemperical covariance (i.e. standard)
         measure = ConnectivityMeasure(kind='correlation',
                                       cov_estimator=EmpiricalCovariance())
     elif self.method == 'covariance':
         #LedoitWolf estimator
         measure = ConnectivityMeasure(kind='covariance')
     elif self.method == 'partial_correlation':
         # Partial correlation with LedoitWolf covariance estimator
         measure = ConnectivityMeasure(kind='partial correlation')
     elif self.method == 'precision':
         measure = ConnectivityMeasure(kind='precision')
     elif 'sparse' in self.method:
         measure = GraphicalLassoCV()
     if 'sparse' in self.method:
         measure.fit(cifti_np_array)
         if 'covariance' in self.method:
             network_matrix = measure.covariance_
         elif 'precision' in self.method:
             network_matrix = measure.precision_
     else:
         network_matrix = measure.fit_transform([cifti_np_array])[0]
     df_network_matrix = pd.DataFrame(network_matrix)
     df_network_matrix.columns = self.parcel_labels
     if self.seed_ROI_name == 'avg':
         # take everything except last element, i.e. avg. Need to do this because downstream this object must match grayordinate_file
         self.r_functional_vector = df_network_matrix[
             self.seed_ROI_name][:-1].to_numpy()
     else:
         self.r_functional_vector = np.squeeze(
             df_network_matrix[self.seed_ROI_name].to_numpy())
     self.z_functional_vector = 0.5 * (
         np.log(1 + self.r_functional_vector) -
         np.log(1 - self.r_functional_vector))
Example #53
0
    def fit(self, X):
        '''
      Copulafit using Gaussian copula with marginals evaluated by Gaussian KDE
      Precision matrix is evaluated using specified method, default to graphical LASSO
      :param X: input dataset
      :return: estimated precision matrix rho
      '''

        N, d = X.shape
        if self.scaler is not None:
            X_scale = self.scaler.fit_transform(X)
        else:
            X_scale = X
        if len(self.vertexes) == 0:
            self.vertexes = [str(id) for id in range(d)]

        self.theta = 1.0 / N
        cum_marginals = np.zeros_like(X)
        inv_norm_cdf = np.zeros_like(X)
        # inv_norm_cdf_scaled = np.zeros_like(X)
        self.kernels = list([])
        # TODO: complexity O(Nd) is high
        if self.verbose:
            colored('>> Computing marginals', color='blue')
        for j in range(cum_marginals.shape[1]):
            self.kernels.append(gaussian_kde(X_scale[:, j]))
            cum_pdf_overall = self.kernels[-1].integrate_box_1d(
                X_scale[:, j].min(), X_scale[:, j].max())
            for i in range(cum_marginals.shape[0]):
                cum_marginals[i, j] = self.kernels[-1].integrate_box_1d(
                    X_scale[:, j].min(), X_scale[i, j]) / cum_pdf_overall
                # truncate cumulative marginals
                if cum_marginals[i, j] < self.theta:
                    cum_marginals[i, j] = self.theta
                elif cum_marginals[i, j] > 1 - self.theta:
                    cum_marginals[i, j] = 1 - self.theta
                # inverse of normal CDF: \Phi(F_j(x))^{-1}
                inv_norm_cdf[i, j] = norm.ppf(cum_marginals[i, j])
                # scaled to preserve mean and variance: u_j + \sigma_j*\Phi(F_j(x))^{-1}
                # inv_norm_cdf_scaled[i, j] = X_scale[:, j].mean() + X_scale[:, j].std() * inv_norm_cdf[i, j]

        if self.method == 'mle':
            # maximum-likelihood estiamtor
            empirical_cov = EmpiricalCovariance()
            empirical_cov.fit(inv_norm_cdf)
            if self.verbose:
                print colored('>> Running MLE to estiamte precision matrix',
                              color='blue')

            self.est_cov = empirical_cov.covariance_
            self.corr = scale_matrix(self.est_cov)
            self.precision_ = inv(empirical_cov.covariance_)

        if self.method == 'glasso':
            if self.verbose:
                print colored('>> Running glasso to estiamte precision matrix',
                              color='blue')

            empirical_cov = EmpiricalCovariance()
            empirical_cov.fit(inv_norm_cdf)
            # shrunk convariance to avoid numerical instability
            shrunk_cov = shrunk_covariance(empirical_cov.covariance_,
                                           shrinkage=0.8)
            self.est_cov, self.precision_ = graph_lasso(emp_cov=shrunk_cov,
                                                        alpha=self.penalty,
                                                        verbose=self.verbose,
                                                        max_iter=self.max_iter)
            self.corr = scale_matrix(self.est_cov)

        if self.method == 'ledoit_wolf':
            if self.verbose:
                print colored(
                    '>> Running ledoit_wolf to estiamte precision matrix',
                    color='blue')

            self.est_cov, _ = ledoit_wolf(inv_norm_cdf)
            self.corr = scale_matrix(self.est_cov)
            self.precision_ = linalg.inv(self.est_cov)

        if self.method == 'spectral':
            '''L2 mehtod, use paper Inverse covariance estimation for high dimension data in linear time and space
         :formular: in paper eq(8)
         '''
            if self.verbose:
                print colored(
                    '>> Running Riccati to estiamte precision matrix',
                    color='blue')

            # TODO: note estimated cov is sample cov
            self.est_cov, self.precision_ = spectral(inv_norm_cdf,
                                                     rho=2 * self.penalty,
                                                     assume_centered=False)
            self.corr = scale_matrix(self.est_cov)

        if self.method == 'pc':
            clf = pgmlearner.PGMLearner()
            data_list = list([])
            for row_id in range(X_scale.shape[0]):
                instance = dict()
                for i, n in enumerate(self.vertexes):
                    instance[n] = X_scale[row_id, i]
                data_list.append(instance)
            graph = clf.lg_constraint_estimatestruct(data=data_list,
                                                     pvalparam=self.pval,
                                                     bins=self.bins)
            dag = np.zeros(shape=(len(graph.V), len(graph.V)))
            for e in graph.E:
                dag[self.vertexes.index(e[0]), self.vertexes.index(e[1])] = 1
            self.conditional_independences_ = dag

        if self.method == 'ic':
            df = dict()
            variable_types = dict()
            for j in range(X_scale.shape[1]):
                df[self.vertexes[j]] = X_scale[:, j]
                variable_types[self.vertexes[j]] = 'c'
            data = pd.DataFrame(df)
            # run the search
            ic_algorithm = IC(RobustRegressionTest,
                              data,
                              variable_types,
                              alpha=self.pval)
            graph = ic_algorithm.search()
            dag = np.zeros(shape=(X_scale.shape[1], X_scale.shape[1]))
            for e in graph.edges(data=True):
                i = self.vertexes.index(e[0])
                j = self.vertexes.index(e[1])
                dag[i, j] = 1
                dag[j, i] = 1
                arrows = set(e[2]['arrows'])
                head_len = len(arrows)
                if head_len > 0:
                    head = arrows.pop()
                    if head_len == 1 and head == e[0]:
                        dag[i, j] = 0
                    if head_len == 1 and head == e[1]:
                        dag[j, i] = 0
            self.conditional_independences_ = dag

        # finally we fit the structure
        self.fit_structure(self.precision_)
# ---------------------
#
# Below, we fit MCD and MLE based covariance estimators to our data and print
# the estimated covariance matrices. Note that the estimated variance of
# feature 2 is much higher with the MLE based estimator (7.5) than
# that of the MCD robust estimator (1.2). This shows that the MCD based
# robust estimator is much more resistant to the outlier samples, which were
# designed to have a much larger variance in feature 2.

import matplotlib.pyplot as plt
from sklearn.covariance import EmpiricalCovariance, MinCovDet

# fit a MCD robust estimator to data
robust_cov = MinCovDet().fit(X)
# fit a MLE estimator to data
emp_cov = EmpiricalCovariance().fit(X)
print('Estimated covariance matrix:\n'
      'MCD (Robust):\n{}\n'
      'MLE:\n{}'.format(robust_cov.covariance_, emp_cov.covariance_))

# %%
# To better visualize the difference, we plot contours of the
# Mahalanobis distances calculated by both methods. Notice that the robust
# MCD based Mahalanobis distances fit the inlier black points much better,
# whereas the MLE based distances are more influenced by the outlier
# red points.

fig, ax = plt.subplots(figsize=(10, 5))
# Plot data set
inlier_plot = ax.scatter(X[:, 0], X[:, 1], color='black', label='inliers')
outlier_plot = ax.scatter(X[:, 0][-n_outliers:],
def _get_cov(X):
    # calculates cov matrix
    from sklearn.covariance import EmpiricalCovariance
    cov = EmpiricalCovariance().fit(X)
    return cov
Example #56
0
def main():
    parser = argparse.ArgumentParser(
        description='Plot outlier-like distances for a 2-dimensional dataset')
    parser.add_argument(
        'dataset', type=argparse.FileType('r'),
        help='a CSV file containing the dataset')
    parser.add_argument(
        '--plot', type=str, choices=['train', 'grid'], default='grid',
        help='plot the dataset or a grid evenly distributed over its span')
    parser.add_argument(
        '--plotdims', type=int, choices=[2, 3], default=2,
        help='the number of dimensions to plot')

    args = parser.parse_args()

    X = np.loadtxt(args.dataset, delimiter=',')
    fig = plt.figure()

    xformer = NullTransformer()

    if X.shape[1] > 2:
        xformer = PCA(n_components=2)
        X = xformer.fit_transform(X)

    if args.plotdims == 2:
        plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0')
    else:
        plt.scatter(X[:, 0], X[:, 1])
    plt.show(block=False)

    path_to_script = os.path.realpath(__file__)
    dir_of_script = os.path.dirname(path_to_script)
    dataset_path = dir_of_script + '/outliers.npy'
    np.save(dataset_path, X)
    
    ###########################################################################
    # Train autoencoder with the n samples until convergence.  Run
    # evenly distributed samples through the autoencoder and compute
    # their reconstruction error.
    ###########################################################################

    maxseq_orig = np.max(X)
    minseq_orig = np.min(X)
    seqrange = np.abs(maxseq_orig - minseq_orig)
    maxseq = maxseq_orig + 0.5 * seqrange
    minseq = minseq_orig - 0.5 * seqrange
    print("minseq", minseq, "maxseq", maxseq)
    if args.plot == 'grid':
        seq = np.linspace(minseq, maxseq, num=50, endpoint=True)
        Xplot = np.array([_ for _ in product(seq, seq)])
    else:
        Xplot = X

    robust_cov = MinCovDet().fit(X)
    robust_md = robust_cov.mahalanobis(Xplot)

    empirical_cov = EmpiricalCovariance().fit(X)
    empirical_md = empirical_cov.mahalanobis(Xplot)

    # Assume Xplot is at least 2-dimensional.
    if Xplot.shape[1] > 2:
        Xplot2d = bh_sne(Xplot)
    else:
        Xplot2d = Xplot

    robust_md01 = robust_md - np.nanmin(robust_md)
    robust_md01 = robust_md01 / np.nanmax(robust_md01)

    empirical_md01 = empirical_md - np.nanmin(empirical_md)
    empirical_md01 = empirical_md01 / np.nanmax(empirical_md01)

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 
            cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01,
            cmap=plt.cm.jet, color=robust_md01)
        ax.set_zlabel('Mahalanobis distance')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (robust covariance)')

    fig = plt.figure()
    if args.plotdims == 2:
        ax = fig.add_subplot(1, 1, 1)
        ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 
            cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0')
    else:
        ax = fig.add_subplot(1, 1, 1, projection='3d')
        ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01,
            cmap=plt.cm.jet, color=empirical_md01)
        ax.set_zlabel('Mahalanobis distance')

    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Mahalanobis distance (empirical covariance)')
    
    enc_dec = [
        # tanh encoder, linear decoder
        ['tanh', 'linear'],
        # sigmoid encoder, linear decoder
        ['sigmoid', 'linear'],
        #######################################################################
        # The reconstruction error of the autoencoders trained with the
        # remaining commented-out pairs don't seem to match Mahalanobis
        # distance very well.  Feel free to uncomment them to see for
        # yourself.
        # linear encoder, linear decoder
        # ['linear', 'linear'],
        # tanh encoder, tanh decoder
        # ['tanh', 'tanh'],
        # tanh encoder, sigmoid decoder
        # ['tanh', 'sigmoid'],
        # sigmoid encoder, tanh decoder
        # ['sigmoid', 'tanh'],
        # sigmoid encoder, sigmoid decoder
        # ['sigmoid', 'sigmoid']
        #######################################################################
    ]
    
    for i, act in enumerate(enc_dec):
        enc, dec = act
        if dec == 'linear':
            dec = None
        model = train_autoencoder(dataset_path,
            act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16)
        
        Xshared = theano.shared(
            np.asarray(Xplot, dtype=theano.config.floatX), borrow=True)
        f = theano.function([], outputs=model.reconstruct(Xshared))
        fit = f()
        error = reconstruction_error(Xplot, fit)

        error01 = error - np.nanmin(error)
        error01 = error01 / np.nanmax(error01)
        
        fig = plt.figure()
        if args.plotdims == 2:
            ax = fig.add_subplot(1, 1, 1)
            ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
                cmap=plt.cm.jet, c=error, s=60, linewidth='0')
        else:
            ax = fig.add_subplot(1, 1, 1, projection='3d')
            ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error,
                cmap=plt.cm.jet, color=error01)
            ax.set_zlabel('Reconstruction error')

        ax.set_xlabel('x')
        ax.set_ylabel('y')
        encdec_type = ', '.join(act) 
        ax.set_title('Reconstruction error (' + encdec_type + ')')

        print("Correlation of robust MD and reconstruction error (" +
            str(encdec_type) + ") " + str(pearsonr(robust_md, error)))
        print("Correlation of empirical MD and reconstruction error (" +
            str(encdec_type) + ") " + str(pearsonr(empirical_md, error)))

    print("Correlation of robust MD and empirical MD " +
        str(pearsonr(robust_md, empirical_md)))

    os.remove(dataset_path)
    os.remove('outliers.pkl')

    plt.show(block=True)
Example #57
0
        bins=[i / 2 for i in range(0, 12, 1)],
        rwidth=1,
        color='#b1dbd0',
        edgecolor='black',
        align='left')

fig.tight_layout(pad=5.0)
xa.locator_params(axis='y', nbins=7)

for tick in ax.get_xticklabels():
    tick.set_fontname("Arial")
for tick in ax.get_yticklabels():
    tick.set_fontname("Arial")
plt.show()

A = EmpiricalCovariance().fit(np.array((x, dx))).covariance_
A = (A > 0) * A

A = A - np.diag(np.diag(A))
X = networkx.from_numpy_array(A)

F = networkx.Graph()
ps = networkx.spring_layout(X, scale=5, k=1 / len(A)**(1 / 40000))

labels = (z[:, 0].astype(str))
l = {i: labels[i] for i in range(len(labels))}

networkx.draw_networkx_nodes(F,
                             ps,
                             nodelist=X.nodes,
                             node_color='maroon',
n_features = 2

# generate data
# gen_cov = np.eye(n_features)
# gen_cov[0, 0] = 2.
# X = np.dot(np.random.randn(n_samples, n_features), gen_cov)
# # add some outliers
# outliers_cov = np.eye(n_features)
# outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.
# X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov)

# fit a Minimum Covariance Determinant (MCD) robust estimator to data
robust_cov = MinCovDet().fit(X)

# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)

###############################################################################
# Display results
fig = plt.figure()
plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)

# Show data set
subfig1 = plt.subplot(3, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
                              color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
                               color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
Example #59
0
def test_covariance():
    # Tests Covariance module on a simple dataset.
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    emp_cov = empirical_covariance(X)
    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(emp_cov), 0)
    assert_almost_equal(cov.error_norm(emp_cov, norm='spectral'), 0)
    assert_almost_equal(cov.error_norm(emp_cov, norm='frobenius'), 0)
    assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
    assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
    with pytest.raises(NotImplementedError):
        cov.error_norm(emp_cov, norm='foo')
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    assert np.amin(mahal_dist) > 0

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test with one sample
    # Create X with 1 sample and 5 features
    X_1sample = np.arange(5).reshape(1, 5)
    cov = EmpiricalCovariance()
    warn_msg = (
        "Only one sample available. You may want to reshape your data array")
    with pytest.warns(UserWarning, match=warn_msg):
        cov.fit(X_1sample)

    assert_array_almost_equal(cov.covariance_,
                              np.zeros(shape=(5, 5), dtype=np.float64))

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)

    # test centered case
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    assert_array_equal(cov.location_, np.zeros(X.shape[1]))