Esempio n. 1
0
    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """
        # Validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = MinCovDet(store_precision=self.store_precision,
                                   assume_centered=self.assume_centered,
                                   support_fraction=self.support_fraction,
                                   random_state=self.random_state)
        self.detector_.fit(X=X, y=y)

        # Use mahalanabis distance as the outlier score
        self.decision_scores_ = self.detector_.dist_
        self._process_decision_scores()
        return self
Esempio n. 2
0
def robust_mahalanobis_method(x=None, data=None):
    #Minimum covariance determinant method
    rng = np.random.RandomState(0)
    real_cov = np.cov(data.values.T)
    X = rng.multivariate_normal(mean=np.mean(data, axis=0),
                                cov=real_cov,
                                size=506)
    cov = MinCovDet(random_state=0).fit(X)
    mcd = cov.covariance_  #robust covariance metric
    robust_mean = cov.location_  #robust mean
    inv_covmat = sp.linalg.inv(mcd)  #inverse covariance metric

    #Calculate MD with minimum covariance determinant method
    x_minus_mu = x - robust_mean
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    md = mahal.diagonal()

    #Compare rMD with threshold and flag as outlier
    outlier = []
    C = chi2.ppf((1 - 0.001),
                 df=x.shape[1])  #degrees of freedom = number of variables
    for index, value in enumerate(md):
        if value > C:
            outlier.append(index)
        else:
            continue
    return outlier, md
Esempio n. 3
0
def MCD_Score(train_a, test_a, test_b):
    mcd = MinCovDet()
    mcd.fit(train_a)
    mcd_anoscore = mcd.mahalanobis(test_a)
    mcd_normalscore = mcd.mahalanobis(test_b)
    print("mcd ano score {} mcd normal score {}".format(
        mcd_anoscore, mcd_normalscore))
Esempio n. 4
0
def outliers_finder(data_frame: pd.DataFrame) -> pd.DataFrame:
    """
    Finding and removing outliers
    :param data_frame:
    :return:
    """
    (df_X, df_y) = splitting_dataset(data_frame)
    # Define the PCA object
    pca = PCA()

    # Run PCA on scaled data and obtain the scores array
    T = pca.fit_transform(StandardScaler().fit_transform(df_X.values))

    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(T[:, :5])

    # Get the Mahalanobis distance
    m = robust_cov.mahalanobis(T[:, :5])

    data_frame['mahalanobis'] = m

    # calculate p-value for each mahalanobis distance
    data_frame['p'] = 1 - chi2.cdf(data_frame['mahalanobis'], 3)
    data_frame.sort_values('p', ascending=False)
    Drops = (data_frame['p'] <= 0.001)
    data_frame['Drops'] = (data_frame['p'] <= 0.001)

    indexNames = data_frame[data_frame['Drops'] == True].index
    print(indexNames.size)
    data_frame.drop(indexNames, inplace=True)

    return data_frame
Esempio n. 5
0
def robust_mahalanobis_method(df):
    #Minimum covariance determinant
    rng = np.random.RandomState(0)
    real_cov = np.cov(df.values.T)
    X = rng.multivariate_normal(mean=np.mean(df, axis=0),
                                cov=real_cov,
                                size=506)
    cov = MinCovDet(random_state=0).fit(X)
    mcd = cov.covariance_  #robust covariance metric
    robust_mean = cov.location_  #robust mean
    inv_covmat = sp.linalg.inv(mcd)  #inverse covariance metric

    #Robust M-Distance
    x_minus_mu = df - robust_mean
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    md = np.sqrt(mahal.diagonal())

    #Flag as outlier
    outlier = []
    C = np.sqrt(chi2.ppf(
        (1 - 0.001),
        df=df.shape[1]))  #degrees of freedom = number of variables
    for index, value in enumerate(md):
        if value > C:
            outlier.append(index)
        else:
            continue
    return outlier, md
Esempio n. 6
0
 def reject_outliers(self, nstd=10.):
     """ update the list of inliers """
     from sklearn.covariance import MinCovDet
     X = np.concatenate((self.qobs, self.qpred * self.wav[:, None]),
                        axis=-1)
     dist = MinCovDet().fit(X).dist_
     self.set_inliers(dist <= nstd**2.)
Esempio n. 7
0
def interaction_matrix(X,
                       interaction_type='causal',
                       prior_knowledge=None,
                       measure='pwling',
                       estimator='ML',
                       file_name=''):
    if (interaction_type == 'causal'):
        lingam = DirectLiNGAM(prior_knowledge=prior_knowledge,
                              measure=measure).fit(X)
        B = lingam.adjacency_matrix_
        C = np.zeros([X.shape[1], X.shape[1]])
        for d in range(1, X.shape[1]):
            C += np.linalg.matrix_power(B, d)
        return B, C
    elif (interaction_type == 'correlation'):
        return np.corrcoef(X.T) - np.eye(X.shape[1])
    elif (interaction_type == 'covariance'):
        if (estimator == 'ML'):
            est = EmpiricalCovariance(store_precision=True,
                                      assume_centered=False).fit(X)
        elif (estimator == 'MCD'):
            est = MinCovDet(store_precision=True,
                            assume_centered=False,
                            support_fraction=None).fit(X)
        cov = est.covariance_
        if (np.linalg.matrix_rank(cov) != X.shape[1]):
            cov += 1e-6 * np.eye(X.shape[1])
        l_, P_ = np.linalg.eig(np.linalg.inv(cov))
        l = np.diag(np.sqrt(l_))
        P = P_.T
        U = P.T.dot(l).T
        return cov, U
    elif (interaction_type == 'precomputed'):
        df = pd.read_csv(file_name)
        return df.values
def compute_MCD_weft(weftsPickled, target_path):

    weft_points_list = floatPointList()
    for pickled_path in weftsPickled:
        weft_points_list.extend(pickle.load(open(pickled_path, "rb" )))

    x_vals = [fp.x for fp in weft_points_list]
    y_vals = [fp.y for fp in weft_points_list]

    mean_hor_dist = weft_points_list.getMedianWeftDist()

    min_x = min(x_vals) + 1.5 * mean_hor_dist
    max_x = max(x_vals) - 1.5 * mean_hor_dist
    min_y = min(y_vals) + 1.5 * mean_hor_dist
    max_y = max(y_vals) - 1.5 * mean_hor_dist

    inner_points = floatPointList()
    for pt in weft_points_list:
        if min_x < pt.x < max_x and min_y < pt.y < max_y:
            inner_points.append(pt)

    X = np.zeros([len(inner_points), 3])

    for idx, pt in enumerate(inner_points):
        X[idx,0] = pt.area
        X[idx,1] = pt.right_dist
        X[idx,2] = pt.left_dist

    Y = X[~(X<=0).any(axis=1)]

    robust_cov = MinCovDet(support_fraction=0.8).fit(Y)
    pickle.dump(robust_cov, open(target_path, "wb"))
Esempio n. 9
0
    def _naiveMCD(self, dataset, thresh=3):

        types = LoLTypeInference().getDataTypes(dataset)
        qdataset = [[d[i] for i, t in enumerate(types) if t == 'numerical']
                    for d in dataset]

        X = featurize(qdataset, [t for t in types if t == 'numerical'])
        xshape = np.shape(X)

        #for conditioning problems with the estimate
        Xsamp = X + 0.01 * np.random.randn(xshape[0], xshape[1])

        m = MinCovDet()
        m.fit(Xsamp)
        sigma = np.linalg.inv(m.covariance_)
        mu = np.mean(X, axis=0)

        results = []
        for i in range(0, xshape[0]):
            val = np.squeeze((X[i, :] - mu) * sigma * (X[i, :] - mu).T)[0, 0]
            results.append([str(val)])

        e = ErrorDetector(results,
                          modules=[QuantitativeErrorModule],
                          config=[{
                              'thresh': thresh
                          }])
        e.fit()

        return set([error['cell'][0] for error in e])
Esempio n. 10
0
def test_mcd_issue3367():
    # Check that MCD completes when the covariance matrix is singular
    # i.e. one of the rows and columns are all zeros
    rand_gen = np.random.RandomState(0)

    # Think of these as the values for X and Y -> 10 values between -5 and 5
    data_values = np.linspace(-5, 5, 10).tolist()
    # Get the cartesian product of all possible coordinate pairs from above set
    data = np.array(list(itertools.product(data_values, data_values)))

    # Add a third column that's all zeros to make our data a set of point
    # within a plane, which means that the covariance matrix will be singular
    data = np.hstack((data, np.zeros((data.shape[0], 1))))

    # The below line of code should raise an exception if the covariance matrix
    # is singular. As a further test, since we have points in XYZ, the
    # principle components (Eigenvectors) of these directly relate to the
    # geometry of the points. Since it's a plane, we should be able to test
    # that the Eigenvector that corresponds to the smallest Eigenvalue is the
    # plane normal, specifically [0, 0, 1], since everything is in the XY plane
    # (as I've set it up above). To do this one would start by:
    #
    #     evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
    #     normal = evecs[:, np.argmin(evals)]
    #
    # After which we need to assert that our `normal` is equal to [0, 0, 1].
    # Do note that there is floating point error associated with this, so it's
    # best to subtract the two and then compare some small tolerance (e.g.
    # 1e-12).
    MinCovDet(random_state=rand_gen).fit(data)
Esempio n. 11
0
def wmean(x, w=None, robust=False):
    '''Weighted mean 
    
    Calculate the mean of x using weights w.
    
    Args:
        x : array of values to be averaged
        w      : array of weights for each element of x; can be ommitted if robust=True
        robust : (boolean) robust weights will be internally calculated using FastMCD;
                 only used if robust=True and w is empty
        
    Returns:
        scalar : weighted mean    
    '''
    if (w != None):
        assert len(w) == len(x), 'w must be the same length as x'

    # Use FastMCD to calculate weights; Another method could be used here
    if (robust and w == None):
        w = MinCovDet().fit(np.array([x, x]).T).support_

    if (len(w) == 0):
        raise SystemExit('must specify weights w or select robust=True')
    assert len(w) == len(x), 'w must be the same length as x'

    return np.sum(x * w) / np.sum(w)
Esempio n. 12
0
def wcov(x, y, w=None, ddof=1, robust=False):
    '''Weighted covariance 
    
    Calculate the covariance of x and y using weights w. If ddof=1 (default),
    then the result is the unbiased (sample) covariance when w=1.
    
    Implements weighted covariance as defined by NIST Dataplot (https://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf)
    
    Args:
        x,y    : array of values 
        w      : array of weights for each element of x; can be ommitted if robust=True
        ddof   : scalar differential degrees of freedom (Default ddof=1)
        robust : (boolean) robust weights will be internally calculated using FastMCD;
                 only used if robust=True and w is empty
        
    Returns:
        scalar : weighted covariance   
    '''
    n = len(x)
    assert len(y) == n, 'y must be the same length as x'

    # Use FastMCD to calculate weights; Another method could be used here
    if (robust and w == None):
        w = MinCovDet().fit(np.array([x, y]).T).support_

    if (len(w) == 0):
        raise SystemExit('must specify weights w or select robust=True')
    assert len(w) == n, 'w must be the same length as x and y'

    w = wscale(w)
    nw = np.count_nonzero(w)

    return np.sum( ( x - wmean(x,w) ) * ( y - wmean(y,w) ) * w ) / \
        ( np.sum(w) / nw * (nw - ddof) )
Esempio n. 13
0
def wcorr(x, y, w=None, robust=False):
    '''Weighted correlation coeffient
    
    Calculate the Pearson linear correlation coefficient of x and y using weights w. 
    This is derived from the weighted covariance and weighted variance.
    
    Args:
        x,y    : array of values 
        w      : array of weights for each element of x
        robust : (boolean) robust weights will be internally calculated using FastMCD;
                 only used if robust=True and w is empty
        
    Returns:
        scalar : weighted covariance   
    '''

    n = len(x)
    assert len(y) == n, 'y must be the same length as x'

    # Use FastMCD to calculate weights; Another method could be used here
    if (w == None):
        w = MinCovDet().fit(np.array([x, y]).T).support_

    if (len(w) == 0):
        raise SystemExit('must specify weights w or select robust=True')
    assert len(w) == n, 'w must be the same length as x and y'
    w = wscale(w)
    return wcov(x, y, w) / np.sqrt(wvar(x, w) * wvar(y, w))
Esempio n. 14
0
def find_outliers_mahalanobis(featMatProjected,
                              extremeness=2.,
                              figsize=[8, 8],
                              saveto=None):
    """ A function to determine to return a list of outlier indices using the
        Mahalanobis distance. 
        Outlier threshold = std(Mahalanobis distance) * extremeness degree 
        [extreme_values=2, very_extreme_values=3 --> according to 68-95-99.7 rule]
    """
    import numpy as np
    import pandas as pd
    import seaborn as sns
    from pathlib import Path
    from sklearn.covariance import MinCovDet
    from matplotlib import pyplot as plt

    # NB: Euclidean distance puts more weight than it should on correlated variables
    # Chicken and egg situation, we can’t know they are outliers until we calculate
    # the stats of the distribution, but the stats of the distribution are skewed by outliers!
    # Mahalanobis gets around this by weighting by robust estimation of covariance matrix

    # Fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(
        featMatProjected[:, :10])  # Use the first 10 principal components

    # Get the Mahalanobis distance
    MahalanobisDist = robust_cov.mahalanobis(featMatProjected[:, :10])

    projectedTable = pd.DataFrame(featMatProjected[:,:10],\
                      columns=['PC' + str(n+1) for n in range(10)])

    plt.ioff() if saveto else plt.ion()
    plt.close('all')
    plt.style.use(CUSTOM_STYLE)
    sns.set_style('ticks')
    fig, ax = plt.subplots(figsize=figsize)
    ax.set_facecolor('#F7FFFF')
    plt.scatter(np.array(projectedTable['PC1']),
                np.array(projectedTable['PC2']),
                c=MahalanobisDist)  # colour PCA by Mahalanobis distance
    plt.title('Mahalanobis Distance for Outlier Detection', fontsize=20)
    plt.colorbar()
    ax.grid(False)

    if saveto:
        saveto.parent.mkdir(exist_ok=True, parents=True)
        suffix = Path(saveto).suffix.strip('.')
        plt.savefig(saveto, format=suffix, dpi=300)
    else:
        plt.show()

    k = np.std(MahalanobisDist) * extremeness
    upper_t = np.mean(MahalanobisDist) + k
    outliers = []
    for i in range(len(MahalanobisDist)):
        if (MahalanobisDist[i] >= upper_t):
            outliers.append(i)
    print("Outliers found: %d" % len(outliers))

    return np.array(outliers)
Esempio n. 15
0
    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        # Validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = MinCovDet(store_precision=self.store_precision,
                                   assume_centered=self.assume_centered,
                                   support_fraction=self.support_fraction,
                                   random_state=self.random_state)
        self.detector_.fit(X=X, y=y)

        # Use mahalanabis distance as the outlier score
        self.decision_scores_ = self.detector_.dist_
        self._process_decision_scores()
        return self
Esempio n. 16
0
    def obtenerOutliersMinCovarianza(self, datosOriginales, datosATestear):
        clf = MinCovDet().fit(datosOriginales)
        resultadoValoresATestear = clf.predict(datosATestear)

        listaOutliers, listaInliers = self.getListasOutliersInliers(
            resultadoValoresATestear, datosATestear)
        return listaOutliers, listaInliers
Esempio n. 17
0
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
                          tol_support):
    rand_gen = np.random.RandomState(0)
    data = rand_gen.randn(n_samples, n_features)
    # add some outliers
    outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
    outliers_offset = 10. * \
                      (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
    data[outliers_index] += outliers_offset
    inliers_mask = np.ones(n_samples).astype(bool)
    inliers_mask[outliers_index] = False

    pure_data = data[inliers_mask]
    # compute MCD by fitting an object
    mcd_fit = MinCovDet(random_state=rand_gen).fit(data)
    T = mcd_fit.location_
    S = mcd_fit.covariance_
    H = mcd_fit.support_
    # compare with the estimates learnt from the inliers
    error_location = np.mean((pure_data.mean(0) - T) ** 2)
    assert (error_location < tol_loc)
    error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
    assert (error_cov < tol_cov)
    assert (np.sum(H) >= tol_support)
    assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
Esempio n. 18
0
def test_mcd_issue1127():
    # Check that the code does not break with X.shape = (3, 1)
    # (i.e. n_support = n_samples)
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(3, 1))
    mcd = MinCovDet()
    mcd.fit(X)
def _h_getMahalanobisRobust(dat, critical_alpha=0.01, good_rows=np.zeros(0)):
    '''Calculate the Mahalanobis distance from the sample vector.'''
    if good_rows.size == 0:
        good_rows = np.any(~np.isnan(dat), axis=1)

    try:
        dat2fit = dat[good_rows]
        assert not np.any(np.isnan(dat2fit))

        robust_cov = MinCovDet().fit(dat2fit)
        mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat))
    except ValueError:
        # this step will fail if the covariance matrix is not singular. This happens if the data is not
        # a unimodal symetric distribution. For example there is too many small noisy particles. Therefore
        # I will take a safe option and return zeros in the mahalanobis
        # distance if this is the case.
        mahalanobis_dist = np.zeros(dat.shape[0])

    # critial distance of the maholanobis distance using the chi-square distirbution
    # https://en.wikiversity.org/wiki/Mahalanobis%27_distance
    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
    maha_lim = chi2.ppf(1 - critical_alpha, dat.shape[1])
    outliers = mahalanobis_dist > maha_lim

    return mahalanobis_dist, outliers, maha_lim
def test_mcd_increasing_det_warning():
    # Check that a warning is raised if we observe increasing determinants
    # during the c_step. In theory the sequence of determinants should be
    # decreasing. Increasing determinants are likely due to ill-conditioned
    # covariance matrices that result in poor precision matrices.

    X = [[5.1, 3.5, 1.4, 0.2],
         [4.9, 3.0, 1.4, 0.2],
         [4.7, 3.2, 1.3, 0.2],
         [4.6, 3.1, 1.5, 0.2],
         [5.0, 3.6, 1.4, 0.2],
         [4.6, 3.4, 1.4, 0.3],
         [5.0, 3.4, 1.5, 0.2],
         [4.4, 2.9, 1.4, 0.2],
         [4.9, 3.1, 1.5, 0.1],
         [5.4, 3.7, 1.5, 0.2],
         [4.8, 3.4, 1.6, 0.2],
         [4.8, 3.0, 1.4, 0.1],
         [4.3, 3.0, 1.1, 0.1],
         [5.1, 3.5, 1.4, 0.3],
         [5.7, 3.8, 1.7, 0.3],
         [5.4, 3.4, 1.7, 0.2],
         [4.6, 3.6, 1.0, 0.2],
         [5.0, 3.0, 1.6, 0.2],
         [5.2, 3.5, 1.5, 0.2]]

    mcd = MinCovDet(random_state=1)
    warn_msg = "Determinant has increased"
    with pytest.warns(RuntimeWarning, match=warn_msg):
        mcd.fit(X)
    def __init__(self):
        # Config for loading no action spectrum (noise data)
        rospack = rospkg.RosPack()
        self.train_dir = osp.join(rospack.get_path(
            'decopin_hand'), 'train_data')
        if not osp.exists(self.train_dir):
            makedirs(self.train_dir)
        self.noise_data_path = osp.join(self.train_dir, 'noise.npy')
        if not osp.exists(self.noise_data_path):
            rospy.logerr('{} is not found. Exit.'.format(self.noise_data_path))
            exit()
        no_action_data = np.load(self.noise_data_path)
        # extract about 100 data from no_action_data
        divide = max(1, len(no_action_data) / 100)
        no_action_data = no_action_data[::divide]
        # Detect in action or not by mahalanobis distance
        self.anormal_threshold = rospy.get_param('~anormal_threshold')
        self.mcd = MinCovDet()
        self.mcd.fit(no_action_data)
        rospy.loginfo('Calc covariance matrix for Mahalanobis distance')

        # ROS
        self.bridge = CvBridge()
        self.pub = rospy.Publisher('~in_action', Bool, queue_size=1)
        self.sub = rospy.Subscriber('~raw_spectrogram', Image, self.cb)
Esempio n. 22
0
def portfolio_covariance(r, method='normal'):
    if method == 'normal':
        r_cov = r.cov() * period_adjustment
    elif method == 'mcd':
        r_cov = MinCovDet(random_state=0).fit(r).covariance_ * period_adjustment
    elif method == 'mest':
        r_cov = EmpiricalCovariance().fit(r).covariance_ * period_adjustment
    return r_cov
def MCD_ano_score():
    print("マハラノビス距離(each MCD) ano score")
    mcd = MinCovDet()
    mcd.fit(train_normal)
    mcd_anoscore = mcd.mahalanobis(test_normal)
    mcd_normalscore = mcd.mahalanobis(test_ano)
    print("mcd ano score {} mcd normal score {}".format(
        mcd_anoscore, mcd_normalscore))
Esempio n. 24
0
def mahalanobis_calculate(data, num_pcs):
    pca = PCA(num_pcs)
    T = pca.fit_transform(data)
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(T)
    # Get the Mahalanobis distance
    m = robust_cov.mahalanobis(T)
    return m
def as7262_outliers(data, scatter_correction=None):
    data_columns = data[as7262_wavelengths]
    print(data_columns)
    # data_columns.T.plot()
    # plt.plot(data_columns.T)
    plt.show()
    if scatter_correction == "SNV":
        data_columns = processing.snv(data_columns)
    elif scatter_correction == "MSC":
        data_columns, _ = processing.msc(data_columns)

    # svm = OneClassSVM().fit_predict(snv_data)
    # print(svm)
    robust_cov = MinCovDet().fit(data_columns)
    mahal_dist = robust_cov.mahalanobis(data_columns)
    # mahal_dist = MahalanobisDist(np.array(data_columns), verbose=True)
    print(mahal_dist)


    zscore(data_columns)
    print('+++++')
    mean = np.mean(mahal_dist)
    std = 3*np.std(mahal_dist)
    print(mean, std)
    print(mean - std, mean + std)
    zscore_mahal = (mahal_dist - mean) / np.std(mahal_dist)
    # print(zscore_mahal)
    # print(zscore_mahal.max(), zscore_mahal.argmax(), data_columns.loc[zscore_mahal.argmax()])
    print('pppp')
    print(data_columns)
    print(zscore_mahal.argmax())
    outliers = data_columns.loc[zscore_mahal > 3].index
    outliers = data_columns.iloc[zscore_mahal.argmax()].name
    # print(data_columns.loc[zscore_mahal > 3].index)
    rows = data_columns.loc[outliers]
    # print(data_columns.loc[zscore_mahal.argmax()].name)
    print(data_columns.shape)
    print(rows)

    # print((mahal_dist-mahal_dist.mean()).std())
    # print(mahal_dist.std())
    # print(mahal_dist.mean() + 3*mahal_dist.std())
    # mahal_dist2 = MahalanobisDist(np.array(data_columns), verbose=True)
    n, bins, _ = plt.hist(zscore_mahal, bins=40)
    plt.show()

    # x_hist = np.linspace(min(mahal_dist), max(mahal_dist), 100)
    #
    # popt, pcov = curve_fit(gauss_function, bins[:len(n)], n, maxfev=100000, p0=[300, 0, 20])
    # new_fit = gauss_function(x_hist, *popt)
    # plt.plot(x_hist, new_fit, 'r--')
    # color = data_columns.shape[0] * ["#000000"]
    # color[data_columns.loc[zscore_mahal.argmax()].name] = "#FF0000"
    plt.plot(data_columns.T, c="black")
    plt.plot(rows.T, c="red")
    plt.plot(data_columns.mean(), c="blue", lw=4)
    # snv_data.T.plot(color=color)
    plt.show()
Esempio n. 26
0
def detect(train_data: np.ndarray, test_data: np.ndarray) -> list:
    estimated_covarianvce = MinCovDet().fit(train_data)
    train_dist = estimated_covarianvce.mahalanobis(train_data)
    np_max = np.max(train_dist)

    return [
        0 if data <= np_max else 1
        for data in estimated_covarianvce.mahalanobis(test_data)
    ]
Esempio n. 27
0
    def __init__(self, cov_estimator=MinCovDet(), threshold=None):
        if not isinstance(cov_estimator, EmpiricalCovariance):
            raise TypeError(
                "Estimator must be a sklearn.covariance.EmpiricalCovariance class"
            )

        self.cov_estimator = cov_estimator
        self.threshold = threshold

        self.attr_to_check = ["mahal_dist_"]
Esempio n. 28
0
def calc_robust_covariance_matrix(data_row,
                                  weights=None,
                                  centered=True,
                                  random_state=None):
    if weights is not None:
        data_row = inflate_data_using_weights(data_row, weights)

    C = MinCovDet(assume_centered=centered,
                  random_state=random_state).fit(data_row).covariance_
    return C
 def __init__(self, method='robust', estimator_kw_args={}):
     if method is 'robust':
         self.covariance_estimator_ = MinCovDet(**estimator_kw_args)
     elif method is 'empirical':
         self.covariance_estimator_ = EmpiricalCovariance(
             **estimator_kw_args)
     else:
         raise ValueError(
             "{} is not a valid method. Must be one of 'robust' or 'empirical'"
             .format(method))
def mahal_plot(e):
    first_half = e[1:len(e) - 1]
    second_half = e[2:len(e)]
    X = np.array([first_half, second_half])
    X = np.transpose(X)
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    robust_cov = MinCovDet().fit(X)

    # compare estimators learnt from the full data set with true parameters
    emp_cov = EmpiricalCovariance().fit(X)

    fig = plt.figure()

    # Show data set
    subfig1 = plt.subplot(1, 1, 1)
    inlier_plot = subfig1.scatter(first_half,
                                  second_half,
                                  color='black',
                                  label='daily diff in homes passed')

    subfig1.set_title("Mahalanobis distances of the iid invariants:")

    # Show contours of the distance functions
    xx, yy = np.meshgrid(np.linspace(plt.xlim()[0],
                                     plt.xlim()[1], 800),
                         np.linspace(plt.ylim()[0],
                                     plt.ylim()[1], 100))

    zz = np.c_[xx.ravel(), yy.ravel()]

    mahal_emp_cov = emp_cov.mahalanobis(zz)
    mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
    emp_cov_contour = subfig1.contour(xx,
                                      yy,
                                      np.sqrt(mahal_emp_cov),
                                      cmap=plt.cm.PuBu_r,
                                      linestyles='dashed')

    mahal_robust_cov = robust_cov.mahalanobis(zz)
    mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
    robust_contour = subfig1.contour(xx,
                                     yy,
                                     np.sqrt(mahal_robust_cov),
                                     cmap=plt.cm.YlOrBr_r,
                                     color='red',
                                     linewidth="3")

    subfig1.legend([
        emp_cov_contour.collections[1], robust_contour.collections[1],
        inlier_plot
    ], ['MLE dist', 'robust dist', 'kpis'],
                   loc="upper right",
                   borderaxespad=0)
    print(np.corrcoef(first_half, second_half))
    return (robust_cov, emp_cov)