Esempio n. 1
0
def test_mcd_issue1127():
    # Check that the code does not break with X.shape = (3, 1)
    # (i.e. n_support = n_samples)
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(3, 1))
    mcd = MinCovDet()
    mcd.fit(X)
Esempio n. 2
0
def MCD_Score(train_a, test_a, test_b):
    mcd = MinCovDet()
    mcd.fit(train_a)
    mcd_anoscore = mcd.mahalanobis(test_a)
    mcd_normalscore = mcd.mahalanobis(test_b)
    print("mcd ano score {} mcd normal score {}".format(
        mcd_anoscore, mcd_normalscore))
def test_mcd_increasing_det_warning():
    # Check that a warning is raised if we observe increasing determinants
    # during the c_step. In theory the sequence of determinants should be
    # decreasing. Increasing determinants are likely due to ill-conditioned
    # covariance matrices that result in poor precision matrices.

    X = [[5.1, 3.5, 1.4, 0.2],
         [4.9, 3.0, 1.4, 0.2],
         [4.7, 3.2, 1.3, 0.2],
         [4.6, 3.1, 1.5, 0.2],
         [5.0, 3.6, 1.4, 0.2],
         [4.6, 3.4, 1.4, 0.3],
         [5.0, 3.4, 1.5, 0.2],
         [4.4, 2.9, 1.4, 0.2],
         [4.9, 3.1, 1.5, 0.1],
         [5.4, 3.7, 1.5, 0.2],
         [4.8, 3.4, 1.6, 0.2],
         [4.8, 3.0, 1.4, 0.1],
         [4.3, 3.0, 1.1, 0.1],
         [5.1, 3.5, 1.4, 0.3],
         [5.7, 3.8, 1.7, 0.3],
         [5.4, 3.4, 1.7, 0.2],
         [4.6, 3.6, 1.0, 0.2],
         [5.0, 3.0, 1.6, 0.2],
         [5.2, 3.5, 1.5, 0.2]]

    mcd = MinCovDet(random_state=1)
    warn_msg = "Determinant has increased"
    with pytest.warns(RuntimeWarning, match=warn_msg):
        mcd.fit(X)
Esempio n. 4
0
    def _naiveMCD(self, dataset, thresh=3):

        types = LoLTypeInference().getDataTypes(dataset)
        qdataset = [[d[i] for i, t in enumerate(types) if t == 'numerical']
                    for d in dataset]

        X = featurize(qdataset, [t for t in types if t == 'numerical'])
        xshape = np.shape(X)

        #for conditioning problems with the estimate
        Xsamp = X + 0.01 * np.random.randn(xshape[0], xshape[1])

        m = MinCovDet()
        m.fit(Xsamp)
        sigma = np.linalg.inv(m.covariance_)
        mu = np.mean(X, axis=0)

        results = []
        for i in range(0, xshape[0]):
            val = np.squeeze((X[i, :] - mu) * sigma * (X[i, :] - mu).T)[0, 0]
            results.append([str(val)])

        e = ErrorDetector(results,
                          modules=[QuantitativeErrorModule],
                          config=[{
                              'thresh': thresh
                          }])
        e.fit()

        return set([error['cell'][0] for error in e])
def test_mcd_issue1127():
    # Check that the code does not break with X.shape = (3, 1)
    # (i.e. n_support = n_samples)
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(3, 1))
    mcd = MinCovDet()
    mcd.fit(X)
class ActionDetector(object):
    """
    Publish whether the robot is in action or not to rostopic, by MT method.

    NOTE
    Before starting to detect action, some waiting time is required.
    This is preparation time to calculate mahalanobis distance.
    Reaction speed for action detection is a bit late
    because spectrum is mean of spectrogram, not right edge of spectrogram
    """

    def __init__(self):
        # Config for loading no action spectrum (noise data)
        rospack = rospkg.RosPack()
        self.train_dir = osp.join(rospack.get_path(
            'decopin_hand'), 'train_data')
        if not osp.exists(self.train_dir):
            makedirs(self.train_dir)
        self.noise_data_path = osp.join(self.train_dir, 'noise.npy')
        if not osp.exists(self.noise_data_path):
            rospy.logerr('{} is not found. Exit.'.format(self.noise_data_path))
            exit()
        no_action_data = np.load(self.noise_data_path)
        # extract about 100 data from no_action_data
        divide = max(1, len(no_action_data) / 100)
        no_action_data = no_action_data[::divide]
        # Detect in action or not by mahalanobis distance
        self.anormal_threshold = rospy.get_param('~anormal_threshold')
        self.mcd = MinCovDet()
        self.mcd.fit(no_action_data)
        rospy.loginfo('Calc covariance matrix for Mahalanobis distance')

        # ROS
        self.bridge = CvBridge()
        self.pub = rospy.Publisher('~in_action', Bool, queue_size=1)
        self.sub = rospy.Subscriber('~raw_spectrogram', Image, self.cb)

    def cb(self, msg):
        """
        Main process of NoiseSaver class
        Publish whether the robot is in action or not
        """

        # spectrogram.shape is (height, width) = (spectrum, time)
        spectrogram = self.bridge.imgmsg_to_cv2(msg)
        self.current_spectrum = np.average(spectrogram, axis=1)
        # Check whether current spectrogram is in action or not
        spectrum = self.current_spectrum[None]
        dist = self.mcd.mahalanobis(spectrum)[0]
        info_message = '(mahalanobis distance, threshold) = ({}, {})'.format(
            dist, self.anormal_threshold)
        if dist < self.anormal_threshold:
            self.in_action = False
            rospy.loginfo('No action\n' + info_message + '\n')
        else:
            self.in_action = True
            rospy.loginfo('### In action ###\n' + info_message + '\n')
        pub_msg = Bool(data=self.in_action)
        self.pub.publish(pub_msg)
def MCD_ano_score():
    print("マハラノビス距離(each MCD) ano score")
    mcd = MinCovDet()
    mcd.fit(train_normal)
    mcd_anoscore = mcd.mahalanobis(test_normal)
    mcd_normalscore = mcd.mahalanobis(test_ano)
    print("mcd ano score {} mcd normal score {}".format(
        mcd_anoscore, mcd_normalscore))
Esempio n. 8
0
 def leverage(self, X):
     mcd = MinCovDet()
     mcd.fit(X)
     loc, cov = mcd.location_, mcd.covariance_
     inversed_cov = np.linalg.inv(cov)
     result = np.zeros(X.shape[0])
     for i, element in enumerate(X):
         h = np.sqrt(
             np.transpose(element - loc) @ inversed_cov @ (element - loc))
         result[i] = h
     return result
Esempio n. 9
0
def RejectOutliers(data, threshold=3):
    """
    Rejects nodal outliers based on :threshold: away from the mean based on the
    mahalanobis distance
    """
    from sklearn.covariance import MinCovDet
    clf = MinCovDet()
    clf.fit(data)
    distances = clf.mahalanobis(data)

    outliers = np.where(distances >= threshold)[0]
    inliers = np.where(distances < threshold)[0]
    return inliers, outliers
    def resif(self, X):
        """
        computes the robust empirical influence function(RESIF). Choose to use :math:`\Omega_2 =
        ( \\theta, \\hat{\Sigma} )` as estimator of central model.

        :param X: ndarray, shape(n_samples, n_features)
                Training data

        :return: ndarray, shape(n_samples,)
                RESIF of each sample
        """
        mcd = MinCovDet()
        mcd.fit(X=X)
        loc, cov = mcd.location_, mcd.covariance_
        inversed_cov = np.linalg.inv(cov)
        result = np.zeros(len(X))
        for i, element in enumerate(X):
            h = np.sqrt(
                np.transpose(element - loc) @ inversed_cov @ (element - loc))
            result[i] = h

        return result
Esempio n. 11
0
def robust_hurst(ts, lags=100, robust_cov=True, plot=False):

    minCovDet = MinCovDet(assume_centered=True)
    n = ts.shape[0]

    # calculate lagged variances
    var_lags = np.zeros(lags - 1)

    for lag in range(1, lags):
        lagged_series = ts[lag:] - ts[:-lag]

        if robust_cov:
            minCovDet.fit(lagged_series.reshape(-1, 1))
            var_lags[lag - 1] = np.asscalar(minCovDet.covariance_)
        else:
            var_lags[lag -
                     1] = np.dot(lagged_series, lagged_series) / (n - lag - 1)

    # calculate log-log slopes
    slopes = np.zeros(int(comb(lags - 2, 2)))
    cntr = 0
    for i in range(1, lags - 1):
        for j in range(i + 1, lags - 1):
            slopes[cntr] = np.log(
                var_lags[j] / var_lags[i]) / (2 * np.log(float(j) / i))
            cntr += 1

    H_est = np.median(slopes)

    # plot
    if plot:
        plt.figure()
        plt.hist(slopes)

        plt.figure()
        plt.plot(np.log(range(1, lags)), np.log(var_lags))
        plt.plot(np.log(range(1, lags)), np.log(range(1, lags)) * H_est)

    return np.median(slopes)
Esempio n. 12
0
class myMahalanobisDistance():
    def __init__(self, estimator='ML', tol=1e-6):
        if (estimator == 'ML'):
            self.estimator_ = EmpiricalCovariance(store_precision=True,
                                                  assume_centered=False)
        elif (estimator == 'MCD'):
            self.estimator_ = MinCovDet(store_precision=True,
                                        assume_centered=False,
                                        support_fraction=None,
                                        random_state=0)
        else:
            self.estimator_ = None
        self.tol_ = tol

    def fit(self, X_tr):
        self.D_ = X_tr.shape[1]
        if (self.estimator_ == None):
            self.cov_ = np.cov(X_tr.T)
            if (np.linalg.matrix_rank(self.cov_) != self.D_):
                self.cov_ += self.tol_ * np.eye(self.D_)
        else:
            self.estimator_ = self.estimator_.fit(X_tr)
            self.cov_ = self.estimator_.covariance_
            if (np.linalg.matrix_rank(self.cov_) != self.D_):
                self.cov_ += self.tol_ * np.eye(self.D_)
            # self.inv_ = self.estimator_.precision_
        self.inv_ = np.linalg.inv(self.cov_)
        self = self.__setEig()
        return self

    def __setEig(self):
        self.lams_, self.U_ = np.linalg.eig(self.inv_)
        self.U_ = self.U_.T
        # self.lams_, self.U_ = self.lams_.astype(float), self.U_.T.astype(float)
        self.Lam_ = np.diag(np.sqrt(self.lams_))
        self.L_ = self.U_.T.dot(self.Lam_).T
        # self.M_ = self.U_.T.dot(self.Lam_, self.U_)
        return self

    def mahalanobis_dist(self, x, y, p=2):
        if (p == 1):
            return np.linalg.norm(self.L_.dot(x - y), ord=1)
        else:
            return mahalanobis(x, y, self.inv_)
Esempio n. 13
0
class OutlierMahalanobis(TransformerMixin):

    def __init__(self, support_fraction = 0.95, verbose = False, chi2_percentile = 0.995,qqplot=True):
        self.verbose = verbose
        self.support_fraction = support_fraction
        self.chi2 = stats.chi2
        self.mcd = MCD(store_precision = True, support_fraction = support_fraction)
        self.chi2_percentile = chi2_percentile
        self.qqplot=qqplot

    def get_params(self):
        return {"support_fraction": self.support_fraction,"chi2_percentile":self.chi2_percentile}

    def set_params(self, **parameters):
        for key,value in parameters.items() :
            setattr(self,key,parameters[key])
        return self

    def fit(self, X,y=None):
        """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme"""
        self.mcd.fit(X)
        d = np.array([distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_ ) for p in X])
        self.d2 = d**2 #MD squared
        n, self.degrees_of_freedom_ = X.shape
        self.iextreme_values = (self.d2 > self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) )
        if self.verbose:
            print("%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile))
            print("with support fraction %.2f."%self.support_fraction)
            pvalue=stats.kstest(self.d2, lambda x : stats.chi2.cdf(x,df=self.degrees_of_freedom_))[1]
            if pvalue <= 0.01:
                print('Attention : Très forte présomption contre l\'hypothèse nulle p_value : '+str(pvalue))
            elif pvalue <= 0.05:
                print('Attention : Forte présomption contre l\'hypothèse nulle p_value : '+str(pvalue))
            elif pvalue <= 0.1:
                print('Faible présomption contre l\'hypothèse nulle p_value : '+str(pvalue))
            else :
                print('Pas de présomption contre l\'hypothèse nulle. p_value : '+str(pvalue))
            if self.qqplot==True :
                plt.figure(figsize=(10,10))
                stats.probplot(self.d2,dist=stats.chi2(df=self.degrees_of_freedom_), plot=plt)
                plt.title('QQ plot between Mahanalobis distance quantiles and Chi2 quantiles')
                plt.show()

        return self

    def transform(self,X):

        return X[~self.iextreme_values]

    def plot(self,log=False, sort = False ):
        """
        Cause plotting is always fun.

        log: transform the distance-sq to a log ( distance-sq )
        sort: sort the data according to distnace before plotting
        ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views.

        """
        n = self.d2.shape[0]
        fig = plt.figure(figsize=(10,10))

        x = np.arange( n )
        ax = fig.add_subplot(111)


        transform = (lambda x: x ) if not log else (lambda x: np.log(x))
        chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_)

        chi_line = transform( chi_line )
        d2 = transform( self.d2 )
        if sort:
            isort = np.argsort( d2 )
            ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' )
            plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" )


        else:
            ax.scatter(x, d2 )
            extreme_values = d2[ self.iextreme_values ]
            ax.scatter( x[self.iextreme_values], extreme_values, color="r" )

        ax.hlines( chi_line, 0, n,
                        label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" )

        ax.legend()
        ax.set_ylabel("distance squared")
        ax.set_xlabel("observation")
        ax.set_xlim(0, self.d2.shape[0])


        plt.show()
Esempio n. 14
0
class Outlier_detection(object):

    def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995):
        self.verbose = verbose
        self.support_fraction = support_fraction
        self.chi2 = stats.chi2
        self.mcd = MCD(store_precision = True, support_fraction = support_fraction)
        self.chi2_percentile = chi2_percentile
        
    def fit(self, X):
        """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme"""
        self.mcd.fit(X)
        mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_  )
        d = np.array(map(mahalanobis, X)) #Mahalanobis distance values
        self.d2 = d ** 2 #MD squared
        n, self.degrees_of_freedom_ = X.shape
        self.iextreme_values = (self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_) )
        if self.verbose:
            print "%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile)
            print "with support fraction %.2f."%self.support_fraction
        return self

    def plot(self,log=False, sort = False ):
        """
        Cause plotting is always fun.
        
        log: transform the distance-sq to a log ( distance-sq )
        sort: sort the data according to distnace before plotting
        ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views.
        
        """
        n = self.d2.shape[0]
        fig = plt.figure()
        
        x = np.arange( n )
        ax = fig.add_subplot(111)
 
 
        transform = (lambda x: x ) if not log else (lambda x: np.log(x))
        chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_)     
        
        chi_line = transform( chi_line )
        d2 = transform( self.d2 )
        if sort:
            isort = np.argsort( d2 )    
            ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' )
            plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" )
            
        
        else:
            ax.scatter(x, d2 )
            extreme_values = d2[ self.iextreme_values ]
            ax.scatter( x[self.iextreme_values], extreme_values, color="r" )
            
        ax.hlines( chi_line, 0, n, 
                        label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" )

        ax.legend()
        ax.set_ylabel("distance squared")
        ax.set_xlabel("observation")
        ax.set_xlim(0, self.d2.shape[0])


        plt.show()
Esempio n. 15
0
class Outlier_detection(object):
    def __init__(self,
                 support_fraction=0.95,
                 verbose=True,
                 chi2_percentile=0.995):
        self.verbose = verbose
        self.support_fraction = support_fraction
        self.chi2 = stats.chi2
        self.mcd = MCD(store_precision=True, support_fraction=support_fraction)
        self.chi2_percentile = chi2_percentile

    def fit(self, X):
        """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme"""
        self.mcd.fit(X)
        mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_,
                                                     self.mcd.precision_)
        d = np.array(map(mahalanobis, X))  #Mahalanobis distance values
        self.d2 = d**2  #MD squared
        n, self.degrees_of_freedom_ = X.shape
        self.iextreme_values = (self.d2 > self.chi2.ppf(
            0.995, self.degrees_of_freedom_))
        if self.verbose:
            print "%.3f proportion of outliers at %.3f%% chi2 percentile, " % (
                self.iextreme_values.sum() / float(n), self.chi2_percentile)
            print "with support fraction %.2f." % self.support_fraction
        return self

    def plot(self, log=False, sort=False):
        """
        Cause plotting is always fun.
        
        log: transform the distance-sq to a log ( distance-sq )
        sort: sort the data according to distnace before plotting
        ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views.
        
        """
        n = self.d2.shape[0]
        fig = plt.figure()

        x = np.arange(n)
        ax = fig.add_subplot(111)

        transform = (lambda x: x) if not log else (lambda x: np.log(x))
        chi_line = self.chi2.ppf(self.chi2_percentile,
                                 self.degrees_of_freedom_)

        chi_line = transform(chi_line)
        d2 = transform(self.d2)
        if sort:
            isort = np.argsort(d2)
            ax.scatter(x, d2[isort], alpha=0.7, facecolors='none')
            plt.plot(x,
                     transform(
                         self.chi2.ppf(np.linspace(0, 1, n),
                                       self.degrees_of_freedom_)),
                     c="r",
                     label="distribution assuming normal")

        else:
            ax.scatter(x, d2)
            extreme_values = d2[self.iextreme_values]
            ax.scatter(x[self.iextreme_values], extreme_values, color="r")

        ax.hlines(chi_line,
                  0,
                  n,
                  label="%.1f%% $\chi^2$ quantile" %
                  (100 * self.chi2_percentile),
                  linestyles="dotted")

        ax.legend()
        ax.set_ylabel("distance squared")
        ax.set_xlabel("observation")
        ax.set_xlim(0, self.d2.shape[0])

        plt.show()
Esempio n. 16
0
def test_mcd_class_on_invalid_input():
    X = np.arange(100)
    mcd = MinCovDet()
    msg = "Expected 2D array, got 1D array instead"
    with pytest.raises(ValueError, match=msg):
        mcd.fit(X)
Esempio n. 17
0
def remove_drugs_with_low_effect_multivariate(
        feat, meta, signif_level=0.05,
        cov_estimator = 'EmpiricalCov',
        drugname_column = 'drug_type',
        dose_column = 'drug_dose',
        keep_names = ['DMSO', 'NoCompound'],
        return_nonsignificant = False
        ):
    """
    Remove drugs when all the doses of the drug are very close to DMSO.
    Whether a dose is very close to DMSO is checked using the Mahalanobis
    distance (MD) calculated based on the robust covariance estimate of the
    DMSO observations and assuming that the MD^2 of DMSO points follow a chi2
    distribution with n_feat degrees of freedom.
    param:
        feat : dataframe
            feature dataframe
        meta : dataframe
            dataframe with sample identification data
        signif_level = float
            Defines the significance level for the p-value of the hypothesis
            test for each drug dose based on the MD^2 distribution.
        cov_estimator : 'RobustCov' or 'EmpiricalCov'
            Specifies the method to estimate the covariance matrix.
        return_nonsignificant : bool, optional
            return the names of the drugs that are removed from the
            dataset
    return:
        feat : dataframe
            feature dataframe with low-potency drugs removed
        meta : dataframe
            metadata dataframe with low-potency drugs removed
    """
    from sklearn.covariance import MinCovDet, EmpiricalCovariance
    from scipy.stats import chi2
    from time import time

    if cov_estimator == 'RobustCov':
        estimator = MinCovDet()
    elif cov_estimator == 'EmpiricalCov':
        estimator = EmpiricalCovariance()

    print('Estimating covariance matrix...'); st_time=time()
    estimator.fit(feat[meta[drugname_column].isin(['DMSO'])])
    print('Done in {:.2f}.'.format(time()-st_time))

    drug_names = meta[drugname_column].unique()

    mah_dist = {}
    signif_effect_drugs = []
    for idr, drug in enumerate(drug_names):
        if drug in keep_names:
            continue

        print('Checking compound {} ({}/{})...'.format(
            drug, idr+1, drug_names.shape[0]))

        X = feat[meta[drugname_column].isin([drug])]
        X.insert(0, 'dose',
                 meta.loc[meta[drugname_column].isin([drug]), dose_column])

        X = X.groupby(by='dose').mean()

        md2 = estimator.mahalanobis(X)
        mah_dist[drug] = md2

        nft = feat.shape[1]

        # Compute the P-Values
        p_vals = 1 - chi2.cdf(md2, nft)

        # Extreme values with a significance level of p_value
        if any(p_vals < signif_level):
            signif_effect_drugs.append(drug)

    signif_effect_drugs.extend(keep_names)

    feat = feat[meta[drugname_column].isin(signif_effect_drugs)]
    meta = meta[meta[drugname_column].isin(signif_effect_drugs)]

    if return_nonsignificant:
        return feat, meta, list(
            set(drug_names).difference(set(signif_effect_drugs))
            ), mah_dist
    else:
        return feat, meta
Esempio n. 18
0
fig = plt.figure()
fig.suptitle(
    'Parallel Coordinates Plot of Potential Outliers in Wisconsin Breast Cancer Data'
)
parallel_coordinates(d.iloc[possible_outliers, :],
                     class_column='diagnosis',
                     cols=d.columns[3:],
                     color=('#0158FE', '#FE0101'))
plt.show()

#-------------------------------------------------------------------------------------------------#
#----------------------------------------Robust Covariance----------------------------------------#
#-------------------------------------------------------------------------------------------------#

robust_cov = MinCovDet(assume_centered=False, random_state=14)
robust_cov.fit(d.iloc[:, 3:12])

#View covariance matrix before and after reweighting
sns.heatmap(robust_cov.raw_covariance_, annot=True)
plt.title('Raw Covariance Matrix')
plt.show()
sns.heatmap(robust_cov.covariance_, annot=True)
plt.title('Robust Covariance Matrix')
plt.show()

#View the Mahalanobis distances on the PCA plot
pca_model = PCA(n_components=None, whiten=False, random_state=14)
pca_dim = pca_model.fit_transform(d.iloc[:, 3:12])

plt.figure(figsize=(10, 5))
plt.xlabel('Latent Variable 1 (explains most variance)')
Esempio n. 19
0
A = matrix(np.ones((1, N)))
b = matrix(1.0)

results_df = pd.DataFrame()

for train_window in range(150, 1000 + 1, 10):
    for train_data_start in range(0, returns.shape[0] - train_window + 1, 50):
        train_data_end = train_data_start + train_window

        R_train = np.asarray(returns.iloc[train_data_start:train_data_end, :])

        n = R_train.shape[0]
        N = R_train.shape[1]

        mcd = MinCovDet()
        mcd.fit(R_train)
        S = mcd.covariance_
        """ Markowitz long-only """
        Q = matrix(2 * S)

        sol = qp(Q, p, G, h, A, b)
        w_opt_M = np.reshape(sol["x"], N)
        """ Calculate statistics """
        # Train
        cum_returns_train = np.asarray(
            (prices.iloc[train_data_start:train_data_end, :] -
             prices.iloc[train_data_start, :])[1:])
        train_returns = np.dot(cum_returns_train, w_opt_M)

        H_train = robust_hurst(train_returns)
        lr.fit(
Esempio n. 20
0
lm2 = ols('word_diff ~ Age + C(Centre_ID)',
         data=clean_st,subset=subset).fit()

print(lm2.summary())

# <markdowncell>

# # Snippets. Might come back to this later:

# <codecell>

from scipy.stats import pearsonr
from sklearn.covariance import MinCovDet

# just look at what's interesting for now, and drop the NAs involved
clean = st_v_merged.loc[:,['norm_diff','Interview_Suggested_Ranking_numerical_']]
clean = clean.dropna(axis=0)

# calculate robust covariance estimate, calculate what's too far away
mcd = MinCovDet()
mcd.fit(clean)

pearsonr(clean.iloc[:,0],clean.iloc[:,1])

# <codecell>

d = mcd.mahalanobis(clean)
d.sort()
d

Esempio n. 21
0
class MCD():
    def __init__():
        """
        Minimum Covariance Determinant (MCD) based anomaly detection is based on that Mahalanobis-type distances 
        in which the shape matrix is derived from a consistent high breakdown robust multivariate location and 
        scale estimator can be used to find anomaly points.
        The Minimum Covariance Determinant covariance estimator is to be applied on Gaussian-distributed data, 
        but could still be relevant on data drawn from a unimodal, symmetric distribution. 

        Parameters
        ----------
        store_precision : bool
            Specify if the estimated precision is stored.
        assume_centered : bool
            If True, the support of the robust location and the covariance
            estimates is computed, and a covariance estimate is recomputed from
            it, without centering the data.
            Useful to work with data whose mean is significantly equal to
            zero but is not exactly zero.
            If False, the robust location and covariance are directly computed
            with the FastMCD algorithm without additional treatment.
        support_fraction : float, 0 < support_fraction < 1
            The proportion of points to be included in the support of the raw
            MCD estimate. Default is None, which implies that the minimum
            value of support_fraction will be used within the algorithm:
            [n_sample + n_features + 1] / 2
        random_state : int, RandomState instance or None, optional (default=None)
            If int, random_state is the seed used by the random number generator;
            If RandomState instance, random_state is the random number generator;
            If None, the random number generator is the RandomState instance used
            by `np.random`.
        """
        def __init__(self, store_precision=True, assume_centered=False,
                 support_fraction=None, random_state=None):
            self.store_precision = store_precision
            self.assume_centered = assume_centered
            self.support_fraction = support_fraction
            self.random_state = random_state

        def fit(self, X):
            """Fit detector.
            Parameters
            ----------
            X : numpy array of shape (n_samples, n_features)
                The input samples.
            """
            self.X_train = check_array(X)
            self.mcd = MinCovDet(store_precision=self.store_precision,
                            assume_centered=self.assume_centered,
                            support_fraction=self.support_fraction,
                            random_state=self.random_state)
            self.mcd.fit(X=X, y=y)
            pass

        def decision(X):
        """Predict anomaly score of each element.
        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        ll : array, shape (n_samples,)
            Mahalanobis distance of each sample under the current model, which is the anomaly score of each element.
        """
        return self.mcd.mahalanobis(X)
Esempio n. 22
0
class MCD(BaseDetector):
    """Detecting outliers in a Gaussian distributed dataset using
    Minimum Covariance Determinant (MCD): robust estimator of covariance.

    The Minimum Covariance Determinant covariance estimator is to be applied
    on Gaussian-distributed data, but could still be relevant on data
    drawn from a unimodal, symmetric distribution. It is not meant to be used
    with multi-modal data (the algorithm used to fit a MinCovDet object is
    likely to fail in such a case).
    One should consider projection pursuit methods to deal with multi-modal
    datasets.

    First fit a minimum covariance determinant model and then compute the
    Mahalanobis distance as the outlier degree of the data

    See :cite:`rousseeuw1999fast,hardin2004outlier` for details.

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    store_precision : bool
        Specify if the estimated precision is stored.

    assume_centered : Boolean
        If True, the support of the robust location and the covariance
        estimates is computed, and a covariance estimate is recomputed from
        it, without centering the data.
        Useful to work with data whose mean is significantly equal to
        zero but is not exactly zero.
        If False, the robust location and covariance are directly computed
        with the FastMCD algorithm without additional treatment.

    support_fraction : float, 0 < support_fraction < 1
        The proportion of points to be included in the support of the raw
        MCD estimate. Default is None, which implies that the minimum
        value of support_fraction will be used within the algorithm:
        [n_sample + n_features + 1] / 2

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Attributes
    ----------
    raw_location_ : array-like, shape (n_features,)
        The raw robust estimated location before correction and re-weighting.

    raw_covariance_ : array-like, shape (n_features, n_features)
        The raw robust estimated covariance before correction and re-weighting.

    raw_support_ : array-like, shape (n_samples,)
        A mask of the observations that have been used to compute
        the raw robust estimates of location and shape, before correction
        and re-weighting.

    location_ : array-like, shape (n_features,)
        Estimated robust location

    covariance_ : array-like, shape (n_features, n_features)
        Estimated robust covariance matrix

    precision_ : array-like, shape (n_features, n_features)
        Estimated pseudo inverse matrix.
        (stored only if store_precision is True)

    support_ : array-like, shape (n_samples,)
        A mask of the observations that have been used to compute
        the robust estimates of location and shape.

    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted. Mahalanobis distances of the training set (on which
        `:meth:`fit` is called) observations.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, contamination=0.1, store_precision=True,
                 assume_centered=False, support_fraction=None,
                 random_state=None):
        super(MCD, self).__init__(contamination=contamination)
        self.store_precision = store_precision
        self.assume_centered = assume_centered
        self.support_fraction = support_fraction
        self.random_state = random_state

    # noinspection PyIncorrectDocstring
    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        # Validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = MinCovDet(store_precision=self.store_precision,
                                   assume_centered=self.assume_centered,
                                   support_fraction=self.support_fraction,
                                   random_state=self.random_state)
        self.detector_.fit(X=X, y=y)

        # Use mahalanabis distance as the outlier score
        self.decision_scores_ = self.detector_.dist_
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        X = check_array(X)

        # Computer mahalanobis distance of the samples
        return self.detector_.mahalanobis(X)

    @property
    def raw_location_(self):
        """The raw robust estimated location before correction and
        re-weighting.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.raw_location_

    @property
    def raw_covariance_(self):
        """The raw robust estimated location before correction and
        re-weighting.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.raw_covariance_

    @property
    def raw_support_(self):
        """A mask of the observations that have been used to compute
        the raw robust estimates of location and shape, before correction
        and re-weighting.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.raw_support_

    @property
    def location_(self):
        """Estimated robust location.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.location_

    @property
    def covariance_(self):
        """Estimated robust covariance matrix.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.covariance_

    @property
    def precision_(self):
        """ Estimated pseudo inverse matrix.
        (stored only if store_precision is True)

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.precision_

    @property
    def support_(self):
        """A mask of the observations that have been used to compute
        the robust estimates of location and shape.

        Decorator for scikit-learn MinCovDet attributes.
        """
        return self.detector_.support_
class Outlier_detection(object):
    def __init__(self,
                 support_fraction=0.95,
                 verbose=True,
                 chi2_percentile=0.995):
        self.verbose = verbose
        self.support_fraction = support_fraction
        self.chi2 = stats.chi2
        self.mcd = MCD(store_precision=True, support_fraction=support_fraction)
        self.chi2_percentile = chi2_percentile

    def fit(self, X):
        """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme"""
        self.mcd.fit(X)
        mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_,
                                                     self.mcd.precision_)
        d = np.array(list(map(mahalanobis, X)))  #Mahalanobis distance values
        self.d2 = d**2  #MD squared  # <--- l2 norm only option?!
        n, self.degrees_of_freedom_ = X.shape
        self.iextreme_values = (
            self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_)
        )  # boolean array showing outliers
        self.outlier_inds = np.nonzero(od.iextreme_values)[0]  #
        if self.verbose:
            print(
                "%.3f proportion of outliers at %.3f%% chi2 percentile, " %
                (self.iextreme_values.sum() / float(n), self.chi2_percentile))
            print("with support fraction %.2f." % self.support_fraction)
        return self

    def plot(self, log=False, sort=False):
        """
        log: transform the distance-sq to a log
        sort: sort the data according to distance before plotting
        """
        n = self.d2.shape[0]
        fig = plt.figure()

        x = np.arange(n)
        ax = fig.add_subplot(111)

        transform = (lambda x: x) if not log else (lambda x: np.log(x))
        chi_line = self.chi2.ppf(self.chi2_percentile,
                                 self.degrees_of_freedom_)
        chi_line = transform(chi_line)
        d2 = transform(self.d2)
        if sort:
            isort = np.argsort(d2)
            ax.scatter(x, d2[isort], alpha=0.7, facecolors='none')
            plt.plot(x,
                     transform(
                         self.chi2.ppf(np.linspace(0, 1, n),
                                       self.degrees_of_freedom_)),
                     c="r",
                     label="distribution assuming normal")
        else:
            ax.scatter(x, d2)
            extreme_values = d2[self.iextreme_values]
            ax.scatter(x[self.iextreme_values], extreme_values, color="r")

        ax.hlines(chi_line,
                  0,
                  n,
                  label="%.1f%% $\chi^2$ quantile" %
                  (100 * self.chi2_percentile),
                  linestyles="dotted")
        ax.legend()
        ax.set_ylabel("distance squared")
        ax.set_xlabel("observation")
        ax.set_xlim(0, self.d2.shape[0])
        plt.show()

#        if plot_2d:
#            if self.degrees_of_freedom_!=2:
#                print('Dataset dimensions do not allow 2D plot.')
#            else:

# =============================================================================
#     # TO DO:   ADD 3D VERSION (SHOULD HAVE THIS SOMEWHERE)
# =============================================================================

# =============================================================================
#     # TO DO:   ADD ROBUST DEMO FROM SKLEARN (AND ADAPT), SEE BELOW:
# =============================================================================

    """ Robust Mahalanobis distance
        Sources:
            https://en.wikipedia.org/wiki/Mahalanobis_distance
            http://scikit-learn.org/stable/auto_examples/covariance/plot_mahalanobis_distances.html#sphx-glr-auto-examples-covariance-plot-mahalanobis-distances-py
            ^^ latter uses robust estimates of mu and covariance!
    """
    # fit a Minimum Covariance Determinant (MCD) robust estimator to data
    Mahal = pd.DataFrame(data=np.random(50, 2))  # add
    X = Mahal.values  #MID_overview[['recentSales', 'CB_perc', 'HRW_perc']].values
    robust_cov = MinCovDet().fit(
        X
    )  # NB. robuster than standard covariance estimator EmpiricalCovariance().fit(X)
    Mahal['mahal_dist'] = robust_cov.mahalanobis(
        X - robust_cov.location_)  #** (0.33)
    Mahal['rank_mahal'] = Mahal.mahal_dist.rank(ascending=True).astype(int)
Esempio n. 24
0
    choosen = [k for k in rm if k[0] == fname]
    choosen.sort(
        key=lambda x: datetime.datetime.strptime(x[1], "%Y%m%d-%H%M%S"))

    for k1 in choosen:
        a1, a2, a3 = extractStats(k1[2], fr)
        a1.extend(a2)
        a1.extend(a3)
        ab.append(a1)

    rr = np.array(ab)

    #print(dataNameList_f)
    if dataNameList_f:
        print('fitting ' + str(len(dataNameList_f)) + ' new data')
        mcd.fit(rr[:-1 * nrAnalysis - 1, :])
    else:
        print('no new data')

    arn = mcd.mahalanobis(rr[-1 * nrAnalysis - 1:-1, :] -
                          mcd.location_)**(0.33)
    aro = mcd.mahalanobis(rr[:-1 * nrAnalysis - 1, :] - mcd.location_)**(0.33)

    print(np.median(aro[mcd.support_]))

    ax1.clear()
    ax1.scatter(rr[:-1 * nrAnalysis - 1, [0]],
                rr[:-1 * nrAnalysis - 1, [3]],
                marker='+')
    ax1.scatter(rr[-1 * nrAnalysis - 1:-1, [0]],
                rr[-1 * nrAnalysis - 1:-1, [3]],
def reduce_cnts_based_on_coeffs(coeffs: list,
                                cnts: list,
                                percentile=75,
                                plot=True) -> list:
    avgcoeffs = spatial_efd.AverageCoefficients(coeffs)
    SDcoeffs = spatial_efd.AverageSD(coeffs, avgcoeffs)

    if plot:
        median = np.median(np.array(coeffs), axis=0)

        x_med, y_med = spatial_efd.inverse_transform(median, harmonic=10)
        x_avg, y_avg = spatial_efd.inverse_transform(avgcoeffs, harmonic=10)
        x_sd, y_sd = spatial_efd.inverse_transform(SDcoeffs, harmonic=10)

        ax = spatial_efd.InitPlot()
        spatial_efd.PlotEllipse(ax, x_avg, y_avg, color="w", width=2.0)
        spatial_efd.PlotEllipse(ax, x_med, y_med, color="b", width=2.0)

        # Plot avg +/- 1 SD error ellipses
        spatial_efd.PlotEllipse(ax,
                                x_avg + x_sd,
                                y_avg + y_sd,
                                color="r",
                                width=1.0)
        spatial_efd.PlotEllipse(ax,
                                x_avg - x_sd,
                                y_avg - y_sd,
                                color="r",
                                width=1.0)

        plt.close("all")

    arr = np.array(coeffs)
    reshaped = np.reshape(arr, [arr.shape[0], -1])
    MCD = MinCovDet()
    MCD.fit(reshaped)

    a = MCD.mahalanobis(reshaped)

    if plot:
        plt.boxplot(a)
        plt.show()
        plt.close("all")

    percentile = np.percentile(a, percentile)

    reduced = list(np.array(coeffs)[a < percentile])

    avgcoeffs = spatial_efd.AverageCoefficients(reduced)
    SDcoeffs = spatial_efd.AverageSD(reduced, avgcoeffs)

    median = np.median(np.array(reduced), axis=0)

    x_med, y_med = spatial_efd.inverse_transform(median, harmonic=10)
    x_avg, y_avg = spatial_efd.inverse_transform(avgcoeffs, harmonic=10)
    x_sd, y_sd = spatial_efd.inverse_transform(0.1 * SDcoeffs, harmonic=10)

    if plot:
        ax = spatial_efd.InitPlot()
        spatial_efd.PlotEllipse(ax, x_avg, y_avg, color="w", width=2.0)
        spatial_efd.PlotEllipse(ax, x_med, y_med, color="b", width=2.0)

        # Plot avg +/- 1 SD error ellipses
        spatial_efd.PlotEllipse(ax,
                                x_avg + x_sd,
                                y_avg + y_sd,
                                color="r",
                                width=1.0)
        spatial_efd.PlotEllipse(ax,
                                x_avg - x_sd,
                                y_avg - y_sd,
                                color="r",
                                width=1.0)

        i = 10
        plt.figure()
        ax = plt.gca()
        spatial_efd.plotComparison(
            ax,
            coeffs[i],
            10,
            cnts[i][:, 0],
            cnts[i][:, 1],
            color1="w",
            rotation=rots[i],
        )
        plt.show()
        plt.close("all")

    reduced_cnts = np.array(cnts)[a < percentile]

    return reduced_cnts, a < percentile
    raw_slopes_ok_subjs = slopes_df[ok_subjects]

    control_data = raw_slopes_ok_subjs[raw_slopes_ok_subjs['group'] ==
                                       'control']
    control_slopes = control_data[task_names]
    preHD_data = raw_slopes_ok_subjs[raw_slopes_ok_subjs['group'] == 'preHD']
    preHD_slopes = preHD_data[task_names]
    '''
    PCA Representation of raw slopes
    '''

    all_slopes = raw_slopes_ok_subjs[task_names]
    rs = RobustScaler()
    scaled_all_slopes = (1.34896) * rs.fit_transform(all_slopes)
    mcd = MinCovDet()  #random_state=1982)
    mcd.fit(scaled_all_slopes)
    all_slopes_corr = corr_from_cov(mcd.covariance_)
    plot_corr_matrix(all_slopes_corr,
                     col_names=task_names,
                     title='All corr, raw')

    all_pcs, all_var_explained = PCs_from_cov(mcd.covariance_,
                                              task_names,
                                              n_components=n_components,
                                              convert_2_corr=True)

    # Properly centered DataFrame of scaled slopes
    stds = pd.Series(dict(zip(task_names,
                              np.sqrt(mcd.covariance_.diagonal()))))
    rn_slopes = pd.DataFrame(dict(zip(task_names, scaled_all_slopes.T))).\
        set_index(all_slopes.index) / stds -\
Esempio n. 27
0
def DetectOutliers(sc, cluster_label, red_dim = 2, outlier_prob_thres = 10**-4):
    """
    This function implements the outlier detection scheme of FEATS.


    Parameters
    ----------

    sc : SingleCell              
        The SingleCell object which contains the data and metadata of genes and cells

    cluster_label : str
        The name of the column in celldata assay of sc which stores the cluster labels of the cells

    red_dim : int, optional
        The reduced dimentionality in which the outliers are computed. Default 2. 

    outlier_prob_thres : float
        The probability threshold for samples to be classified as outliers. Default 10^-4. 
        

    Returns
    -------

    SingleCell
        The single cell object containing the outlier analysis information in the celldata assay. It 
        contains the following columns in the celldata assay with the outlier information: 
        'FEATS_Outliers' - A column with the value True if the respective cell is an outlier, False otherwise.
        'FEATS_Msd' - The computed Mahalanobis squared distance for the respective cells. 
        'FEATS_Outlier_Score' - The outlier score for the respective cells.
        'FEATS_Oos' -  A column with the value True if the respective cell was not used by the Minimum
        Covariance Determinant (MCD) algorithm in computing the robust mean and covariance matrix. 

    """
    
    # Store outlier probability in sc object 
    sc.addCellData(col_data = -np.log10(np.ones(sc.dim[1]) * outlier_prob_thres), col_name = 'Outlier_Thres')

    # First check if clustering has been performed
    if (sc.checkCellData(cluster_label) == False):
        raise ValueError("Clustering has not been done. Perform clustering first! ")

    else:
        print("Computing outliers . . .")
        # Get cluster labels
        labels = sc.getCellData(cluster_label)
        n_clusters = np.unique(labels)
        X = sc.getCounts()
        _, n_samples = X.shape

        # Sort according to F scores
        scores = sc.getGeneData('FEATS_F_Score')
        idx = np.argsort(scores, kind='mergesort')
        idx = idx[::-1] # Sort descending
        # X = X[idx[0:100], :]

        # PCA
        pc = PCA(n_components=red_dim)
        X_red = pc.fit_transform(X.T)
        X_red = X_red.T

        mcd = MinCovDet(assume_centered=True)
        #mcd = []
        #for i in range(n_clusters):
        #    mcd.append(MinCovDet(assume_centered=True))   # mcd object, to compute min cov determinant 

        oos = np.zeros(n_samples, dtype=bool)   # Out of sample estimates (bool), True if sample is not included 
                                                # in MCD computation

        squared_md = np.zeros(n_samples)        # Squared Mahalanobis Distance

        # For each cluster reduce the data and estimate the robust covariance
        for i in n_clusters:
            mask = (labels == i)

            # If number of samples is less than number of features in reduced data squared. 
            if (np.sum(mask) < red_dim**2):

                print("Number of samples is less than number of features squared.")
                print("Not performing outlier detection on cluster ", i)
                oos[mask] = False               # Set the samples as not an outlier
                squared_md[mask] = 0.0      # Set the mahalanobis distance as zero.

            else:
 
                cluster = X_red[:, mask]
                mcd.fit(cluster.T)          # Fit a minimum covariance determinant estimator
                # cluster_mu = mcd.location_
                # cluster_cov = mcd.covariance_
                squared_md[mask] = mcd.mahalanobis(cluster.T)
                oos[mask] = (mcd.support_ == False)

        outlier_score = -np.log10(chi2.sf(squared_md, red_dim))
        outliers = outlier_score > -np.log10(outlier_prob_thres)

        print ("Number of outliers = ", np.sum(outliers))
        print ("Number of points in out of sample = ", np.sum(oos))

        print("Saving outlier information in Single Cell object . . .")

        sc.addCellData(col_data = outliers, col_name = "FEATS_Outliers")
        sc.addCellData(col_data = squared_md, col_name = "FEATS_Msd")
        sc.addCellData(col_data = outlier_score, col_name = "FEATS_Outlier_Score")
        sc.addCellData(col_data = oos, col_name = "FEATS_Oos")

    return sc