def calculate_distance_metrics(df, time_delta=pd.Timedelta(7, "D")):
    """
    """
    lobs = df.columns.tolist()
    # Self-join LoB fractions, offset by the time specified in the time_delta kwarg.
    df = df.merge(df.set_index(df.index.to_series() + time_delta),
                  on='date_',
                  suffixes=('', '_prev'))

    # For each cumulative distribution fraction based metric, compare current to previous distribution:
    # Kolmogorov-Smirnov Distance
    df['ks_distance'] = df.apply(lambda row: scipy.stats.ks_2samp(
        row[lobs], row[[x + "_prev" for x in lobs]])[0],
                                 axis=1)
    #df['ks_distance_pvalue'] = df.apply(lambda row: scipy.stats.ks_2samp(row[lobs], row[[x+"_prev" for x in lobs]])[1], axis=1)
    # Wasserstein Distance (aka Earth-Mover)
    df["wasserstein"] = df.apply(lambda row: scipy.stats.wasserstein_distance(
        row[lobs], row[[x + "_prev" for x in lobs]]),
                                 axis=1)
    # Cramer-von Mises Distance (aka energy distance)
    df["energy_dist"] = df.apply(lambda row: scipy.stats.energy_distance(
        row[lobs], row[[x + "_prev" for x in lobs]]),
                                 axis=1)
    df.drop(columns=[x + "_prev" for x in lobs], inplace=True)

    # Calculate the Mahalanobis distance (based on the multivariate equivalent of the standard deviation)
    pca = decomposition.PCA().fit_transform(
        preprocessing.StandardScaler().fit_transform(df[lobs]))[:, :10]
    mahalanobis = covariance.MinCovDet().fit(pca).mahalanobis(pca)
    df['mahalanobis'] = mahalanobis
    return df.drop(columns=lobs)
Beispiel #2
0
def robust_mahalanobis_with_chi2(feat, prob_reject, ret_dist=False):
    '''Reject outliers using one-class classification based on the mahalanobis distance
    estimate from a robust covariance as calculated by minimum covariance determinant.
    
    :Parameters:
        
        feat : array
               2D array where each row is a feature and each column a factor
        prob_reject : float
                      Probability threshold for rejecting outliers
        extra : dict
                Unused keyword arguments
    
    :Returns:
        
        sel : array
              Boolean selection array for each feature
    '''

    feat -= numpy.median(
        feat, axis=0)  #feat.mean(axis=0)#scipy.stats.mstats.mode(feat, 0)[0]
    try:
        robust_cov = skcov.MinCovDet().fit(feat)
    except:
        robust_cov = skcov.EmpiricalCovariance().fit(feat)
    dist = robust_cov.mahalanobis(
        feat)  # - scipy.stats.mstats.mode(feat, 0)[0])
    cut = scipy.stats.chi2.ppf(prob_reject, feat.shape[1])
    sel = dist < cut
    return (sel, dist) if ret_dist else sel
Beispiel #3
0
def AddMahalanobis(df):
    df2 = df.copy()
    for t in set(df.type):
        df_of_a_type = df[df.type == t]
        mcd = covariance.MinCovDet()
        mcd.fit(df_of_a_type[NUMERIC_FEATURES])
        df2[t + '_md'] = mcd.mahalanobis(df[NUMERIC_FEATURES])
    return df2
Beispiel #4
0
def robustcovest(df, covtype):
    if (covtype == 'sample'):
        return pd.DataFrame(np.cov(df, rowvar=False, ddof=1),
                            index=df.columns,
                            columns=df.columns)
    if (covtype == 'LedoitWolf'):
        lw = skc.LedoitWolf()
        return pd.DataFrame(lw.fit(np.matrix(df)).covariance_,
                            index=df.columns,
                            columns=df.columns)
    if (covtype == 'MinDet'):
        return pd.DataFrame(skc.MinCovDet().fit(df).covariance_,
                            index=df.columns,
                            columns=df.columns)
Beispiel #5
0
 def __init__(self, dim, estimator='OAS', **kwargs):
     """
     TODO
     """
     super(SKGaussianParams, self).__init__(dim, **kwargs)
     if estimator == 'EmpiricalCovariance':
         self._estimator = covariance.EmpiricalCovariance(
             assume_centered=True)
     elif estimator == 'LedoitWolf':
         self._estimator = covariance.LedoitWolf(assume_centered=True)
     elif estimator == 'MinCovDet':
         self._estimator = covariance.MinCovDet(assume_centered=True)
     elif estimator == 'OAS':
         self._estimator = covariance.OAS(assume_centered=True)
     elif estimator == 'ShrunkCovariance':
         self._estimator = covariance.ShrunkCovariance(assume_centered=True)
     else:
         raise ValueError('Unknown estimator: {}'.format(estimator))
Beispiel #6
0
def get_covariance(var_iter, shrinkage=0.1):
    cov = []
    for (samples, genos, quals, variant) in var_iter:
        if genos is None: continue
        if any(np.isnan(genos)): continue
        if len(np.unique(genos)) == 1: continue
        cov.append(genos)
    cov = np.cov(np.array(cov, dtype='f').T)
    cov[np.diag_indices_from(cov)] = 1
    # shrunk
    cov = covariance.shrunk_covariance(cov, shrinkage=shrinkage)
    #cov, _ = covariance.ledoit_wolf(cov)
    #cov, _ = covariance.oas(cov)
    # robust:
    try:
        cov = covariance.MinCovDet().fit(cov).covariance_
    except ValueError:
        pass
    return cov
cur.execute(sql)

colnames = [desc[0] for desc in cur.description]

raw_data = cur.fetchall()

df = pd.DataFrame(raw_data, columns=colnames)
dependents = df.drop('userid', 1)
dependents = dependents.drop('distance_pct_path_error',
                             1).drop('classification_confidence',
                                     1).drop('suspension_fit_error',
                                             1).fillna(0)
dependents = dependents.apply(lambda x: (x - np.min(x)) /
                              (np.max(x) - np.min(x)))

mcd = covariance.MinCovDet()
mcd.fit(dependents)
distances = mcd.mahalanobis(dependents - mcd.location_)**(.5)
distances_with_idx = zip(range(0, len(distances)), distances)
#pctile_cutoff = np.percentile(distances_with_idx, 90)

pctile_cutoff = 0
filtered_distances = [i for i in distances_with_idx if i[1] > pctile_cutoff]
filtered_distances.sort(key=lambda x: -x[1])
filtered_distances = filtered_distances[:21]

max_trips = max(df['trip_cnt'])
trip_center = mcd.location_[0] * max_trips

max_minutes = max(df['avg_minutes'])
minutes_center = mcd.location_[1] * max_minutes