Example #1
0
def sliced_wasserstein_no_histogram(p, q, iters=20):
    '''
        Utility function for the 1-Sliced Wasserstein distance.
        Tries to fight the course of dimensionality by only considering the
        "bins" around the sampled values.
        If the sampled values are entirely wrong, this method will not work >> use first moment test first.
        p = sampled values
        q = density function at the sampled values
    '''
    if any(np.isnan(x) for x in p.flatten()):
        return float('inf')
    if len(set(tuple(x) for x in p)) / len(p) <= 0.1:
        return float('inf')

    dim = p.shape[1]
    if dim == 1:
        return wasserstein_distance(p.flatten(), p.flatten(), np.ones(p.shape[0]) + EPS, q + EPS)

    dist = 0
    for _ in range(iters):
        proj_vec = normal(size=dim)
        proj_vec = proj_vec / norm(proj_vec) # sample randomly from dim-1 sphere
        bins = [np.dot( proj_vec, pt ) for pt in p]
        dist += wasserstein_distance(bins, bins, np.ones(p.shape[0]) + EPS, q + EPS)
    return dist/iters
def get_monitoring_tools(X, y):
    """
    determine outlier and distance thresholds
    return thresholds, outlier model(s) and source distributions for distances
    NOTE: for classification the outlier detection on y is not needed
    """

    preprocessor = get_preprocessor()
    preprocessor = preprocessor.fit(X)
    X_pp = preprocessor.transform(X)

    xpipe = Pipeline(steps=[(
        'pca',
        PCA(2)), ('clf',
                  EllipticEnvelope(random_state=0, contamination=0.01))])
    xpipe.fit(X_pp)

    bs_samples = 1000
    outliers_X = np.zeros(bs_samples)
    wasserstein_X = np.zeros(bs_samples)
    wasserstein_y = np.zeros(bs_samples)

    for b in range(bs_samples):
        n_samples = int(np.round(0.80 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=True).astype(int)
        y_bs = y[subset_indices]
        X_bs = X_pp[subset_indices, :]

        test1 = xpipe.predict(X_bs)
        wasserstein_X[b] = wasserstein_distance(X_pp.flatten(), X_bs.flatten())
        wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
        outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

    ## determine thresholds as a function of the confidence intervals
    outliers_X.sort()
    outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(
        0.025 * bs_samples)]

    wasserstein_X.sort()
    wasserstein_X_threshold = wasserstein_X[int(
        0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

    wasserstein_y.sort()
    wasserstein_y_threshold = wasserstein_y[int(
        0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

    to_return = {
        "outlier_X": np.round(outlier_X_threshold, 1),
        "wasserstein_X": np.round(wasserstein_X_threshold, 2),
        "wasserstein_y": np.round(wasserstein_y_threshold, 2),
        "preprocessor": preprocessor,
        "clf_X": xpipe,
        "X_source": X_pp,
        "y_source": y,
        "latest_X": X,
        "latest_y": y
    }
    return (to_return)
Example #3
0
    def calc_c_hist(self):
        """
        Compare the histograms of all of the eigenvectors (eigenfunctions) for each of the
        meshes - only upto the number of meshes we are interested in. The paper says to do
        this for the number of features we are interested (i.e. 5 or 6), however, this implementation
        does that for more than the requested number of features to ensure that we dont get the wrong ones.
        So, we'll get more spectral coordinates, order them, and then select the appropriate number from
        these re-ordered coordinates.

        :return:
        """

        # Initially tried using straight values (not log) but eig vec 1 got accentuated too much (in the weighting)
        # Then tried just log, but becuase there are negative values it creates erro.
        # need to add .5 + a small value to ensure there are no 0 values entered to log.
        # wasserstein_distance is the same as earth movers distance, and is the minimum "work" needed to
        # transform u (first entry) into v (second entry).
        eps = np.finfo(float).eps
        for i in range(self.n_features):
            for j in range(self.n_features):
                self.c_hist[i, j] = wasserstein_distance(
                    np.log(self.rand_target_eig_vecs[:, i] + 0.5 + eps),
                    np.log(self.rand_source_eig_vecs[:, j] + 0.5 + eps))
                self.c_hist_f[i, j] = wasserstein_distance(
                    np.log(self.rand_target_eig_vecs[:, i] + 0.5 + eps),
                    np.log(-self.rand_source_eig_vecs[:, j] + 0.5 + eps))
Example #4
0
def estimate_noise_model(img_segment):
    mean, standard_deviation, hist, bins = estimate_noise_parameters(
        img_segment, return_histogram=True)
    pdf, bins = histogram_processing.compute_image_pdf(img_segment)
    #standard_deviation = np.sqrt(np.var(pdf.values()))

    gaussian_pdf = generate_gaussian_pdf(mean, standard_deviation)
    rayleigh_pdf = generate_rayleigh_pdf(mean, standard_deviation)
    erlang_pdf = generate_erlang_pdf(mean, standard_deviation)
    exponential_pdf = generate_exponential_pdf(mean, standard_deviation)
    uniform_pdf = generate_uniform_pdf(mean, standard_deviation)

    gaussian_distance = wasserstein_distance(
        pdf, np.array(list(gaussian_pdf.values())))
    rayleigh_distance = wasserstein_distance(
        pdf, np.array(list(rayleigh_pdf.values())))
    erlang_distance = wasserstein_distance(pdf,
                                           np.array(list(erlang_pdf.values())))
    exponential_distance = wasserstein_distance(
        pdf, np.array(list(exponential_pdf.values())))
    uniform_distance = wasserstein_distance(
        pdf, np.array(list(uniform_pdf.values())))

    distances = {
        'gaussian': gaussian_distance,
        'rayleigh': rayleigh_distance,
        'erlang': erlang_distance,
        'exponential': exponential_distance,
        'uniform': uniform_distance
    }

    min_distance = tuple((d[0], d[1]) for d in distances.items()
                         if d[1] == min(distances.values()))

    return distances, min_distance
Example #5
0
def compare_drift(X_src, y_src, X_new, y_new):
    clf_y = EllipticEnvelope(random_state=0, contamination=0.01)
    clf_X = EllipticEnvelope(random_state=0, contamination=0.01)

    clf_X.fit(X_src)
    clf_y.fit(y_src.reshape(y_src.size, 1))

    test_X = clf_X.predict(X_new)

    test_y = clf_y.predict(y_new.reshape(-1, 1))

    X_distance = wasserstein_distance(X_src.values.flatten(),
                                      X_new.values.flatten())

    y_distance = wasserstein_distance(y_src.flatten(), y_new.flatten())

    X_outlier = len(test_X[test_X == -1]) / len(test_X)

    y_outlier = len(test_y[test_y == -1]) / len(test_y)

    results = {
        'X_wasserstein_distance': X_distance,
        'y_wasserstein_distance': y_distance,
        'X_outlier_percentage': X_outlier,
        'y_outlier_percentage': y_outlier
    }

    return results
Example #6
0
def metric_max(n_classes,Mtype):
    Pref=np.ones(n_classes)/n_classes #Reference attribute
    Pep=np.zeros(n_classes)
    Pep[0]=1
    if Mtype=="L1":
        fair_d = abs(Pep - Pref).sum()
    elif Mtype=="L2":
        fair_d = np.sqrt(((Pep - Pref)**2).sum())
    elif Mtype=="Is":
        #L1
        l1_fair_d = abs(Pep - Pref).sum()
        #Specificity
        rank=np.linspace(1,n_classes-1,n_classes-1)
        rank[::-1].sort() #Descending order
        perc=np.array([i/np.sum(rank) for i in rank])
        
        alpha=Pep[1:]
        specificity=abs(Pep[0]-np.sum(alpha*perc))
        fair_d=(l1_fair_d+specificity)/2
    elif Mtype=="Wd":
        fair_d=wasserstein_distance(Pep,Pref)
    elif Mtype=="Wds":
        #Specificity
        rank=np.linspace(1,n_classes-1,n_classes-1)
        rank[::-1].sort() #Descending order
        perc=np.array([i/np.sum(rank) for i in rank])
        alpha=Pep[1:]
        specificity=abs(Pep[0]-np.sum(alpha*perc))
        #Wassertein dist
        ws=wasserstein_distance(Pep,Pref)
        
        fair_d=(ws+specificity)/2    
    else:
        fair_d=0
    return fair_d
Example #7
0
def compute_statistics_MLP(y_A, y_nodes, output_A, output_nodes, y_seq_len, output_seq_len):
    r"""
    Compute statistics for the current data point, based on the one-shot output from the MLP decoder.

    :param output_adj: predicted A
    :param output_coord: predicted X
    :param output_seq_len: predicted |V|
    :param y_adj: target A
    :param y_coord: target X
    :param y_seq_len: target |V|
    :param lamb: lambda parameter for the loss in this experiment
    :return: streetmover, acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam
    """
    output_graph = nx.from_numpy_matrix(output_A)
    y_graph = nx.from_numpy_matrix(y_A)
    
    output_degree = get_degree_hist(output_graph)
    y_degree = get_degree_hist(y_graph)
    dist_degree = wasserstein_distance(output_degree, y_degree)
    
    output_diam = get_diameters(output_graph)
    y_diam = get_diameters(y_graph)
    dist_diam = wasserstein_distance(output_diam, y_diam) if len(output_diam) > 0 else 1
    
    delta_n_nodes = int(output_seq_len - y_seq_len)
    delta_n_edges = output_A.sum() - y_A.sum()
    
    acc_A = get_accuracy_A(output_A, y_A)
    
    (y_pc, output_pc), (streetmover, P, C) = streetmover_distance(y_A, y_nodes, output_A, output_nodes, n_points=100)
    # print("Streetmover distance: {:.3f}".format(streetmover.item()))
    
    return streetmover.item(), acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam
Example #8
0
def wasserstein(dataset_path):
    """
    This needs to be the full datasets to make sense.
    Since it is the full datasets, this is not included
    when running all tests.
    """
    from scipy.stats import wasserstein_distance
    for dataset_cls in all_datasets():
        print('Loading dataset:', dataset_cls.__name__)
        train = dataset_cls(dataset_path, split=TRAIN)
        valid = dataset_cls(dataset_path, split=VALIDATION)
        test = dataset_cls(dataset_path, split=TEST)
        splits = [train, valid, test]

        ws_max = 0.0
        ws_ave = 0.0
        for i in range(train.num_features):
            for j, split in enumerate(splits):
                if j == 0:
                    hist, bins = np.histogram(split.x[:, i], density=True)
                    us = split.x[:, i]
                else:
                    hist, _ = np.histogram(split.x[:, i], density=True, bins=bins)
                    vs = split.x[:, i]
                    ws = wasserstein_distance(us, vs)
                    ws_max = max(ws_max, wasserstein_distance(us, vs))
                    ws_ave += ws / (train.num_features * 2)
        print('Max wasserstein:', ws_max)
        print('Average wasserstein:', ws_ave)
        print()
def f_dist(histogram1, histogram2):
    if histogram1 != [0] and histogram2 != [0]:
        return wasserstein_distance(histogram1[0], histogram2[0])
    elif histogram1 != [0] and histogram2 == [0]:
        return wasserstein_distance(histogram1[0], np.zeros((1, )))
    elif histogram1 == [0] and histogram2 != [0]:
        return wasserstein_distance(histogram2[0], np.zeros((1, )))
Example #10
0
def report_linear_onestep(metric_function):
    
    error_list = []
    
    time_points, test_data = linear.simulate_default()

    feature_list = [] # here we do not have external time-series or control variables
    target_list = ['x_component', 'y_component', 'z_component']
    df_test = create_data(test_data, time_points, cols=target_list, num=2)


    for _ in range(10):
        # generate data with random initial conditions
        xi = np.random.uniform(1, 4, 1)[0]
        yi = np.random.uniform(0, 4, 1)[0]
        zi = np.random.uniform(1, 2, 1)[0]
        time_points, cubic_data = linear.simulate_custom(xinit=xi, yinit=yi, zinit=zi)

        df = create_data(cubic_data, time_points, cols=target_list, num=0)
        df_train = train_onestep.generate_dataset(df, [0],feature_list, target_list, n_dim=2500)
        
        rf_model = RandomForestRegressor(n_estimators=20)
        figure_path = './plots/'
        rf_dict, score_dict = train_onestep.train_classic(df_train, rf_model, plot=False,model_type='random_forest', figure_path=figure_path)
        time_points, predictions = predict_onestep.predict_integrate(df_test, df, rf_dict, target_list, feature_list, title='test', plot=False,model_type='random_forest', subplots=(3,1), bio=False)
        predictions = predictions.to_numpy()
        
        if metric_function == "wasserstein":
            e1 = wasserstein_distance(predictions[:,0], test_data[0,:,0])
            e2 = wasserstein_distance(predictions[:,1], test_data[0,:,1])
            e3 = wasserstein_distance(predictions[:,2], test_data[0,:,2])
            
        elif metric_function == "dtw":
            e1, _ = fastdtw(predictions[:,0], test_data[0,:,0], dist=euclidean)
            e2, _ = fastdtw(predictions[:,1], test_data[0,:,1], dist=euclidean)
            e3, _ = fastdtw(predictions[:,2], test_data[0,:,2], dist=euclidean)
            e1 /= np.linalg.norm(test_data[0,:,0], 2)**2
            e2 /= np.linalg.norm(test_data[0,:,1], 2)**2
            e3 /= np.linalg.norm(test_data[0,:,2], 2)**2
            
        elif metric_function == "mse":
            e1 = predict_lmmNet.compute_MSE(predictions, test_data[0], 0)
            e2 = predict_lmmNet.compute_MSE(predictions, test_data[0], 1)
            e3 = predict_lmmNet.compute_MSE(predictions, test_data[0], 2)
        error_list.append((e1, e2, e3))
        
        # plot
        plt.figure(figsize=(20, 10))
        plt.plot(time_points, test_data[0,:,0], 'r.', label='x_1')
        plt.plot(time_points, test_data[0,:,1], 'y.', label='x_2')
        plt.plot(time_points, test_data[0,:,2], 'g.', label='x_3')
        plt.plot(time_points, predictions[:,0], 'b--', label='predicted dynamics')
        plt.plot(time_points, predictions[:,1], 'b--')
        plt.plot(time_points, predictions[:,2], 'b--')
        plt.title(str(xi) + " " + str(yi) + " " + str(zi))
        plt.legend()
        plt.show()
        
    return error_list
Example #11
0
    def get_univariate_quant_metrics(self,
                                     dataset=TRAIN,
                                     transformed=False,
                                     verbose=True,
                                     thin_model=None,
                                     thin_true=None,
                                     seed=None,
                                     n=None):
        """
        Calculates quantitative metrics for the difference between p(t) and
        p_model(t) and the difference between p(y) and p_model(y)

        :param dataset: dataset subset to evaluate on (train, val, or test)
        :param transformed: If True, use transformed version of data.
            If False, use original (non-transformed) version of data.
        :param thin_model: thinning interval for the model data
        :param thin_true: thinning interval for the real data
        :param seed: seed for sample from generative model
        :return: {
            't_ks_pval': ks p-value with null that t_model and t_true are from the same distribution
            'y_ks_pval': ks p-value with null that y_model and y_true are from the same distribution
            't_wasserstein1_dist': wasserstein1 distance between t_true and t_model
            'y_wasserstein1_dist': wasserstein1 distance between y_true and y_model
        }
        """
        _, t_model, y_model = to_np_vectors(self.sample(
            seed=seed, untransform=(not transformed)),
                                            thin_interval=thin_model)

        _, t_true, y_true = self.get_data(transformed=transformed,
                                          dataset=dataset,
                                          verbose=verbose)
        t_true, y_true = to_np_vectors((t_true, y_true),
                                       thin_interval=thin_true)

        # jitter for numerical stability
        t_true = t_true.copy() + np.random.rand(*t_true.shape) * 1e-6
        t_model = t_model.copy() + np.random.rand(*t_model.shape) * 1e-6

        ks_label = "_ks_pval"
        es_label = "_es_pval"
        wasserstein_label = "_wasserstein1_dist"
        metrics = {
            T + ks_label:
            float(stats.ks_2samp(t_model, t_true).pvalue),
            Y + ks_label:
            float(stats.ks_2samp(y_model, y_true).pvalue),
            T + es_label:
            float(stats.epps_singleton_2samp(t_model, t_true).pvalue),
            Y + es_label:
            float(stats.epps_singleton_2samp(y_model, y_true).pvalue),
            T + wasserstein_label:
            float(stats.wasserstein_distance(t_model, t_true)),
            Y + wasserstein_label:
            float(stats.wasserstein_distance(y_model, y_true)),
        }

        return metrics
Example #12
0
def clustering(init_rec_field1, fin_rec_field1, dst, mode='RGB', show=True):
    """
        Dendograms with earth mover's distance.
    """    
    
    import numpy as np
    import matplotlib.pyplot as plt
    import scipy as scp
    
    from sklearn.cluster import AgglomerativeClustering
    from scipy.stats import wasserstein_distance
    from scipy.cluster.hierarchy import dendrogram, linkage
    
    if mode == 'RGB':
        channels = 3
    else:
        channels = 4
    
    # first gen: generate clusters and dendograms with earth mover distance
    dist = np.zeros(shape=(16,16))
    for i in range(16):
        for j in range(16):
            dist[i,j] = wasserstein_distance(init_rec_field1[:,:,:channels,i].flatten(), 
                                             init_rec_field1[:,:,:channels,j].flatten())
            
    cluster = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage="average")
    clusters = cluster.fit_predict(dist)
    
    # plot dendogram
    sq_dist = scp.spatial.distance.squareform(dist)
    linkage_matrix = linkage(sq_dist, "average")
    dendrogram(linkage_matrix)
    plt.title("[DENDOGRAM RECEPTIVE FIELDS, FIRST GENERATION]: earth mover distance, linkage 'avg'.")
    
    if show == True:
        plt.show()
    else:
        pass    
    # last gen: generate clusters and dendograms with earth mover distance
    dist = np.zeros(shape=(16,16))
    for i in range(16):
        for j in range(16):
            dist[i,j] = wasserstein_distance(fin_rec_field1[:,:,:channels,i].flatten(), 
                                             fin_rec_field1[:,:,:channels,j].flatten())
            
    cluster = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage="average")
    clusters = cluster.fit_predict(dist)
    
    # plot dendogram
    sq_dist = scp.spatial.distance.squareform(dist)
    linkage_matrix = linkage(sq_dist, "average")
    dendrogram(linkage_matrix)
    plt.title("[DENDOGRAM RECEPTIVE FIELDS, LAST GENERATION]: earth mover distance, linkage 'avg'.")
    
    if show == True:
        plt.show()
    else:
        pass
Example #13
0
def calc_distance(features: {}, shape_features: {}) -> {}:
    """
    It determines the closest shape to the query shape by computing a custom distance function
    between the features of dataset's shape and the features of the query shape.
    ----------------------------
    Args:
        features (obj: 'dict): The dictionary containing the feature metrics of the shapes
        shape_features (obj: 'dict'): The dictionary containing the feature metrics of the shape

    Returns:
        similarities (obj: 'dict'): The dictionary containing the closest shapes (key) and the respective
                                    distance to the query shape (value)
    """
    similarities = {}
    weights = np.load(s.SAVED_DATA + "distance_weights.npy",
                      allow_pickle=True).item()

    for id, featuresList in features.items():
        # Distance is the square root of the sum of squared differences
        dist_v = distance.euclidean(featuresList['volume'],
                                    shape_features.get('volume'))
        dist_a = distance.euclidean(featuresList['area'],
                                    shape_features.get('area'))
        dist_c = distance.euclidean(featuresList['compactness'],
                                    shape_features.get('compactness'))
        dist_bb = distance.euclidean(featuresList['bbox_volume'],
                                     shape_features.get('bbox_volume'))
        dist_d = distance.euclidean(featuresList['diameter'],
                                    shape_features.get('diameter'))
        dist_e = distance.euclidean(featuresList['eccentricity'],
                                    shape_features.get('eccentricity'))

        dist_A3 = wasserstein_distance(featuresList['A3'][0],
                                       shape_features.get('A3')[0])
        dist_D1 = wasserstein_distance(featuresList['D1'][0],
                                       shape_features.get('D1')[0])
        dist_D2 = wasserstein_distance(featuresList['D2'][0],
                                       shape_features.get('D2')[0])
        dist_D3 = wasserstein_distance(featuresList['D3'][0],
                                       shape_features.get('D3')[0])
        dist_D4 = wasserstein_distance(featuresList['D4'][0],
                                       shape_features.get('D4')[0])

        similarity = weights["w_v"]*dist_v + \
            weights["w_a"]*dist_a + \
            weights["w_c"]*dist_c + \
            weights["w_bb"]*dist_bb + \
            weights["w_d"]*dist_d + \
            weights["w_e"]*dist_e + \
            weights["w_A3"]*dist_A3 + \
            weights["w_D1"]*dist_D1 + \
            weights["w_D2"]*dist_D2 + \
            weights["w_D3"]*dist_D3 + \
            weights["w_D4"]*dist_D4

        similarities[id] = similarity

    return similarities
Example #14
0
def behavior_comparison():
    networks = (
        ('Caveman-50-10', fio.read_network('networks/cavemen-50-10.txt')),
        ('Elitist-500', fio.read_network('networks/elitist-500.txt')),
        ('CGG-500', fio.read_network('networks/cgg-500.txt'))
    )

    num_sims = 50
    num_behaviors = 3
    distributions = []
    averages = np.zeros((len(networks), num_behaviors))
    loop = tqdm(total=len(networks) * num_behaviors * num_sims)
    for i, (n_name, net) in enumerate(networks):

        behaviors = (
            ('No Mitigations',
             behavior.NoMitigation()),
            ('Generic Pressure R=1',
             behavior.SimplePressureBehavior(net, rng=RNG, radius=1)),
            ('Edge Pressure R=1',
             behavior.SimpleEdgePressureBehavior(net, rng=RNG, radius=1))
            # ('All Edges Sequential Flicker 1/4',
            #  StaticFlickerBehavior(net.M, net.edges, (True, False, False, False))),
            # ('All Edges Random Flicker 0.25',
            #  RandomFlickerBehavior(net.M, net.edges, 0.25)),
            # ('Collected Pressure Flicker 0.25, R=1',
            #  UnifiedPressureFlickerBehavior(net, 1, RNG)),
            # ('Generic Pressure Radius 3',
            #  SimplePressureBehavior(net, 3)),
            # ('Pressure Decay Radius 3',
            #  PressureDecayBehavior(net, 3)),
            # ('Pressure Flicker Radius 3',
            #  PressureFlickerBehavior(net, 3))
        )

        for j, (b_name, behavior) in enumerate(behaviors):
            s_scores = []
            for _ in range(num_sims):
                loop.set_description(f'{n_name}, {b_name}')
                end_sir = simulate(net.M, sir0=make_starting_sir(net.N, 1, rng=RNG),
                                   disease=Disease(4, 0.3),
                                   update_connections=behavior,
                                   max_steps=200,
                                   rng=RNG)[-1]
                s_scores.append(np.sum(end_sir[0, :] > 0)/net.N)
                loop.update()
            # plt.title(f'{n_name}, {b_name}, Avg: {sum(s_scores)/len(s_scores)}')
            # plt.hist(s_scores)
            # plt.figure()
            averages[i, j] = sum(s_scores)/len(s_scores)
            distributions.append(s_scores)
    print(wasserstein_distance(distributions[1], distributions[2]))
    print(wasserstein_distance(distributions[4], distributions[5]))
    print(wasserstein_distance(distributions[7], distributions[8]))
    # plt.show()
    np.set_printoptions(precision=3, suppress=True)
    print(averages)
Example #15
0
def get_monitoring_tools(X, y):
    """
    determine outlier and distance thresholds
    return thresholds, outlier model(s) and source distributions for distances

    """
    X_pre = X
    xpipe = Pipeline(steps=[('scaler',
                             StandardScaler()), ('rf',
                                                 RandomForestRegressor())])
    xpipe.fit(X_pre, y)

    bs_samples = 1000
    outliers_X = np.zeros(bs_samples)
    wasserstein_X = np.zeros(bs_samples)
    wasserstein_y = np.zeros(bs_samples)

    for b in range(bs_samples):
        n_samples = int(np.round(0.8 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y_bs = y[mask]
        X_bs = X[mask]

        test1 = xpipe.predict(X_bs)
        wasserstein_X[b] = wasserstein_distance(X_pre.values.flatten(),
                                                X_bs.values.flatten())
        wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
        outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

    ## determine thresholds as a function of the confidence intervals
    outliers_X.sort()
    outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(
        0.025 * bs_samples)]

    wasserstein_X.sort()
    wasserstein_X_threshold = wasserstein_X[int(
        0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

    wasserstein_y.sort()
    wasserstein_y_threshold = wasserstein_y[int(
        0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

    to_return = {
        "outlier_X": np.round(outlier_X_threshold, 1),
        "wasserstein_X": np.round(wasserstein_X_threshold, 2),
        "wasserstein_y": np.round(wasserstein_y_threshold, 2),
        "pipe_X": xpipe,
        "X_source": X_pre,
        "y_source": y,
        "latest_X": X,
        "latest_y": y
    }
    return (to_return)
Example #16
0
    def detect(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]) -> Dict[str, Union[float, np.ndarray]]:
        """
        determine outlier and distance thresholds
        return thresholds, outlier model(s) and source distributions for distances
        NOTE: for classification the outlier detection on y is not needed
        """

        if isinstance(X, pd.DataFrame):
            X = X.values

        if isinstance(y, pd.Series):
            y = y.values


        contamination: float = 0.01

        xpipe = Pipeline(steps=[('pca', PCA(2, random_state=self.seed)),
                                ('clf', EllipticEnvelope(random_state=self.seed, contamination=contamination))])
        xpipe.fit(X)

        bs_samples: int = 1000
        outliers_X: np.ndarray = np.zeros(bs_samples)
        wasserstein_X: np.ndarray = np.zeros(bs_samples)
        wasserstein_y: np.ndarray = np.zeros(bs_samples)

        for b in range(bs_samples):
            # set random seed
            rng = np.random.default_rng(self.seed + b)

            n_samples = int(np.round(0.80 * X.shape[0]))
            subset_indices = rng.choice(np.arange(X.shape[0]), n_samples, replace=True).astype(int)
            y_bs = y[subset_indices]
            X_bs = X[subset_indices, :]

            test1 = xpipe.predict(X_bs)
            wasserstein_X[b] = wasserstein_distance(X.flatten(), X_bs.flatten())
            wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
            outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

        ## determine thresholds as a function of the confidence intervals
        outliers_X.sort()
        outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(0.025 * bs_samples)]

        wasserstein_X.sort()
        wasserstein_X_threshold = wasserstein_X[int(0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

        wasserstein_y.sort()
        wasserstein_y_threshold = wasserstein_y[int(0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

        result = {
                    "outlier_X": np.round(outlier_X_threshold, 2),
                    "wasserstein_X": np.round(wasserstein_X_threshold, 2),
                    "wasserstein_y": np.round(wasserstein_y_threshold, 2)
                  }
        return result
Example #17
0
def get_monitoring_tools(df):
    """
    determine outlier and distance thresholds
    return thresholds, outlier model(s) and source distributions for distances
    NOTE: for classification the outlier detection on y is not needed

    """
    X, y, dates = engineer_features(df)
    X1 = X.to_numpy()
    xpipe = Pipeline(steps=[(
        'pca',
        PCA(2)), ('clf',
                  EllipticEnvelope(random_state=0, contamination=0.01))])
    xpipe.fit(X1)
    bs_samples = 549
    outliers_X = np.zeros(bs_samples)
    wasserstein_X = np.zeros(bs_samples)
    wasserstein_y = np.zeros(bs_samples)
    for b in range(bs_samples):
        n_samples = int(np.round(0.80 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X1.shape[0]),
                                          n_samples,
                                          replace=True).astype(int)
        y_bs = y[subset_indices]
        X_bs = X1[subset_indices, :]

        test1 = xpipe.predict(X_bs)
        wasserstein_X[b] = wasserstein_distance(X1.flatten(), X_bs.flatten())
        wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
        outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

    outliers_X.sort()
    outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(
        0.025 * bs_samples)]

    wasserstein_X.sort()
    wasserstein_X_threshold = wasserstein_X[int(
        0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

    wasserstein_y.sort()
    wasserstein_y_threshold = wasserstein_y[int(
        0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

    to_return = {
        "outlier_X": np.round(outlier_X_threshold, 1),
        "wasserstein_X": np.round(wasserstein_X_threshold, 2),
        "wasserstein_y": np.round(wasserstein_y_threshold, 2),
        "clf_X": xpipe,
        "X_source": X1,
        "y_source": y,
        "latest_X": X,
        "latest_y": y
    }
    return (to_return)
Example #18
0
def report_harmonic_onestep(metric_function):
    
    error_list = []
    
    time_points, test_data = harmonic.simulate_custom(xinit=1, yinit=0)

    feature_list = [] # here we do not have external time-series or control variables
    target_list = ['x_component', 'y_component']
    df_test = create_data(test_data, time_points, cols=target_list, num=2)

    for _ in range(10):
        # generate data with random initial conditions
        xi = np.random.uniform(0, 4, 2)
        yi = np.random.uniform(0, 4, 2)
        time_points, data1 = harmonic.simulate_custom(xinit=xi[0], yinit=yi[0])    
        time_points, data2 = harmonic.simulate_custom(xinit=xi[1], yinit=yi[1])

        # now generate and augment the training dataset
        df1 = create_data(data1, time_points, cols=target_list, num=0)
        df2 = create_data(data2, time_points, cols=target_list, num=1)
        df = pd.concat([df1, df2])
        df_train = train_onestep.generate_dataset(df, [0,1],feature_list, target_list, n_dim=30000)

        rf_model = RandomForestRegressor(n_estimators=20)
        figure_path = './plots/'
        rf_dict, score_dict = train_onestep.train_classic(df_train, rf_model, plot=False,model_type='random_forest', figure_path=figure_path)
        time_points, predictions = predict_onestep.predict_integrate(df_test, df, rf_dict, target_list, feature_list, title='test', plot=False,model_type='random_forest', subplots=(2,1), bio=False)
        predictions = predictions.to_numpy()

        if metric_function == "wasserstein":
            e1 = wasserstein_distance(predictions[:,0], test_data[0,:,0])
            e2 = wasserstein_distance(predictions[:,1], test_data[0,:,1])
            
        elif metric_function == "dtw":
            e1, _ = fastdtw(predictions[:,0], test_data[0,:,0], dist=euclidean)
            e2, _ = fastdtw(predictions[:,1], test_data[0,:,1], dist=euclidean)
            e1 /= np.linalg.norm(test_data[0,:,0], 2)**2
            e2 /= np.linalg.norm(test_data[0,:,1], 2)**2
            
        elif metric_function == "mse":
            e1 = predict_lmmNet.compute_MSE(predictions, test_data[0], 0)
            e2 = predict_lmmNet.compute_MSE(predictions, test_data[0], 1)
        error_list.append((e1, e2))
        
        # plot
        plt.plot(time_points, test_data[0,:,0], 'r.', label='x_1')
        plt.plot(time_points, test_data[0,:,1], 'y.', label='x_2')
        plt.plot(time_points, predictions[:,0], 'b--', label='predicted x_1')
        plt.plot(time_points, predictions[:,1], 'b--', label='predicted x_2')
        plt.title(str(xi) + " " + str(yi))
        plt.legend()
        plt.show()
        
    return error_list
    def predict(self, seg_list, **kwargs):
        # from sklearn.cross_decomposition import CCA
        # X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [3.,5.,4.]]
        # Y = [[0.1], [0.9], [6.2], [11.9]]
        # cca = CCA(n_components=1)
        # cca.fit(X, Y)
        # X_c, Y_c = cca.transform(X, Y)
        # print(X_c, Y_c)

        for idx in range(len(seg_list) - 1):
            seg_a = seg_list[idx]
            seg_b = seg_list[idx + 1]
            w, h = original_shape = tuple(seg_a.shape)
            seg_a = np.reshape(
                np.array(seg_a, dtype=np.float32) / np.max(seg_a), (w * h))
            seg_b = np.reshape(
                np.array(seg_b, dtype=np.float32) / np.max(seg_b), (w * h))

            # seg_as = seg_list[idx]
            # seg_bs = seg_list[idx+1]
            # seg_a = self.add_position_array(imgtools.expand_image_dim(seg_as))
            # seg_b = self.add_position_array(imgtools.expand_image_dim(seg_bs))
            # w, h, _ = original_shape = tuple(seg_a.shape)

            # seg_a = np.reshape(np.array(seg_a, dtype=np.float32), (w * h, 3))
            # seg_b = np.reshape(np.array(seg_b, dtype=np.float32), (w * h, 3))
            # # print(seg_a.shape)
            # seg_a = scale(seg_a)
            # seg_b = scale(seg_b)

            # seg_a = np.squeeze(np.reshape(np.array(seg_a, dtype=np.float32), (w * h * 3)))
            # seg_b = np.squeeze(np.reshape(np.array(seg_b, dtype=np.float32), (w * h * 3)))

            # print("A->B:{}".format(cv2.EMD(seg_a, seg_b, cv2.DIST_L2)))
            print("A->B:{}".format(wasserstein_distance(seg_a, seg_b)))
            for a_idx in np.unique(seg_a):
                seg_a_label = seg_a == a_idx
                # seg_a_label = self.add_position_array(imgtools.expand_image_dim(seg_as == a_idx))
                # seg_a_label = np.reshape(np.array(seg_a_label, dtype=np.float32), (w * h , 3))

                # seg_a_label = scale(seg_a_label)
                # seg_a_label = np.squeeze(np.reshape(np.array(seg_a_label, dtype=np.float32), (w * h * 3)))
                for b_idx in np.unique(seg_b):
                    seg_b_label = seg_b == b_idx
                    # seg_b_label = self.add_position_array(imgtools.expand_image_dim(seg_bs == b_idx))
                    # seg_b_label = np.reshape(np.array(seg_b_label, dtype=np.float32), (w * h , 3))

                    # seg_b_label = scale(seg_b_label)
                    # seg_b_label = np.squeeze(np.reshape(np.array(seg_b_label,   dtype=np.float32), (w * h * 3)))
                    # print("A({})->B({}):{}".format(seg_a_label, seg_b_label, cv2.EMD(seg_a_label, seg_b_label)))
                    print("A({})->B({}):{}".format(
                        a_idx, b_idx,
                        wasserstein_distance(seg_a_label, seg_b_label)))
Example #20
0
def compute_statistics(output_adj, output_coord, output_seq_len, y_adj, y_coord, y_seq_len, lamb=0.5):
    r"""
    Compute statistics for the current data point.
    
    :param output_adj: predicted A
    :param output_coord: predicted X
    :param output_seq_len: predicted |V|
    :param y_adj: target A
    :param y_coord: target X
    :param y_seq_len: target |V|
    :param lamb: lambda parameter for the loss in this experiment
    :return: streetmover, loss, loss_adj, loss_coord, acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam
    """
    output_A = decode_adj(output_adj[0, :output_seq_len - 2].cpu().numpy())  # not include the last 1)
    y_A = decode_adj(y_adj[0, :y_seq_len - 2].cpu().numpy())
    output_nodes = output_coord[0, :output_seq_len - 2]
    y_nodes = y_coord[0, :y_seq_len - 2]
    output_graph = nx.from_numpy_matrix(output_A)
    y_graph = nx.from_numpy_matrix(y_A)
    
    assert output_A.shape[0] == output_nodes.shape[0] == output_seq_len - 2
    assert y_A.shape[0] == y_nodes.shape[0] == y_seq_len - 2
    
    output_n_edges = output_adj.reshape(-1).sum()
    y_n_edges = y_adj.reshape(-1).sum()
    
    output_degree = get_degree_hist(output_graph)
    y_degree = get_degree_hist(y_graph)
    dist_degree = wasserstein_distance(output_degree, y_degree)
    
    output_diam = get_diameters(output_graph)
    y_diam = get_diameters(y_graph)
    dist_diam = wasserstein_distance(output_diam, y_diam) if len(output_diam) > 0 else 1
    
    delta_n_nodes = int(output_seq_len - y_seq_len)
    delta_n_edges = (output_n_edges - y_n_edges).item()
    
    acc_A = get_accuracy_A(output_A, y_A)
    
    loss_adj = get_BCE_adj(output_adj[0], y_adj[0])
    loss_coord = get_MSE_coord(output_nodes, y_nodes)
    loss = lamb * loss_adj + (1 - lamb) * loss_coord
    
    (y_pc, output_pc), (streetmover, P, C) = streetmover_distance(y_A, y_nodes, output_A, output_nodes, n_points=100)
    # print("Streetmover distance: {:.3f}".format(streetmover.item()))
    
    # possibly, plot assignments and/or point clouds
    # show_assignments(y_pc, output_pc, P, title=str(streetmover.item())[:8])
    # plot_point_cloud(y_adj[0], y_coord[0], y_pc)
    # plot_point_cloud(output_adj[0], output_coord[0], output_pc)
    
    return streetmover.item(), loss, loss_adj, loss_coord, acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam
def model_monitor(country="all", dev=DEV, training=True):
    """
    performance monitoring
    """
    print("Monitor Model")
    
    ## import data
    #datasets = engineer_features(training=training, dev=dev)
    datasets = engineer_features(training=training)
    X, y, dates, labels = datasets[country]
    dates = pd.to_datetime(dates)
    print(X.shape)
    
    ## train the model
    if training:
        _model_train(X, y, labels, tag=country, dev=dev)
    
    ## monitor RMSE
    samples = [10, 20, 30, 50, 60]

    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n, X, y, dates)
        queries = [(str(d.year), str(d.month), str(d.day), country) for d in dates_new]
        y_pred = [model_predict(year=query[0], month=query[1], day=query[2], country=query[3],verbose=False, dev=dev)["y_pred"][0].round(2) for query in queries]
        rmse = np.sqrt(mean_squared_error(y_new.tolist(),y_pred))
        print("sample size: {}, RSME: {}".format(n, rmse.round(2)))
        
    ## monitor performance
    ## scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    samples = [25, 50, 75, 90]

    clf_y = EllipticEnvelope(random_state=0,contamination=0.01)
    clf_X = EllipticEnvelope(random_state=0,contamination=0.01)

    clf_X.fit(X)
    clf_y.fit(y.reshape(y.size,1))

    results = defaultdict(list)
    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n,X,y, dates)
        results["sample_size"].append(n)
        results['wasserstein_X'].append(np.round(wasserstein_distance(X.flatten(),X_new.flatten()),2))
        results['wasserstein_y'].append(np.round(wasserstein_distance(y,y_new),2))
        test1 = clf_X.predict(X_new)
        test2 = clf_y.predict(y_new.reshape(y_new.size,1))
        results["outlier_percent_X"].append(np.round(1.0 - (test1[test1==1].size / test1.size),2))
        results["outlier_percent_y"].append(np.round(1.0 - (test2[test2==1].size / test2.size),2))
    
    return pd.DataFrame(results)
Example #22
0
def cal_emd_lose(out_float_list, out_quant_list, out_len):
    """caculate earch move distance"""
    emd_sum = 0
    if out_len >= 3:
        for index in range(len(out_float_list)):
            emd_sum += wasserstein_distance(out_float_list[index],
                                            out_quant_list[index])
    else:
        out_float = np.concatenate(out_float_list)
        out_quant = np.concatenate(out_quant_list)
        emd_sum += wasserstein_distance(out_float, out_quant)
    emd_sum /= float(len(out_float_list))
    return emd_sum
Example #23
0
def report_linear_lmmnet(metric_function):
    
    error_list = []
    
    time_points, test_data = linear.simulate_default()


    for _ in range(10):
        # generate data with random initial conditions
        xi = np.random.uniform(1, 4, 1)[0]
        yi = np.random.uniform(0, 4, 1)[0]
        zi = np.random.uniform(1, 2, 1)[0]
        time_points, cubic_data = linear.simulate_custom(xinit=xi, yinit=yi, zinit=zi)

        model = train_lmmNet.train_easy(time_points, cubic_data)
        x0 = test_data[0,0,:] # initial conditions
        predicted_traj = odeint(lambda x, t: predict_lmmNet.predict_fn(x, t, model), x0, time_points)

        predictions = predicted_traj
        if metric_function == "wasserstein":
            e1 = wasserstein_distance(predictions[:,0], test_data[0,:,0])
            e2 = wasserstein_distance(predictions[:,1], test_data[0,:,1])
            e3 = wasserstein_distance(predictions[:,2], test_data[0,:,2])
            
        elif metric_function == "dtw":
            e1, _ = fastdtw(predictions[:,0], test_data[0,:,0], dist=euclidean)
            e2, _ = fastdtw(predictions[:,1], test_data[0,:,1], dist=euclidean)
            e3, _ = fastdtw(predictions[:,2], test_data[0,:,2], dist=euclidean)
            e1 /= np.linalg.norm(test_data[0,:,0], 2)**2
            e2 /= np.linalg.norm(test_data[0,:,1], 2)**2
            e3 /= np.linalg.norm(test_data[0,:,2], 2)**2
            
        elif metric_function == "mse":
            e1 = predict_lmmNet.compute_MSE(predictions, test_data[0], 0)
            e2 = predict_lmmNet.compute_MSE(predictions, test_data[0], 1)
            e3 = predict_lmmNet.compute_MSE(predictions, test_data[0], 2)
        error_list.append((e1, e2, e3))
        
        # plot
        plt.figure(figsize=(20, 10))
        plt.plot(time_points, test_data[0,:,0], 'r.', label='x_1')
        plt.plot(time_points, test_data[0,:,1], 'y.', label='x_2')
        plt.plot(time_points, test_data[0,:,2], 'g.', label='x_3')
        plt.plot(time_points, predictions[:,0], 'b--', label='predicted dynamics')
        plt.plot(time_points, predictions[:,1], 'b--')
        plt.plot(time_points, predictions[:,2], 'b--')
        plt.title(str(xi) + " " + str(yi) + " " + str(zi))
        plt.legend()
        plt.show()
        
    return error_list
def calc_distance_matrix(X, method):
    if method in ['chebyshev', 'euclidean', 'l1', 'l2']:
        DM = DistanceMetric.get_metric(method).pairwise(X)
    elif method in ['cosine']:
        DM = pairwise.cosine_distances(X)
    elif method in [
            'correlation', 'cityblock', 'braycurtis', 'canberra', 'hamming',
            'jaccard', 'kulsinski'
    ]:
        DM = squareform(pdist(X, method))
    elif method in ['minkowski3']:
        DM = squareform(pdist(X, 'minkowski', 3))
    elif method in ['dot']:
        DM = squareform(pdist(X, lambda u, v: np.dot(u, v)))
    elif method in ['emd']:
        from scipy.stats import wasserstein_distance
        l = len(X)
        DM = np.zeros((l, l))
        for x in range(l):
            for y in range(l):
                DM[x, y] = wasserstein_distance(X[x], X[y])
    else:
        return None

    return DM
Example #25
0
    def fit(self, X, y, w):
        labels = np.unique(y)
        assert 0 < len(labels) <= 2

        super(BinaryWPCA, self).fit(X, y, w)
        assert self.R is not None

        sel = (y == labels[0])
        X1 = X[sel, :] @ self.R
        w1 = w[sel]
        w2 = w[~sel]
        X2 = X[~sel, :] @ self.R

        assert X1.shape == X2.shape
        _, n = X1.shape
        distances = [
            wasserstein_distance(u_values=X1[:, i],
                                 u_weights=w1,
                                 v_values=X2[:, i],
                                 v_weights=w2) for i in range(n)
        ]

        idx = np.flip(np.argsort(distances))
        self.R = self.R[:, idx]
        self.is_fitted = True
def get_dist(outf, layer,result_j, predicted, sample_size, criteria):
    final_tr = np.load(outf + "emp_bnn_train_"+str(layer) + ".npy")
    predicted_tr = np.load(outf + "labels_bnn_train.npy")
    #print(layer)
    if layer < 44:
        pca_model = pk.load(open(outf + "pca_bnn"+str(layer)+".pkl","rb"))
        #final_tr = pca_model.transform(final_tr)
        #print(result_j.cpu().detach().numpy().shape)
        final_adv = pca_model.transform(result_j.cpu().detach().numpy())
    else:
        final_adv = result_j.cpu().detach().numpy()
        
    distance = np.zeros(final_adv.shape[0])
    for i in range(final_adv.shape[0]):
        data_train_sample = final_tr[predicted_tr == int(predicted[i])]
        #print(predicted[i], data_train_sample.shape)
        ind = np.random.choice(data_train_sample.shape[0],min(sample_size, data_train_sample.shape[0]),replace=False)
        data_train_sample_i = data_train_sample[ind,]
        dist = np.zeros(data_train_sample_i.shape[0])
        for k in range(data_train_sample_i.shape[0]):
            dist[k] = wasserstein_distance(final_adv[i,:], data_train_sample_i[k,:])
        #print(dist)
        if len(dist) == 0:   
            dis_adv = 0
        elif criteria == 'mean':
            dis_adv = dist.mean()
        elif criteria == 'min':
            dis_adv = dist.min()
        else:
            dis_adv = np.median(dist)
        distance[i] = dis_adv
    return distance
Example #27
0
def triplet_analysis(df, categories=CATEGORIES):
    df2 = df.drop_duplicates()
    for category in categories:
        groupby_cols = categories[:]
        groupby_cols.remove(category)

        fname = 'triplets_%s.csv' % category
        fname = fname.replace(' ', '_')
        with open(fname, 'w') as f:
            for name, group in df2.groupby(groupby_cols):
                unique_vals = group[category].unique()
                group_items = ['--------------------', ', '.join(name), '--------------------']
                write_items = []
                for val1, val2 in combinations(group[category].unique(), 2):
                    sample1 = group['alpha'][group[category] == val1]
                    sample2 = group['alpha'][group[category] == val2]
                    n1 = len(sample1)
                    n2 = len(sample2)
                    if n1 > 25 and n2 > 25:
                        dist = wasserstein_distance(sample1, sample2)
                        write_items.append((dist, n1, n2, val1, val2))
                        #write_items.append('%s (%d), %s (%d), %f' % (val1, n1, val2, n2, dist))
                if len(write_items) > 0:
                    f.write('-----------------------\n')
                    f.write(', '.join(name) + '\n')
                    f.write('-----------------------\n')
                    write_items = reversed(sorted(write_items))
                    for dist, n1, n2, val1, val2 in write_items:
                        f.write('%s (%d), %s (%d), %f\n' % (val1, n1, val2, n2, dist))
Example #28
0
def date_dist_scores(ref_timeline, ground_truth, p_val=.05):
    '''
    Scores predicted distribution of timeline event dates using 2 sample
        Kolmogorov-Smirnov statistic and the first Wasserstein distance
        (earth mover's distance)

    Returns dict with KS statistic, if time distributions are statistically
        significantly different, and the Wasserstein distance
    '''
    gt_dates = [time.mktime(d.timetuple()) for d in ground_truth.get_dates()]
    ref_dates = [time.mktime(d.timetuple()) for d in ref_timeline.get_dates()]

    scaler = MinMaxScaler()

    gt_scaled = scaler.fit_transform(np.array(gt_dates).reshape(-1, 1)).T[0]
    ref_scaled = scaler.transform(np.array(ref_dates).reshape(-1, 1)).T[0]

    ks_test = stats.ks_2samp(gt_scaled, ref_scaled)
    emd = stats.wasserstein_distance(gt_scaled, ref_scaled)

    # ks_signif 1 when the differentce in date distribution between ground
    #	truth and generated timelines is statistically significant
    return {
        'ks_stat': ks_test.statistic,
        'ks_signif': int(ks_test.pvalue < p_val),
        'earth_movers_distance': emd
    }
Example #29
0
def wasserstein_test(u_values, v_values, bootstraps=999, use_gamma_model=True):
    # permutation test of wasserstein distance
    # based on the one outlined in https://github.com/cdowd/twosamples
    wass_dist, wass_dir = wasserstein_distance_and_direction(
        u_values, v_values)

    # under null hypothesis the samples are drawn from the same distribution
    # so we can make expected wasserstein values by permuting values between
    # the two samples
    pool = np.concatenate([u_values, v_values])
    n = len(u_values)
    exp = []
    for _ in range(bootstraps):
        np.random.shuffle(pool)
        exp.append(stats.wasserstein_distance(pool[:n], pool[n:]))
    exp = np.array(exp)

    if not use_gamma_model:
        # bootstrap p value with pseudocount
        p_val = ((exp >= wass_dist).sum() + 1) / (bootstraps + 1)
    else:
        # fit a gamma distribution to the expected distances
        g = stats.gamma(*stats.gamma.fit(exp))
        # compute p value using survival function
        p_val = g.sf(wass_dist)
    return wass_dist, wass_dir, p_val
def get_decision(layer,result_j, predicted, thrd, sample_size, criteria):
    final_tr = torch.load("./hidden_output/emp_bnn_train_"+str(layer)).cpu().detach().numpy()
    predicted_tr = np.load("./data/predicts_bnn_train.npy")
    if layer < 43:
        pca_model = pk.load(open("./data/pca_bnn"+str(layer)+".pkl","rb"))
        final_tr = pca_model.transform(final_tr)
        final_adv = pca_model.transform(result_j.cpu().detach().numpy())
    else:
        final_adv = result_j.cpu().detach().numpy()
        
    decision = np.zeros(final_adv.shape[0])
    for i in range(final_adv.shape[0]):
        data_train_sample = final_tr[predicted_tr == int(predicted[i])] 
        ind = np.random.choice(data_train_sample.shape[0],min(sample_size, data_train_sample.shape[0]),replace=False)
        data_train_sample_i = data_train_sample[ind,]
        dist = np.zeros(data_train_sample_i.shape[0])
        for k in range(data_train_sample_i.shape[0]):
            dist[k] = wasserstein_distance(final_adv[i,:], data_train_sample_i[k,:])
        if criteria == 'mean':
            dis_adv = dist.mean()
        elif criteria == 'min':
            dis_adv = dist.min()
        else:
            dis_adv = np.median(dist)
        if dis_adv > thrd[int(predicted[i])]:
            decision[i] = 1
    return decision