Python wasserstein_distance Examples, scipy.stats.wasserstein_distance Python Examples

Example #1

0

Show file

File: langevin.py Project: swyoon/LangevinMC

def sliced_wasserstein_no_histogram(p, q, iters=20):
    '''
        Utility function for the 1-Sliced Wasserstein distance.
        Tries to fight the course of dimensionality by only considering the
        "bins" around the sampled values.
        If the sampled values are entirely wrong, this method will not work >> use first moment test first.
        p = sampled values
        q = density function at the sampled values
    '''
    if any(np.isnan(x) for x in p.flatten()):
        return float('inf')
    if len(set(tuple(x) for x in p)) / len(p) <= 0.1:
        return float('inf')

    dim = p.shape[1]
    if dim == 1:
        return wasserstein_distance(p.flatten(), p.flatten(), np.ones(p.shape[0]) + EPS, q + EPS)

    dist = 0
    for _ in range(iters):
        proj_vec = normal(size=dim)
        proj_vec = proj_vec / norm(proj_vec) # sample randomly from dim-1 sphere
        bins = [np.dot( proj_vec, pt ) for pt in p]
        dist += wasserstein_distance(bins, bins, np.ones(p.shape[0]) + EPS, q + EPS)
    return dist/iters

Example #2

0

Show file

File: monitoring.py Project: nicolezk/ai-workflow-capstone

def get_monitoring_tools(X, y):
    """
    determine outlier and distance thresholds
    return thresholds, outlier model(s) and source distributions for distances
    NOTE: for classification the outlier detection on y is not needed
    """

    preprocessor = get_preprocessor()
    preprocessor = preprocessor.fit(X)
    X_pp = preprocessor.transform(X)

    xpipe = Pipeline(steps=[(
        'pca',
        PCA(2)), ('clf',
                  EllipticEnvelope(random_state=0, contamination=0.01))])
    xpipe.fit(X_pp)

    bs_samples = 1000
    outliers_X = np.zeros(bs_samples)
    wasserstein_X = np.zeros(bs_samples)
    wasserstein_y = np.zeros(bs_samples)

    for b in range(bs_samples):
        n_samples = int(np.round(0.80 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=True).astype(int)
        y_bs = y[subset_indices]
        X_bs = X_pp[subset_indices, :]

        test1 = xpipe.predict(X_bs)
        wasserstein_X[b] = wasserstein_distance(X_pp.flatten(), X_bs.flatten())
        wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
        outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

    ## determine thresholds as a function of the confidence intervals
    outliers_X.sort()
    outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(
        0.025 * bs_samples)]

    wasserstein_X.sort()
    wasserstein_X_threshold = wasserstein_X[int(
        0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

    wasserstein_y.sort()
    wasserstein_y_threshold = wasserstein_y[int(
        0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

    to_return = {
        "outlier_X": np.round(outlier_X_threshold, 1),
        "wasserstein_X": np.round(wasserstein_X_threshold, 2),
        "wasserstein_y": np.round(wasserstein_y_threshold, 2),
        "preprocessor": preprocessor,
        "clf_X": xpipe,
        "X_source": X_pp,
        "y_source": y,
        "latest_X": X,
        "latest_y": y
    }
    return (to_return)

Example #3

0

Show file

    def calc_c_hist(self):
        """
        Compare the histograms of all of the eigenvectors (eigenfunctions) for each of the
        meshes - only upto the number of meshes we are interested in. The paper says to do
        this for the number of features we are interested (i.e. 5 or 6), however, this implementation
        does that for more than the requested number of features to ensure that we dont get the wrong ones.
        So, we'll get more spectral coordinates, order them, and then select the appropriate number from
        these re-ordered coordinates.

        :return:
        """

        # Initially tried using straight values (not log) but eig vec 1 got accentuated too much (in the weighting)
        # Then tried just log, but becuase there are negative values it creates erro.
        # need to add .5 + a small value to ensure there are no 0 values entered to log.
        # wasserstein_distance is the same as earth movers distance, and is the minimum "work" needed to
        # transform u (first entry) into v (second entry).
        eps = np.finfo(float).eps
        for i in range(self.n_features):
            for j in range(self.n_features):
                self.c_hist[i, j] = wasserstein_distance(
                    np.log(self.rand_target_eig_vecs[:, i] + 0.5 + eps),
                    np.log(self.rand_source_eig_vecs[:, j] + 0.5 + eps))
                self.c_hist_f[i, j] = wasserstein_distance(
                    np.log(self.rand_target_eig_vecs[:, i] + 0.5 + eps),
                    np.log(-self.rand_source_eig_vecs[:, j] + 0.5 + eps))

Example #4

0

Show file

def estimate_noise_model(img_segment):
    mean, standard_deviation, hist, bins = estimate_noise_parameters(
        img_segment, return_histogram=True)
    pdf, bins = histogram_processing.compute_image_pdf(img_segment)
    #standard_deviation = np.sqrt(np.var(pdf.values()))

    gaussian_pdf = generate_gaussian_pdf(mean, standard_deviation)
    rayleigh_pdf = generate_rayleigh_pdf(mean, standard_deviation)
    erlang_pdf = generate_erlang_pdf(mean, standard_deviation)
    exponential_pdf = generate_exponential_pdf(mean, standard_deviation)
    uniform_pdf = generate_uniform_pdf(mean, standard_deviation)

    gaussian_distance = wasserstein_distance(
        pdf, np.array(list(gaussian_pdf.values())))
    rayleigh_distance = wasserstein_distance(
        pdf, np.array(list(rayleigh_pdf.values())))
    erlang_distance = wasserstein_distance(pdf,
                                           np.array(list(erlang_pdf.values())))
    exponential_distance = wasserstein_distance(
        pdf, np.array(list(exponential_pdf.values())))
    uniform_distance = wasserstein_distance(
        pdf, np.array(list(uniform_pdf.values())))

    distances = {
        'gaussian': gaussian_distance,
        'rayleigh': rayleigh_distance,
        'erlang': erlang_distance,
        'exponential': exponential_distance,
        'uniform': uniform_distance
    }

    min_distance = tuple((d[0], d[1]) for d in distances.items()
                         if d[1] == min(distances.values()))

    return distances, min_distance

Example #5

0

Show file

def compare_drift(X_src, y_src, X_new, y_new):
    clf_y = EllipticEnvelope(random_state=0, contamination=0.01)
    clf_X = EllipticEnvelope(random_state=0, contamination=0.01)

    clf_X.fit(X_src)
    clf_y.fit(y_src.reshape(y_src.size, 1))

    test_X = clf_X.predict(X_new)

    test_y = clf_y.predict(y_new.reshape(-1, 1))

    X_distance = wasserstein_distance(X_src.values.flatten(),
                                      X_new.values.flatten())

    y_distance = wasserstein_distance(y_src.flatten(), y_new.flatten())

    X_outlier = len(test_X[test_X == -1]) / len(test_X)

    y_outlier = len(test_y[test_y == -1]) / len(test_y)

    results = {
        'X_wasserstein_distance': X_distance,
        'y_wasserstein_distance': y_distance,
        'X_outlier_percentage': X_outlier,
        'y_outlier_percentage': y_outlier
    }

    return results

Example #6

0

Show file

def metric_max(n_classes,Mtype):
    Pref=np.ones(n_classes)/n_classes #Reference attribute
    Pep=np.zeros(n_classes)
    Pep[0]=1
    if Mtype=="L1":
        fair_d = abs(Pep - Pref).sum()
    elif Mtype=="L2":
        fair_d = np.sqrt(((Pep - Pref)**2).sum())
    elif Mtype=="Is":
        #L1
        l1_fair_d = abs(Pep - Pref).sum()
        #Specificity
        rank=np.linspace(1,n_classes-1,n_classes-1)
        rank[::-1].sort() #Descending order
        perc=np.array([i/np.sum(rank) for i in rank])
        
        alpha=Pep[1:]
        specificity=abs(Pep[0]-np.sum(alpha*perc))
        fair_d=(l1_fair_d+specificity)/2
    elif Mtype=="Wd":
        fair_d=wasserstein_distance(Pep,Pref)
    elif Mtype=="Wds":
        #Specificity
        rank=np.linspace(1,n_classes-1,n_classes-1)
        rank[::-1].sort() #Descending order
        perc=np.array([i/np.sum(rank) for i in rank])
        alpha=Pep[1:]
        specificity=abs(Pep[0]-np.sum(alpha*perc))
        #Wassertein dist
        ws=wasserstein_distance(Pep,Pref)
        
        fair_d=(ws+specificity)/2    
    else:
        fair_d=0
    return fair_d

Example #7

0

Show file

def compute_statistics_MLP(y_A, y_nodes, output_A, output_nodes, y_seq_len, output_seq_len):
    r"""
    Compute statistics for the current data point, based on the one-shot output from the MLP decoder.

    :param output_adj: predicted A
    :param output_coord: predicted X
    :param output_seq_len: predicted |V|
    :param y_adj: target A
    :param y_coord: target X
    :param y_seq_len: target |V|
    :param lamb: lambda parameter for the loss in this experiment
    :return: streetmover, acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam
    """
    output_graph = nx.from_numpy_matrix(output_A)
    y_graph = nx.from_numpy_matrix(y_A)
    
    output_degree = get_degree_hist(output_graph)
    y_degree = get_degree_hist(y_graph)
    dist_degree = wasserstein_distance(output_degree, y_degree)
    
    output_diam = get_diameters(output_graph)
    y_diam = get_diameters(y_graph)
    dist_diam = wasserstein_distance(output_diam, y_diam) if len(output_diam) > 0 else 1
    
    delta_n_nodes = int(output_seq_len - y_seq_len)
    delta_n_edges = output_A.sum() - y_A.sum()
    
    acc_A = get_accuracy_A(output_A, y_A)
    
    (y_pc, output_pc), (streetmover, P, C) = streetmover_distance(y_A, y_nodes, output_A, output_nodes, n_points=100)
    # print("Streetmover distance: {:.3f}".format(streetmover.item()))
    
    return streetmover.item(), acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam

Example #8

0

Show file

def wasserstein(dataset_path):
    """
    This needs to be the full datasets to make sense.
    Since it is the full datasets, this is not included
    when running all tests.
    """
    from scipy.stats import wasserstein_distance
    for dataset_cls in all_datasets():
        print('Loading dataset:', dataset_cls.__name__)
        train = dataset_cls(dataset_path, split=TRAIN)
        valid = dataset_cls(dataset_path, split=VALIDATION)
        test = dataset_cls(dataset_path, split=TEST)
        splits = [train, valid, test]

        ws_max = 0.0
        ws_ave = 0.0
        for i in range(train.num_features):
            for j, split in enumerate(splits):
                if j == 0:
                    hist, bins = np.histogram(split.x[:, i], density=True)
                    us = split.x[:, i]
                else:
                    hist, _ = np.histogram(split.x[:, i], density=True, bins=bins)
                    vs = split.x[:, i]
                    ws = wasserstein_distance(us, vs)
                    ws_max = max(ws_max, wasserstein_distance(us, vs))
                    ws_ave += ws / (train.num_features * 2)
        print('Max wasserstein:', ws_max)
        print('Average wasserstein:', ws_ave)
        print()

Example #9

0

Show file

File: Outlier_Detection_in_BHS.py Project: ozgekoroglu/Outlier_Detection_BHS

def f_dist(histogram1, histogram2):
    if histogram1 != [0] and histogram2 != [0]:
        return wasserstein_distance(histogram1[0], histogram2[0])
    elif histogram1 != [0] and histogram2 == [0]:
        return wasserstein_distance(histogram1[0], np.zeros((1, )))
    elif histogram1 == [0] and histogram2 != [0]:
        return wasserstein_distance(histogram2[0], np.zeros((1, )))

Example #10

0

Show file

File: measure.py Project: jpahle/LMMNet

def report_linear_onestep(metric_function):
    
    error_list = []
    
    time_points, test_data = linear.simulate_default()

    feature_list = [] # here we do not have external time-series or control variables
    target_list = ['x_component', 'y_component', 'z_component']
    df_test = create_data(test_data, time_points, cols=target_list, num=2)


    for _ in range(10):
        # generate data with random initial conditions
        xi = np.random.uniform(1, 4, 1)[0]
        yi = np.random.uniform(0, 4, 1)[0]
        zi = np.random.uniform(1, 2, 1)[0]
        time_points, cubic_data = linear.simulate_custom(xinit=xi, yinit=yi, zinit=zi)

        df = create_data(cubic_data, time_points, cols=target_list, num=0)
        df_train = train_onestep.generate_dataset(df, [0],feature_list, target_list, n_dim=2500)
        
        rf_model = RandomForestRegressor(n_estimators=20)
        figure_path = './plots/'
        rf_dict, score_dict = train_onestep.train_classic(df_train, rf_model, plot=False,model_type='random_forest', figure_path=figure_path)
        time_points, predictions = predict_onestep.predict_integrate(df_test, df, rf_dict, target_list, feature_list, title='test', plot=False,model_type='random_forest', subplots=(3,1), bio=False)
        predictions = predictions.to_numpy()
        
        if metric_function == "wasserstein":
            e1 = wasserstein_distance(predictions[:,0], test_data[0,:,0])
            e2 = wasserstein_distance(predictions[:,1], test_data[0,:,1])
            e3 = wasserstein_distance(predictions[:,2], test_data[0,:,2])
            
        elif metric_function == "dtw":
            e1, _ = fastdtw(predictions[:,0], test_data[0,:,0], dist=euclidean)
            e2, _ = fastdtw(predictions[:,1], test_data[0,:,1], dist=euclidean)
            e3, _ = fastdtw(predictions[:,2], test_data[0,:,2], dist=euclidean)
            e1 /= np.linalg.norm(test_data[0,:,0], 2)**2
            e2 /= np.linalg.norm(test_data[0,:,1], 2)**2
            e3 /= np.linalg.norm(test_data[0,:,2], 2)**2
            
        elif metric_function == "mse":
            e1 = predict_lmmNet.compute_MSE(predictions, test_data[0], 0)
            e2 = predict_lmmNet.compute_MSE(predictions, test_data[0], 1)
            e3 = predict_lmmNet.compute_MSE(predictions, test_data[0], 2)
        error_list.append((e1, e2, e3))
        
        # plot
        plt.figure(figsize=(20, 10))
        plt.plot(time_points, test_data[0,:,0], 'r.', label='x_1')
        plt.plot(time_points, test_data[0,:,1], 'y.', label='x_2')
        plt.plot(time_points, test_data[0,:,2], 'g.', label='x_3')
        plt.plot(time_points, predictions[:,0], 'b--', label='predicted dynamics')
        plt.plot(time_points, predictions[:,1], 'b--')
        plt.plot(time_points, predictions[:,2], 'b--')
        plt.title(str(xi) + " " + str(yi) + " " + str(zi))
        plt.legend()
        plt.show()
        
    return error_list

Example #11

0

Show file

File: base.py Project: bradyneal/realcause

    def get_univariate_quant_metrics(self,
                                     dataset=TRAIN,
                                     transformed=False,
                                     verbose=True,
                                     thin_model=None,
                                     thin_true=None,
                                     seed=None,
                                     n=None):
        """
        Calculates quantitative metrics for the difference between p(t) and
        p_model(t) and the difference between p(y) and p_model(y)

        :param dataset: dataset subset to evaluate on (train, val, or test)
        :param transformed: If True, use transformed version of data.
            If False, use original (non-transformed) version of data.
        :param thin_model: thinning interval for the model data
        :param thin_true: thinning interval for the real data
        :param seed: seed for sample from generative model
        :return: {
            't_ks_pval': ks p-value with null that t_model and t_true are from the same distribution
            'y_ks_pval': ks p-value with null that y_model and y_true are from the same distribution
            't_wasserstein1_dist': wasserstein1 distance between t_true and t_model
            'y_wasserstein1_dist': wasserstein1 distance between y_true and y_model
        }
        """
        _, t_model, y_model = to_np_vectors(self.sample(
            seed=seed, untransform=(not transformed)),
                                            thin_interval=thin_model)

        _, t_true, y_true = self.get_data(transformed=transformed,
                                          dataset=dataset,
                                          verbose=verbose)
        t_true, y_true = to_np_vectors((t_true, y_true),
                                       thin_interval=thin_true)

        # jitter for numerical stability
        t_true = t_true.copy() + np.random.rand(*t_true.shape) * 1e-6
        t_model = t_model.copy() + np.random.rand(*t_model.shape) * 1e-6

        ks_label = "_ks_pval"
        es_label = "_es_pval"
        wasserstein_label = "_wasserstein1_dist"
        metrics = {
            T + ks_label:
            float(stats.ks_2samp(t_model, t_true).pvalue),
            Y + ks_label:
            float(stats.ks_2samp(y_model, y_true).pvalue),
            T + es_label:
            float(stats.epps_singleton_2samp(t_model, t_true).pvalue),
            Y + es_label:
            float(stats.epps_singleton_2samp(y_model, y_true).pvalue),
            T + wasserstein_label:
            float(stats.wasserstein_distance(t_model, t_true)),
            Y + wasserstein_label:
            float(stats.wasserstein_distance(y_model, y_true)),
        }

        return metrics

Example #12

0

Show file

def clustering(init_rec_field1, fin_rec_field1, dst, mode='RGB', show=True):
    """
        Dendograms with earth mover's distance.
    """    
    
    import numpy as np
    import matplotlib.pyplot as plt
    import scipy as scp
    
    from sklearn.cluster import AgglomerativeClustering
    from scipy.stats import wasserstein_distance
    from scipy.cluster.hierarchy import dendrogram, linkage
    
    if mode == 'RGB':
        channels = 3
    else:
        channels = 4
    
    # first gen: generate clusters and dendograms with earth mover distance
    dist = np.zeros(shape=(16,16))
    for i in range(16):
        for j in range(16):
            dist[i,j] = wasserstein_distance(init_rec_field1[:,:,:channels,i].flatten(), 
                                             init_rec_field1[:,:,:channels,j].flatten())
            
    cluster = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage="average")
    clusters = cluster.fit_predict(dist)
    
    # plot dendogram
    sq_dist = scp.spatial.distance.squareform(dist)
    linkage_matrix = linkage(sq_dist, "average")
    dendrogram(linkage_matrix)
    plt.title("[DENDOGRAM RECEPTIVE FIELDS, FIRST GENERATION]: earth mover distance, linkage 'avg'.")
    
    if show == True:
        plt.show()
    else:
        pass    
    # last gen: generate clusters and dendograms with earth mover distance
    dist = np.zeros(shape=(16,16))
    for i in range(16):
        for j in range(16):
            dist[i,j] = wasserstein_distance(fin_rec_field1[:,:,:channels,i].flatten(), 
                                             fin_rec_field1[:,:,:channels,j].flatten())
            
    cluster = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage="average")
    clusters = cluster.fit_predict(dist)
    
    # plot dendogram
    sq_dist = scp.spatial.distance.squareform(dist)
    linkage_matrix = linkage(sq_dist, "average")
    dendrogram(linkage_matrix)
    plt.title("[DENDOGRAM RECEPTIVE FIELDS, LAST GENERATION]: earth mover distance, linkage 'avg'.")
    
    if show == True:
        plt.show()
    else:
        pass

Example #13

0

Show file

def calc_distance(features: {}, shape_features: {}) -> {}:
    """
    It determines the closest shape to the query shape by computing a custom distance function
    between the features of dataset's shape and the features of the query shape.
    ----------------------------
    Args:
        features (obj: 'dict): The dictionary containing the feature metrics of the shapes
        shape_features (obj: 'dict'): The dictionary containing the feature metrics of the shape

    Returns:
        similarities (obj: 'dict'): The dictionary containing the closest shapes (key) and the respective
                                    distance to the query shape (value)
    """
    similarities = {}
    weights = np.load(s.SAVED_DATA + "distance_weights.npy",
                      allow_pickle=True).item()

    for id, featuresList in features.items():
        # Distance is the square root of the sum of squared differences
        dist_v = distance.euclidean(featuresList['volume'],
                                    shape_features.get('volume'))
        dist_a = distance.euclidean(featuresList['area'],
                                    shape_features.get('area'))
        dist_c = distance.euclidean(featuresList['compactness'],
                                    shape_features.get('compactness'))
        dist_bb = distance.euclidean(featuresList['bbox_volume'],
                                     shape_features.get('bbox_volume'))
        dist_d = distance.euclidean(featuresList['diameter'],
                                    shape_features.get('diameter'))
        dist_e = distance.euclidean(featuresList['eccentricity'],
                                    shape_features.get('eccentricity'))

        dist_A3 = wasserstein_distance(featuresList['A3'][0],
                                       shape_features.get('A3')[0])
        dist_D1 = wasserstein_distance(featuresList['D1'][0],
                                       shape_features.get('D1')[0])
        dist_D2 = wasserstein_distance(featuresList['D2'][0],
                                       shape_features.get('D2')[0])
        dist_D3 = wasserstein_distance(featuresList['D3'][0],
                                       shape_features.get('D3')[0])
        dist_D4 = wasserstein_distance(featuresList['D4'][0],
                                       shape_features.get('D4')[0])

        similarity = weights["w_v"]*dist_v + \
            weights["w_a"]*dist_a + \
            weights["w_c"]*dist_c + \
            weights["w_bb"]*dist_bb + \
            weights["w_d"]*dist_d + \
            weights["w_e"]*dist_e + \
            weights["w_A3"]*dist_A3 + \
            weights["w_D1"]*dist_D1 + \
            weights["w_D2"]*dist_D2 + \
            weights["w_D3"]*dist_D3 + \
            weights["w_D4"]*dist_D4

        similarities[id] = similarity

    return similarities

Example #14

0

Show file

def behavior_comparison():
    networks = (
        ('Caveman-50-10', fio.read_network('networks/cavemen-50-10.txt')),
        ('Elitist-500', fio.read_network('networks/elitist-500.txt')),
        ('CGG-500', fio.read_network('networks/cgg-500.txt'))
    )

    num_sims = 50
    num_behaviors = 3
    distributions = []
    averages = np.zeros((len(networks), num_behaviors))
    loop = tqdm(total=len(networks) * num_behaviors * num_sims)
    for i, (n_name, net) in enumerate(networks):

        behaviors = (
            ('No Mitigations',
             behavior.NoMitigation()),
            ('Generic Pressure R=1',
             behavior.SimplePressureBehavior(net, rng=RNG, radius=1)),
            ('Edge Pressure R=1',
             behavior.SimpleEdgePressureBehavior(net, rng=RNG, radius=1))
            # ('All Edges Sequential Flicker 1/4',
            #  StaticFlickerBehavior(net.M, net.edges, (True, False, False, False))),
            # ('All Edges Random Flicker 0.25',
            #  RandomFlickerBehavior(net.M, net.edges, 0.25)),
            # ('Collected Pressure Flicker 0.25, R=1',
            #  UnifiedPressureFlickerBehavior(net, 1, RNG)),
            # ('Generic Pressure Radius 3',
            #  SimplePressureBehavior(net, 3)),
            # ('Pressure Decay Radius 3',
            #  PressureDecayBehavior(net, 3)),
            # ('Pressure Flicker Radius 3',
            #  PressureFlickerBehavior(net, 3))
        )

        for j, (b_name, behavior) in enumerate(behaviors):
            s_scores = []
            for _ in range(num_sims):
                loop.set_description(f'{n_name}, {b_name}')
                end_sir = simulate(net.M, sir0=make_starting_sir(net.N, 1, rng=RNG),
                                   disease=Disease(4, 0.3),
                                   update_connections=behavior,
                                   max_steps=200,
                                   rng=RNG)[-1]
                s_scores.append(np.sum(end_sir[0, :] > 0)/net.N)
                loop.update()
            # plt.title(f'{n_name}, {b_name}, Avg: {sum(s_scores)/len(s_scores)}')
            # plt.hist(s_scores)
            # plt.figure()
            averages[i, j] = sum(s_scores)/len(s_scores)
            distributions.append(s_scores)
    print(wasserstein_distance(distributions[1], distributions[2]))
    print(wasserstein_distance(distributions[4], distributions[5]))
    print(wasserstein_distance(distributions[7], distributions[8]))
    # plt.show()
    np.set_printoptions(precision=3, suppress=True)
    print(averages)

Example #15

0

Show file

def get_monitoring_tools(X, y):
    """
    determine outlier and distance thresholds
    return thresholds, outlier model(s) and source distributions for distances

    """
    X_pre = X
    xpipe = Pipeline(steps=[('scaler',
                             StandardScaler()), ('rf',
                                                 RandomForestRegressor())])
    xpipe.fit(X_pre, y)

    bs_samples = 1000
    outliers_X = np.zeros(bs_samples)
    wasserstein_X = np.zeros(bs_samples)
    wasserstein_y = np.zeros(bs_samples)

    for b in range(bs_samples):
        n_samples = int(np.round(0.8 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y_bs = y[mask]
        X_bs = X[mask]

        test1 = xpipe.predict(X_bs)
        wasserstein_X[b] = wasserstein_distance(X_pre.values.flatten(),
                                                X_bs.values.flatten())
        wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
        outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

    ## determine thresholds as a function of the confidence intervals
    outliers_X.sort()
    outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(
        0.025 * bs_samples)]

    wasserstein_X.sort()
    wasserstein_X_threshold = wasserstein_X[int(
        0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

    wasserstein_y.sort()
    wasserstein_y_threshold = wasserstein_y[int(
        0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

    to_return = {
        "outlier_X": np.round(outlier_X_threshold, 1),
        "wasserstein_X": np.round(wasserstein_X_threshold, 2),
        "wasserstein_y": np.round(wasserstein_y_threshold, 2),
        "pipe_X": xpipe,
        "X_source": X_pre,
        "y_source": y,
        "latest_X": X,
        "latest_y": y
    }
    return (to_return)

Example #16

0

Show file

    def detect(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]) -> Dict[str, Union[float, np.ndarray]]:
        """
        determine outlier and distance thresholds
        return thresholds, outlier model(s) and source distributions for distances
        NOTE: for classification the outlier detection on y is not needed
        """

        if isinstance(X, pd.DataFrame):
            X = X.values

        if isinstance(y, pd.Series):
            y = y.values


        contamination: float = 0.01

        xpipe = Pipeline(steps=[('pca', PCA(2, random_state=self.seed)),
                                ('clf', EllipticEnvelope(random_state=self.seed, contamination=contamination))])
        xpipe.fit(X)

        bs_samples: int = 1000
        outliers_X: np.ndarray = np.zeros(bs_samples)
        wasserstein_X: np.ndarray = np.zeros(bs_samples)
        wasserstein_y: np.ndarray = np.zeros(bs_samples)

        for b in range(bs_samples):
            # set random seed
            rng = np.random.default_rng(self.seed + b)

            n_samples = int(np.round(0.80 * X.shape[0]))
            subset_indices = rng.choice(np.arange(X.shape[0]), n_samples, replace=True).astype(int)
            y_bs = y[subset_indices]
            X_bs = X[subset_indices, :]

            test1 = xpipe.predict(X_bs)
            wasserstein_X[b] = wasserstein_distance(X.flatten(), X_bs.flatten())
            wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
            outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

        ## determine thresholds as a function of the confidence intervals
        outliers_X.sort()
        outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(0.025 * bs_samples)]

        wasserstein_X.sort()
        wasserstein_X_threshold = wasserstein_X[int(0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

        wasserstein_y.sort()
        wasserstein_y_threshold = wasserstein_y[int(0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

        result = {
                    "outlier_X": np.round(outlier_X_threshold, 2),
                    "wasserstein_X": np.round(wasserstein_X_threshold, 2),
                    "wasserstein_y": np.round(wasserstein_y_threshold, 2)
                  }
        return result

Example #17

0

Show file

def get_monitoring_tools(df):
    """
    determine outlier and distance thresholds
    return thresholds, outlier model(s) and source distributions for distances
    NOTE: for classification the outlier detection on y is not needed

    """
    X, y, dates = engineer_features(df)
    X1 = X.to_numpy()
    xpipe = Pipeline(steps=[(
        'pca',
        PCA(2)), ('clf',
                  EllipticEnvelope(random_state=0, contamination=0.01))])
    xpipe.fit(X1)
    bs_samples = 549
    outliers_X = np.zeros(bs_samples)
    wasserstein_X = np.zeros(bs_samples)
    wasserstein_y = np.zeros(bs_samples)
    for b in range(bs_samples):
        n_samples = int(np.round(0.80 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X1.shape[0]),
                                          n_samples,
                                          replace=True).astype(int)
        y_bs = y[subset_indices]
        X_bs = X1[subset_indices, :]

        test1 = xpipe.predict(X_bs)
        wasserstein_X[b] = wasserstein_distance(X1.flatten(), X_bs.flatten())
        wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
        outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

    outliers_X.sort()
    outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(
        0.025 * bs_samples)]

    wasserstein_X.sort()
    wasserstein_X_threshold = wasserstein_X[int(
        0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

    wasserstein_y.sort()
    wasserstein_y_threshold = wasserstein_y[int(
        0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

    to_return = {
        "outlier_X": np.round(outlier_X_threshold, 1),
        "wasserstein_X": np.round(wasserstein_X_threshold, 2),
        "wasserstein_y": np.round(wasserstein_y_threshold, 2),
        "clf_X": xpipe,
        "X_source": X1,
        "y_source": y,
        "latest_X": X,
        "latest_y": y
    }
    return (to_return)

Example #18

0

Show file

File: measure.py Project: jpahle/LMMNet

def report_harmonic_onestep(metric_function):
    
    error_list = []
    
    time_points, test_data = harmonic.simulate_custom(xinit=1, yinit=0)

    feature_list = [] # here we do not have external time-series or control variables
    target_list = ['x_component', 'y_component']
    df_test = create_data(test_data, time_points, cols=target_list, num=2)

    for _ in range(10):
        # generate data with random initial conditions
        xi = np.random.uniform(0, 4, 2)
        yi = np.random.uniform(0, 4, 2)
        time_points, data1 = harmonic.simulate_custom(xinit=xi[0], yinit=yi[0])    
        time_points, data2 = harmonic.simulate_custom(xinit=xi[1], yinit=yi[1])

        # now generate and augment the training dataset
        df1 = create_data(data1, time_points, cols=target_list, num=0)
        df2 = create_data(data2, time_points, cols=target_list, num=1)
        df = pd.concat([df1, df2])
        df_train = train_onestep.generate_dataset(df, [0,1],feature_list, target_list, n_dim=30000)

        rf_model = RandomForestRegressor(n_estimators=20)
        figure_path = './plots/'
        rf_dict, score_dict = train_onestep.train_classic(df_train, rf_model, plot=False,model_type='random_forest', figure_path=figure_path)
        time_points, predictions = predict_onestep.predict_integrate(df_test, df, rf_dict, target_list, feature_list, title='test', plot=False,model_type='random_forest', subplots=(2,1), bio=False)
        predictions = predictions.to_numpy()

        if metric_function == "wasserstein":
            e1 = wasserstein_distance(predictions[:,0], test_data[0,:,0])
            e2 = wasserstein_distance(predictions[:,1], test_data[0,:,1])
            
        elif metric_function == "dtw":
            e1, _ = fastdtw(predictions[:,0], test_data[0,:,0], dist=euclidean)
            e2, _ = fastdtw(predictions[:,1], test_data[0,:,1], dist=euclidean)
            e1 /= np.linalg.norm(test_data[0,:,0], 2)**2
            e2 /= np.linalg.norm(test_data[0,:,1], 2)**2
            
        elif metric_function == "mse":
            e1 = predict_lmmNet.compute_MSE(predictions, test_data[0], 0)
            e2 = predict_lmmNet.compute_MSE(predictions, test_data[0], 1)
        error_list.append((e1, e2))
        
        # plot
        plt.plot(time_points, test_data[0,:,0], 'r.', label='x_1')
        plt.plot(time_points, test_data[0,:,1], 'y.', label='x_2')
        plt.plot(time_points, predictions[:,0], 'b--', label='predicted x_1')
        plt.plot(time_points, predictions[:,1], 'b--', label='predicted x_2')
        plt.title(str(xi) + " " + str(yi))
        plt.legend()
        plt.show()
        
    return error_list

Example #19

0

Show file

File: imgsegcmp.py Project: wbrandenburger/DataVisualization

    def predict(self, seg_list, **kwargs):
        # from sklearn.cross_decomposition import CCA
        # X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [3.,5.,4.]]
        # Y = [[0.1], [0.9], [6.2], [11.9]]
        # cca = CCA(n_components=1)
        # cca.fit(X, Y)
        # X_c, Y_c = cca.transform(X, Y)
        # print(X_c, Y_c)

        for idx in range(len(seg_list) - 1):
            seg_a = seg_list[idx]
            seg_b = seg_list[idx + 1]
            w, h = original_shape = tuple(seg_a.shape)
            seg_a = np.reshape(
                np.array(seg_a, dtype=np.float32) / np.max(seg_a), (w * h))
            seg_b = np.reshape(
                np.array(seg_b, dtype=np.float32) / np.max(seg_b), (w * h))

            # seg_as = seg_list[idx]
            # seg_bs = seg_list[idx+1]
            # seg_a = self.add_position_array(imgtools.expand_image_dim(seg_as))
            # seg_b = self.add_position_array(imgtools.expand_image_dim(seg_bs))
            # w, h, _ = original_shape = tuple(seg_a.shape)

            # seg_a = np.reshape(np.array(seg_a, dtype=np.float32), (w * h, 3))
            # seg_b = np.reshape(np.array(seg_b, dtype=np.float32), (w * h, 3))
            # # print(seg_a.shape)
            # seg_a = scale(seg_a)
            # seg_b = scale(seg_b)

            # seg_a = np.squeeze(np.reshape(np.array(seg_a, dtype=np.float32), (w * h * 3)))
            # seg_b = np.squeeze(np.reshape(np.array(seg_b, dtype=np.float32), (w * h * 3)))

            # print("A->B:{}".format(cv2.EMD(seg_a, seg_b, cv2.DIST_L2)))
            print("A->B:{}".format(wasserstein_distance(seg_a, seg_b)))
            for a_idx in np.unique(seg_a):
                seg_a_label = seg_a == a_idx
                # seg_a_label = self.add_position_array(imgtools.expand_image_dim(seg_as == a_idx))
                # seg_a_label = np.reshape(np.array(seg_a_label, dtype=np.float32), (w * h , 3))

                # seg_a_label = scale(seg_a_label)
                # seg_a_label = np.squeeze(np.reshape(np.array(seg_a_label, dtype=np.float32), (w * h * 3)))
                for b_idx in np.unique(seg_b):
                    seg_b_label = seg_b == b_idx
                    # seg_b_label = self.add_position_array(imgtools.expand_image_dim(seg_bs == b_idx))
                    # seg_b_label = np.reshape(np.array(seg_b_label, dtype=np.float32), (w * h , 3))

                    # seg_b_label = scale(seg_b_label)
                    # seg_b_label = np.squeeze(np.reshape(np.array(seg_b_label,   dtype=np.float32), (w * h * 3)))
                    # print("A({})->B({}):{}".format(seg_a_label, seg_b_label, cv2.EMD(seg_a_label, seg_b_label)))
                    print("A({})->B({}):{}".format(
                        a_idx, b_idx,
                        wasserstein_distance(seg_a_label, seg_b_label)))

Example #20

0

Show file

def compute_statistics(output_adj, output_coord, output_seq_len, y_adj, y_coord, y_seq_len, lamb=0.5):
    r"""
    Compute statistics for the current data point.
    
    :param output_adj: predicted A
    :param output_coord: predicted X
    :param output_seq_len: predicted |V|
    :param y_adj: target A
    :param y_coord: target X
    :param y_seq_len: target |V|
    :param lamb: lambda parameter for the loss in this experiment
    :return: streetmover, loss, loss_adj, loss_coord, acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam
    """
    output_A = decode_adj(output_adj[0, :output_seq_len - 2].cpu().numpy())  # not include the last 1)
    y_A = decode_adj(y_adj[0, :y_seq_len - 2].cpu().numpy())
    output_nodes = output_coord[0, :output_seq_len - 2]
    y_nodes = y_coord[0, :y_seq_len - 2]
    output_graph = nx.from_numpy_matrix(output_A)
    y_graph = nx.from_numpy_matrix(y_A)
    
    assert output_A.shape[0] == output_nodes.shape[0] == output_seq_len - 2
    assert y_A.shape[0] == y_nodes.shape[0] == y_seq_len - 2
    
    output_n_edges = output_adj.reshape(-1).sum()
    y_n_edges = y_adj.reshape(-1).sum()
    
    output_degree = get_degree_hist(output_graph)
    y_degree = get_degree_hist(y_graph)
    dist_degree = wasserstein_distance(output_degree, y_degree)
    
    output_diam = get_diameters(output_graph)
    y_diam = get_diameters(y_graph)
    dist_diam = wasserstein_distance(output_diam, y_diam) if len(output_diam) > 0 else 1
    
    delta_n_nodes = int(output_seq_len - y_seq_len)
    delta_n_edges = (output_n_edges - y_n_edges).item()
    
    acc_A = get_accuracy_A(output_A, y_A)
    
    loss_adj = get_BCE_adj(output_adj[0], y_adj[0])
    loss_coord = get_MSE_coord(output_nodes, y_nodes)
    loss = lamb * loss_adj + (1 - lamb) * loss_coord
    
    (y_pc, output_pc), (streetmover, P, C) = streetmover_distance(y_A, y_nodes, output_A, output_nodes, n_points=100)
    # print("Streetmover distance: {:.3f}".format(streetmover.item()))
    
    # possibly, plot assignments and/or point clouds
    # show_assignments(y_pc, output_pc, P, title=str(streetmover.item())[:8])
    # plot_point_cloud(y_adj[0], y_coord[0], y_pc)
    # plot_point_cloud(output_adj[0], output_coord[0], output_pc)
    
    return streetmover.item(), loss, loss_adj, loss_coord, acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam

Example #21

0

Show file

File: monitoring.py Project: atilatosta/ai-workflow-capstone

def model_monitor(country="all", dev=DEV, training=True):
    """
    performance monitoring
    """
    print("Monitor Model")
    
    ## import data
    #datasets = engineer_features(training=training, dev=dev)
    datasets = engineer_features(training=training)
    X, y, dates, labels = datasets[country]
    dates = pd.to_datetime(dates)
    print(X.shape)
    
    ## train the model
    if training:
        _model_train(X, y, labels, tag=country, dev=dev)
    
    ## monitor RMSE
    samples = [10, 20, 30, 50, 60]

    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n, X, y, dates)
        queries = [(str(d.year), str(d.month), str(d.day), country) for d in dates_new]
        y_pred = [model_predict(year=query[0], month=query[1], day=query[2], country=query[3],verbose=False, dev=dev)["y_pred"][0].round(2) for query in queries]
        rmse = np.sqrt(mean_squared_error(y_new.tolist(),y_pred))
        print("sample size: {}, RSME: {}".format(n, rmse.round(2)))
        
    ## monitor performance
    ## scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    samples = [25, 50, 75, 90]

    clf_y = EllipticEnvelope(random_state=0,contamination=0.01)
    clf_X = EllipticEnvelope(random_state=0,contamination=0.01)

    clf_X.fit(X)
    clf_y.fit(y.reshape(y.size,1))

    results = defaultdict(list)
    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n,X,y, dates)
        results["sample_size"].append(n)
        results['wasserstein_X'].append(np.round(wasserstein_distance(X.flatten(),X_new.flatten()),2))
        results['wasserstein_y'].append(np.round(wasserstein_distance(y,y_new),2))
        test1 = clf_X.predict(X_new)
        test2 = clf_y.predict(y_new.reshape(y_new.size,1))
        results["outlier_percent_X"].append(np.round(1.0 - (test1[test1==1].size / test1.size),2))
        results["outlier_percent_y"].append(np.round(1.0 - (test2[test2==1].size / test2.size),2))
    
    return pd.DataFrame(results)

Example #22

0

Show file

File: quant_post_hpo.py Project: itminner/PaddleSlim

def cal_emd_lose(out_float_list, out_quant_list, out_len):
    """caculate earch move distance"""
    emd_sum = 0
    if out_len >= 3:
        for index in range(len(out_float_list)):
            emd_sum += wasserstein_distance(out_float_list[index],
                                            out_quant_list[index])
    else:
        out_float = np.concatenate(out_float_list)
        out_quant = np.concatenate(out_quant_list)
        emd_sum += wasserstein_distance(out_float, out_quant)
    emd_sum /= float(len(out_float_list))
    return emd_sum

Example #23

0

Show file

File: measure.py Project: jpahle/LMMNet

def report_linear_lmmnet(metric_function):
    
    error_list = []
    
    time_points, test_data = linear.simulate_default()


    for _ in range(10):
        # generate data with random initial conditions
        xi = np.random.uniform(1, 4, 1)[0]
        yi = np.random.uniform(0, 4, 1)[0]
        zi = np.random.uniform(1, 2, 1)[0]
        time_points, cubic_data = linear.simulate_custom(xinit=xi, yinit=yi, zinit=zi)

        model = train_lmmNet.train_easy(time_points, cubic_data)
        x0 = test_data[0,0,:] # initial conditions
        predicted_traj = odeint(lambda x, t: predict_lmmNet.predict_fn(x, t, model), x0, time_points)

        predictions = predicted_traj
        if metric_function == "wasserstein":
            e1 = wasserstein_distance(predictions[:,0], test_data[0,:,0])
            e2 = wasserstein_distance(predictions[:,1], test_data[0,:,1])
            e3 = wasserstein_distance(predictions[:,2], test_data[0,:,2])
            
        elif metric_function == "dtw":
            e1, _ = fastdtw(predictions[:,0], test_data[0,:,0], dist=euclidean)
            e2, _ = fastdtw(predictions[:,1], test_data[0,:,1], dist=euclidean)
            e3, _ = fastdtw(predictions[:,2], test_data[0,:,2], dist=euclidean)
            e1 /= np.linalg.norm(test_data[0,:,0], 2)**2
            e2 /= np.linalg.norm(test_data[0,:,1], 2)**2
            e3 /= np.linalg.norm(test_data[0,:,2], 2)**2
            
        elif metric_function == "mse":
            e1 = predict_lmmNet.compute_MSE(predictions, test_data[0], 0)
            e2 = predict_lmmNet.compute_MSE(predictions, test_data[0], 1)
            e3 = predict_lmmNet.compute_MSE(predictions, test_data[0], 2)
        error_list.append((e1, e2, e3))
        
        # plot
        plt.figure(figsize=(20, 10))
        plt.plot(time_points, test_data[0,:,0], 'r.', label='x_1')
        plt.plot(time_points, test_data[0,:,1], 'y.', label='x_2')
        plt.plot(time_points, test_data[0,:,2], 'g.', label='x_3')
        plt.plot(time_points, predictions[:,0], 'b--', label='predicted dynamics')
        plt.plot(time_points, predictions[:,1], 'b--')
        plt.plot(time_points, predictions[:,2], 'b--')
        plt.title(str(xi) + " " + str(yi) + " " + str(zi))
        plt.legend()
        plt.show()
        
    return error_list

Example #24

0

Show file

File: analysis.py Project: markloyman/LungNoduleRetrieval

def calc_distance_matrix(X, method):
    if method in ['chebyshev', 'euclidean', 'l1', 'l2']:
        DM = DistanceMetric.get_metric(method).pairwise(X)
    elif method in ['cosine']:
        DM = pairwise.cosine_distances(X)
    elif method in [
            'correlation', 'cityblock', 'braycurtis', 'canberra', 'hamming',
            'jaccard', 'kulsinski'
    ]:
        DM = squareform(pdist(X, method))
    elif method in ['minkowski3']:
        DM = squareform(pdist(X, 'minkowski', 3))
    elif method in ['dot']:
        DM = squareform(pdist(X, lambda u, v: np.dot(u, v)))
    elif method in ['emd']:
        from scipy.stats import wasserstein_distance
        l = len(X)
        DM = np.zeros((l, l))
        for x in range(l):
            for y in range(l):
                DM[x, y] = wasserstein_distance(X[x], X[y])
    else:
        return None

    return DM

Example #25

0

Show file

File: WPCA.py Project: avitase/wpipln

    def fit(self, X, y, w):
        labels = np.unique(y)
        assert 0 < len(labels) <= 2

        super(BinaryWPCA, self).fit(X, y, w)
        assert self.R is not None

        sel = (y == labels[0])
        X1 = X[sel, :] @ self.R
        w1 = w[sel]
        w2 = w[~sel]
        X2 = X[~sel, :] @ self.R

        assert X1.shape == X2.shape
        _, n = X1.shape
        distances = [
            wasserstein_distance(u_values=X1[:, i],
                                 u_weights=w1,
                                 v_values=X2[:, i],
                                 v_weights=w2) for i in range(n)
        ]

        idx = np.flip(np.argsort(distances))
        self.R = self.R[:, idx]
        self.is_fitted = True

Example #26

0

Show file

File: vgg_vi.py Project: BayesianDetection/BayesianDetection

def get_dist(outf, layer,result_j, predicted, sample_size, criteria):
    final_tr = np.load(outf + "emp_bnn_train_"+str(layer) + ".npy")
    predicted_tr = np.load(outf + "labels_bnn_train.npy")
    #print(layer)
    if layer < 44:
        pca_model = pk.load(open(outf + "pca_bnn"+str(layer)+".pkl","rb"))
        #final_tr = pca_model.transform(final_tr)
        #print(result_j.cpu().detach().numpy().shape)
        final_adv = pca_model.transform(result_j.cpu().detach().numpy())
    else:
        final_adv = result_j.cpu().detach().numpy()
        
    distance = np.zeros(final_adv.shape[0])
    for i in range(final_adv.shape[0]):
        data_train_sample = final_tr[predicted_tr == int(predicted[i])]
        #print(predicted[i], data_train_sample.shape)
        ind = np.random.choice(data_train_sample.shape[0],min(sample_size, data_train_sample.shape[0]),replace=False)
        data_train_sample_i = data_train_sample[ind,]
        dist = np.zeros(data_train_sample_i.shape[0])
        for k in range(data_train_sample_i.shape[0]):
            dist[k] = wasserstein_distance(final_adv[i,:], data_train_sample_i[k,:])
        #print(dist)
        if len(dist) == 0:   
            dis_adv = 0
        elif criteria == 'mean':
            dis_adv = dist.mean()
        elif criteria == 'min':
            dis_adv = dist.min()
        else:
            dis_adv = np.median(dist)
        distance[i] = dis_adv
    return distance

Example #27

0

Show file

File: pareto_steiner_stats.py Project: arjunc12/Plants

def triplet_analysis(df, categories=CATEGORIES):
    df2 = df.drop_duplicates()
    for category in categories:
        groupby_cols = categories[:]
        groupby_cols.remove(category)

        fname = 'triplets_%s.csv' % category
        fname = fname.replace(' ', '_')
        with open(fname, 'w') as f:
            for name, group in df2.groupby(groupby_cols):
                unique_vals = group[category].unique()
                group_items = ['--------------------', ', '.join(name), '--------------------']
                write_items = []
                for val1, val2 in combinations(group[category].unique(), 2):
                    sample1 = group['alpha'][group[category] == val1]
                    sample2 = group['alpha'][group[category] == val2]
                    n1 = len(sample1)
                    n2 = len(sample2)
                    if n1 > 25 and n2 > 25:
                        dist = wasserstein_distance(sample1, sample2)
                        write_items.append((dist, n1, n2, val1, val2))
                        #write_items.append('%s (%d), %s (%d), %f' % (val1, n1, val2, n2, dist))
                if len(write_items) > 0:
                    f.write('-----------------------\n')
                    f.write(', '.join(name) + '\n')
                    f.write('-----------------------\n')
                    write_items = reversed(sorted(write_items))
                    for dist, n1, n2, val1, val2 in write_items:
                        f.write('%s (%d), %s (%d), %f\n' % (val1, n1, val2, n2, dist))

Example #28

0

Show file

def date_dist_scores(ref_timeline, ground_truth, p_val=.05):
    '''
    Scores predicted distribution of timeline event dates using 2 sample
        Kolmogorov-Smirnov statistic and the first Wasserstein distance
        (earth mover's distance)

    Returns dict with KS statistic, if time distributions are statistically
        significantly different, and the Wasserstein distance
    '''
    gt_dates = [time.mktime(d.timetuple()) for d in ground_truth.get_dates()]
    ref_dates = [time.mktime(d.timetuple()) for d in ref_timeline.get_dates()]

    scaler = MinMaxScaler()

    gt_scaled = scaler.fit_transform(np.array(gt_dates).reshape(-1, 1)).T[0]
    ref_scaled = scaler.transform(np.array(ref_dates).reshape(-1, 1)).T[0]

    ks_test = stats.ks_2samp(gt_scaled, ref_scaled)
    emd = stats.wasserstein_distance(gt_scaled, ref_scaled)

    # ks_signif 1 when the differentce in date distribution between ground
    #	truth and generated timelines is statistically significant
    return {
        'ks_stat': ks_test.statistic,
        'ks_signif': int(ks_test.pvalue < p_val),
        'earth_movers_distance': emd
    }

Example #29

0

Show file

def wasserstein_test(u_values, v_values, bootstraps=999, use_gamma_model=True):
    # permutation test of wasserstein distance
    # based on the one outlined in https://github.com/cdowd/twosamples
    wass_dist, wass_dir = wasserstein_distance_and_direction(
        u_values, v_values)

    # under null hypothesis the samples are drawn from the same distribution
    # so we can make expected wasserstein values by permuting values between
    # the two samples
    pool = np.concatenate([u_values, v_values])
    n = len(u_values)
    exp = []
    for _ in range(bootstraps):
        np.random.shuffle(pool)
        exp.append(stats.wasserstein_distance(pool[:n], pool[n:]))
    exp = np.array(exp)

    if not use_gamma_model:
        # bootstrap p value with pseudocount
        p_val = ((exp >= wass_dist).sum() + 1) / (bootstraps + 1)
    else:
        # fit a gamma distribution to the expected distances
        g = stats.gamma(*stats.gamma.fit(exp))
        # compute p value using survival function
        p_val = g.sf(wass_dist)
    return wass_dist, wass_dir, p_val

Example #30

0

Show file

File: vgg_vi.py Project: BayesianDetection/BayesianDetection

def get_decision(layer,result_j, predicted, thrd, sample_size, criteria):
    final_tr = torch.load("./hidden_output/emp_bnn_train_"+str(layer)).cpu().detach().numpy()
    predicted_tr = np.load("./data/predicts_bnn_train.npy")
    if layer < 43:
        pca_model = pk.load(open("./data/pca_bnn"+str(layer)+".pkl","rb"))
        final_tr = pca_model.transform(final_tr)
        final_adv = pca_model.transform(result_j.cpu().detach().numpy())
    else:
        final_adv = result_j.cpu().detach().numpy()
        
    decision = np.zeros(final_adv.shape[0])
    for i in range(final_adv.shape[0]):
        data_train_sample = final_tr[predicted_tr == int(predicted[i])] 
        ind = np.random.choice(data_train_sample.shape[0],min(sample_size, data_train_sample.shape[0]),replace=False)
        data_train_sample_i = data_train_sample[ind,]
        dist = np.zeros(data_train_sample_i.shape[0])
        for k in range(data_train_sample_i.shape[0]):
            dist[k] = wasserstein_distance(final_adv[i,:], data_train_sample_i[k,:])
        if criteria == 'mean':
            dis_adv = dist.mean()
        elif criteria == 'min':
            dis_adv = dist.min()
        else:
            dis_adv = np.median(dist)
        if dis_adv > thrd[int(predicted[i])]:
            decision[i] = 1
    return decision