def sliced_wasserstein_no_histogram(p, q, iters=20): ''' Utility function for the 1-Sliced Wasserstein distance. Tries to fight the course of dimensionality by only considering the "bins" around the sampled values. If the sampled values are entirely wrong, this method will not work >> use first moment test first. p = sampled values q = density function at the sampled values ''' if any(np.isnan(x) for x in p.flatten()): return float('inf') if len(set(tuple(x) for x in p)) / len(p) <= 0.1: return float('inf') dim = p.shape[1] if dim == 1: return wasserstein_distance(p.flatten(), p.flatten(), np.ones(p.shape[0]) + EPS, q + EPS) dist = 0 for _ in range(iters): proj_vec = normal(size=dim) proj_vec = proj_vec / norm(proj_vec) # sample randomly from dim-1 sphere bins = [np.dot( proj_vec, pt ) for pt in p] dist += wasserstein_distance(bins, bins, np.ones(p.shape[0]) + EPS, q + EPS) return dist/iters
def get_monitoring_tools(X, y): """ determine outlier and distance thresholds return thresholds, outlier model(s) and source distributions for distances NOTE: for classification the outlier detection on y is not needed """ preprocessor = get_preprocessor() preprocessor = preprocessor.fit(X) X_pp = preprocessor.transform(X) xpipe = Pipeline(steps=[( 'pca', PCA(2)), ('clf', EllipticEnvelope(random_state=0, contamination=0.01))]) xpipe.fit(X_pp) bs_samples = 1000 outliers_X = np.zeros(bs_samples) wasserstein_X = np.zeros(bs_samples) wasserstein_y = np.zeros(bs_samples) for b in range(bs_samples): n_samples = int(np.round(0.80 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=True).astype(int) y_bs = y[subset_indices] X_bs = X_pp[subset_indices, :] test1 = xpipe.predict(X_bs) wasserstein_X[b] = wasserstein_distance(X_pp.flatten(), X_bs.flatten()) wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten()) outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size)) ## determine thresholds as a function of the confidence intervals outliers_X.sort() outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int( 0.025 * bs_samples)] wasserstein_X.sort() wasserstein_X_threshold = wasserstein_X[int( 0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)] wasserstein_y.sort() wasserstein_y_threshold = wasserstein_y[int( 0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)] to_return = { "outlier_X": np.round(outlier_X_threshold, 1), "wasserstein_X": np.round(wasserstein_X_threshold, 2), "wasserstein_y": np.round(wasserstein_y_threshold, 2), "preprocessor": preprocessor, "clf_X": xpipe, "X_source": X_pp, "y_source": y, "latest_X": X, "latest_y": y } return (to_return)
def calc_c_hist(self): """ Compare the histograms of all of the eigenvectors (eigenfunctions) for each of the meshes - only upto the number of meshes we are interested in. The paper says to do this for the number of features we are interested (i.e. 5 or 6), however, this implementation does that for more than the requested number of features to ensure that we dont get the wrong ones. So, we'll get more spectral coordinates, order them, and then select the appropriate number from these re-ordered coordinates. :return: """ # Initially tried using straight values (not log) but eig vec 1 got accentuated too much (in the weighting) # Then tried just log, but becuase there are negative values it creates erro. # need to add .5 + a small value to ensure there are no 0 values entered to log. # wasserstein_distance is the same as earth movers distance, and is the minimum "work" needed to # transform u (first entry) into v (second entry). eps = np.finfo(float).eps for i in range(self.n_features): for j in range(self.n_features): self.c_hist[i, j] = wasserstein_distance( np.log(self.rand_target_eig_vecs[:, i] + 0.5 + eps), np.log(self.rand_source_eig_vecs[:, j] + 0.5 + eps)) self.c_hist_f[i, j] = wasserstein_distance( np.log(self.rand_target_eig_vecs[:, i] + 0.5 + eps), np.log(-self.rand_source_eig_vecs[:, j] + 0.5 + eps))
def estimate_noise_model(img_segment): mean, standard_deviation, hist, bins = estimate_noise_parameters( img_segment, return_histogram=True) pdf, bins = histogram_processing.compute_image_pdf(img_segment) #standard_deviation = np.sqrt(np.var(pdf.values())) gaussian_pdf = generate_gaussian_pdf(mean, standard_deviation) rayleigh_pdf = generate_rayleigh_pdf(mean, standard_deviation) erlang_pdf = generate_erlang_pdf(mean, standard_deviation) exponential_pdf = generate_exponential_pdf(mean, standard_deviation) uniform_pdf = generate_uniform_pdf(mean, standard_deviation) gaussian_distance = wasserstein_distance( pdf, np.array(list(gaussian_pdf.values()))) rayleigh_distance = wasserstein_distance( pdf, np.array(list(rayleigh_pdf.values()))) erlang_distance = wasserstein_distance(pdf, np.array(list(erlang_pdf.values()))) exponential_distance = wasserstein_distance( pdf, np.array(list(exponential_pdf.values()))) uniform_distance = wasserstein_distance( pdf, np.array(list(uniform_pdf.values()))) distances = { 'gaussian': gaussian_distance, 'rayleigh': rayleigh_distance, 'erlang': erlang_distance, 'exponential': exponential_distance, 'uniform': uniform_distance } min_distance = tuple((d[0], d[1]) for d in distances.items() if d[1] == min(distances.values())) return distances, min_distance
def compare_drift(X_src, y_src, X_new, y_new): clf_y = EllipticEnvelope(random_state=0, contamination=0.01) clf_X = EllipticEnvelope(random_state=0, contamination=0.01) clf_X.fit(X_src) clf_y.fit(y_src.reshape(y_src.size, 1)) test_X = clf_X.predict(X_new) test_y = clf_y.predict(y_new.reshape(-1, 1)) X_distance = wasserstein_distance(X_src.values.flatten(), X_new.values.flatten()) y_distance = wasserstein_distance(y_src.flatten(), y_new.flatten()) X_outlier = len(test_X[test_X == -1]) / len(test_X) y_outlier = len(test_y[test_y == -1]) / len(test_y) results = { 'X_wasserstein_distance': X_distance, 'y_wasserstein_distance': y_distance, 'X_outlier_percentage': X_outlier, 'y_outlier_percentage': y_outlier } return results
def metric_max(n_classes,Mtype): Pref=np.ones(n_classes)/n_classes #Reference attribute Pep=np.zeros(n_classes) Pep[0]=1 if Mtype=="L1": fair_d = abs(Pep - Pref).sum() elif Mtype=="L2": fair_d = np.sqrt(((Pep - Pref)**2).sum()) elif Mtype=="Is": #L1 l1_fair_d = abs(Pep - Pref).sum() #Specificity rank=np.linspace(1,n_classes-1,n_classes-1) rank[::-1].sort() #Descending order perc=np.array([i/np.sum(rank) for i in rank]) alpha=Pep[1:] specificity=abs(Pep[0]-np.sum(alpha*perc)) fair_d=(l1_fair_d+specificity)/2 elif Mtype=="Wd": fair_d=wasserstein_distance(Pep,Pref) elif Mtype=="Wds": #Specificity rank=np.linspace(1,n_classes-1,n_classes-1) rank[::-1].sort() #Descending order perc=np.array([i/np.sum(rank) for i in rank]) alpha=Pep[1:] specificity=abs(Pep[0]-np.sum(alpha*perc)) #Wassertein dist ws=wasserstein_distance(Pep,Pref) fair_d=(ws+specificity)/2 else: fair_d=0 return fair_d
def compute_statistics_MLP(y_A, y_nodes, output_A, output_nodes, y_seq_len, output_seq_len): r""" Compute statistics for the current data point, based on the one-shot output from the MLP decoder. :param output_adj: predicted A :param output_coord: predicted X :param output_seq_len: predicted |V| :param y_adj: target A :param y_coord: target X :param y_seq_len: target |V| :param lamb: lambda parameter for the loss in this experiment :return: streetmover, acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam """ output_graph = nx.from_numpy_matrix(output_A) y_graph = nx.from_numpy_matrix(y_A) output_degree = get_degree_hist(output_graph) y_degree = get_degree_hist(y_graph) dist_degree = wasserstein_distance(output_degree, y_degree) output_diam = get_diameters(output_graph) y_diam = get_diameters(y_graph) dist_diam = wasserstein_distance(output_diam, y_diam) if len(output_diam) > 0 else 1 delta_n_nodes = int(output_seq_len - y_seq_len) delta_n_edges = output_A.sum() - y_A.sum() acc_A = get_accuracy_A(output_A, y_A) (y_pc, output_pc), (streetmover, P, C) = streetmover_distance(y_A, y_nodes, output_A, output_nodes, n_points=100) # print("Streetmover distance: {:.3f}".format(streetmover.item())) return streetmover.item(), acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam
def wasserstein(dataset_path): """ This needs to be the full datasets to make sense. Since it is the full datasets, this is not included when running all tests. """ from scipy.stats import wasserstein_distance for dataset_cls in all_datasets(): print('Loading dataset:', dataset_cls.__name__) train = dataset_cls(dataset_path, split=TRAIN) valid = dataset_cls(dataset_path, split=VALIDATION) test = dataset_cls(dataset_path, split=TEST) splits = [train, valid, test] ws_max = 0.0 ws_ave = 0.0 for i in range(train.num_features): for j, split in enumerate(splits): if j == 0: hist, bins = np.histogram(split.x[:, i], density=True) us = split.x[:, i] else: hist, _ = np.histogram(split.x[:, i], density=True, bins=bins) vs = split.x[:, i] ws = wasserstein_distance(us, vs) ws_max = max(ws_max, wasserstein_distance(us, vs)) ws_ave += ws / (train.num_features * 2) print('Max wasserstein:', ws_max) print('Average wasserstein:', ws_ave) print()
def f_dist(histogram1, histogram2): if histogram1 != [0] and histogram2 != [0]: return wasserstein_distance(histogram1[0], histogram2[0]) elif histogram1 != [0] and histogram2 == [0]: return wasserstein_distance(histogram1[0], np.zeros((1, ))) elif histogram1 == [0] and histogram2 != [0]: return wasserstein_distance(histogram2[0], np.zeros((1, )))
def report_linear_onestep(metric_function): error_list = [] time_points, test_data = linear.simulate_default() feature_list = [] # here we do not have external time-series or control variables target_list = ['x_component', 'y_component', 'z_component'] df_test = create_data(test_data, time_points, cols=target_list, num=2) for _ in range(10): # generate data with random initial conditions xi = np.random.uniform(1, 4, 1)[0] yi = np.random.uniform(0, 4, 1)[0] zi = np.random.uniform(1, 2, 1)[0] time_points, cubic_data = linear.simulate_custom(xinit=xi, yinit=yi, zinit=zi) df = create_data(cubic_data, time_points, cols=target_list, num=0) df_train = train_onestep.generate_dataset(df, [0],feature_list, target_list, n_dim=2500) rf_model = RandomForestRegressor(n_estimators=20) figure_path = './plots/' rf_dict, score_dict = train_onestep.train_classic(df_train, rf_model, plot=False,model_type='random_forest', figure_path=figure_path) time_points, predictions = predict_onestep.predict_integrate(df_test, df, rf_dict, target_list, feature_list, title='test', plot=False,model_type='random_forest', subplots=(3,1), bio=False) predictions = predictions.to_numpy() if metric_function == "wasserstein": e1 = wasserstein_distance(predictions[:,0], test_data[0,:,0]) e2 = wasserstein_distance(predictions[:,1], test_data[0,:,1]) e3 = wasserstein_distance(predictions[:,2], test_data[0,:,2]) elif metric_function == "dtw": e1, _ = fastdtw(predictions[:,0], test_data[0,:,0], dist=euclidean) e2, _ = fastdtw(predictions[:,1], test_data[0,:,1], dist=euclidean) e3, _ = fastdtw(predictions[:,2], test_data[0,:,2], dist=euclidean) e1 /= np.linalg.norm(test_data[0,:,0], 2)**2 e2 /= np.linalg.norm(test_data[0,:,1], 2)**2 e3 /= np.linalg.norm(test_data[0,:,2], 2)**2 elif metric_function == "mse": e1 = predict_lmmNet.compute_MSE(predictions, test_data[0], 0) e2 = predict_lmmNet.compute_MSE(predictions, test_data[0], 1) e3 = predict_lmmNet.compute_MSE(predictions, test_data[0], 2) error_list.append((e1, e2, e3)) # plot plt.figure(figsize=(20, 10)) plt.plot(time_points, test_data[0,:,0], 'r.', label='x_1') plt.plot(time_points, test_data[0,:,1], 'y.', label='x_2') plt.plot(time_points, test_data[0,:,2], 'g.', label='x_3') plt.plot(time_points, predictions[:,0], 'b--', label='predicted dynamics') plt.plot(time_points, predictions[:,1], 'b--') plt.plot(time_points, predictions[:,2], 'b--') plt.title(str(xi) + " " + str(yi) + " " + str(zi)) plt.legend() plt.show() return error_list
def get_univariate_quant_metrics(self, dataset=TRAIN, transformed=False, verbose=True, thin_model=None, thin_true=None, seed=None, n=None): """ Calculates quantitative metrics for the difference between p(t) and p_model(t) and the difference between p(y) and p_model(y) :param dataset: dataset subset to evaluate on (train, val, or test) :param transformed: If True, use transformed version of data. If False, use original (non-transformed) version of data. :param thin_model: thinning interval for the model data :param thin_true: thinning interval for the real data :param seed: seed for sample from generative model :return: { 't_ks_pval': ks p-value with null that t_model and t_true are from the same distribution 'y_ks_pval': ks p-value with null that y_model and y_true are from the same distribution 't_wasserstein1_dist': wasserstein1 distance between t_true and t_model 'y_wasserstein1_dist': wasserstein1 distance between y_true and y_model } """ _, t_model, y_model = to_np_vectors(self.sample( seed=seed, untransform=(not transformed)), thin_interval=thin_model) _, t_true, y_true = self.get_data(transformed=transformed, dataset=dataset, verbose=verbose) t_true, y_true = to_np_vectors((t_true, y_true), thin_interval=thin_true) # jitter for numerical stability t_true = t_true.copy() + np.random.rand(*t_true.shape) * 1e-6 t_model = t_model.copy() + np.random.rand(*t_model.shape) * 1e-6 ks_label = "_ks_pval" es_label = "_es_pval" wasserstein_label = "_wasserstein1_dist" metrics = { T + ks_label: float(stats.ks_2samp(t_model, t_true).pvalue), Y + ks_label: float(stats.ks_2samp(y_model, y_true).pvalue), T + es_label: float(stats.epps_singleton_2samp(t_model, t_true).pvalue), Y + es_label: float(stats.epps_singleton_2samp(y_model, y_true).pvalue), T + wasserstein_label: float(stats.wasserstein_distance(t_model, t_true)), Y + wasserstein_label: float(stats.wasserstein_distance(y_model, y_true)), } return metrics
def clustering(init_rec_field1, fin_rec_field1, dst, mode='RGB', show=True): """ Dendograms with earth mover's distance. """ import numpy as np import matplotlib.pyplot as plt import scipy as scp from sklearn.cluster import AgglomerativeClustering from scipy.stats import wasserstein_distance from scipy.cluster.hierarchy import dendrogram, linkage if mode == 'RGB': channels = 3 else: channels = 4 # first gen: generate clusters and dendograms with earth mover distance dist = np.zeros(shape=(16,16)) for i in range(16): for j in range(16): dist[i,j] = wasserstein_distance(init_rec_field1[:,:,:channels,i].flatten(), init_rec_field1[:,:,:channels,j].flatten()) cluster = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage="average") clusters = cluster.fit_predict(dist) # plot dendogram sq_dist = scp.spatial.distance.squareform(dist) linkage_matrix = linkage(sq_dist, "average") dendrogram(linkage_matrix) plt.title("[DENDOGRAM RECEPTIVE FIELDS, FIRST GENERATION]: earth mover distance, linkage 'avg'.") if show == True: plt.show() else: pass # last gen: generate clusters and dendograms with earth mover distance dist = np.zeros(shape=(16,16)) for i in range(16): for j in range(16): dist[i,j] = wasserstein_distance(fin_rec_field1[:,:,:channels,i].flatten(), fin_rec_field1[:,:,:channels,j].flatten()) cluster = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage="average") clusters = cluster.fit_predict(dist) # plot dendogram sq_dist = scp.spatial.distance.squareform(dist) linkage_matrix = linkage(sq_dist, "average") dendrogram(linkage_matrix) plt.title("[DENDOGRAM RECEPTIVE FIELDS, LAST GENERATION]: earth mover distance, linkage 'avg'.") if show == True: plt.show() else: pass
def calc_distance(features: {}, shape_features: {}) -> {}: """ It determines the closest shape to the query shape by computing a custom distance function between the features of dataset's shape and the features of the query shape. ---------------------------- Args: features (obj: 'dict): The dictionary containing the feature metrics of the shapes shape_features (obj: 'dict'): The dictionary containing the feature metrics of the shape Returns: similarities (obj: 'dict'): The dictionary containing the closest shapes (key) and the respective distance to the query shape (value) """ similarities = {} weights = np.load(s.SAVED_DATA + "distance_weights.npy", allow_pickle=True).item() for id, featuresList in features.items(): # Distance is the square root of the sum of squared differences dist_v = distance.euclidean(featuresList['volume'], shape_features.get('volume')) dist_a = distance.euclidean(featuresList['area'], shape_features.get('area')) dist_c = distance.euclidean(featuresList['compactness'], shape_features.get('compactness')) dist_bb = distance.euclidean(featuresList['bbox_volume'], shape_features.get('bbox_volume')) dist_d = distance.euclidean(featuresList['diameter'], shape_features.get('diameter')) dist_e = distance.euclidean(featuresList['eccentricity'], shape_features.get('eccentricity')) dist_A3 = wasserstein_distance(featuresList['A3'][0], shape_features.get('A3')[0]) dist_D1 = wasserstein_distance(featuresList['D1'][0], shape_features.get('D1')[0]) dist_D2 = wasserstein_distance(featuresList['D2'][0], shape_features.get('D2')[0]) dist_D3 = wasserstein_distance(featuresList['D3'][0], shape_features.get('D3')[0]) dist_D4 = wasserstein_distance(featuresList['D4'][0], shape_features.get('D4')[0]) similarity = weights["w_v"]*dist_v + \ weights["w_a"]*dist_a + \ weights["w_c"]*dist_c + \ weights["w_bb"]*dist_bb + \ weights["w_d"]*dist_d + \ weights["w_e"]*dist_e + \ weights["w_A3"]*dist_A3 + \ weights["w_D1"]*dist_D1 + \ weights["w_D2"]*dist_D2 + \ weights["w_D3"]*dist_D3 + \ weights["w_D4"]*dist_D4 similarities[id] = similarity return similarities
def behavior_comparison(): networks = ( ('Caveman-50-10', fio.read_network('networks/cavemen-50-10.txt')), ('Elitist-500', fio.read_network('networks/elitist-500.txt')), ('CGG-500', fio.read_network('networks/cgg-500.txt')) ) num_sims = 50 num_behaviors = 3 distributions = [] averages = np.zeros((len(networks), num_behaviors)) loop = tqdm(total=len(networks) * num_behaviors * num_sims) for i, (n_name, net) in enumerate(networks): behaviors = ( ('No Mitigations', behavior.NoMitigation()), ('Generic Pressure R=1', behavior.SimplePressureBehavior(net, rng=RNG, radius=1)), ('Edge Pressure R=1', behavior.SimpleEdgePressureBehavior(net, rng=RNG, radius=1)) # ('All Edges Sequential Flicker 1/4', # StaticFlickerBehavior(net.M, net.edges, (True, False, False, False))), # ('All Edges Random Flicker 0.25', # RandomFlickerBehavior(net.M, net.edges, 0.25)), # ('Collected Pressure Flicker 0.25, R=1', # UnifiedPressureFlickerBehavior(net, 1, RNG)), # ('Generic Pressure Radius 3', # SimplePressureBehavior(net, 3)), # ('Pressure Decay Radius 3', # PressureDecayBehavior(net, 3)), # ('Pressure Flicker Radius 3', # PressureFlickerBehavior(net, 3)) ) for j, (b_name, behavior) in enumerate(behaviors): s_scores = [] for _ in range(num_sims): loop.set_description(f'{n_name}, {b_name}') end_sir = simulate(net.M, sir0=make_starting_sir(net.N, 1, rng=RNG), disease=Disease(4, 0.3), update_connections=behavior, max_steps=200, rng=RNG)[-1] s_scores.append(np.sum(end_sir[0, :] > 0)/net.N) loop.update() # plt.title(f'{n_name}, {b_name}, Avg: {sum(s_scores)/len(s_scores)}') # plt.hist(s_scores) # plt.figure() averages[i, j] = sum(s_scores)/len(s_scores) distributions.append(s_scores) print(wasserstein_distance(distributions[1], distributions[2])) print(wasserstein_distance(distributions[4], distributions[5])) print(wasserstein_distance(distributions[7], distributions[8])) # plt.show() np.set_printoptions(precision=3, suppress=True) print(averages)
def get_monitoring_tools(X, y): """ determine outlier and distance thresholds return thresholds, outlier model(s) and source distributions for distances """ X_pre = X xpipe = Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())]) xpipe.fit(X_pre, y) bs_samples = 1000 outliers_X = np.zeros(bs_samples) wasserstein_X = np.zeros(bs_samples) wasserstein_y = np.zeros(bs_samples) for b in range(bs_samples): n_samples = int(np.round(0.8 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y_bs = y[mask] X_bs = X[mask] test1 = xpipe.predict(X_bs) wasserstein_X[b] = wasserstein_distance(X_pre.values.flatten(), X_bs.values.flatten()) wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten()) outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size)) ## determine thresholds as a function of the confidence intervals outliers_X.sort() outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int( 0.025 * bs_samples)] wasserstein_X.sort() wasserstein_X_threshold = wasserstein_X[int( 0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)] wasserstein_y.sort() wasserstein_y_threshold = wasserstein_y[int( 0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)] to_return = { "outlier_X": np.round(outlier_X_threshold, 1), "wasserstein_X": np.round(wasserstein_X_threshold, 2), "wasserstein_y": np.round(wasserstein_y_threshold, 2), "pipe_X": xpipe, "X_source": X_pre, "y_source": y, "latest_X": X, "latest_y": y } return (to_return)
def detect(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]) -> Dict[str, Union[float, np.ndarray]]: """ determine outlier and distance thresholds return thresholds, outlier model(s) and source distributions for distances NOTE: for classification the outlier detection on y is not needed """ if isinstance(X, pd.DataFrame): X = X.values if isinstance(y, pd.Series): y = y.values contamination: float = 0.01 xpipe = Pipeline(steps=[('pca', PCA(2, random_state=self.seed)), ('clf', EllipticEnvelope(random_state=self.seed, contamination=contamination))]) xpipe.fit(X) bs_samples: int = 1000 outliers_X: np.ndarray = np.zeros(bs_samples) wasserstein_X: np.ndarray = np.zeros(bs_samples) wasserstein_y: np.ndarray = np.zeros(bs_samples) for b in range(bs_samples): # set random seed rng = np.random.default_rng(self.seed + b) n_samples = int(np.round(0.80 * X.shape[0])) subset_indices = rng.choice(np.arange(X.shape[0]), n_samples, replace=True).astype(int) y_bs = y[subset_indices] X_bs = X[subset_indices, :] test1 = xpipe.predict(X_bs) wasserstein_X[b] = wasserstein_distance(X.flatten(), X_bs.flatten()) wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten()) outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size)) ## determine thresholds as a function of the confidence intervals outliers_X.sort() outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(0.025 * bs_samples)] wasserstein_X.sort() wasserstein_X_threshold = wasserstein_X[int(0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)] wasserstein_y.sort() wasserstein_y_threshold = wasserstein_y[int(0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)] result = { "outlier_X": np.round(outlier_X_threshold, 2), "wasserstein_X": np.round(wasserstein_X_threshold, 2), "wasserstein_y": np.round(wasserstein_y_threshold, 2) } return result
def get_monitoring_tools(df): """ determine outlier and distance thresholds return thresholds, outlier model(s) and source distributions for distances NOTE: for classification the outlier detection on y is not needed """ X, y, dates = engineer_features(df) X1 = X.to_numpy() xpipe = Pipeline(steps=[( 'pca', PCA(2)), ('clf', EllipticEnvelope(random_state=0, contamination=0.01))]) xpipe.fit(X1) bs_samples = 549 outliers_X = np.zeros(bs_samples) wasserstein_X = np.zeros(bs_samples) wasserstein_y = np.zeros(bs_samples) for b in range(bs_samples): n_samples = int(np.round(0.80 * X.shape[0])) subset_indices = np.random.choice(np.arange(X1.shape[0]), n_samples, replace=True).astype(int) y_bs = y[subset_indices] X_bs = X1[subset_indices, :] test1 = xpipe.predict(X_bs) wasserstein_X[b] = wasserstein_distance(X1.flatten(), X_bs.flatten()) wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten()) outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size)) outliers_X.sort() outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int( 0.025 * bs_samples)] wasserstein_X.sort() wasserstein_X_threshold = wasserstein_X[int( 0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)] wasserstein_y.sort() wasserstein_y_threshold = wasserstein_y[int( 0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)] to_return = { "outlier_X": np.round(outlier_X_threshold, 1), "wasserstein_X": np.round(wasserstein_X_threshold, 2), "wasserstein_y": np.round(wasserstein_y_threshold, 2), "clf_X": xpipe, "X_source": X1, "y_source": y, "latest_X": X, "latest_y": y } return (to_return)
def report_harmonic_onestep(metric_function): error_list = [] time_points, test_data = harmonic.simulate_custom(xinit=1, yinit=0) feature_list = [] # here we do not have external time-series or control variables target_list = ['x_component', 'y_component'] df_test = create_data(test_data, time_points, cols=target_list, num=2) for _ in range(10): # generate data with random initial conditions xi = np.random.uniform(0, 4, 2) yi = np.random.uniform(0, 4, 2) time_points, data1 = harmonic.simulate_custom(xinit=xi[0], yinit=yi[0]) time_points, data2 = harmonic.simulate_custom(xinit=xi[1], yinit=yi[1]) # now generate and augment the training dataset df1 = create_data(data1, time_points, cols=target_list, num=0) df2 = create_data(data2, time_points, cols=target_list, num=1) df = pd.concat([df1, df2]) df_train = train_onestep.generate_dataset(df, [0,1],feature_list, target_list, n_dim=30000) rf_model = RandomForestRegressor(n_estimators=20) figure_path = './plots/' rf_dict, score_dict = train_onestep.train_classic(df_train, rf_model, plot=False,model_type='random_forest', figure_path=figure_path) time_points, predictions = predict_onestep.predict_integrate(df_test, df, rf_dict, target_list, feature_list, title='test', plot=False,model_type='random_forest', subplots=(2,1), bio=False) predictions = predictions.to_numpy() if metric_function == "wasserstein": e1 = wasserstein_distance(predictions[:,0], test_data[0,:,0]) e2 = wasserstein_distance(predictions[:,1], test_data[0,:,1]) elif metric_function == "dtw": e1, _ = fastdtw(predictions[:,0], test_data[0,:,0], dist=euclidean) e2, _ = fastdtw(predictions[:,1], test_data[0,:,1], dist=euclidean) e1 /= np.linalg.norm(test_data[0,:,0], 2)**2 e2 /= np.linalg.norm(test_data[0,:,1], 2)**2 elif metric_function == "mse": e1 = predict_lmmNet.compute_MSE(predictions, test_data[0], 0) e2 = predict_lmmNet.compute_MSE(predictions, test_data[0], 1) error_list.append((e1, e2)) # plot plt.plot(time_points, test_data[0,:,0], 'r.', label='x_1') plt.plot(time_points, test_data[0,:,1], 'y.', label='x_2') plt.plot(time_points, predictions[:,0], 'b--', label='predicted x_1') plt.plot(time_points, predictions[:,1], 'b--', label='predicted x_2') plt.title(str(xi) + " " + str(yi)) plt.legend() plt.show() return error_list
def predict(self, seg_list, **kwargs): # from sklearn.cross_decomposition import CCA # X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [3.,5.,4.]] # Y = [[0.1], [0.9], [6.2], [11.9]] # cca = CCA(n_components=1) # cca.fit(X, Y) # X_c, Y_c = cca.transform(X, Y) # print(X_c, Y_c) for idx in range(len(seg_list) - 1): seg_a = seg_list[idx] seg_b = seg_list[idx + 1] w, h = original_shape = tuple(seg_a.shape) seg_a = np.reshape( np.array(seg_a, dtype=np.float32) / np.max(seg_a), (w * h)) seg_b = np.reshape( np.array(seg_b, dtype=np.float32) / np.max(seg_b), (w * h)) # seg_as = seg_list[idx] # seg_bs = seg_list[idx+1] # seg_a = self.add_position_array(imgtools.expand_image_dim(seg_as)) # seg_b = self.add_position_array(imgtools.expand_image_dim(seg_bs)) # w, h, _ = original_shape = tuple(seg_a.shape) # seg_a = np.reshape(np.array(seg_a, dtype=np.float32), (w * h, 3)) # seg_b = np.reshape(np.array(seg_b, dtype=np.float32), (w * h, 3)) # # print(seg_a.shape) # seg_a = scale(seg_a) # seg_b = scale(seg_b) # seg_a = np.squeeze(np.reshape(np.array(seg_a, dtype=np.float32), (w * h * 3))) # seg_b = np.squeeze(np.reshape(np.array(seg_b, dtype=np.float32), (w * h * 3))) # print("A->B:{}".format(cv2.EMD(seg_a, seg_b, cv2.DIST_L2))) print("A->B:{}".format(wasserstein_distance(seg_a, seg_b))) for a_idx in np.unique(seg_a): seg_a_label = seg_a == a_idx # seg_a_label = self.add_position_array(imgtools.expand_image_dim(seg_as == a_idx)) # seg_a_label = np.reshape(np.array(seg_a_label, dtype=np.float32), (w * h , 3)) # seg_a_label = scale(seg_a_label) # seg_a_label = np.squeeze(np.reshape(np.array(seg_a_label, dtype=np.float32), (w * h * 3))) for b_idx in np.unique(seg_b): seg_b_label = seg_b == b_idx # seg_b_label = self.add_position_array(imgtools.expand_image_dim(seg_bs == b_idx)) # seg_b_label = np.reshape(np.array(seg_b_label, dtype=np.float32), (w * h , 3)) # seg_b_label = scale(seg_b_label) # seg_b_label = np.squeeze(np.reshape(np.array(seg_b_label, dtype=np.float32), (w * h * 3))) # print("A({})->B({}):{}".format(seg_a_label, seg_b_label, cv2.EMD(seg_a_label, seg_b_label))) print("A({})->B({}):{}".format( a_idx, b_idx, wasserstein_distance(seg_a_label, seg_b_label)))
def compute_statistics(output_adj, output_coord, output_seq_len, y_adj, y_coord, y_seq_len, lamb=0.5): r""" Compute statistics for the current data point. :param output_adj: predicted A :param output_coord: predicted X :param output_seq_len: predicted |V| :param y_adj: target A :param y_coord: target X :param y_seq_len: target |V| :param lamb: lambda parameter for the loss in this experiment :return: streetmover, loss, loss_adj, loss_coord, acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam """ output_A = decode_adj(output_adj[0, :output_seq_len - 2].cpu().numpy()) # not include the last 1) y_A = decode_adj(y_adj[0, :y_seq_len - 2].cpu().numpy()) output_nodes = output_coord[0, :output_seq_len - 2] y_nodes = y_coord[0, :y_seq_len - 2] output_graph = nx.from_numpy_matrix(output_A) y_graph = nx.from_numpy_matrix(y_A) assert output_A.shape[0] == output_nodes.shape[0] == output_seq_len - 2 assert y_A.shape[0] == y_nodes.shape[0] == y_seq_len - 2 output_n_edges = output_adj.reshape(-1).sum() y_n_edges = y_adj.reshape(-1).sum() output_degree = get_degree_hist(output_graph) y_degree = get_degree_hist(y_graph) dist_degree = wasserstein_distance(output_degree, y_degree) output_diam = get_diameters(output_graph) y_diam = get_diameters(y_graph) dist_diam = wasserstein_distance(output_diam, y_diam) if len(output_diam) > 0 else 1 delta_n_nodes = int(output_seq_len - y_seq_len) delta_n_edges = (output_n_edges - y_n_edges).item() acc_A = get_accuracy_A(output_A, y_A) loss_adj = get_BCE_adj(output_adj[0], y_adj[0]) loss_coord = get_MSE_coord(output_nodes, y_nodes) loss = lamb * loss_adj + (1 - lamb) * loss_coord (y_pc, output_pc), (streetmover, P, C) = streetmover_distance(y_A, y_nodes, output_A, output_nodes, n_points=100) # print("Streetmover distance: {:.3f}".format(streetmover.item())) # possibly, plot assignments and/or point clouds # show_assignments(y_pc, output_pc, P, title=str(streetmover.item())[:8]) # plot_point_cloud(y_adj[0], y_coord[0], y_pc) # plot_point_cloud(output_adj[0], output_coord[0], output_pc) return streetmover.item(), loss, loss_adj, loss_coord, acc_A, delta_n_edges, delta_n_nodes, dist_degree, dist_diam
def model_monitor(country="all", dev=DEV, training=True): """ performance monitoring """ print("Monitor Model") ## import data #datasets = engineer_features(training=training, dev=dev) datasets = engineer_features(training=training) X, y, dates, labels = datasets[country] dates = pd.to_datetime(dates) print(X.shape) ## train the model if training: _model_train(X, y, labels, tag=country, dev=dev) ## monitor RMSE samples = [10, 20, 30, 50, 60] for n in samples: X_new, y_new, dates_new = simulate_samples(n, X, y, dates) queries = [(str(d.year), str(d.month), str(d.day), country) for d in dates_new] y_pred = [model_predict(year=query[0], month=query[1], day=query[2], country=query[3],verbose=False, dev=dev)["y_pred"][0].round(2) for query in queries] rmse = np.sqrt(mean_squared_error(y_new.tolist(),y_pred)) print("sample size: {}, RSME: {}".format(n, rmse.round(2))) ## monitor performance ## scaling scaler = StandardScaler() X = scaler.fit_transform(X) samples = [25, 50, 75, 90] clf_y = EllipticEnvelope(random_state=0,contamination=0.01) clf_X = EllipticEnvelope(random_state=0,contamination=0.01) clf_X.fit(X) clf_y.fit(y.reshape(y.size,1)) results = defaultdict(list) for n in samples: X_new, y_new, dates_new = simulate_samples(n,X,y, dates) results["sample_size"].append(n) results['wasserstein_X'].append(np.round(wasserstein_distance(X.flatten(),X_new.flatten()),2)) results['wasserstein_y'].append(np.round(wasserstein_distance(y,y_new),2)) test1 = clf_X.predict(X_new) test2 = clf_y.predict(y_new.reshape(y_new.size,1)) results["outlier_percent_X"].append(np.round(1.0 - (test1[test1==1].size / test1.size),2)) results["outlier_percent_y"].append(np.round(1.0 - (test2[test2==1].size / test2.size),2)) return pd.DataFrame(results)
def cal_emd_lose(out_float_list, out_quant_list, out_len): """caculate earch move distance""" emd_sum = 0 if out_len >= 3: for index in range(len(out_float_list)): emd_sum += wasserstein_distance(out_float_list[index], out_quant_list[index]) else: out_float = np.concatenate(out_float_list) out_quant = np.concatenate(out_quant_list) emd_sum += wasserstein_distance(out_float, out_quant) emd_sum /= float(len(out_float_list)) return emd_sum
def report_linear_lmmnet(metric_function): error_list = [] time_points, test_data = linear.simulate_default() for _ in range(10): # generate data with random initial conditions xi = np.random.uniform(1, 4, 1)[0] yi = np.random.uniform(0, 4, 1)[0] zi = np.random.uniform(1, 2, 1)[0] time_points, cubic_data = linear.simulate_custom(xinit=xi, yinit=yi, zinit=zi) model = train_lmmNet.train_easy(time_points, cubic_data) x0 = test_data[0,0,:] # initial conditions predicted_traj = odeint(lambda x, t: predict_lmmNet.predict_fn(x, t, model), x0, time_points) predictions = predicted_traj if metric_function == "wasserstein": e1 = wasserstein_distance(predictions[:,0], test_data[0,:,0]) e2 = wasserstein_distance(predictions[:,1], test_data[0,:,1]) e3 = wasserstein_distance(predictions[:,2], test_data[0,:,2]) elif metric_function == "dtw": e1, _ = fastdtw(predictions[:,0], test_data[0,:,0], dist=euclidean) e2, _ = fastdtw(predictions[:,1], test_data[0,:,1], dist=euclidean) e3, _ = fastdtw(predictions[:,2], test_data[0,:,2], dist=euclidean) e1 /= np.linalg.norm(test_data[0,:,0], 2)**2 e2 /= np.linalg.norm(test_data[0,:,1], 2)**2 e3 /= np.linalg.norm(test_data[0,:,2], 2)**2 elif metric_function == "mse": e1 = predict_lmmNet.compute_MSE(predictions, test_data[0], 0) e2 = predict_lmmNet.compute_MSE(predictions, test_data[0], 1) e3 = predict_lmmNet.compute_MSE(predictions, test_data[0], 2) error_list.append((e1, e2, e3)) # plot plt.figure(figsize=(20, 10)) plt.plot(time_points, test_data[0,:,0], 'r.', label='x_1') plt.plot(time_points, test_data[0,:,1], 'y.', label='x_2') plt.plot(time_points, test_data[0,:,2], 'g.', label='x_3') plt.plot(time_points, predictions[:,0], 'b--', label='predicted dynamics') plt.plot(time_points, predictions[:,1], 'b--') plt.plot(time_points, predictions[:,2], 'b--') plt.title(str(xi) + " " + str(yi) + " " + str(zi)) plt.legend() plt.show() return error_list
def calc_distance_matrix(X, method): if method in ['chebyshev', 'euclidean', 'l1', 'l2']: DM = DistanceMetric.get_metric(method).pairwise(X) elif method in ['cosine']: DM = pairwise.cosine_distances(X) elif method in [ 'correlation', 'cityblock', 'braycurtis', 'canberra', 'hamming', 'jaccard', 'kulsinski' ]: DM = squareform(pdist(X, method)) elif method in ['minkowski3']: DM = squareform(pdist(X, 'minkowski', 3)) elif method in ['dot']: DM = squareform(pdist(X, lambda u, v: np.dot(u, v))) elif method in ['emd']: from scipy.stats import wasserstein_distance l = len(X) DM = np.zeros((l, l)) for x in range(l): for y in range(l): DM[x, y] = wasserstein_distance(X[x], X[y]) else: return None return DM
def fit(self, X, y, w): labels = np.unique(y) assert 0 < len(labels) <= 2 super(BinaryWPCA, self).fit(X, y, w) assert self.R is not None sel = (y == labels[0]) X1 = X[sel, :] @ self.R w1 = w[sel] w2 = w[~sel] X2 = X[~sel, :] @ self.R assert X1.shape == X2.shape _, n = X1.shape distances = [ wasserstein_distance(u_values=X1[:, i], u_weights=w1, v_values=X2[:, i], v_weights=w2) for i in range(n) ] idx = np.flip(np.argsort(distances)) self.R = self.R[:, idx] self.is_fitted = True
def get_dist(outf, layer,result_j, predicted, sample_size, criteria): final_tr = np.load(outf + "emp_bnn_train_"+str(layer) + ".npy") predicted_tr = np.load(outf + "labels_bnn_train.npy") #print(layer) if layer < 44: pca_model = pk.load(open(outf + "pca_bnn"+str(layer)+".pkl","rb")) #final_tr = pca_model.transform(final_tr) #print(result_j.cpu().detach().numpy().shape) final_adv = pca_model.transform(result_j.cpu().detach().numpy()) else: final_adv = result_j.cpu().detach().numpy() distance = np.zeros(final_adv.shape[0]) for i in range(final_adv.shape[0]): data_train_sample = final_tr[predicted_tr == int(predicted[i])] #print(predicted[i], data_train_sample.shape) ind = np.random.choice(data_train_sample.shape[0],min(sample_size, data_train_sample.shape[0]),replace=False) data_train_sample_i = data_train_sample[ind,] dist = np.zeros(data_train_sample_i.shape[0]) for k in range(data_train_sample_i.shape[0]): dist[k] = wasserstein_distance(final_adv[i,:], data_train_sample_i[k,:]) #print(dist) if len(dist) == 0: dis_adv = 0 elif criteria == 'mean': dis_adv = dist.mean() elif criteria == 'min': dis_adv = dist.min() else: dis_adv = np.median(dist) distance[i] = dis_adv return distance
def triplet_analysis(df, categories=CATEGORIES): df2 = df.drop_duplicates() for category in categories: groupby_cols = categories[:] groupby_cols.remove(category) fname = 'triplets_%s.csv' % category fname = fname.replace(' ', '_') with open(fname, 'w') as f: for name, group in df2.groupby(groupby_cols): unique_vals = group[category].unique() group_items = ['--------------------', ', '.join(name), '--------------------'] write_items = [] for val1, val2 in combinations(group[category].unique(), 2): sample1 = group['alpha'][group[category] == val1] sample2 = group['alpha'][group[category] == val2] n1 = len(sample1) n2 = len(sample2) if n1 > 25 and n2 > 25: dist = wasserstein_distance(sample1, sample2) write_items.append((dist, n1, n2, val1, val2)) #write_items.append('%s (%d), %s (%d), %f' % (val1, n1, val2, n2, dist)) if len(write_items) > 0: f.write('-----------------------\n') f.write(', '.join(name) + '\n') f.write('-----------------------\n') write_items = reversed(sorted(write_items)) for dist, n1, n2, val1, val2 in write_items: f.write('%s (%d), %s (%d), %f\n' % (val1, n1, val2, n2, dist))
def date_dist_scores(ref_timeline, ground_truth, p_val=.05): ''' Scores predicted distribution of timeline event dates using 2 sample Kolmogorov-Smirnov statistic and the first Wasserstein distance (earth mover's distance) Returns dict with KS statistic, if time distributions are statistically significantly different, and the Wasserstein distance ''' gt_dates = [time.mktime(d.timetuple()) for d in ground_truth.get_dates()] ref_dates = [time.mktime(d.timetuple()) for d in ref_timeline.get_dates()] scaler = MinMaxScaler() gt_scaled = scaler.fit_transform(np.array(gt_dates).reshape(-1, 1)).T[0] ref_scaled = scaler.transform(np.array(ref_dates).reshape(-1, 1)).T[0] ks_test = stats.ks_2samp(gt_scaled, ref_scaled) emd = stats.wasserstein_distance(gt_scaled, ref_scaled) # ks_signif 1 when the differentce in date distribution between ground # truth and generated timelines is statistically significant return { 'ks_stat': ks_test.statistic, 'ks_signif': int(ks_test.pvalue < p_val), 'earth_movers_distance': emd }
def wasserstein_test(u_values, v_values, bootstraps=999, use_gamma_model=True): # permutation test of wasserstein distance # based on the one outlined in https://github.com/cdowd/twosamples wass_dist, wass_dir = wasserstein_distance_and_direction( u_values, v_values) # under null hypothesis the samples are drawn from the same distribution # so we can make expected wasserstein values by permuting values between # the two samples pool = np.concatenate([u_values, v_values]) n = len(u_values) exp = [] for _ in range(bootstraps): np.random.shuffle(pool) exp.append(stats.wasserstein_distance(pool[:n], pool[n:])) exp = np.array(exp) if not use_gamma_model: # bootstrap p value with pseudocount p_val = ((exp >= wass_dist).sum() + 1) / (bootstraps + 1) else: # fit a gamma distribution to the expected distances g = stats.gamma(*stats.gamma.fit(exp)) # compute p value using survival function p_val = g.sf(wass_dist) return wass_dist, wass_dir, p_val
def get_decision(layer,result_j, predicted, thrd, sample_size, criteria): final_tr = torch.load("./hidden_output/emp_bnn_train_"+str(layer)).cpu().detach().numpy() predicted_tr = np.load("./data/predicts_bnn_train.npy") if layer < 43: pca_model = pk.load(open("./data/pca_bnn"+str(layer)+".pkl","rb")) final_tr = pca_model.transform(final_tr) final_adv = pca_model.transform(result_j.cpu().detach().numpy()) else: final_adv = result_j.cpu().detach().numpy() decision = np.zeros(final_adv.shape[0]) for i in range(final_adv.shape[0]): data_train_sample = final_tr[predicted_tr == int(predicted[i])] ind = np.random.choice(data_train_sample.shape[0],min(sample_size, data_train_sample.shape[0]),replace=False) data_train_sample_i = data_train_sample[ind,] dist = np.zeros(data_train_sample_i.shape[0]) for k in range(data_train_sample_i.shape[0]): dist[k] = wasserstein_distance(final_adv[i,:], data_train_sample_i[k,:]) if criteria == 'mean': dis_adv = dist.mean() elif criteria == 'min': dis_adv = dist.min() else: dis_adv = np.median(dist) if dis_adv > thrd[int(predicted[i])]: decision[i] = 1 return decision