def test_load_data(dataset_name, perp=30, early_stop=""): import numpy as np from run_plots import _simple_scatter, _simple_loss _, _, y = dataset.load_dataset(dataset_name) embedding_dir = f"{dir_path}/tmp/{dataset_name}" out_name = f"{embedding_dir}/{perp}{early_stop}" print("\nTest loading saved data from ", out_name) loaded = joblib.load(filename=f"{out_name}.z") _simple_scatter(Z=loaded["embedding"], out_name=f"{out_name}_scatter.png", labels=y) losses = loaded["progress_errors"] if losses is not None: losses = losses[np.where(losses > 0.0)] _simple_loss(loss=losses, out_name=f"{out_name}_loss.png", figsize=(6, 3)) print(loaded.keys()) for k, v in loaded.items(): if k not in ["embedding", "error_per_point", "progress_errors"]: print(k, v)
def gen_constraints(dataset_name, n_sim, n_dis): _, _, labels = dataset.load_dataset(args.dataset_name) sim_links = gen_simmilar_links(labels, n_sim) dis_links = gen_dissimilar_links(labels, n_dis) out_name = f"{LINK_DIR}/auto_{dataset_name}_{n_sim}sim_{n_dis}dis.pkl" joblib.dump(sim_links + dis_links, out_name)
def simple_run(dataset_name, n_iters, lr): mlflow.log_param('dataset_name', dataset_name) mlflow.log_param('n_iters', n_iters) mlflow.log_param('lr', lr) plot_args = { 'prefix': f'{dataset_name}-{lr}-{n_iters}', 'suffix': '.png', 'save_to_file': SAVE_FIGURES, 'track_with_mlflow': TRACK_FLOW, } X_original, X, y = dataset.load_dataset(dataset_name) run_ppca(X_original, X, y, lr, n_iters, plot_args)
def extract_qij_for_some_pairs( dataset_name, normalized=False, use_log=False, base_perp=None, list_n_constraints=[] ): _, X, labels = dataset.load_dataset(dataset_name) perps = range(1, X.shape[0] // 3) # store all Q for different number of constraints Q_sim_all = defaultdict(list) Q_dis_all = defaultdict(list) embedding_type = "normal" if base_perp is None else "chain" # for each embedding, pick different pairs with different number of constraints for perp in perps: file_name = f"{perp}.z" if base_perp is None else f"{base_perp}_to_{perp}.z" in_file = os.path.join(dir_path, embedding_type, dataset_name, file_name) print(in_file) data = joblib.load(in_file) Q = compute_Q(data["embedding"]) if use_log: Q = -np.log(Q) if normalized: Q /= Q.max() Q = squareform(Q) # store the q_ij for this `perp` for each num of constraint in Q_*_all[n_constraints] for n_constraints in list_n_constraints: sim = gen_similar_links(labels, n_constraints, include_link_type=False) dis = gen_dissimilar_links(labels, n_constraints, include_link_type=False) Q_sim = Q[sim[:, 0], sim[:, 1]] Q_dis = Q[dis[:, 0], dis[:, 1]] Q_sim_all[n_constraints].append(Q_sim) Q_dis_all[n_constraints].append(Q_dis) del Q # store all q_ij for both sim and dis links, together with list of all calculated perplexities out_name = ( f"./q_ij/{dataset_name}" f"_{embedding_type}{base_perp if base_perp else ''}" f"{'_log' if use_log else ''}" f"{'_normalized' if normalized else ''}" ) joblib.dump([perps, Q_sim_all, Q_dis_all], out_name)
def examine_qij(dataset_name, writer=None, base_perp=None): _, X, _ = dataset.load_dataset(dataset_name) embedding_type = "normal" if base_perp is None else "chain" for perp in range(1, X.shape[0] // 3): file_name = f"{perp}.z" if base_perp is None else f"{base_perp}_to_{perp}.z" in_file = os.path.join(dir_path, embedding_type, dataset_name, file_name) data = joblib.load(in_file) Q = compute_Q(data["embedding"]) nlogQ = -np.log(Q) nlogQ_normalized = nlogQ / np.max(nlogQ) if writer: print(f"Adding histogram for perp={perp} with {len(Q)} values") writer.add_histogram(f"qij", Q, global_step=perp) writer.add_histogram(f"qij_normalized", Q / Q.max(), global_step=perp) writer.add_histogram(f"-logqij", nlogQ, global_step=perp) writer.add_histogram(f"-logqij_normalized", nlogQ_normalized, perp) else: print("No tensorboardX debug info stored")
def plot_embeddings(run_range=None, base_perp=None, force_rewrite=False): _, X, y = dataset.load_dataset(dataset_name) for perp in range(1, X.shape[0] // 3) if run_range is None else run_range: for earlystop in ["", "_earlystop"]: if base_perp is None: embedding_dir = f"{dir_path}/normal/{dataset_name}" file_name = f"{embedding_dir}/{perp}{earlystop}" else: embedding_dir = f"{dir_path}/chain/{dataset_name}" file_name = f"{embedding_dir}/{base_perp}_to_{perp}{earlystop}" if os.path.exists(f"{file_name}_all.png") and not force_rewrite: continue print("Plotting ", file_name) data = joblib.load(f"{file_name}.z") try: error_per_point = data["error_per_point"] error_as_point_size = (MinMaxScaler( feature_range=(25, 150)).fit_transform( error_per_point.reshape(-1, 1)).reshape(1, -1)) progress_errors = data["progress_errors"] progress_errors = progress_errors[np.where( progress_errors > 0)] _scatter_with_loss( Z=data["embedding"], loss=progress_errors, out_name=f"{file_name}_all.png", point_sizes=error_as_point_size, labels=y, loss_title=(f"final_loss={data['kl_divergence']:.3f}," + f" n_iter={data['n_iter']+1}"), ) except KeyError: # Exception: print( "`error_per_point` or `progress_errors` are not available." )
def calculate_metrics(dataset_name, metric_names, base_perp=None, earlystop="_earlystop"): _, X, _ = dataset.load_dataset(dataset_name) N = X.shape[0] all_perps = range(1, N // 3) if base_perp is None: embedding_dir = f"{dir_path}/normal/{dataset_name}" in_name_prefix = "" out_name_sufix = "" else: embedding_dir = f"{dir_path}/chain/{dataset_name}" in_name_prefix = f"{base_perp}_to_" out_name_sufix = f"_base{base_perp}" result = [] for perp in all_perps: in_name = f"{embedding_dir}/{in_name_prefix}{perp}{earlystop}.z" data = joblib.load(in_name) print("Runing metric for ", in_name) record = { "perplexity": perp, "kl_divergence": data["kl_divergence"], "bic": 2 * data["kl_divergence"] + np.log(N) * perp / N, } drMetric = DRMetric(X, data["embedding"]) for metric_name in metric_names: print(metric_name, end=" ") metric_method = getattr(drMetric, metric_name) record[metric_name] = metric_method() print("Done for perp ", perp) result.append(record) df = pd.DataFrame(result).set_index("perplexity") df.to_csv(f"metrics/{dataset_name}{out_name_sufix}{earlystop}.csv")
def plot_metamap(run_range, base_perp=None, earlystop=""): print(f"Generate meta-plot for {dataset_name} with params: ", locals()) if base_perp is None: embedding_dir = f"{dir_path}/normal/{dataset_name}" in_name_prefix = "" out_name_sufix = "" else: embedding_dir = f"{dir_path}/chain/{dataset_name}" in_name_prefix = f"{base_perp}_to_" out_name_sufix = f"_base{base_perp}" _, X, _ = dataset.load_dataset(dataset_name) run_range = run_range or range(3, X.shape[0] // 3) # prepare all pre-calculated embeddings all_Z = [] all_perps = [] all_losses = [] for perp in run_range: in_name = f"{embedding_dir}/{in_name_prefix}{perp}{earlystop}.z" data = joblib.load(in_name) all_perps.append(perp) all_Z.append(data["embedding"].ravel()) all_losses.append(data["kl_divergence"]) # using all_Z as features for meta-tSNE # all_Z = StandardScaler().fit_transform(all_Z) meta_Z = TSNE(perplexity=10).fit_transform(all_Z) out_name = f"{dataset_name}{out_name_sufix}{earlystop}" plt.figure(figsize=(6, 6)) plt.scatter(meta_Z[:, 0], meta_Z[:, 1], c=all_perps) plt.title(out_name) cbar = plt.colorbar() cbar.ax.set_title("Perplexity") plt.tight_layout() plt.savefig(f"./plot_chain/metamap/{out_name}.png")
def nested_run(n_iters, learning_rates, datasets, preprocessing_method): for dataset_name in datasets: with mlflow.start_run(): mlflow.log_param('dataset_name', f"{dataset_name}_{preprocessing_method}") mlflow.log_param('n_iters', n_iters) X_original, X, y = dataset.load_dataset(dataset_name, preprocessing_method) for lr in learning_rates: plot_args = { 'prefix': f'{dataset_name}-{lr}-{n_iters}', 'suffix': '.png', 'save_to_file': SAVE_FIGURES, 'track_with_mlflow': TRACK_FLOW } with mlflow.start_run(nested=True): mlflow.log_param('lr', lr) run_ppca(X_original, X, y, learning_rate=lr, n_iters=n_iters, plot_args=plot_args)
def test_auto_generated_constraints(dataset_name, n_sim, n_dis): _, _, labels = dataset.load_dataset(dataset_name) in_name = f"{LINK_DIR}/auto_{dataset_name}_{n_sim}sim_{n_dis}dis.pkl" links = joblib.load(in_name) assert len(links) == n_sim + n_dis, ( "Number of generated links is not correct." f"Expected {n_sim} + {n_dis} = {n_sim + n_dis}, got {len(links)}") if len(links) > 0: assert len(links[0]) == 3, ( "Expect 3 elements in a links [p1, p2, link_type]," f"but got {len(links[0])} for the first link") for p0, p1, link_type in links: if link_type == SIM_LINK_TYPE: assert labels[p0] == labels[ p1], "Labels of sim-link must be the same" elif link_type == DIS_LINK_TYPE: assert labels[p0] != labels[ p1], "Labels of dis-link must not be the same" else: raise ValueError( "`link_type` of auto-generated link must be +1 or -1")
embedding_figures=html_table(tbody=_gen_tbody(run_range, base_perp)), ) with open(out_name, "w") as out_file: out_file.write(rendered) print(f"Write to {out_name}") if __name__ == "__main__": import argparse ap = argparse.ArgumentParser() ap.add_argument("-d", "--dataset_name") ap.add_argument("-x", "--dev", action="store_true") ap.add_argument("-p", "--test_perp") args = vars(ap.parse_args()) dataset_name = args.get("dataset_name", "FASHION200") test_perp = args.get("test_perp", 30) DEV = args.get("dev", False) _, X, _ = dataset.load_dataset(dataset_name) run_range = ( [test_perp - 1, test_perp, test_perp + 1] if DEV else range(2, X.shape[0] // 3) ) template_name = "view_chain2.template" for base_perp in hyper_params[dataset_name]["base_perps"]: out_name = f"html/{dataset_name}_base{base_perp}_v2.html" gen_page(template_name, out_name, run_range=run_range, base_perp=base_perp)
help="Number of hidden units, defaults to 0 to simply do Z@W", ) ap.add_argument("-ad", "--add_noise", action="store_true") args = ap.parse_args() time_str = time.strftime("%b%d/%H:%M:%S", time.localtime()) log_dir = ( f"runs{args.run_id}/{args.dataset_name}/{time_str}_" + f"lr{args.learning_rate}_h{args.hidden_dim}" ) print(log_dir) writer = SummaryWriter(log_dir=log_dir) X_original, X, y = dataset.load_dataset( args.dataset_name, preprocessing_method=args.scale_data ) if args.add_noise: X_original = generate_noise("s&p", X_original) X = X_original / 255.0 # run_with_sklearn(X, y) writer.add_text(f"Params", str(args)) z2d_loc, z2d_scale = trainVI( data=X, hidden_dim=args.hidden_dim, learning_rate=args.learning_rate, n_iters=5 if args.dev else args.n_iters, trace_embeddings_interval=200, writer=writer,
def run_dataset(dataset_name, n_iter_without_progress=1000, min_grad_norm=1e-32, early_stop=""): _, X, y = dataset.load_dataset(dataset_name) embedding_dir = f"{dir_path}/embeddings/{dataset_name}" if not os.path.exists(embedding_dir): os.mkdir(embedding_dir) base_min_grad_norm = min_grad_norm for perp in [test_perp] if DEV else range(1, X.shape[0] // 3): if early_stop != "": # using early_stop adaptive_min_grad_norm = base_min_grad_norm * (10**(-(perp // 30))) print( f"perp={perp} ({perp//30}) adaptive_min_grad_norm={adaptive_min_grad_norm}" ) else: adaptive_min_grad_norm = min_grad_norm start_time = time() if USE_MULTICORE: tsne = MulticoreTSNE( verbose=1 if DEV else 0, random_state=fixed_seed, perplexity=perp, n_iter_without_progress=n_iter_without_progress, min_grad_norm=adaptive_min_grad_norm, n_jobs=n_cpu_using, eval_interval=50, n_iter=1000, ) else: tsne = TSNE(random_state=fixed_seed, perplexity=perp) tsne.fit_transform(X) running_time = time() - start_time print(f"{dataset_name}, {perp}, time: {running_time}s") error_per_point = None progress_errors = None try: error_per_point = tsne.error_per_point_ progress_errors = tsne.progress_errors_ except AttributeError: print( "`error_per_point_` or `progress_errors_` are not available.") result = dict( perplexity=perp, running_time=running_time, embedding=tsne.embedding_, kl_divergence=tsne.kl_divergence_, n_jobs=tsne.n_jobs if USE_MULTICORE else 1, n_iter=tsne.n_iter_, learning_rate=tsne.learning_rate, random_state=tsne.random_state, progress_errors=progress_errors, error_per_point=error_per_point, ) out_name = f"{dir_path}/tmp/{dataset_name}" if DEV else embedding_dir out_name += f"/{perp}{early_stop}" joblib.dump(value=result, filename=f"{out_name}.z")
ap.add_argument("-cp", "--constraint_proportion", default=1.0, type=float, help="target_function = cp * user_constraint + (1-cp)* John's metric") ap.add_argument("-u", "--utility_function", default="ucb", help="in ['ucb', 'ei', 'poi']") ap.add_argument("-k", "--kappa", default=5.0, type=float, help="For UCB, small ->exploitation, large ->exploration, default 5.0") ap.add_argument("-x", "--xi", default=0.025, type=float, help="For EI/POI, small ->exploitation, large ->exploration, default 0.025") args = ap.parse_args() mlflow.set_experiment('BO-with-Constraint-Scores') for arg_key, arg_value in vars(args).items(): mlflow.log_param(arg_key, arg_value) dataset.set_data_home("./data") dataset_name = args.dataset_name _, X, labels = dataset.load_dataset(dataset_name) print(X.shape, labels.shape) method_name = args.method_name score_name = args.score_name n_constraints = args.n_constraints rnd_seed = args.random_seed constraint_proportion = args.constraint_proportion constraints = generate_constraints(score_name, n_constraints) target_function_wrapper = partial(target_function, method_name, score_name, constraints) best_result = run_bo(target_func=target_function_wrapper) print(best_result)
def run_dataset( dataset_name, base_perp=30, run_range=range(31, 101), n_iter_without_progress=1000, min_grad_norm=1e-32, early_stop=False, ): _, X, y = dataset.load_dataset(dataset_name) chain_dir = f"{dir_path}/chain/{dataset_name}" if not os.path.exists(chain_dir): os.mkdir(chain_dir) # load init from perp=base_perp user_early_stop = "_earlystop" if early_stop else "" in_name = f"{dir_path}/normal/{dataset_name}/{base_perp}{user_early_stop}.z" base_data = joblib.load(in_name) Z_base = base_data["embedding"] for perp in run_range: if perp == base_perp: continue start_time = time() tsne = MulticoreTSNE( random_state=fixed_seed, perplexity=perp, init=Z_base, n_iter_without_progress=n_iter_without_progress, min_grad_norm=min_grad_norm, n_jobs=n_cpu_using, ) tsne.fit_transform(X) running_time = time() - start_time print( f"{dataset_name}, perp={perp}, {user_early_stop}, t={running_time:.3}s" ) error_per_point = None progress_errors = None try: error_per_point = tsne.error_per_point_ progress_errors = tsne.progress_errors_ except AttributeError: print( "`error_per_point_` or `progress_errors_` are not available.") result = dict( perplexity=perp, running_time=running_time, embedding=tsne.embedding_, kl_divergence=tsne.kl_divergence_, n_jobs=tsne.n_jobs, n_iter=tsne.n_iter_, learning_rate=tsne.learning_rate, random_state=tsne.random_state, progress_errors=progress_errors, error_per_point=error_per_point, ) out_name = f"{chain_dir}/{base_perp}_to_{perp}{user_early_stop}.z" joblib.dump(value=result, filename=out_name)
def compare_kl_normal_chain(dataset_name, base_perp, run_range=range(1, 100), plot=True): _, X, y = dataset.load_dataset(dataset_name) chain_dir = f"{dir_path}/chain/{dataset_name}" normal_dir = f"{dir_path}/embeddings/{dataset_name}" # get min, max of base embedding corresponding to base_perp base_data = joblib.load(f"{normal_dir}/{base_perp}.z") Z_base = base_data["embedding"] x_min, x_max = Z_base[:, 0].min(), Z_base[:, 0].max() y_min, y_max = Z_base[:, 1].min(), Z_base[:, 1].max() fig_limit = (x_min, x_max, y_min, y_max) result = [] for perp in run_range: normal_file = f"{normal_dir}/{perp}.z" chain_file = f"{chain_dir}/{base_perp}_to_{perp}.z" if not os.path.exists(normal_file) or not os.path.exists(chain_file): print( "One of following files not found: \n", "\n".join([normal_file, chain_file]), ) continue normal_data = joblib.load(normal_file) chain_data = joblib.load(chain_file) Z_normal = normal_data["embedding"] Z_chain = chain_data["embedding"] Q_normal = normal_data.get("Q", compute_Q(Z_normal)) Q_chain = chain_data.get("Q", compute_Q(Z_chain)) aucrnx_normal = DRMetric(X, Z_normal).auc_rnx() aucrnx_chain = DRMetric(X, Z_chain).auc_rnx() kl_normal_chain = klpq(Q_normal, Q_chain) kl_chain_normal = klpq(Q_chain, Q_normal) kl_sum = 0.5 * (kl_normal_chain + kl_chain_normal) record = { "perplexity": perp, "kl_normal_chain": kl_normal_chain, "kl_chain_normal": kl_chain_normal, "kl_sum": kl_sum, "aucrnx_normal": aucrnx_normal, "aucrnx_chain": aucrnx_chain, "running_time_normal": normal_data["running_time"], "running_time_chain": chain_data["running_time"], } result.append(record) print( "perp", perp, "diff_aucrnx", abs(aucrnx_normal - aucrnx_chain), "\tkl", kl_sum, ) if plot: fig_name = f"{base_perp}_to_{perp}" _scatter(Z_normal, f"{normal_dir}/{fig_name}", y=y) _scatter(Z_normal, f"{normal_dir}/{fig_name}", *fig_limit, y=y) _scatter(Z_chain, f"{chain_dir}/{fig_name}", y=y) _scatter(Z_chain, f"{chain_dir}/{fig_name}", *fig_limit, y=y) df = pd.DataFrame(result).set_index("perplexity") df.to_csv(f"plot_chain/{dataset_name}_{base_perp}.csv")