예제 #1
0
def test_load_data(dataset_name, perp=30, early_stop=""):
    import numpy as np
    from run_plots import _simple_scatter, _simple_loss

    _, _, y = dataset.load_dataset(dataset_name)
    embedding_dir = f"{dir_path}/tmp/{dataset_name}"
    out_name = f"{embedding_dir}/{perp}{early_stop}"
    print("\nTest loading saved data from ", out_name)

    loaded = joblib.load(filename=f"{out_name}.z")
    _simple_scatter(Z=loaded["embedding"],
                    out_name=f"{out_name}_scatter.png",
                    labels=y)

    losses = loaded["progress_errors"]
    if losses is not None:
        losses = losses[np.where(losses > 0.0)]
        _simple_loss(loss=losses,
                     out_name=f"{out_name}_loss.png",
                     figsize=(6, 3))

    print(loaded.keys())
    for k, v in loaded.items():
        if k not in ["embedding", "error_per_point", "progress_errors"]:
            print(k, v)
def gen_constraints(dataset_name, n_sim, n_dis):
    _, _, labels = dataset.load_dataset(args.dataset_name)

    sim_links = gen_simmilar_links(labels, n_sim)
    dis_links = gen_dissimilar_links(labels, n_dis)

    out_name = f"{LINK_DIR}/auto_{dataset_name}_{n_sim}sim_{n_dis}dis.pkl"
    joblib.dump(sim_links + dis_links, out_name)
예제 #3
0
def simple_run(dataset_name, n_iters, lr):
    mlflow.log_param('dataset_name', dataset_name)
    mlflow.log_param('n_iters', n_iters)
    mlflow.log_param('lr', lr)
    plot_args = {
        'prefix': f'{dataset_name}-{lr}-{n_iters}',
        'suffix': '.png',
        'save_to_file': SAVE_FIGURES,
        'track_with_mlflow': TRACK_FLOW,
    }

    X_original, X, y = dataset.load_dataset(dataset_name)
    run_ppca(X_original, X, y, lr, n_iters, plot_args)
예제 #4
0
def extract_qij_for_some_pairs(
    dataset_name, normalized=False, use_log=False, base_perp=None, list_n_constraints=[]
):
    _, X, labels = dataset.load_dataset(dataset_name)
    perps = range(1, X.shape[0] // 3)

    # store all Q for different number of constraints
    Q_sim_all = defaultdict(list)
    Q_dis_all = defaultdict(list)

    embedding_type = "normal" if base_perp is None else "chain"

    # for each embedding, pick different pairs with different number of constraints
    for perp in perps:
        file_name = f"{perp}.z" if base_perp is None else f"{base_perp}_to_{perp}.z"
        in_file = os.path.join(dir_path, embedding_type, dataset_name, file_name)
        print(in_file)

        data = joblib.load(in_file)
        Q = compute_Q(data["embedding"])

        if use_log:
            Q = -np.log(Q)
        if normalized:
            Q /= Q.max()
        Q = squareform(Q)

        # store the q_ij for this `perp` for each num of constraint in Q_*_all[n_constraints]
        for n_constraints in list_n_constraints:
            sim = gen_similar_links(labels, n_constraints, include_link_type=False)
            dis = gen_dissimilar_links(labels, n_constraints, include_link_type=False)

            Q_sim = Q[sim[:, 0], sim[:, 1]]
            Q_dis = Q[dis[:, 0], dis[:, 1]]

            Q_sim_all[n_constraints].append(Q_sim)
            Q_dis_all[n_constraints].append(Q_dis)

        del Q

    # store all q_ij for both sim and dis links, together with list of all calculated perplexities
    out_name = (
        f"./q_ij/{dataset_name}"
        f"_{embedding_type}{base_perp if base_perp else ''}"
        f"{'_log' if use_log else ''}"
        f"{'_normalized' if normalized else ''}"
    )
    joblib.dump([perps, Q_sim_all, Q_dis_all], out_name)
예제 #5
0
def examine_qij(dataset_name, writer=None, base_perp=None):
    _, X, _ = dataset.load_dataset(dataset_name)

    embedding_type = "normal" if base_perp is None else "chain"
    for perp in range(1, X.shape[0] // 3):
        file_name = f"{perp}.z" if base_perp is None else f"{base_perp}_to_{perp}.z"
        in_file = os.path.join(dir_path, embedding_type, dataset_name, file_name)
        data = joblib.load(in_file)
        Q = compute_Q(data["embedding"])
        nlogQ = -np.log(Q)
        nlogQ_normalized = nlogQ / np.max(nlogQ)

        if writer:
            print(f"Adding histogram for perp={perp} with {len(Q)} values")
            writer.add_histogram(f"qij", Q, global_step=perp)
            writer.add_histogram(f"qij_normalized", Q / Q.max(), global_step=perp)
            writer.add_histogram(f"-logqij", nlogQ, global_step=perp)
            writer.add_histogram(f"-logqij_normalized", nlogQ_normalized, perp)
        else:
            print("No tensorboardX debug info stored")
예제 #6
0
def plot_embeddings(run_range=None, base_perp=None, force_rewrite=False):
    _, X, y = dataset.load_dataset(dataset_name)

    for perp in range(1, X.shape[0] // 3) if run_range is None else run_range:
        for earlystop in ["", "_earlystop"]:
            if base_perp is None:
                embedding_dir = f"{dir_path}/normal/{dataset_name}"
                file_name = f"{embedding_dir}/{perp}{earlystop}"
            else:
                embedding_dir = f"{dir_path}/chain/{dataset_name}"
                file_name = f"{embedding_dir}/{base_perp}_to_{perp}{earlystop}"

            if os.path.exists(f"{file_name}_all.png") and not force_rewrite:
                continue

            print("Plotting ", file_name)
            data = joblib.load(f"{file_name}.z")

            try:
                error_per_point = data["error_per_point"]
                error_as_point_size = (MinMaxScaler(
                    feature_range=(25, 150)).fit_transform(
                        error_per_point.reshape(-1, 1)).reshape(1, -1))

                progress_errors = data["progress_errors"]
                progress_errors = progress_errors[np.where(
                    progress_errors > 0)]

                _scatter_with_loss(
                    Z=data["embedding"],
                    loss=progress_errors,
                    out_name=f"{file_name}_all.png",
                    point_sizes=error_as_point_size,
                    labels=y,
                    loss_title=(f"final_loss={data['kl_divergence']:.3f}," +
                                f"  n_iter={data['n_iter']+1}"),
                )
            except KeyError:  # Exception:
                print(
                    "`error_per_point` or `progress_errors` are not available."
                )
예제 #7
0
def calculate_metrics(dataset_name,
                      metric_names,
                      base_perp=None,
                      earlystop="_earlystop"):
    _, X, _ = dataset.load_dataset(dataset_name)
    N = X.shape[0]
    all_perps = range(1, N // 3)

    if base_perp is None:
        embedding_dir = f"{dir_path}/normal/{dataset_name}"
        in_name_prefix = ""
        out_name_sufix = ""
    else:
        embedding_dir = f"{dir_path}/chain/{dataset_name}"
        in_name_prefix = f"{base_perp}_to_"
        out_name_sufix = f"_base{base_perp}"

    result = []
    for perp in all_perps:
        in_name = f"{embedding_dir}/{in_name_prefix}{perp}{earlystop}.z"
        data = joblib.load(in_name)
        print("Runing metric for ", in_name)

        record = {
            "perplexity": perp,
            "kl_divergence": data["kl_divergence"],
            "bic": 2 * data["kl_divergence"] + np.log(N) * perp / N,
        }

        drMetric = DRMetric(X, data["embedding"])
        for metric_name in metric_names:
            print(metric_name, end="   ")
            metric_method = getattr(drMetric, metric_name)
            record[metric_name] = metric_method()

        print("Done for perp ", perp)
        result.append(record)

    df = pd.DataFrame(result).set_index("perplexity")
    df.to_csv(f"metrics/{dataset_name}{out_name_sufix}{earlystop}.csv")
예제 #8
0
def plot_metamap(run_range, base_perp=None, earlystop=""):
    print(f"Generate meta-plot for {dataset_name} with params: ", locals())

    if base_perp is None:
        embedding_dir = f"{dir_path}/normal/{dataset_name}"
        in_name_prefix = ""
        out_name_sufix = ""
    else:
        embedding_dir = f"{dir_path}/chain/{dataset_name}"
        in_name_prefix = f"{base_perp}_to_"
        out_name_sufix = f"_base{base_perp}"

    _, X, _ = dataset.load_dataset(dataset_name)
    run_range = run_range or range(3, X.shape[0] // 3)

    # prepare all pre-calculated embeddings
    all_Z = []
    all_perps = []
    all_losses = []
    for perp in run_range:
        in_name = f"{embedding_dir}/{in_name_prefix}{perp}{earlystop}.z"
        data = joblib.load(in_name)
        all_perps.append(perp)
        all_Z.append(data["embedding"].ravel())
        all_losses.append(data["kl_divergence"])

    # using all_Z as features for meta-tSNE
    # all_Z = StandardScaler().fit_transform(all_Z)
    meta_Z = TSNE(perplexity=10).fit_transform(all_Z)

    out_name = f"{dataset_name}{out_name_sufix}{earlystop}"
    plt.figure(figsize=(6, 6))
    plt.scatter(meta_Z[:, 0], meta_Z[:, 1], c=all_perps)
    plt.title(out_name)
    cbar = plt.colorbar()
    cbar.ax.set_title("Perplexity")
    plt.tight_layout()
    plt.savefig(f"./plot_chain/metamap/{out_name}.png")
예제 #9
0
def nested_run(n_iters, learning_rates, datasets, preprocessing_method):
    for dataset_name in datasets:
        with mlflow.start_run():
            mlflow.log_param('dataset_name',
                             f"{dataset_name}_{preprocessing_method}")
            mlflow.log_param('n_iters', n_iters)

            X_original, X, y = dataset.load_dataset(dataset_name,
                                                    preprocessing_method)
            for lr in learning_rates:
                plot_args = {
                    'prefix': f'{dataset_name}-{lr}-{n_iters}',
                    'suffix': '.png',
                    'save_to_file': SAVE_FIGURES,
                    'track_with_mlflow': TRACK_FLOW
                }
                with mlflow.start_run(nested=True):
                    mlflow.log_param('lr', lr)
                    run_ppca(X_original,
                             X,
                             y,
                             learning_rate=lr,
                             n_iters=n_iters,
                             plot_args=plot_args)
def test_auto_generated_constraints(dataset_name, n_sim, n_dis):
    _, _, labels = dataset.load_dataset(dataset_name)

    in_name = f"{LINK_DIR}/auto_{dataset_name}_{n_sim}sim_{n_dis}dis.pkl"
    links = joblib.load(in_name)
    assert len(links) == n_sim + n_dis, (
        "Number of generated links is not correct."
        f"Expected {n_sim} + {n_dis} = {n_sim + n_dis}, got {len(links)}")

    if len(links) > 0:
        assert len(links[0]) == 3, (
            "Expect 3 elements in a links [p1, p2, link_type],"
            f"but got {len(links[0])} for the first link")

    for p0, p1, link_type in links:
        if link_type == SIM_LINK_TYPE:
            assert labels[p0] == labels[
                p1], "Labels of sim-link must be the same"
        elif link_type == DIS_LINK_TYPE:
            assert labels[p0] != labels[
                p1], "Labels of dis-link must not be the same"
        else:
            raise ValueError(
                "`link_type` of auto-generated link must be +1 or -1")
예제 #11
0
            embedding_figures=html_table(tbody=_gen_tbody(run_range, base_perp)),
        )

    with open(out_name, "w") as out_file:
        out_file.write(rendered)
    print(f"Write to {out_name}")


if __name__ == "__main__":
    import argparse

    ap = argparse.ArgumentParser()
    ap.add_argument("-d", "--dataset_name")
    ap.add_argument("-x", "--dev", action="store_true")
    ap.add_argument("-p", "--test_perp")
    args = vars(ap.parse_args())

    dataset_name = args.get("dataset_name", "FASHION200")
    test_perp = args.get("test_perp", 30)
    DEV = args.get("dev", False)

    _, X, _ = dataset.load_dataset(dataset_name)
    run_range = (
        [test_perp - 1, test_perp, test_perp + 1] if DEV else range(2, X.shape[0] // 3)
    )
    template_name = "view_chain2.template"

    for base_perp in hyper_params[dataset_name]["base_perps"]:
        out_name = f"html/{dataset_name}_base{base_perp}_v2.html"
        gen_page(template_name, out_name, run_range=run_range, base_perp=base_perp)
예제 #12
0
        help="Number of hidden units, defaults to 0 to simply do Z@W",
    )
    ap.add_argument("-ad", "--add_noise", action="store_true")

    args = ap.parse_args()

    time_str = time.strftime("%b%d/%H:%M:%S", time.localtime())
    log_dir = (
        f"runs{args.run_id}/{args.dataset_name}/{time_str}_"
        + f"lr{args.learning_rate}_h{args.hidden_dim}"
    )
    print(log_dir)

    writer = SummaryWriter(log_dir=log_dir)
    X_original, X, y = dataset.load_dataset(
        args.dataset_name, preprocessing_method=args.scale_data
    )
    if args.add_noise:
        X_original = generate_noise("s&p", X_original)
        X = X_original / 255.0

    # run_with_sklearn(X, y)

    writer.add_text(f"Params", str(args))
    z2d_loc, z2d_scale = trainVI(
        data=X,
        hidden_dim=args.hidden_dim,
        learning_rate=args.learning_rate,
        n_iters=5 if args.dev else args.n_iters,
        trace_embeddings_interval=200,
        writer=writer,
예제 #13
0
def run_dataset(dataset_name,
                n_iter_without_progress=1000,
                min_grad_norm=1e-32,
                early_stop=""):
    _, X, y = dataset.load_dataset(dataset_name)
    embedding_dir = f"{dir_path}/embeddings/{dataset_name}"
    if not os.path.exists(embedding_dir):
        os.mkdir(embedding_dir)

    base_min_grad_norm = min_grad_norm

    for perp in [test_perp] if DEV else range(1, X.shape[0] // 3):
        if early_stop != "":  # using early_stop
            adaptive_min_grad_norm = base_min_grad_norm * (10**(-(perp // 30)))
            print(
                f"perp={perp} ({perp//30}) adaptive_min_grad_norm={adaptive_min_grad_norm}"
            )
        else:
            adaptive_min_grad_norm = min_grad_norm

        start_time = time()
        if USE_MULTICORE:
            tsne = MulticoreTSNE(
                verbose=1 if DEV else 0,
                random_state=fixed_seed,
                perplexity=perp,
                n_iter_without_progress=n_iter_without_progress,
                min_grad_norm=adaptive_min_grad_norm,
                n_jobs=n_cpu_using,
                eval_interval=50,
                n_iter=1000,
            )
        else:
            tsne = TSNE(random_state=fixed_seed, perplexity=perp)
        tsne.fit_transform(X)
        running_time = time() - start_time
        print(f"{dataset_name}, {perp}, time: {running_time}s")

        error_per_point = None
        progress_errors = None
        try:
            error_per_point = tsne.error_per_point_
            progress_errors = tsne.progress_errors_
        except AttributeError:
            print(
                "`error_per_point_` or `progress_errors_` are not available.")

        result = dict(
            perplexity=perp,
            running_time=running_time,
            embedding=tsne.embedding_,
            kl_divergence=tsne.kl_divergence_,
            n_jobs=tsne.n_jobs if USE_MULTICORE else 1,
            n_iter=tsne.n_iter_,
            learning_rate=tsne.learning_rate,
            random_state=tsne.random_state,
            progress_errors=progress_errors,
            error_per_point=error_per_point,
        )

        out_name = f"{dir_path}/tmp/{dataset_name}" if DEV else embedding_dir
        out_name += f"/{perp}{early_stop}"
        joblib.dump(value=result, filename=f"{out_name}.z")
예제 #14
0
    ap.add_argument("-cp", "--constraint_proportion", default=1.0, type=float,
                    help="target_function = cp * user_constraint + (1-cp)* John's metric")
    ap.add_argument("-u", "--utility_function", default="ucb",
                    help="in ['ucb', 'ei', 'poi']")
    ap.add_argument("-k", "--kappa", default=5.0, type=float,
                    help="For UCB, small ->exploitation, large ->exploration, default 5.0")
    ap.add_argument("-x", "--xi", default=0.025, type=float,
                    help="For EI/POI, small ->exploitation, large ->exploration, default 0.025")
    args = ap.parse_args()

    mlflow.set_experiment('BO-with-Constraint-Scores')
    for arg_key, arg_value in vars(args).items():
        mlflow.log_param(arg_key, arg_value)

    dataset.set_data_home("./data")
    dataset_name = args.dataset_name
    _, X, labels = dataset.load_dataset(dataset_name)
    print(X.shape, labels.shape)

    method_name = args.method_name
    score_name = args.score_name
    n_constraints = args.n_constraints
    rnd_seed = args.random_seed
    constraint_proportion = args.constraint_proportion

    constraints = generate_constraints(score_name, n_constraints)
    target_function_wrapper = partial(target_function,
                                      method_name, score_name, constraints)
    best_result = run_bo(target_func=target_function_wrapper)
    print(best_result)
예제 #15
0
def run_dataset(
    dataset_name,
    base_perp=30,
    run_range=range(31, 101),
    n_iter_without_progress=1000,
    min_grad_norm=1e-32,
    early_stop=False,
):
    _, X, y = dataset.load_dataset(dataset_name)
    chain_dir = f"{dir_path}/chain/{dataset_name}"
    if not os.path.exists(chain_dir):
        os.mkdir(chain_dir)

    # load init from perp=base_perp
    user_early_stop = "_earlystop" if early_stop else ""
    in_name = f"{dir_path}/normal/{dataset_name}/{base_perp}{user_early_stop}.z"
    base_data = joblib.load(in_name)
    Z_base = base_data["embedding"]

    for perp in run_range:
        if perp == base_perp:
            continue

        start_time = time()
        tsne = MulticoreTSNE(
            random_state=fixed_seed,
            perplexity=perp,
            init=Z_base,
            n_iter_without_progress=n_iter_without_progress,
            min_grad_norm=min_grad_norm,
            n_jobs=n_cpu_using,
        )
        tsne.fit_transform(X)
        running_time = time() - start_time
        print(
            f"{dataset_name}, perp={perp}, {user_early_stop}, t={running_time:.3}s"
        )

        error_per_point = None
        progress_errors = None
        try:
            error_per_point = tsne.error_per_point_
            progress_errors = tsne.progress_errors_
        except AttributeError:
            print(
                "`error_per_point_` or `progress_errors_` are not available.")

        result = dict(
            perplexity=perp,
            running_time=running_time,
            embedding=tsne.embedding_,
            kl_divergence=tsne.kl_divergence_,
            n_jobs=tsne.n_jobs,
            n_iter=tsne.n_iter_,
            learning_rate=tsne.learning_rate,
            random_state=tsne.random_state,
            progress_errors=progress_errors,
            error_per_point=error_per_point,
        )
        out_name = f"{chain_dir}/{base_perp}_to_{perp}{user_early_stop}.z"
        joblib.dump(value=result, filename=out_name)
예제 #16
0
def compare_kl_normal_chain(dataset_name,
                            base_perp,
                            run_range=range(1, 100),
                            plot=True):
    _, X, y = dataset.load_dataset(dataset_name)
    chain_dir = f"{dir_path}/chain/{dataset_name}"
    normal_dir = f"{dir_path}/embeddings/{dataset_name}"

    # get min, max of base embedding corresponding to base_perp
    base_data = joblib.load(f"{normal_dir}/{base_perp}.z")
    Z_base = base_data["embedding"]
    x_min, x_max = Z_base[:, 0].min(), Z_base[:, 0].max()
    y_min, y_max = Z_base[:, 1].min(), Z_base[:, 1].max()
    fig_limit = (x_min, x_max, y_min, y_max)

    result = []
    for perp in run_range:
        normal_file = f"{normal_dir}/{perp}.z"
        chain_file = f"{chain_dir}/{base_perp}_to_{perp}.z"
        if not os.path.exists(normal_file) or not os.path.exists(chain_file):
            print(
                "One of following files not found: \n",
                "\n".join([normal_file, chain_file]),
            )
            continue

        normal_data = joblib.load(normal_file)
        chain_data = joblib.load(chain_file)
        Z_normal = normal_data["embedding"]
        Z_chain = chain_data["embedding"]

        Q_normal = normal_data.get("Q", compute_Q(Z_normal))
        Q_chain = chain_data.get("Q", compute_Q(Z_chain))

        aucrnx_normal = DRMetric(X, Z_normal).auc_rnx()
        aucrnx_chain = DRMetric(X, Z_chain).auc_rnx()

        kl_normal_chain = klpq(Q_normal, Q_chain)
        kl_chain_normal = klpq(Q_chain, Q_normal)
        kl_sum = 0.5 * (kl_normal_chain + kl_chain_normal)

        record = {
            "perplexity": perp,
            "kl_normal_chain": kl_normal_chain,
            "kl_chain_normal": kl_chain_normal,
            "kl_sum": kl_sum,
            "aucrnx_normal": aucrnx_normal,
            "aucrnx_chain": aucrnx_chain,
            "running_time_normal": normal_data["running_time"],
            "running_time_chain": chain_data["running_time"],
        }
        result.append(record)
        print(
            "perp",
            perp,
            "diff_aucrnx",
            abs(aucrnx_normal - aucrnx_chain),
            "\tkl",
            kl_sum,
        )

        if plot:
            fig_name = f"{base_perp}_to_{perp}"
            _scatter(Z_normal, f"{normal_dir}/{fig_name}", y=y)
            _scatter(Z_normal, f"{normal_dir}/{fig_name}", *fig_limit, y=y)

            _scatter(Z_chain, f"{chain_dir}/{fig_name}", y=y)
            _scatter(Z_chain, f"{chain_dir}/{fig_name}", *fig_limit, y=y)
    df = pd.DataFrame(result).set_index("perplexity")
    df.to_csv(f"plot_chain/{dataset_name}_{base_perp}.csv")