def get_sim_over_time_multi(names: List, filepath: str) -> List:
    all_sim = []
    for filename in os.listdir(filepath):
        if filename[:-4] in names:
            emergence_order = get_emergence_order(os.path.join(filepath, filename))
            all_vecs = get_attested_order(os.path.join(filepath, filename))

            sim_over_t = get_sim_over_time(emergence_order, all_vecs)
            all_sim.extend([(item[0], item[1], filename[:-4]) for item in sim_over_t])

    return all_sim
Example #2
0
def test_s_vals(domain: str, field: str) -> None:
    if domain == "turing":
        out_path = "data/turing_winners/s-vals"
        vecs_path = "data/turing_winners/sbert-abstracts-ordered"
        tuning_path = "data/turing_winners/pickled-tuning/"
        tuning_path_2 = "data/turing_winners/pickled-tuning-2/"
    else:
        out_path = f"data/nobel_winners/{field}/s-vals"
        vecs_path = f"data/nobel_winners/{field}/abstracts-ordered"
        tuning_path = f"data/nobel_winners/{field}/pickled-tuning/"
        tuning_path_2 = f"data/nobel_winners/{field}/pickled-tuning-2/"

    order_path = vecs_path
    from_previous_inds = False

    #s_vals = [0.001 * i for i in range(1, 101)]
    s_vals = [0.01 * i for i in range(1, 101)]
    models = {val: {} for val in s_vals}
    num_samples = len(
        [name for name in os.listdir(vecs_path) if name.endswith(".csv")])

    if not from_previous_inds:
        train_inds, test_inds = train_test_split_inds(num_samples, 0.5)
    else:
        train_inds = []
        with open(inds_path + f"train_inds_{i}.txt", "r") as f:
            for row in f.readlines():
                train_inds.append(int(row))

        selected = [
            name[:-2] + ".csv" for name in os.listdir(tuning_path)
            if name.endswith(".p")
        ]
        all_names = list(os.listdir(vecs_path))
        train_inds = [
            all_names.index(item) for item in selected if item.endswith(".csv")
        ]

    for i, filename in enumerate(os.listdir(vecs_path)):
        if filename.endswith(".csv"):
            print(i, filename)
            if i in test_inds:
                continue
            vecs_filename = os.path.join(vecs_path, filename)
            order_filename = os.path.join(order_path, filename)
            all_vecs = get_attested_order(vecs_filename,
                                          vecs_col=2,
                                          multicols=True)
            emergence_order = get_emergence_order(order_filename,
                                                  vecs_col=2,
                                                  multicols=True)

            name_to_model = {val: make_rank_on_exemplar(val) for val in s_vals}

            res = {}
            ll_rand = get_probability_rand(emergence_order)
            print("RANDOM SCORE: ", ll_rand)

            # for name in name_to_model:
            #     print(name)
            #     ranking_type = "local" if name == "Local" else "global"
            #     ll_model, _, _ = get_probability_score(emergence_order, all_vecs, name_to_model[name], ranking_type=ranking_type)
            #     print("MODEL SCORE:", ll_model)
            #     diff = ll_model - ll_rand

            #     print(f"LL ratio {name}: ", diff)
            #     models[name][filename[:-4]] = (diff, len(all_vecs))
            #     res[name] = (diff, len(all_vecs))

            ranking_types = ["global"] * len(s_vals)
            ll_models, model_order = get_probability_score_multi(
                emergence_order, all_vecs, name_to_model, ranking_types)

            for name, ll in zip(model_order, ll_models):
                print(" === MODEL ===")
                print(f"{name}: {ll}")
                diff = ll - ll_rand
                res[name] = (diff, len(all_vecs))
                models[name][filename[:-4]] = (diff, len(all_vecs))

            with open(f"{tuning_path_2}{filename[:-4]}.p", "rb") as p_file:
                old_info = pickle.load(p_file)
                old_info.update(res)
                print(old_info)

            with open(f"{tuning_path_2}{filename[:-4]}.p", "wb") as p_file:
                pickle.dump(old_info, p_file)

            assert False

    with open(f"{out_path}/exemplar-grid.p", "rb") as p_file:
        old_model = pickle.load(p_file)
        old_model.update(models)
        print(old_model)
        assert False
    with open(f"{out_path}/exemplar-grid.p", "wb") as p_file:
        pickle.dump(models, p_file)

    print("train inds")
    print(train_inds)
Example #3
0
def run_cv_s_vals(domain: str, field: str, cv: int) -> None:
    if domain == "turing":
        out_path = "data/turing_winners/s-vals"
        vecs_path = "data/turing_winners/sbert-abstracts-ordered"
        tuning_path = "data/turing_winners/pickled-tuning/"
        tuning_path_2 = "data/turing_winners/pickled-tuning-2/"
    else:
        out_path = f"data/nobel_winners/{field}/s-vals"
        vecs_path = f"data/nobel_winners/{field}/abstracts-ordered"
        tuning_path = f"data/nobel_winners/{field}/pickled-tuning/"
        tuning_path_2 = f"data/nobel_winners/{field}/pickled-tuning-2/"

    order_path = vecs_path
    from_previous_inds = False

    #s_vals = [0.001 * i for i in range(1, 101)]
    #s_vals = [0.01 * i for i in range(1, 101)]
    #s_vals = [0.001 * i for i in range(1, 101)]
    s_vals = [0.001 * i for i in range(1, 3)]
    num_samples = len(
        [name for name in os.listdir(vecs_path) if name.endswith(".csv")])
    cv_folds = cv_split_inds(num_samples, cv)

    with open(f"{out_path}/folds.txt", "w") as inds_f:
        for fold in cv_folds:
            inds_f.write(str(fold) + "\n")

    for fold in range(len(cv_folds)):
        print(f"Fold {fold}")
        models = {val: {} for val in s_vals}
        test_inds = cv_folds[fold]

        for i, filename in enumerate(os.listdir(vecs_path)):
            if filename.endswith(".csv"):
                print(i, filename)
            if i in test_inds:
                continue

            ll_diff = {}
            res = {}

            vecs_filename = os.path.join(vecs_path, filename)
            order_filename = os.path.join(order_path, filename)
            all_vecs = get_attested_order(vecs_filename,
                                          vecs_col=2,
                                          multicols=True)
            emergence_order = get_emergence_order(order_filename,
                                                  vecs_col=2,
                                                  multicols=True)

            name_to_model = {val: make_rank_on_exemplar(val) for val in s_vals}

            ll_rand = get_probability_rand(emergence_order)
            print("RANDOM SCORE: ", ll_rand)

            ranking_types = ["global"] * len(s_vals)
            ll_models, model_order = get_probability_score_multi(
                emergence_order, all_vecs, name_to_model, ranking_types)

            for name, ll in zip(model_order, ll_models):
                print(" === MODEL ===")
                print(f"{name}: {ll}")
                diff = ll - ll_rand
                if name in ll_diff:
                    ll_diff[name].append(diff)
                else:
                    ll_diff[name] = [diff]

            for name, ll in zip(model_order, ll_models):
                res[name] = sum(ll_diff[name]) / len(ll_diff[name])
                if name in models and filename[:-4] in models[name]:
                    models[name][filename[:-4]].append(res[name])
                else:
                    models[name][filename[:-4]] = res[name]

            with open(f"{tuning_path}{filename[:-4]}_{fold}.p",
                      "wb") as p_file:
                pickle.dump(models, p_file)

        with open(f"{out_path}/exemplar-grid-fold_{fold}.p", "wb") as p_file:
            pickle.dump(models, p_file)
    #seaborn.scatterplot(time, sim, hue=names)
    data = pd.DataFrame(data=np.column_stack((time, sim, names)), columns=["timestep", "sim", "name"])
    fg = seaborn.FacetGrid(data.astype({"timestep": int, "sim": float, "name": str}), hue="name")
    fg.map(seaborn.regplot, "timestep", "sim")
    plt.ylabel("Similarity of emerging papers\nto $S_{0}$", fontsize=30)
    plt.xlabel("Timestep $t$", fontsize=30)
    plt.xticks([i for i in range(5, max(time), 5)], fontsize=20)
    plt.yticks(fontsize=20)
    plt.legend(fontsize=20)

    plt.savefig(filename + ".png")
    plt.savefig(filename + ".eps")  
    plt.show()
    

if __name__ == "__main__":
    best_names = ["Leslie-Lamport", "Donald_E=-Knuth", "Robert_W=-Floyd", "Silvio-Micali", "John_E=-Hopcroft"]
    best_pairwise = ["Geoffrey_E=-Hinton", "Leslie-Lamport", "David_A=-Patterson", "Richard_M=-Karp", "Adi-Shamir"]
    worst_names = ["Adi-Shamir", "Amir-Pnueli", "Tony-Hoare", "Joseph-Sifakis", "David_A=-Patterson"]
    worst_pairwise = ["Juris-Hartmanis", "Robert_E=-Kahn", "John-McCarthy", "Edgar_F=-Codd", "John-Cocke"]
    random_names = ["Donald_E=-Knuth", "Amir-Pnueli", "Yoshua-Bengio", "Ken-Thompson", "John_W=-Backus"]

    emergence_order = get_emergence_order("data/turing_winners/vecs-abstracts-ordered/Juris-Hartmanis.csv")
    all_vecs = get_attested_order("data/turing_winners/vecs-abstracts-ordered/Juris-Hartmanis.csv")
    #sim_over_t = get_sim_over_time(emergence_order, all_vecs)
    #plot_similarity_over_time(sim_over_t)
    sim_over_t = get_sim_over_time_multi(best_pairwise, "data/turing_winners/vecs-abstracts-ordered")
    plot_similarity_over_time(sim_over_t, filename="best-p-scientists-sim")


Example #5
0
        if i % 2 == 0:
            texts.append(plt.text(tsne_res[i, 0], tsne_res[i, 1], title, fontsize=5))
    adjust_text(texts)"""
    plt.colorbar(sc, pad=0.3)
    plt.axis("off")
    plt.savefig("einstein-vis.png", dpi=500)
    plt.savefig("einstein-vis.eps", dpi=500)

    assert False

    labels = get_attested_order(
        "data/turing_winners/abstracts/geoff/Geoff-Hinton-abstract-vecs-final.csv",
        vecs_col=1,
        label=True)
    emergence_order = get_emergence_order(
        "data/turing_winners/abstracts/geoff/Geoff-Hinton-abstract-vecs-final-ordered.csv",
        vecs_col=2)
    _, ranks, tails = get_probability_score(emergence_order,
                                            all_vecs,
                                            labels,
                                            rank_on_1NN,
                                            ranking_type="global",
                                            carry_error=True)
    inds = [all_vecs.index(list(vec)) for vec in ranks]
    plot_links(tsne_res, np.asarray(inds), np.asarray(tails), "1NN-geoff",
               "1NN-geoff")
    assert False
    data = gen_random_2d_data(10)
    #data = get_attested_order("data/turing_winners/abstracts/geoff/Geoff-Hinton-abstract-vecs-final.csv", vecs_col=2)

    links_nn, tails_nn = link_with_model(data, rank_on_1NN, "1NN")
        results_path = "results/summary/s_opt/cs.p"

    best_s_vals = {}
    for i, filename in enumerate(os.listdir(vecs_path)):
        if filename.endswith(".csv"):
            print(i, filename)

        if i > 20:
            assert False

        vecs_filename = os.path.join(vecs_path, filename)
        order_filename = os.path.join(vecs_path, filename)
        individual_filename = os.path.join(individual_path, filename)

        all_vecs = get_attested_order(vecs_filename, vecs_col=2, multicols=True)
        emergence_order = get_emergence_order(order_filename, vecs_col=2, multicols=True)

        if len(all_vecs) < 5:
            continue

        best = optimize_one_scientist(all_vecs, emergence_order)
        print(best)
        best_s_vals[filename[:-4]] = best

        with open(f"{individual_path}/{filename[:-4]}.p", "wb") as ind_file:
            pickle.dump(best, ind_file)

    
    with open(f"{results_path}/{filename[:-4]}.p", "wb") as res_file:
        pickle.dump(best_s_vals, res_file)