def get_sim_over_time_multi(names: List, filepath: str) -> List: all_sim = [] for filename in os.listdir(filepath): if filename[:-4] in names: emergence_order = get_emergence_order(os.path.join(filepath, filename)) all_vecs = get_attested_order(os.path.join(filepath, filename)) sim_over_t = get_sim_over_time(emergence_order, all_vecs) all_sim.extend([(item[0], item[1], filename[:-4]) for item in sim_over_t]) return all_sim
def test_s_vals(domain: str, field: str) -> None: if domain == "turing": out_path = "data/turing_winners/s-vals" vecs_path = "data/turing_winners/sbert-abstracts-ordered" tuning_path = "data/turing_winners/pickled-tuning/" tuning_path_2 = "data/turing_winners/pickled-tuning-2/" else: out_path = f"data/nobel_winners/{field}/s-vals" vecs_path = f"data/nobel_winners/{field}/abstracts-ordered" tuning_path = f"data/nobel_winners/{field}/pickled-tuning/" tuning_path_2 = f"data/nobel_winners/{field}/pickled-tuning-2/" order_path = vecs_path from_previous_inds = False #s_vals = [0.001 * i for i in range(1, 101)] s_vals = [0.01 * i for i in range(1, 101)] models = {val: {} for val in s_vals} num_samples = len( [name for name in os.listdir(vecs_path) if name.endswith(".csv")]) if not from_previous_inds: train_inds, test_inds = train_test_split_inds(num_samples, 0.5) else: train_inds = [] with open(inds_path + f"train_inds_{i}.txt", "r") as f: for row in f.readlines(): train_inds.append(int(row)) selected = [ name[:-2] + ".csv" for name in os.listdir(tuning_path) if name.endswith(".p") ] all_names = list(os.listdir(vecs_path)) train_inds = [ all_names.index(item) for item in selected if item.endswith(".csv") ] for i, filename in enumerate(os.listdir(vecs_path)): if filename.endswith(".csv"): print(i, filename) if i in test_inds: continue vecs_filename = os.path.join(vecs_path, filename) order_filename = os.path.join(order_path, filename) all_vecs = get_attested_order(vecs_filename, vecs_col=2, multicols=True) emergence_order = get_emergence_order(order_filename, vecs_col=2, multicols=True) name_to_model = {val: make_rank_on_exemplar(val) for val in s_vals} res = {} ll_rand = get_probability_rand(emergence_order) print("RANDOM SCORE: ", ll_rand) # for name in name_to_model: # print(name) # ranking_type = "local" if name == "Local" else "global" # ll_model, _, _ = get_probability_score(emergence_order, all_vecs, name_to_model[name], ranking_type=ranking_type) # print("MODEL SCORE:", ll_model) # diff = ll_model - ll_rand # print(f"LL ratio {name}: ", diff) # models[name][filename[:-4]] = (diff, len(all_vecs)) # res[name] = (diff, len(all_vecs)) ranking_types = ["global"] * len(s_vals) ll_models, model_order = get_probability_score_multi( emergence_order, all_vecs, name_to_model, ranking_types) for name, ll in zip(model_order, ll_models): print(" === MODEL ===") print(f"{name}: {ll}") diff = ll - ll_rand res[name] = (diff, len(all_vecs)) models[name][filename[:-4]] = (diff, len(all_vecs)) with open(f"{tuning_path_2}{filename[:-4]}.p", "rb") as p_file: old_info = pickle.load(p_file) old_info.update(res) print(old_info) with open(f"{tuning_path_2}{filename[:-4]}.p", "wb") as p_file: pickle.dump(old_info, p_file) assert False with open(f"{out_path}/exemplar-grid.p", "rb") as p_file: old_model = pickle.load(p_file) old_model.update(models) print(old_model) assert False with open(f"{out_path}/exemplar-grid.p", "wb") as p_file: pickle.dump(models, p_file) print("train inds") print(train_inds)
def run_cv_s_vals(domain: str, field: str, cv: int) -> None: if domain == "turing": out_path = "data/turing_winners/s-vals" vecs_path = "data/turing_winners/sbert-abstracts-ordered" tuning_path = "data/turing_winners/pickled-tuning/" tuning_path_2 = "data/turing_winners/pickled-tuning-2/" else: out_path = f"data/nobel_winners/{field}/s-vals" vecs_path = f"data/nobel_winners/{field}/abstracts-ordered" tuning_path = f"data/nobel_winners/{field}/pickled-tuning/" tuning_path_2 = f"data/nobel_winners/{field}/pickled-tuning-2/" order_path = vecs_path from_previous_inds = False #s_vals = [0.001 * i for i in range(1, 101)] #s_vals = [0.01 * i for i in range(1, 101)] #s_vals = [0.001 * i for i in range(1, 101)] s_vals = [0.001 * i for i in range(1, 3)] num_samples = len( [name for name in os.listdir(vecs_path) if name.endswith(".csv")]) cv_folds = cv_split_inds(num_samples, cv) with open(f"{out_path}/folds.txt", "w") as inds_f: for fold in cv_folds: inds_f.write(str(fold) + "\n") for fold in range(len(cv_folds)): print(f"Fold {fold}") models = {val: {} for val in s_vals} test_inds = cv_folds[fold] for i, filename in enumerate(os.listdir(vecs_path)): if filename.endswith(".csv"): print(i, filename) if i in test_inds: continue ll_diff = {} res = {} vecs_filename = os.path.join(vecs_path, filename) order_filename = os.path.join(order_path, filename) all_vecs = get_attested_order(vecs_filename, vecs_col=2, multicols=True) emergence_order = get_emergence_order(order_filename, vecs_col=2, multicols=True) name_to_model = {val: make_rank_on_exemplar(val) for val in s_vals} ll_rand = get_probability_rand(emergence_order) print("RANDOM SCORE: ", ll_rand) ranking_types = ["global"] * len(s_vals) ll_models, model_order = get_probability_score_multi( emergence_order, all_vecs, name_to_model, ranking_types) for name, ll in zip(model_order, ll_models): print(" === MODEL ===") print(f"{name}: {ll}") diff = ll - ll_rand if name in ll_diff: ll_diff[name].append(diff) else: ll_diff[name] = [diff] for name, ll in zip(model_order, ll_models): res[name] = sum(ll_diff[name]) / len(ll_diff[name]) if name in models and filename[:-4] in models[name]: models[name][filename[:-4]].append(res[name]) else: models[name][filename[:-4]] = res[name] with open(f"{tuning_path}{filename[:-4]}_{fold}.p", "wb") as p_file: pickle.dump(models, p_file) with open(f"{out_path}/exemplar-grid-fold_{fold}.p", "wb") as p_file: pickle.dump(models, p_file)
#seaborn.scatterplot(time, sim, hue=names) data = pd.DataFrame(data=np.column_stack((time, sim, names)), columns=["timestep", "sim", "name"]) fg = seaborn.FacetGrid(data.astype({"timestep": int, "sim": float, "name": str}), hue="name") fg.map(seaborn.regplot, "timestep", "sim") plt.ylabel("Similarity of emerging papers\nto $S_{0}$", fontsize=30) plt.xlabel("Timestep $t$", fontsize=30) plt.xticks([i for i in range(5, max(time), 5)], fontsize=20) plt.yticks(fontsize=20) plt.legend(fontsize=20) plt.savefig(filename + ".png") plt.savefig(filename + ".eps") plt.show() if __name__ == "__main__": best_names = ["Leslie-Lamport", "Donald_E=-Knuth", "Robert_W=-Floyd", "Silvio-Micali", "John_E=-Hopcroft"] best_pairwise = ["Geoffrey_E=-Hinton", "Leslie-Lamport", "David_A=-Patterson", "Richard_M=-Karp", "Adi-Shamir"] worst_names = ["Adi-Shamir", "Amir-Pnueli", "Tony-Hoare", "Joseph-Sifakis", "David_A=-Patterson"] worst_pairwise = ["Juris-Hartmanis", "Robert_E=-Kahn", "John-McCarthy", "Edgar_F=-Codd", "John-Cocke"] random_names = ["Donald_E=-Knuth", "Amir-Pnueli", "Yoshua-Bengio", "Ken-Thompson", "John_W=-Backus"] emergence_order = get_emergence_order("data/turing_winners/vecs-abstracts-ordered/Juris-Hartmanis.csv") all_vecs = get_attested_order("data/turing_winners/vecs-abstracts-ordered/Juris-Hartmanis.csv") #sim_over_t = get_sim_over_time(emergence_order, all_vecs) #plot_similarity_over_time(sim_over_t) sim_over_t = get_sim_over_time_multi(best_pairwise, "data/turing_winners/vecs-abstracts-ordered") plot_similarity_over_time(sim_over_t, filename="best-p-scientists-sim")
if i % 2 == 0: texts.append(plt.text(tsne_res[i, 0], tsne_res[i, 1], title, fontsize=5)) adjust_text(texts)""" plt.colorbar(sc, pad=0.3) plt.axis("off") plt.savefig("einstein-vis.png", dpi=500) plt.savefig("einstein-vis.eps", dpi=500) assert False labels = get_attested_order( "data/turing_winners/abstracts/geoff/Geoff-Hinton-abstract-vecs-final.csv", vecs_col=1, label=True) emergence_order = get_emergence_order( "data/turing_winners/abstracts/geoff/Geoff-Hinton-abstract-vecs-final-ordered.csv", vecs_col=2) _, ranks, tails = get_probability_score(emergence_order, all_vecs, labels, rank_on_1NN, ranking_type="global", carry_error=True) inds = [all_vecs.index(list(vec)) for vec in ranks] plot_links(tsne_res, np.asarray(inds), np.asarray(tails), "1NN-geoff", "1NN-geoff") assert False data = gen_random_2d_data(10) #data = get_attested_order("data/turing_winners/abstracts/geoff/Geoff-Hinton-abstract-vecs-final.csv", vecs_col=2) links_nn, tails_nn = link_with_model(data, rank_on_1NN, "1NN")
results_path = "results/summary/s_opt/cs.p" best_s_vals = {} for i, filename in enumerate(os.listdir(vecs_path)): if filename.endswith(".csv"): print(i, filename) if i > 20: assert False vecs_filename = os.path.join(vecs_path, filename) order_filename = os.path.join(vecs_path, filename) individual_filename = os.path.join(individual_path, filename) all_vecs = get_attested_order(vecs_filename, vecs_col=2, multicols=True) emergence_order = get_emergence_order(order_filename, vecs_col=2, multicols=True) if len(all_vecs) < 5: continue best = optimize_one_scientist(all_vecs, emergence_order) print(best) best_s_vals[filename[:-4]] = best with open(f"{individual_path}/{filename[:-4]}.p", "wb") as ind_file: pickle.dump(best, ind_file) with open(f"{results_path}/{filename[:-4]}.p", "wb") as res_file: pickle.dump(best_s_vals, res_file)