def test_adj_and_phi(): """ Reads adj matrix, makes sure we can match what R code did for learning pi_vector, preprocessing it, and flipping it. Uses & compares to files: 'ng_aa_data1/data15' . [_adj_mat.mtx.gz, .dataphi.txt.gz, .dataphipreproc.txt.gz, .dataphiflipped.txt.gz, .adj_mat_flipped.mtx.gz] Throws assertion error if unhappy """ print( "\n*** Testing reading adjacency matrix and computing pi_vector ***\n") # Use the example data files "data15_*". They contain the contents of my expt data file alt.atheism/data1.Rdata #pi_vector_infile = "ng_aa_data1/data15_phi.txt.gz" # this is from data1.Rdata, and it's the phi from the whole (larger) data set #pi_vector_whole_data = score_data.load_pi_from_file(pi_vector_infile) # ignoring this adj_mat_infile = "ng_aa_data1/data15_adj_mat.mtx.gz" # manually constructing these as I go along, using my existing R code in experimentRunner.R pi_vector_learned_R = expts_labeled_data.load_pi_from_file( "ng_aa_data1/data15.dataphi.txt.gz") pi_vector_preproc_R = expts_labeled_data.load_pi_from_file( "ng_aa_data1/data15.dataphipreproc.txt.gz") adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # Quirk from R: it saved floating point data with 7 digits of precision (see getOptions("digits") and format()). # Implication: if we want to ever use those phi files, should re-convert with higher precision. # For now, allow a difference of 1e-07 when comparing them # How annoying. Upping the precision simply revealed how I'm imprecise in the R code anyway. The Bernoulli <-> # multinomial conversion I do doesn't keep the exact probabilities anyway. Actually... that's a possible bug. The # other time the code does this, it explicitly fixes that. # Compare. Expect pi_vector_learned to match pi_vector_learned_R and match numCols of adj_mat. assert (pi_vector_learned.shape[0] == adj_mat.shape[1]) assert (max(abs(pi_vector_learned - pi_vector_learned_R)) < 1e-07) # Expect pi_vector_preproc to match pi_vector_preproc_R and match numCols of adj_mat_preproc assert (pi_vector_preproc.shape[0] == adj_mat_preproc.shape[1]) assert (max(abs(pi_vector_preproc - pi_vector_preproc_R)) < 1e-07) # test flipping pi_vector_flipped_R = expts_labeled_data.load_pi_from_file( "ng_aa_data1/data15.dataphiflipped.txt.gz") adj_mat_flipped_R = score_data.load_adj_mat( "ng_aa_data1/data15.adj_mat_flipped.mtx.gz") pi_vector_flipped, adj_mat_flipped = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat, flip_high_ps=True) # Expect the respective versions to match assert (pi_vector_flipped.shape == pi_vector_preproc.shape) assert (max(abs(pi_vector_flipped - pi_vector_flipped_R)) < 1e-07) assert (adj_mat_flipped_R.shape == adj_mat_flipped.shape) assert (abs(adj_mat_flipped_R - adj_mat_flipped).max() < 1e-07)
def test_score_wc_faiss(): adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz" adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector, adj_mat = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) scores_data_frame = scoring_with_faiss.score_pairs_faiss( adj_mat, which_methods=['weighted_corr_faiss'], how_many_neighbors=-1, print_timing=True, pi_vector=pi_vector) print('scores look like (sample):\n' + str(scores_data_frame.head())) # note for later: scores_data_frame.reset_index() makes it save item1 & item2 as regular columns, defaults back to index of row numbers print("calling adamic-adar") scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact( adj_mat, 'adamic_adar_faiss', pi_vector=pi_vector, num_docs=adj_mat.shape[0]) print('scores look like (sample):\n' + str(scores_data_frame2.head())) print("calling pearson") scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact( adj_mat, 'pearson_faiss') print('scores look like (sample):\n' + str(scores_data_frame2.head())) print("(dense input)") scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact( adj_mat.toarray(), 'pearson_faiss') print('scores look like (sample):\n' + str(scores_data_frame2.head()))
def case1_no_bdry_nodes(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 1\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) new_evals_file = results_dir + "/evals-case1.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True) compare_auc_files(new_evals_file, aucs_file_to_match)
def test_faiss_plus_normal(): adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz" adj_mat = score_data.load_adj_mat(adj_mat_infile) score_data.run_and_eval( adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, # method_spec="all", method_spec=['weighted_corr', 'weighted_corr_faiss'], evals_outfile="reality_appweek_50/python-out/evals-test.txt", pair_scores_outfile='reality_appweek_50/tmp.scoredPairs.csv.gz', print_timing=True)
def case4_0item_no_bdry_affils(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 4\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) # Keep the natural all-0 item, and tell program to remove boundary affils new_evals_file = results_dir + "/evals-case4.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_items=False, remove_boundary_affils=True) compare_auc_files(new_evals_file, aucs_file_to_match)
def case5_0item_keep_0affils(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 5\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) # want only all-0 affils, so set affil[,115] to all 0's adj_mat[:, 115] = 0 new_evals_file = results_dir + "/evals-case5.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_items=False, remove_boundary_affils=False) compare_auc_files(new_evals_file, aucs_file_to_match)
def case3_keep_0and1affils(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 3\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) # Want the natural all-0 and all-1 affils, but still want item 26 to stay out. adj_mat[26,:] = 0 new_evals_file = results_dir + "/evals-case3.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_affils=False) compare_auc_files(new_evals_file, aucs_file_to_match)
def case6_0item_keep_0and1affils(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 6\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) # score matrix the way it comes: with all-0 and all-1 affils, and an item that's all-0 once the all-1 affil is gone # Note: that all-0 item (an induced boundary node) can't be handled quite correctly by the exp model. But it works # out well enough, because it ends up with a parameter very close to zero. new_evals_file = results_dir + "/evals-case6.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_items=False, remove_boundary_affils=False) compare_auc_files(new_evals_file, aucs_file_to_match)
def demo_run_and_eval(adj_mat_infile, pair_scores_outfile, evals_outfile, prefer_faiss=False): adj_mat = score_data.load_adj_mat(adj_mat_infile) score_data.run_and_eval( adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec="all", evals_outfile=evals_outfile, pair_scores_outfile=pair_scores_outfile, print_timing=True, prefer_faiss=prefer_faiss)
def case8_01items_no_bdry_affils(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 8\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) affil_degrees = np.asarray(adj_mat.sum(axis=0)).squeeze() adj_mat.resize((76, 206)) # orig shape was 75x206 adj_mat[75, affil_degrees > 0] = 1 # new almost-all-1 item (preserves orig all-0 affils) new_evals_file = results_dir + "/evals-case8.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_items=False, remove_boundary_affils=True) compare_auc_files(new_evals_file, aucs_file_to_match)
def case9_01items_keep_all(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 9\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) affil_degrees = np.asarray(adj_mat.sum(axis=0)).squeeze() adj_mat.resize((76, 206)) # orig shape was 75x206 adj_mat[75, affil_degrees > 0] = 1 # new almost-all-1 item (preserves orig all-0 affils) # Note: similar to case 6, the all-1 item (induced boundary node) can't be handled by the exp model. There is no # max likelihood solution for this graph. In practice, the algorithm times out -- but even if it ran longer, # there's no good solution to converge to. The parameter for that item needs to be near-infinity, but not infinity. new_evals_file = results_dir + "/evals-case9.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_items=False, remove_boundary_affils=False) compare_auc_files(new_evals_file, aucs_file_to_match)
def test_only_wc(adj_mat_infile, scored_pairs_file_R, scored_pairs_file_new): """ Like test_pair_scores_against_R(), but checks scores & timing of the function simple_only_weighted_corr(). (This was the first scoring method I implemented using a transform of the adj_matrix.) :param adj_mat_infile: local path ending in .mtx.gz :param scored_pairs_file_R: local path ending in .csv.gz """ print( "\n*** Checking simple_only_weighted_corr against scores from R ***\n") # Read adj data and prep pi_vector adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) scores_storage = magic_dictionary.make_me_a_dict(adj_mat_preproc.shape[0]) scoring_methods.extra_implementations.simple_only_weighted_corr( score_data.gen_all_pairs, adj_mat_preproc, scores_storage.create_and_store_array("weighted_corr"), pi_vector_preproc, print_timing=True) scores_storage.to_csv_gz( scored_pairs_file_new, lambda: score_data.ij_gen(adj_mat_preproc.shape[0])) with gzip.open(scored_pairs_file_new, 'r') as fpin: wc_frame = pd.read_csv(fpin) with gzip.open(scored_pairs_file_R, 'r') as fpin: scores_data_frame_R = pd.read_csv(fpin) print("max diff: " + str( abs(wc_frame["weighted_corr"] - scores_data_frame_R["pearsonWeighted"]).max())) assert (max( abs(wc_frame["weighted_corr"] - scores_data_frame_R["pearsonWeighted"])) < 1e-05)
def test_adj_and_phi2(): """ Reads adj matrix, checks that we can learn pi_vector for a second data set. Using files: "reality_appweek_50/data50_adjMat.mtx.gz", "reality_appweek_50/data50-inference-allto6.phi.csv.gz" """ print( "\n*** Testing reading adjacency matrix and computing pi_vector (2) ***\n" ) # Use something other than newsgroups! They're too complicated because they were run early. # Check that I can learn phi from the adjacency matrix and end up with the version in the inference file adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz" pi_vector_preproc_R = expts_labeled_data.load_pi_from_file( "reality_appweek_50/data50-inference-allto6.phi.csv.gz") adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # Expect pi_vector_preproc to match pi_vector_preproc_R assert (max(abs(pi_vector_preproc - pi_vector_preproc_R)) < 1e-07)
def test_faiss_basic_calls(): adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz" adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector, adj_mat = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # can do dot product on plain adj matrix -- just computes sharedSize index = faiss.IndexFlatIP(adj_mat.shape[1]) # takes numCols as arg # mimicking tutorial example: #index.add(np.random.random((100, adj_mat.shape[1])).astype('float32')) adj_for_faiss = adj_mat.toarray().astype( 'float32' ) # adj_mat is sparse, but faiss wants dense. and, apparently, wants float32. index.add(adj_for_faiss) print("index.is_trained: " + str(index.is_trained) + ", index.total: " + str(index.ntotal)) # look at 10 nearest neighbors of each input distances10, neighbors10 = index.search(adj_for_faiss, 10) distances, neighbors = index.search(adj_for_faiss, adj_for_faiss.shape[0]) # all pairs print('basic calls ran')
def test_pair_scores_against_R(adj_mat_infile, scored_pairs_file_R, scored_pairs_file_new, make_dense=False, flip_high_ps=False, run_all=0, prefer_faiss=False): """ Starting from an adj matrix, score pairs (using current implementation) and compare to reference file run from R. Similar contents to score_data.run_and_eval(). :param run_all: set to 2 (or 1) to run and time all (or more) implementations. However, we only look at the scores of the last one. """ print("\n*** Testing scores computed for pairs ***\n") print("Adj matrix infile: " + adj_mat_infile + "; scored pairs reference file: " + scored_pairs_file_R) # Read adj data and prep pi_vector adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat, flip_high_ps=flip_high_ps) methods_to_run = [ 'jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson', 'weighted_corr', 'shared_weight11', 'shared_weight1100', 'adamic_adar', 'newman', 'mixed_pairs' ] mixed_pairs_sims = [.01, .001] start = timer() if make_dense: adj_mat_preproc = adj_mat_preproc.toarray() scoring_methods.score_pairs(score_data.gen_all_pairs, adj_mat_preproc, which_methods=methods_to_run, outfile_csv_gz=scored_pairs_file_new, pi_vector=pi_vector_preproc, back_compat=True, num_docs=adj_mat_preproc.shape[0], mixed_pairs_sims=mixed_pairs_sims, print_timing=True, run_all_implementations=run_all, prefer_faiss=prefer_faiss) with gzip.open(scored_pairs_file_new, 'r') as fpin: scores_data_frame = pd.read_csv(fpin) scores_data_frame['label'] = expts_labeled_data.get_true_labels_expt_data( num_true_pairs=5, pairs_generator=score_data.gen_all_pairs(adj_mat_preproc)) end = timer() print("ran " \ + str(len(methods_to_run) + (len(mixed_pairs_sims) - 1 if 'mixed_pairs' in methods_to_run else 0)) \ + " methods " + ("(plus variants) " if run_all else "") \ + "on " + str(adj_mat.shape[0] * (adj_mat.shape[0]-1)/float(2)) + " pairs") print("num seconds: " + str(end - start)) # Read scores from R and compare with gzip.open(scored_pairs_file_R, 'r') as fpin: scores_data_frame_R = pd.read_csv(fpin) for (R_method, our_method) in list(mapping_from_R_methods.items()): if our_method in list(scores_data_frame): print("Checking " + our_method) # R data doesn't have item numbers, but is in the same all-pairs order as ours print("max diff: " + str( abs(scores_data_frame[our_method] - scores_data_frame_R[R_method]).max())) # Sadly, the p_i vectors are off by a smidgen (see notes above), so anything that uses them can # differ too. sharedWeight11 vals differed by > 1e-06, and that was with only 65 affils. tolerance = 1e-10 if prefer_faiss: tolerance = 1e-04 elif our_method in our_pi_methods: tolerance = 1e-05 assert (max( abs(scores_data_frame[our_method] - scores_data_frame_R[R_method])) < tolerance) return scores_data_frame