def write_item_likelihoods(adj_mat_file, loglik_out_csv, flip_high_ps=False, row_labels=None): adj_mat = load_adj_mat(adj_mat_file) pi_vector, adj_mat, row_labels = remove_boundary_nodes( adj_mat, flip_high_ps=flip_high_ps, orig_row_labels=row_labels) graph_models = learn_graph_models(adj_mat, bernoulli=True, pi_vector=pi_vector, exponential=True) (loglik_bern, aic_bern, item_LLs_bern) = graph_models['bernoulli'].likelihoods(adj_mat) (loglik_exp, aic_exp, item_LLs_exp) = graph_models['exponential'].likelihoods(adj_mat) print("bernoulli model: loglikelihood " + str(loglik_bern) + ", aic " + str(aic_bern)) print("exponential model: loglikelihood " + str(loglik_exp) + ", aic " + str(aic_exp)) with open(loglik_out_csv, 'w') as fout: fout.write("item,loglik_bernoulli,loglik_exponential\n") if row_labels is None: row_labels = range(adj_mat.shape[0]) for i, score in enumerate(item_LLs_bern): fout.write( str(row_labels[i]) + ',' + str(item_LLs_bern[i]) + "," + str(item_LLs_exp[i]) + "\n")
def get_item_likelihoods(adj_mat_file, exponential_model=True, row_labels=None, adj_mat_ready=None): """ Reads matrix, learns model of graph, returns vector of log likelihoods for items. :param adj_mat_file: in matrix market format, optionally compressed (*.mtx.gz) :param exponential_model: True for exponential model, False for Bernoulli model :param row_labels: optional vector of strings or numbers (item names) :param adj_mat_ready: Can pass in a sparse adjacency matrix instead of a file path -- see below. :return: """ # allows input in the form of a file path OR an unweighted adj_mat if adj_mat_file == "" and adj_mat_ready is not None: adj_mat = adj_mat_ready else: adj_mat = load_adj_mat(adj_mat_file) pi_vector, adj_mat, row_labels = remove_boundary_nodes( adj_mat, orig_row_labels=row_labels) one_graph_model = learn_graph_models(adj_mat, bernoulli=(not exponential_model), pi_vector=pi_vector, exponential=exponential_model) (tot_loglik, aic, item_LLs) = list(one_graph_model.values())[0].likelihoods(adj_mat) print("learned " + list(one_graph_model.keys())[0] + " model. total loglikelihood " + str(tot_loglik) + ", aic " + str(aic)) return item_LLs, row_labels
def run_and_eval(adj_mat, true_labels_func, method_spec, evals_outfile, pair_scores_outfile=None, mixed_pairs_sims='standard', add_exp_model=False, make_dense=True, prefer_faiss=False, print_timing=False, row_labels=None, pi_vector_to_use=None, flip_high_ps=False, remove_boundary_items=True, remove_boundary_affils=True): """ :param adj_mat: :param true_labels_func: identifies the true pairs, given a pairs_generator :param method_spec: list of method names OR the string 'all' :param evals_outfile: :param pair_scores_outfile: :param mixed_pairs_sims: :param add_exp_model: :param make_dense: :param prefer_faiss: :param print_timing: :param row_labels: used when adj_mat's indices differ from original row numbers :param pi_vector_to_use: :param flip_high_ps: :param remove_boundary_affils: :param remove_boundary_items: :return: """ # note on sparse matrices: adj_mat is initially read in as "coo" format (coordinates of entries). Next few operations # will be by column, so it's returned from load_adj_mat as "csc" (compressed sparse column). Then, converted to # "csr" in adjust_pi_vector to make pair generation (row slicing) fast. pi_vector, adj_mat, row_labels = remove_boundary_nodes( adj_mat, pi_vector=pi_vector_to_use, flip_high_ps=flip_high_ps, orig_row_labels=row_labels, remove_boundary_items=remove_boundary_items, remove_boundary_affils=remove_boundary_affils) if make_dense: adj_mat = adj_mat.toarray() # score pairs # (sending in all special args any methods might need) want_exp_model = add_exp_model or ('weighted_corr_exp' in method_spec) or \ ('weighted_corr_exp_faiss' in method_spec) or ('all' in method_spec) graph_models = learn_graph_models(adj_mat, bernoulli=True, pi_vector=pi_vector, exponential=want_exp_model, verbose=print_timing, max_iter_biment=50000) # In the future (anticipated for scaling up): first, run any methods that return a subset of pairs. # scores_subset = # Then make pairs_generator use the pairs in scores_subset. # Pairs generators. We need: # 1. Full version that accesses matrix rows, and cheap/efficient version that just gives row indices. # 2. To be able to call each of them multiple times (once per method). # 3. Full version must be able to take different adj_matrix arguments (transformed matrices). # 4. But row_labels arg should be wrapped into both, right here. # functools.partial lets us construct generators that are automatically reset w/orig args when called again. pairs_generator = partial( gen_all_pairs, row_labels=row_labels ) # this is a generator function. Call it with an arg to get generator object. pairs_gen_for_labels = partial( ij_gen, adj_mat.shape[0], row_labels ) # this too is a generator function. Call it w/o args to get generator object. # equivalent, but less elegant than functools.partial # def my_pairs_gen(adj_mat): # return gen_all_pairs(adj_mat, row_labels) # pairs_generator = my_pairs_gen # this is a generator function. Call it with an arg to get generator object. # outfile: even if caller didn't ask for one, we need a temporary one if pair_scores_outfile is None: tf = tempfile.NamedTemporaryFile(delete=False, suffix=".csv.gz") initial_pairs_outfile = tf.name tf.close() else: initial_pairs_outfile = pair_scores_outfile scoring_methods.score_pairs( pairs_generator, adj_mat, method_spec, outfile_csv_gz=initial_pairs_outfile, indices_gen=pairs_gen_for_labels, pi_vector=graph_models['bernoulli'].affil_params, exp_model=graph_models.get('exponential', None), num_docs=adj_mat.shape[0], mixed_pairs_sims=mixed_pairs_sims, print_timing=print_timing, prefer_faiss=prefer_faiss) # if scores_subset is not None: # scores_data_frame = pd.merge(scores_subset, scores_data_frame, on=['item1', 'item2']) with gzip.open(initial_pairs_outfile, 'r') as fpin: scores_data_frame = pd.read_csv(fpin) method_names = set(scores_data_frame.columns.tolist()) - {'item1', 'item2'} scores_data_frame['label'] = list( map(int, true_labels_func(pairs_gen_for_labels()))) # round pair scores at 15th decimal place so we don't get spurious diffs in AUCs when replicating scores_data_frame = scores_data_frame.round( decimals={method: 15 for method in method_names}) # save pair scores if desired if pair_scores_outfile is not None: scores_data_frame = scores_data_frame.reindex( columns=['item1', 'item2', 'label'] + sorted(list(method_names - {'label'})), copy=False) scores_data_frame.to_csv(pair_scores_outfile, index=False, compression="gzip") else: os.remove(initial_pairs_outfile) # compute evals and save evals = {} for method in method_names: evals["auc_" + method] = roc_auc_score( y_true=scores_data_frame['label'], y_score=scores_data_frame[method]) for model_type, graph_model in list(graph_models.items()): (loglik, aic, item_LLs) = graph_model.likelihoods(adj_mat, print_timing=print_timing) evals["loglikelihood_" + model_type] = loglik evals["akaike_" + model_type] = aic evals['constructAllPairsFromMDocs'] = adj_mat.shape[ 0] # only correct when we're using all pairs evals['numPositives'] = scores_data_frame['label'].sum() evals['numAffils'] = adj_mat.shape[1] if want_exp_model: evals['expModelConverged'] = int( graph_models['exponential'].exp_model_converged) with open(evals_outfile, 'w') as fpout: print("Saving results to " + evals_outfile) for (measure, val) in sorted(evals.items()): fpout.write(measure + '\t' + str(val) + '\n')
def score_only(adj_mat_file, method_spec, pair_scores_outfile, flip_high_ps=False, make_dense=True, row_labels=None, print_timing=False, learn_exp_model=False, prefer_faiss=True, integer_ham_ssize=False): """ :param adj_mat_file: function expects a file in matrix market format, optionally gzipped :param method_spec: list of method names (see scoring_methods.all_defined_methods for all choices) :param pair_scores_outfile: output path, should end in .csv.gz. Each line will contain one pair and all their scores. :param flip_high_ps: :param make_dense: If false, keep matrix in sparse format. Uses less RAM, but far slower. :param row_labels: Array of labels, in case 0:(num_rows(adj_mat)-1) isn't their usual naming/numbering :param print_timing: :param learn_exp_model: fit and compute likelihoods for exponential graph model even if not using it for scoring :param prefer_faiss: when the FAISS library is installed, use it (for the methods implemented in it) :param integer_ham_ssize: hamming (distance) and shared_size are returned as integers (saves space and easier to interpret). The default changes them both to similarities between 0 and 1. :return: (no return value. instead, scores are saved to pair_scores_outfile.) """ adj_mat = load_adj_mat(adj_mat_file) _, adj_mat, row_labels = remove_boundary_nodes(adj_mat, flip_high_ps=flip_high_ps, orig_row_labels=row_labels) if make_dense: adj_mat = adj_mat.toarray() want_exp_model = learn_exp_model or ('weighted_corr_exp' in method_spec) or \ ('weighted_corr_exp_faiss' in method_spec) or ('all' in method_spec) graph_models = learn_graph_models(adj_mat, bernoulli=True, exponential=want_exp_model) for model_type, graph_model in list(graph_models.items()): (loglik, aic, item_LLs) = graph_model.likelihoods(adj_mat) print("loglikelihood " + model_type + ": " + str(loglik)) print("akaike " + model_type + ": " + str(aic)) pairs_generator = partial( gen_all_pairs, row_labels=row_labels ) # this is a generator function. Call it with an arg to get generator object. indices_generator = partial( ij_gen, adj_mat.shape[0], row_labels ) # this too is a generator function. Call it w/o args to get generator object. scoring_methods.score_pairs( pairs_generator, adj_mat, method_spec, outfile_csv_gz=pair_scores_outfile, pi_vector=graph_models['bernoulli'].affil_params, indices_gen=indices_generator, exp_model=graph_models.get('exponential', None), num_docs=adj_mat.shape[0], mixed_pairs_sims='standard', print_timing=print_timing, prefer_faiss=prefer_faiss, back_compat=integer_ham_ssize) print('scored pairs saved to ' + pair_scores_outfile)
def resources_test(run_all_implementations=True, use_faiss=False): # Let's read in portions of a big matrix in increasing size, and for each size, score all pairs (both sparse and dense). # This will let us see how things scale and where memory limits will come in. infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt" num_nodes = (100, 1000, 10000, 100000) # num_nodes = [10000] # this size: no run finished in the length of time I was willing to wait num_nodes = (100, 500, 1000, 5000) # num_nodes = [5000] for num_to_try in num_nodes: adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # (order given here doesn't matter) methods_to_run = [ 'cosine', 'cosineIDF', # use fast "transform" 'shared_size', 'adamic_adar', 'newman', 'shared_weight11', # medium 'hamming', 'pearson', 'jaccard', # WC uses "transform" when dense, "terms" when sparse -- speed varies accordingly 'weighted_corr', 'weighted_corr_exp', # only have slow "terms" method 'shared_weight1100', 'mixed_pairs' ] adj_mat_preproc_dense = adj_mat_preproc.toarray() print("\ndense version takes up " + str(sys.getsizeof(adj_mat_preproc_dense)) + " bytes") want_exp_model = ('weighted_corr_exp' in methods_to_run) or \ ('weighted_corr_exp_faiss' in methods_to_run) or ('all' in methods_to_run) start = timer() graph_models = bipartite_fitting.learn_graph_models( adj_mat, bernoulli=False, pi_vector=None, exponential=want_exp_model) end = timer() print("time for learning exponential model: " + str(end - start) + " seconds" if want_exp_model else "") start = timer() score_data.scoring_methods.score_pairs( score_data.gen_all_pairs, adj_mat_preproc_dense, which_methods=methods_to_run, pi_vector=pi_vector_preproc, back_compat=True, num_docs=adj_mat_preproc.shape[0], mixed_pairs_sims=[.01], print_timing=True, exp_model=graph_models.get('exponential', None), run_all_implementations=run_all_implementations, prefer_faiss=use_faiss) end = timer() print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \ + " affils, ") print("ran all methods using dense matrix in " + str(end - start) + " seconds") print("\nsparse adj_matrix takes up " + str(asizeof.asizeof(adj_mat_preproc)) + " bytes;") start = timer() score_data.scoring_methods.score_pairs( score_data.gen_all_pairs, adj_mat_preproc, which_methods=methods_to_run, pi_vector=pi_vector_preproc, back_compat=True, num_docs=adj_mat_preproc.shape[0], mixed_pairs_sims=[.01], print_timing=True, exp_model=graph_models.get('exponential', None), run_all_implementations=run_all_implementations) end = timer() print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \ + " affils, ") print("ran all methods using sparse matrix in " + str(end - start) + " seconds")