Exemple #1
0
def write_item_likelihoods(adj_mat_file,
                           loglik_out_csv,
                           flip_high_ps=False,
                           row_labels=None):
    adj_mat = load_adj_mat(adj_mat_file)
    pi_vector, adj_mat, row_labels = remove_boundary_nodes(
        adj_mat, flip_high_ps=flip_high_ps, orig_row_labels=row_labels)

    graph_models = learn_graph_models(adj_mat,
                                      bernoulli=True,
                                      pi_vector=pi_vector,
                                      exponential=True)

    (loglik_bern, aic_bern,
     item_LLs_bern) = graph_models['bernoulli'].likelihoods(adj_mat)
    (loglik_exp, aic_exp,
     item_LLs_exp) = graph_models['exponential'].likelihoods(adj_mat)
    print("bernoulli model: loglikelihood " + str(loglik_bern) + ", aic " +
          str(aic_bern))
    print("exponential model: loglikelihood " + str(loglik_exp) + ", aic " +
          str(aic_exp))

    with open(loglik_out_csv, 'w') as fout:
        fout.write("item,loglik_bernoulli,loglik_exponential\n")
        if row_labels is None:
            row_labels = range(adj_mat.shape[0])
        for i, score in enumerate(item_LLs_bern):
            fout.write(
                str(row_labels[i]) + ',' + str(item_LLs_bern[i]) + "," +
                str(item_LLs_exp[i]) + "\n")
Exemple #2
0
def get_item_likelihoods(adj_mat_file,
                         exponential_model=True,
                         row_labels=None,
                         adj_mat_ready=None):
    """
    Reads matrix, learns model of graph, returns vector of log likelihoods for items.
    :param adj_mat_file: in matrix market format, optionally compressed (*.mtx.gz)
    :param exponential_model: True for exponential model, False for Bernoulli model
    :param row_labels: optional vector of strings or numbers (item names)
    :param adj_mat_ready: Can pass in a sparse adjacency matrix instead of a file path -- see below.
    :return:
    """
    # allows input in the form of a file path OR an unweighted adj_mat
    if adj_mat_file == "" and adj_mat_ready is not None:
        adj_mat = adj_mat_ready
    else:
        adj_mat = load_adj_mat(adj_mat_file)

    pi_vector, adj_mat, row_labels = remove_boundary_nodes(
        adj_mat, orig_row_labels=row_labels)

    one_graph_model = learn_graph_models(adj_mat,
                                         bernoulli=(not exponential_model),
                                         pi_vector=pi_vector,
                                         exponential=exponential_model)
    (tot_loglik, aic,
     item_LLs) = list(one_graph_model.values())[0].likelihoods(adj_mat)
    print("learned " + list(one_graph_model.keys())[0] +
          " model. total loglikelihood " + str(tot_loglik) + ", aic " +
          str(aic))

    return item_LLs, row_labels
Exemple #3
0
def run_and_eval(adj_mat,
                 true_labels_func,
                 method_spec,
                 evals_outfile,
                 pair_scores_outfile=None,
                 mixed_pairs_sims='standard',
                 add_exp_model=False,
                 make_dense=True,
                 prefer_faiss=False,
                 print_timing=False,
                 row_labels=None,
                 pi_vector_to_use=None,
                 flip_high_ps=False,
                 remove_boundary_items=True,
                 remove_boundary_affils=True):
    """
    :param adj_mat:
    :param true_labels_func: identifies the true pairs, given a pairs_generator
    :param method_spec: list of method names OR the string 'all'
    :param evals_outfile:
    :param pair_scores_outfile:
    :param mixed_pairs_sims:
    :param add_exp_model:
    :param make_dense:
    :param prefer_faiss:
    :param print_timing:
    :param row_labels: used when adj_mat's indices differ from original row numbers
    :param pi_vector_to_use:
    :param flip_high_ps:
    :param remove_boundary_affils:
    :param remove_boundary_items:
    :return:
    """

    # note on sparse matrices: adj_mat is initially read in as "coo" format (coordinates of entries). Next few operations
    # will be by column, so it's returned from load_adj_mat as "csc" (compressed sparse column). Then, converted to
    # "csr" in adjust_pi_vector to make pair generation (row slicing) fast.

    pi_vector, adj_mat, row_labels = remove_boundary_nodes(
        adj_mat,
        pi_vector=pi_vector_to_use,
        flip_high_ps=flip_high_ps,
        orig_row_labels=row_labels,
        remove_boundary_items=remove_boundary_items,
        remove_boundary_affils=remove_boundary_affils)
    if make_dense:
        adj_mat = adj_mat.toarray()

    # score pairs
    # (sending in all special args any methods might need)

    want_exp_model = add_exp_model or ('weighted_corr_exp' in method_spec) or \
                     ('weighted_corr_exp_faiss' in method_spec) or ('all' in method_spec)
    graph_models = learn_graph_models(adj_mat,
                                      bernoulli=True,
                                      pi_vector=pi_vector,
                                      exponential=want_exp_model,
                                      verbose=print_timing,
                                      max_iter_biment=50000)

    # In the future (anticipated for scaling up): first, run any methods that return a subset of pairs.
    # scores_subset =
    # Then make pairs_generator use the pairs in scores_subset.

    # Pairs generators. We need:
    # 1. Full version that accesses matrix rows, and cheap/efficient version that just gives row indices.
    # 2. To be able to call each of them multiple times (once per method).
    # 3. Full version must be able to take different adj_matrix arguments (transformed matrices).
    # 4. But row_labels arg should be wrapped into both, right here.

    # functools.partial lets us construct generators that are automatically reset w/orig args when called again.
    pairs_generator = partial(
        gen_all_pairs, row_labels=row_labels
    )  # this is a generator function. Call it with an arg to get generator object.
    pairs_gen_for_labels = partial(
        ij_gen, adj_mat.shape[0], row_labels
    )  # this too is a generator function. Call it w/o args to get generator object.
    # equivalent, but less elegant than functools.partial
    # def my_pairs_gen(adj_mat):
    #     return gen_all_pairs(adj_mat, row_labels)
    # pairs_generator = my_pairs_gen      # this is a generator function. Call it with an arg to get generator object.

    # outfile: even if caller didn't ask for one, we need a temporary one
    if pair_scores_outfile is None:
        tf = tempfile.NamedTemporaryFile(delete=False, suffix=".csv.gz")
        initial_pairs_outfile = tf.name
        tf.close()
    else:
        initial_pairs_outfile = pair_scores_outfile

    scoring_methods.score_pairs(
        pairs_generator,
        adj_mat,
        method_spec,
        outfile_csv_gz=initial_pairs_outfile,
        indices_gen=pairs_gen_for_labels,
        pi_vector=graph_models['bernoulli'].affil_params,
        exp_model=graph_models.get('exponential', None),
        num_docs=adj_mat.shape[0],
        mixed_pairs_sims=mixed_pairs_sims,
        print_timing=print_timing,
        prefer_faiss=prefer_faiss)
    # if scores_subset is not None:
    #     scores_data_frame = pd.merge(scores_subset, scores_data_frame, on=['item1', 'item2'])

    with gzip.open(initial_pairs_outfile, 'r') as fpin:
        scores_data_frame = pd.read_csv(fpin)

    method_names = set(scores_data_frame.columns.tolist()) - {'item1', 'item2'}
    scores_data_frame['label'] = list(
        map(int, true_labels_func(pairs_gen_for_labels())))

    # round pair scores at 15th decimal place so we don't get spurious diffs in AUCs when replicating
    scores_data_frame = scores_data_frame.round(
        decimals={method: 15
                  for method in method_names})

    # save pair scores if desired
    if pair_scores_outfile is not None:
        scores_data_frame = scores_data_frame.reindex(
            columns=['item1', 'item2', 'label'] +
            sorted(list(method_names - {'label'})),
            copy=False)
        scores_data_frame.to_csv(pair_scores_outfile,
                                 index=False,
                                 compression="gzip")
    else:
        os.remove(initial_pairs_outfile)

    # compute evals and save
    evals = {}
    for method in method_names:
        evals["auc_" + method] = roc_auc_score(
            y_true=scores_data_frame['label'],
            y_score=scores_data_frame[method])

    for model_type, graph_model in list(graph_models.items()):
        (loglik, aic,
         item_LLs) = graph_model.likelihoods(adj_mat,
                                             print_timing=print_timing)
        evals["loglikelihood_" + model_type] = loglik
        evals["akaike_" + model_type] = aic

    evals['constructAllPairsFromMDocs'] = adj_mat.shape[
        0]  # only correct when we're using all pairs
    evals['numPositives'] = scores_data_frame['label'].sum()
    evals['numAffils'] = adj_mat.shape[1]
    if want_exp_model:
        evals['expModelConverged'] = int(
            graph_models['exponential'].exp_model_converged)

    with open(evals_outfile, 'w') as fpout:
        print("Saving results to " + evals_outfile)
        for (measure, val) in sorted(evals.items()):
            fpout.write(measure + '\t' + str(val) + '\n')
Exemple #4
0
def score_only(adj_mat_file,
               method_spec,
               pair_scores_outfile,
               flip_high_ps=False,
               make_dense=True,
               row_labels=None,
               print_timing=False,
               learn_exp_model=False,
               prefer_faiss=True,
               integer_ham_ssize=False):
    """

    :param adj_mat_file: function expects a file in matrix market format, optionally gzipped
    :param method_spec: list of method names (see scoring_methods.all_defined_methods for all choices)
    :param pair_scores_outfile: output path, should end in .csv.gz. Each line will contain one pair and all their scores.
    :param flip_high_ps:
    :param make_dense: If false, keep matrix in sparse format. Uses less RAM, but far slower.
    :param row_labels: Array of labels, in case 0:(num_rows(adj_mat)-1) isn't their usual naming/numbering
    :param print_timing:
    :param learn_exp_model: fit and compute likelihoods for exponential graph model even if not using it for scoring
    :param prefer_faiss: when the FAISS library is installed, use it (for the methods implemented in it)
    :param integer_ham_ssize: hamming (distance) and shared_size are returned as integers (saves space and easier to
                              interpret). The default changes them both to similarities between 0 and 1.
    :return: (no return value. instead, scores are saved to pair_scores_outfile.)
    """

    adj_mat = load_adj_mat(adj_mat_file)
    _, adj_mat, row_labels = remove_boundary_nodes(adj_mat,
                                                   flip_high_ps=flip_high_ps,
                                                   orig_row_labels=row_labels)
    if make_dense:
        adj_mat = adj_mat.toarray()

    want_exp_model = learn_exp_model or ('weighted_corr_exp' in method_spec) or \
                     ('weighted_corr_exp_faiss' in method_spec) or ('all' in method_spec)
    graph_models = learn_graph_models(adj_mat,
                                      bernoulli=True,
                                      exponential=want_exp_model)

    for model_type, graph_model in list(graph_models.items()):
        (loglik, aic, item_LLs) = graph_model.likelihoods(adj_mat)
        print("loglikelihood " + model_type + ": " + str(loglik))
        print("akaike " + model_type + ": " + str(aic))

    pairs_generator = partial(
        gen_all_pairs, row_labels=row_labels
    )  # this is a generator function. Call it with an arg to get generator object.
    indices_generator = partial(
        ij_gen, adj_mat.shape[0], row_labels
    )  # this too is a generator function. Call it w/o args to get generator object.

    scoring_methods.score_pairs(
        pairs_generator,
        adj_mat,
        method_spec,
        outfile_csv_gz=pair_scores_outfile,
        pi_vector=graph_models['bernoulli'].affil_params,
        indices_gen=indices_generator,
        exp_model=graph_models.get('exponential', None),
        num_docs=adj_mat.shape[0],
        mixed_pairs_sims='standard',
        print_timing=print_timing,
        prefer_faiss=prefer_faiss,
        back_compat=integer_ham_ssize)
    print('scored pairs saved to ' + pair_scores_outfile)
def resources_test(run_all_implementations=True, use_faiss=False):
    # Let's read in portions of a big matrix in increasing size, and for each size, score all pairs (both sparse and dense).
    # This will let us see how things scale and where memory limits will come in.
    infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt"

    num_nodes = (100, 1000, 10000, 100000)
    # num_nodes = [10000]  # this size: no run finished in the length of time I was willing to wait
    num_nodes = (100, 500, 1000, 5000)
    # num_nodes = [5000]
    for num_to_try in num_nodes:
        adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try)

        pi_vector_learned = score_data.learn_pi_vector(adj_mat)
        pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
            pi_vector_learned, adj_mat)

        # (order given here doesn't matter)
        methods_to_run = [
            'cosine',
            'cosineIDF',
            # use fast "transform"
            'shared_size',
            'adamic_adar',
            'newman',
            'shared_weight11',
            # medium
            'hamming',
            'pearson',
            'jaccard',
            # WC uses "transform" when dense, "terms" when sparse -- speed varies accordingly
            'weighted_corr',
            'weighted_corr_exp',
            # only have slow "terms" method
            'shared_weight1100',
            'mixed_pairs'
        ]

        adj_mat_preproc_dense = adj_mat_preproc.toarray()
        print("\ndense version takes up " +
              str(sys.getsizeof(adj_mat_preproc_dense)) + " bytes")

        want_exp_model = ('weighted_corr_exp' in methods_to_run) or \
                         ('weighted_corr_exp_faiss' in methods_to_run) or ('all' in methods_to_run)
        start = timer()
        graph_models = bipartite_fitting.learn_graph_models(
            adj_mat,
            bernoulli=False,
            pi_vector=None,
            exponential=want_exp_model)
        end = timer()
        print("time for learning exponential model: " + str(end - start) +
              " seconds" if want_exp_model else "")

        start = timer()
        score_data.scoring_methods.score_pairs(
            score_data.gen_all_pairs,
            adj_mat_preproc_dense,
            which_methods=methods_to_run,
            pi_vector=pi_vector_preproc,
            back_compat=True,
            num_docs=adj_mat_preproc.shape[0],
            mixed_pairs_sims=[.01],
            print_timing=True,
            exp_model=graph_models.get('exponential', None),
            run_all_implementations=run_all_implementations,
            prefer_faiss=use_faiss)
        end = timer()
        print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \
              + " affils, ")
        print("ran all methods using dense matrix in " + str(end - start) +
              " seconds")

        print("\nsparse adj_matrix takes up " +
              str(asizeof.asizeof(adj_mat_preproc)) + " bytes;")

        start = timer()
        score_data.scoring_methods.score_pairs(
            score_data.gen_all_pairs,
            adj_mat_preproc,
            which_methods=methods_to_run,
            pi_vector=pi_vector_preproc,
            back_compat=True,
            num_docs=adj_mat_preproc.shape[0],
            mixed_pairs_sims=[.01],
            print_timing=True,
            exp_model=graph_models.get('exponential', None),
            run_all_implementations=run_all_implementations)
        end = timer()
        print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \
              + " affils, ")
        print("ran all methods using sparse matrix in " + str(end - start) +
              " seconds")