Beispiel #1
0
def test_score_wc_faiss():
    adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz"
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector, adj_mat = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat)

    scores_data_frame = scoring_with_faiss.score_pairs_faiss(
        adj_mat,
        which_methods=['weighted_corr_faiss'],
        how_many_neighbors=-1,
        print_timing=True,
        pi_vector=pi_vector)
    print('scores look like (sample):\n' + str(scores_data_frame.head()))
    # note for later: scores_data_frame.reset_index() makes it save item1 & item2 as regular columns, defaults back to index of row numbers

    print("calling adamic-adar")
    scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact(
        adj_mat,
        'adamic_adar_faiss',
        pi_vector=pi_vector,
        num_docs=adj_mat.shape[0])
    print('scores look like (sample):\n' + str(scores_data_frame2.head()))

    print("calling pearson")
    scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact(
        adj_mat, 'pearson_faiss')
    print('scores look like (sample):\n' + str(scores_data_frame2.head()))
    print("(dense input)")
    scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact(
        adj_mat.toarray(), 'pearson_faiss')
    print('scores look like (sample):\n' + str(scores_data_frame2.head()))
Beispiel #2
0
def test_adj_and_phi():
    """
    Reads adj matrix, makes sure we can match what R code did for learning pi_vector, preprocessing it, and flipping it.

    Uses & compares to files: 'ng_aa_data1/data15' . [_adj_mat.mtx.gz, .dataphi.txt.gz, .dataphipreproc.txt.gz,
                                                      .dataphiflipped.txt.gz, .adj_mat_flipped.mtx.gz]

    Throws assertion error if unhappy
    """
    print(
        "\n*** Testing reading adjacency matrix and computing pi_vector ***\n")
    # Use the example data files "data15_*". They contain the contents of my expt data file alt.atheism/data1.Rdata

    #pi_vector_infile = "ng_aa_data1/data15_phi.txt.gz"  # this is from data1.Rdata, and it's the phi from the whole (larger) data set
    #pi_vector_whole_data = score_data.load_pi_from_file(pi_vector_infile) # ignoring this

    adj_mat_infile = "ng_aa_data1/data15_adj_mat.mtx.gz"

    # manually constructing these as I go along, using my existing R code in experimentRunner.R
    pi_vector_learned_R = expts_labeled_data.load_pi_from_file(
        "ng_aa_data1/data15.dataphi.txt.gz")
    pi_vector_preproc_R = expts_labeled_data.load_pi_from_file(
        "ng_aa_data1/data15.dataphipreproc.txt.gz")

    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat)

    # Quirk from R: it saved floating point data with 7 digits of precision (see getOptions("digits") and format()).
    # Implication: if we want to ever use those phi files, should re-convert with higher precision.

    # For now, allow a difference of 1e-07 when comparing them

    # How annoying. Upping the precision simply revealed how I'm imprecise in the R code anyway. The Bernoulli <->
    # multinomial conversion I do doesn't keep the exact probabilities anyway. Actually... that's a possible bug. The
    # other time the code does this, it explicitly fixes that.

    # Compare. Expect pi_vector_learned to match pi_vector_learned_R and match numCols of adj_mat.
    assert (pi_vector_learned.shape[0] == adj_mat.shape[1])
    assert (max(abs(pi_vector_learned - pi_vector_learned_R)) < 1e-07)

    # Expect pi_vector_preproc to match pi_vector_preproc_R and match numCols of adj_mat_preproc
    assert (pi_vector_preproc.shape[0] == adj_mat_preproc.shape[1])
    assert (max(abs(pi_vector_preproc - pi_vector_preproc_R)) < 1e-07)

    # test flipping
    pi_vector_flipped_R = expts_labeled_data.load_pi_from_file(
        "ng_aa_data1/data15.dataphiflipped.txt.gz")
    adj_mat_flipped_R = score_data.load_adj_mat(
        "ng_aa_data1/data15.adj_mat_flipped.mtx.gz")
    pi_vector_flipped, adj_mat_flipped = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat, flip_high_ps=True)
    # Expect the respective versions to match
    assert (pi_vector_flipped.shape == pi_vector_preproc.shape)
    assert (max(abs(pi_vector_flipped - pi_vector_flipped_R)) < 1e-07)
    assert (adj_mat_flipped_R.shape == adj_mat_flipped.shape)
    assert (abs(adj_mat_flipped_R - adj_mat_flipped).max() < 1e-07)
Beispiel #3
0
def test_only_wc(adj_mat_infile, scored_pairs_file_R, scored_pairs_file_new):
    """
    Like test_pair_scores_against_R(), but checks scores & timing of the function simple_only_weighted_corr().
    (This was the first scoring method I implemented using a transform of the adj_matrix.)

    :param adj_mat_infile: local path ending in .mtx.gz
    :param scored_pairs_file_R: local path ending in .csv.gz
    """

    print(
        "\n*** Checking simple_only_weighted_corr against scores from R ***\n")

    # Read adj data and prep pi_vector
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat)

    scores_storage = magic_dictionary.make_me_a_dict(adj_mat_preproc.shape[0])
    scoring_methods.extra_implementations.simple_only_weighted_corr(
        score_data.gen_all_pairs,
        adj_mat_preproc,
        scores_storage.create_and_store_array("weighted_corr"),
        pi_vector_preproc,
        print_timing=True)
    scores_storage.to_csv_gz(
        scored_pairs_file_new,
        lambda: score_data.ij_gen(adj_mat_preproc.shape[0]))
    with gzip.open(scored_pairs_file_new, 'r') as fpin:
        wc_frame = pd.read_csv(fpin)

    with gzip.open(scored_pairs_file_R, 'r') as fpin:
        scores_data_frame_R = pd.read_csv(fpin)

    print("max diff: " + str(
        abs(wc_frame["weighted_corr"] -
            scores_data_frame_R["pearsonWeighted"]).max()))
    assert (max(
        abs(wc_frame["weighted_corr"] -
            scores_data_frame_R["pearsonWeighted"])) < 1e-05)
Beispiel #4
0
def test_adj_and_phi2():
    """
    Reads adj matrix, checks that we can learn pi_vector for a second data set.
    Using files: "reality_appweek_50/data50_adjMat.mtx.gz", "reality_appweek_50/data50-inference-allto6.phi.csv.gz"
    """
    print(
        "\n*** Testing reading adjacency matrix and computing pi_vector (2) ***\n"
    )
    # Use something other than newsgroups! They're too complicated because they were run early.

    # Check that I can learn phi from the adjacency matrix and end up with the version in the inference file
    adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz"
    pi_vector_preproc_R = expts_labeled_data.load_pi_from_file(
        "reality_appweek_50/data50-inference-allto6.phi.csv.gz")

    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat)

    # Expect pi_vector_preproc to match pi_vector_preproc_R
    assert (max(abs(pi_vector_preproc - pi_vector_preproc_R)) < 1e-07)
Beispiel #5
0
def test_faiss_basic_calls():
    adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz"
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector, adj_mat = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat)

    # can do dot product on plain adj matrix -- just computes sharedSize
    index = faiss.IndexFlatIP(adj_mat.shape[1])  # takes numCols as arg
    # mimicking tutorial example:
    #index.add(np.random.random((100, adj_mat.shape[1])).astype('float32'))
    adj_for_faiss = adj_mat.toarray().astype(
        'float32'
    )  # adj_mat is sparse, but faiss wants dense. and, apparently, wants float32.
    index.add(adj_for_faiss)
    print("index.is_trained: " + str(index.is_trained) + ", index.total: " +
          str(index.ntotal))

    # look at 10 nearest neighbors of each input
    distances10, neighbors10 = index.search(adj_for_faiss, 10)

    distances, neighbors = index.search(adj_for_faiss,
                                        adj_for_faiss.shape[0])  # all pairs
    print('basic calls ran')
Beispiel #6
0
def resources_test():
    infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt"

    num_nodes = (100, 1000, 5000)  # my OS kills it at 10000 (due to memory)
    for num_to_try in num_nodes:
        adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try)

        pi_vector_learned = score_data.learn_pi_vector(adj_mat)
        pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
            pi_vector_learned, adj_mat)

        # plain WC uses "transform" when dense, "terms" when sparse -- speed varies accordingly
        methods_to_run = ['weighted_corr', 'weighted_corr_faiss']

        adj_mat_preproc_dense = adj_mat_preproc.toarray()
        print("\ndense version takes up " +
              str(sys.getsizeof(adj_mat_preproc_dense)) + " bytes")

        start = timer()
        # scores_faiss = scoring_with_faiss.score_pairs_faiss(adj_mat, methods_to_run, print_timing=True,
        #                                                     pi_vector=pi_vector_preproc)

        score_data.scoring_methods.score_pairs(
            score_data.gen_all_pairs,
            adj_mat_preproc_dense,
            which_methods=methods_to_run,
            pi_vector=pi_vector_preproc,
            back_compat=True,
            num_docs=adj_mat_preproc.shape[0],
            mixed_pairs_sims=[.01],
            print_timing=True)
        end = timer()
        print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \
            + " affils, ")
        print("ran all methods using dense matrix in " + str(end - start) +
              " seconds")
Beispiel #7
0
def test_pair_scores_against_R(adj_mat_infile,
                               scored_pairs_file_R,
                               scored_pairs_file_new,
                               make_dense=False,
                               flip_high_ps=False,
                               run_all=0,
                               prefer_faiss=False):
    """
    Starting from an adj matrix, score pairs (using current implementation) and compare to reference file run from R.
    Similar contents to score_data.run_and_eval().

    :param run_all: set to 2 (or 1) to run and time all (or more) implementations.
                    However, we only look at the scores of the last one.
    """
    print("\n*** Testing scores computed for pairs ***\n")
    print("Adj matrix infile: " + adj_mat_infile +
          "; scored pairs reference file: " + scored_pairs_file_R)

    # Read adj data and prep pi_vector
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat, flip_high_ps=flip_high_ps)

    methods_to_run = [
        'jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson',
        'weighted_corr', 'shared_weight11', 'shared_weight1100', 'adamic_adar',
        'newman', 'mixed_pairs'
    ]
    mixed_pairs_sims = [.01, .001]
    start = timer()

    if make_dense:
        adj_mat_preproc = adj_mat_preproc.toarray()
    scoring_methods.score_pairs(score_data.gen_all_pairs,
                                adj_mat_preproc,
                                which_methods=methods_to_run,
                                outfile_csv_gz=scored_pairs_file_new,
                                pi_vector=pi_vector_preproc,
                                back_compat=True,
                                num_docs=adj_mat_preproc.shape[0],
                                mixed_pairs_sims=mixed_pairs_sims,
                                print_timing=True,
                                run_all_implementations=run_all,
                                prefer_faiss=prefer_faiss)
    with gzip.open(scored_pairs_file_new, 'r') as fpin:
        scores_data_frame = pd.read_csv(fpin)

    scores_data_frame['label'] = expts_labeled_data.get_true_labels_expt_data(
        num_true_pairs=5,
        pairs_generator=score_data.gen_all_pairs(adj_mat_preproc))
    end = timer()
    print("ran " \
          + str(len(methods_to_run) + (len(mixed_pairs_sims) - 1 if 'mixed_pairs' in methods_to_run else 0)) \
          + " methods " + ("(plus variants) " if run_all else "") \
          +  "on " + str(adj_mat.shape[0] * (adj_mat.shape[0]-1)/float(2)) + " pairs")
    print("num seconds: " + str(end - start))

    # Read scores from R and compare
    with gzip.open(scored_pairs_file_R, 'r') as fpin:
        scores_data_frame_R = pd.read_csv(fpin)

    for (R_method, our_method) in list(mapping_from_R_methods.items()):
        if our_method in list(scores_data_frame):
            print("Checking " + our_method)
            # R data doesn't have item numbers, but is in the same all-pairs order as ours
            print("max diff: " + str(
                abs(scores_data_frame[our_method] -
                    scores_data_frame_R[R_method]).max()))

            # Sadly, the p_i vectors are off by a smidgen (see notes above), so anything that uses them can
            # differ too. sharedWeight11 vals differed by > 1e-06, and that was with only 65 affils.
            tolerance = 1e-10
            if prefer_faiss:
                tolerance = 1e-04
            elif our_method in our_pi_methods:
                tolerance = 1e-05
            assert (max(
                abs(scores_data_frame[our_method] -
                    scores_data_frame_R[R_method])) < tolerance)

    return scores_data_frame
def resources_test(run_all_implementations=True, use_faiss=False):
    # Let's read in portions of a big matrix in increasing size, and for each size, score all pairs (both sparse and dense).
    # This will let us see how things scale and where memory limits will come in.
    infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt"

    num_nodes = (100, 1000, 10000, 100000)
    # num_nodes = [10000]  # this size: no run finished in the length of time I was willing to wait
    num_nodes = (100, 500, 1000, 5000)
    # num_nodes = [5000]
    for num_to_try in num_nodes:
        adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try)

        pi_vector_learned = score_data.learn_pi_vector(adj_mat)
        pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
            pi_vector_learned, adj_mat)

        # (order given here doesn't matter)
        methods_to_run = [
            'cosine',
            'cosineIDF',
            # use fast "transform"
            'shared_size',
            'adamic_adar',
            'newman',
            'shared_weight11',
            # medium
            'hamming',
            'pearson',
            'jaccard',
            # WC uses "transform" when dense, "terms" when sparse -- speed varies accordingly
            'weighted_corr',
            'weighted_corr_exp',
            # only have slow "terms" method
            'shared_weight1100',
            'mixed_pairs'
        ]

        adj_mat_preproc_dense = adj_mat_preproc.toarray()
        print("\ndense version takes up " +
              str(sys.getsizeof(adj_mat_preproc_dense)) + " bytes")

        want_exp_model = ('weighted_corr_exp' in methods_to_run) or \
                         ('weighted_corr_exp_faiss' in methods_to_run) or ('all' in methods_to_run)
        start = timer()
        graph_models = bipartite_fitting.learn_graph_models(
            adj_mat,
            bernoulli=False,
            pi_vector=None,
            exponential=want_exp_model)
        end = timer()
        print("time for learning exponential model: " + str(end - start) +
              " seconds" if want_exp_model else "")

        start = timer()
        score_data.scoring_methods.score_pairs(
            score_data.gen_all_pairs,
            adj_mat_preproc_dense,
            which_methods=methods_to_run,
            pi_vector=pi_vector_preproc,
            back_compat=True,
            num_docs=adj_mat_preproc.shape[0],
            mixed_pairs_sims=[.01],
            print_timing=True,
            exp_model=graph_models.get('exponential', None),
            run_all_implementations=run_all_implementations,
            prefer_faiss=use_faiss)
        end = timer()
        print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \
              + " affils, ")
        print("ran all methods using dense matrix in " + str(end - start) +
              " seconds")

        print("\nsparse adj_matrix takes up " +
              str(asizeof.asizeof(adj_mat_preproc)) + " bytes;")

        start = timer()
        score_data.scoring_methods.score_pairs(
            score_data.gen_all_pairs,
            adj_mat_preproc,
            which_methods=methods_to_run,
            pi_vector=pi_vector_preproc,
            back_compat=True,
            num_docs=adj_mat_preproc.shape[0],
            mixed_pairs_sims=[.01],
            print_timing=True,
            exp_model=graph_models.get('exponential', None),
            run_all_implementations=run_all_implementations)
        end = timer()
        print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \
              + " affils, ")
        print("ran all methods using sparse matrix in " + str(end - start) +
              " seconds")
def test_cosine_versions():
    infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt"

    num_nodes = (100, 500, 1000, 5000)
    num_nodes = [1000, 2000]
    for num_to_try in num_nodes:
        adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try)

        pi_vector_learned = score_data.learn_pi_vector(adj_mat)
        pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
            pi_vector_learned, adj_mat)
        print("\nmatrix has " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \
              + " affils ")
        print("process memory: ")
        print(get_process_memory())

        print("\n** sklearn sparse cosine **")
        scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs,
                                                adj_mat_preproc,
                                                print_timing=True,
                                                use_package=True)
        print(get_process_memory())

        print(
            "\n** sklearn, but called on sparse.csc of dense 'transformed' matrix **"
        )
        start = timer()
        cos = []
        all_pairs_scores = cosine_similarity(
            sparse.csr_matrix(adj_mat_preproc))
        for (row_idx1, row_idx2, _, _, _,
             _) in score_data.gen_all_pairs(adj_mat_preproc):
            score = all_pairs_scores[row_idx1, row_idx2]
            cos.append(score if not np.isnan(score) else 0)
        end = timer()
        print("duration: " + str(end - start) + " seconds")
        print(get_process_memory())

        adj_mat_preproc_dense = adj_mat_preproc.toarray()
        print("\nmade matrix dense")
        print(get_process_memory())

        print("\n** home-grown dense cosine **")
        scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs,
                                                adj_mat_preproc_dense,
                                                print_timing=True,
                                                use_package=False)
        print(get_process_memory())

        print("\n** sklearn dense, using batches **")
        start = timer()
        cos = []
        all_pairs_scores = scoring_methods_fast.cosine_similarity_n_space(
            adj_mat_preproc_dense, adj_mat_preproc_dense, verbose=True)
        for (row_idx1, row_idx2, _, _, _,
             _) in score_data.gen_all_pairs(adj_mat_preproc_dense):
            score = all_pairs_scores[row_idx1, row_idx2]
            cos.append(score if not np.isnan(score) else 0)
        end = timer()
        print("duration: " + str(end - start) + " seconds")
        print(get_process_memory())

        print("\n** faiss (dense) ** ")
        scoring_with_faiss.score_pairs_faiss_all_exact(adj_mat_preproc_dense,
                                                       ['cosine_faiss'],
                                                       print_timing=True)
        print(get_process_memory())

        print("\n** sklearn dense cosine **")
        adj_mat_preproc_dense = adj_mat_preproc.toarray()
        scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs,
                                                adj_mat_preproc_dense,
                                                print_timing=True,
                                                use_package=True)
        print(get_process_memory())