def check_degree_corr(adj_mat_infile, edges_infile):
    """
    Print and return correlation between vector of people's affil degrees and vector of people's num friends degrees
    :param adj_mat_infile:
    :param edges_infile:
    :return:
    """
    adj_mat, row_labels, _ = read_loc_adj_mat(adj_mat_infile)
    edge_matrix, _ = load_edge_matrix(edges_infile)
    pass # not finished, not needed now
def loc_compute_corrs():
    adj_mat_infile = '/home/lfriedl/ASOUND-bipartite/data-prep/loc_data/interim/gowalla/stratified/bipartite_adj_6friends.txt.gz'
    edges_infile = '/home/lfriedl/ASOUND-bipartite/data-prep/loc_data/interim/gowalla/stratified/loc-edges_6friends.txt.gz'
    # or original files (on local machine)
    adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/gowalla/bipartite_adj.txt'
    edges_infile = '/Users/lfriedl/Documents/dissertation/real-data/gowalla/loc-gowalla_edges.txt'

    adj_mat, row_names, affil_names = read_loc_adj_mat(adj_mat_infile)
    edge_matrix, edge_row_labels_map = load_edge_matrix(edges_infile, row_names)
    friends_per_item = adj_mat.sum(axis=1)
    affils_per_item = edge_matrix.sum(axis=1)

    correlation = np.corrcoef(friends_per_item, affils_per_item, rowvar=False)[0, 1]
    #print("correlation: " + str(correlation) + " for gowalla 6friends")
    print("correlation: " + str(correlation) + " for gowalla original")
def read_sample_save(adj_mat_infile, edges_infile, num_nodes, rows_outfile):
    adj_mat, item_names, _ = read_loc_adj_mat(adj_mat_infile)

    row_ids_to_keep = set(random.sample(list(range(adj_mat.shape[0])), num_nodes)) # indices w/in adj_mat
    adj_mat_to_keep = adj_mat[sorted(row_ids_to_keep),]
    item_names_to_keep = [item_names[i] for i in sorted(row_ids_to_keep)]  # oddly, subset notation above doesn't work

    # challenge: adj_mat_to_keep doesn't remember the old/semantically meaningful row labels. Need to keep these around
    # to send to the pair generators.

    print("Sampled " + str(num_nodes) + " nodes")
    with open(rows_outfile, 'wt') as fp:
        fp.write(" ".join(map(str, sorted(row_ids_to_keep))))   # probably need better syntax

    # edges can be stored efficiently in another sparse matrix
    label_generator = get_label_generator_from_edgefile(edges_infile, item_names_to_keep)

    return adj_mat_to_keep, item_names_to_keep, label_generator
def get_loc_expt_data(adj_mat_infile, edges_infile, row_ids_infile):
    """

    :param adj_mat_infile: bipartite graph. Each line = 1 item (name), followed by pipe-separated affils.
    :param edges_infile: true pairs. Each line = 2 item names (tab-separated).
    :param row_ids_infile: this expt will use subgraph induced by this set of items. N.B. stored as indices, not names.
    :return:
    """
    adj_mat, row_labels, _ = read_loc_adj_mat(adj_mat_infile)

    with open(row_ids_infile, 'r') as fin:
        row_ids_to_keep = sorted(map(int, fin.readline().split()))  # row ids are all on one line, space-separated

    adj_mat_to_keep = adj_mat[row_ids_to_keep,]
    row_labels_to_keep = [row_labels[i] for i in row_ids_to_keep]
    label_generator = get_label_generator_from_edgefile(edges_infile, row_labels_to_keep)

    return adj_mat_to_keep, row_labels_to_keep, label_generator
def run_all_pairs_loc_data(adj_mat_infile, edges_infile, outdir, tag):
    # modified just a bit from get_loc_expt_data()
    adj_mat, row_names, affil_names = read_loc_adj_mat(adj_mat_infile)
    label_generator = get_label_generator_from_edgefile(edges_infile, row_names)

    evals_outfile = outdir + '/results_' + tag + '.txt'
    scored_pairs_outfile = outdir + '/scoredPairs_' + tag + '.csv.gz'
    all_methods_to_run = ['jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson',
                          'shared_weight11', 'shared_weight1100', 'adamic_adar', 'newman', 'weighted_corr',
                          'mixed_pairs']
    # (for gowalla, just ran standard before)
    mixed_pairs_sims = [.001, .005, .01, .05, .1, .2, .3, .4, .5]

    score_data.run_and_eval(adj_mat, true_labels_func=label_generator, method_spec=all_methods_to_run,  #[method],
                            evals_outfile=evals_outfile, mixed_pairs_sims=mixed_pairs_sims,
                            pair_scores_outfile=scored_pairs_outfile, row_labels=row_names,
                            remove_boundary_items=False,
                            print_timing=True, prefer_faiss=True)
def affil_subsets_loc_data(adj_mat_infile, edges_infile, outdir, tag, affil_subset_fraction, affil_subset_type):
    adj_mat, row_names, affil_names = read_loc_adj_mat(adj_mat_infile)
    label_generator = get_label_generator_from_edgefile(edges_infile, row_names)

    tmp_pi_vector = np.asarray(adj_mat.sum(axis=0)).squeeze() / float(adj_mat.shape[0])
    affils_to_keep = affil_subsets.compute_affil_subsets(tmp_pi_vector, affil_subset_fraction, affil_subset_type)
    adj_mat = adj_mat[:, affils_to_keep]

    evals_outfile = outdir + '/results_' + tag + '.txt'
    scored_pairs_outfile = outdir + '/scoredPairs_' + tag + '.csv.gz'
    all_methods_to_run = ['jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson',
                          'shared_weight11', 'shared_weight1100', 'adamic_adar', 'newman', 'weighted_corr',
                          'mixed_pairs']

    score_data.run_and_eval(adj_mat, true_labels_func=label_generator, method_spec=all_methods_to_run,  #[method],
                            evals_outfile=evals_outfile,
                            pair_scores_outfile=scored_pairs_outfile, row_labels=row_names,
                            remove_boundary_items=False,
                            print_timing=True, prefer_faiss=True)
Example #7
0
def stratify_by_num_edges(adj_mat_infile, edges_infile, outdir, min_edges, max_edges):
    """
    Given a data set, calculate the number of true pairs each item participates in --> its degree.
    Then partition the data set by item degree.
    Write out a separate adj_mat and edges_file for items of each degree, from min_edges to max_edges.
    :param adj_mat_infile: The full bipartite graph for a data set.
    :param edges_infile: Full set of true pairs.
    :param outdir:
    :param min_edges:
    :param max_edges:
    :return:
    """
    edge_matrix, edge_row_labels_map = prelim_loc_data_expts.load_edge_matrix(edges_infile) # people are numbered 0 through max in orig file
    edge_row_indices_to_labels = {v: k for k, v in edge_row_labels_map.iteritems()}     # simply reversed, var[index] = label

    adj_mat, row_labels, loc_labels = loc_data.read_loc_adj_mat(adj_mat_infile) # missing some people; row_labels is what matches edge_matrix
    num_friends = np.asarray(edge_matrix.sum(axis=0)).squeeze()

    for edge_count in range(min_edges, max_edges + 1):
        # filter both matrices to people with edge_count
        people_to_keep_edgemat_index = set(np.nonzero(num_friends == edge_count)[0])
        print("found " + str(len(people_to_keep_edgemat_index)) + " people having " + str(edge_count) + " friends each")
        people_labels_to_keep = set([label for (ind, label) in edge_row_indices_to_labels.iteritems()
                                     if ind in people_to_keep_edgemat_index])
        # save files
        adj_mat_file = outdir + "/bipartite_adj_" + str(edge_count) + "friends.txt.gz"
        edge_mat_file = outdir + "/loc-edges_" + str(edge_count) + "friends.txt.gz"

        with gzip.open(adj_mat_file, 'w') as fout:
            # match formatting of orig file: row_id,loc|loc|loc|...
            fout.write("V1,checkins\n")
            for i in range(adj_mat.shape[0]):
                if row_labels[i] in people_labels_to_keep:
                    fout.write(str(row_labels[i]) + "," + "|".join(
                        [loc_labels[j] for j in np.nonzero(adj_mat[i,].toarray()[0])[0]]) + "\n")

        with gzip.open(edge_mat_file, 'w') as fout:  # orig format: i<tab>j. (Was symmetric, and we'll store it as such.)
            for (i,j) in zip(*edge_matrix.nonzero()):
                if edge_row_indices_to_labels[i] in people_labels_to_keep and edge_row_indices_to_labels[j] in people_labels_to_keep:
                    fout.write(str(edge_row_indices_to_labels[i]) + "\t" + str(edge_row_indices_to_labels[j]) + "\n")
Example #8
0
def compare_timings_faiss_normal(adj_mat_infile, evals_outfile,
                                 scored_pairs_outfile):
    infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt"

    num_nodes = (100, 1000, 5000)  # my OS kills it at 10000 (due to memory)
    # num_nodes = [2000]
    for num_to_try in num_nodes:
        adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try)

        print("\n*** Running all faiss methods ***\n")
        print("(asked for " + str(num_to_try) + " nodes)")

        methods_to_run = scoring_with_faiss.all_faiss_methods

        start = timer()
        score_data.run_and_eval(adj_mat,
                                true_labels_func=expts_labeled_data.
                                true_labels_for_expts_with_5pairs,
                                method_spec=methods_to_run,
                                evals_outfile=evals_outfile,
                                pair_scores_outfile=scored_pairs_outfile,
                                print_timing=True)
        end = timer()
        print("ran all " + str(len(methods_to_run)) + " methods in " +
              str(end - start) + " seconds")

        print("Now running normal versions for comparison")
        normal_versions = [x[:-6] for x in methods_to_run]
        start = timer()
        score_data.run_and_eval(adj_mat,
                                true_labels_func=expts_labeled_data.
                                true_labels_for_expts_with_5pairs,
                                method_spec=normal_versions,
                                evals_outfile=evals_outfile,
                                pair_scores_outfile=scored_pairs_outfile,
                                print_timing=True,
                                make_dense=True)
        end = timer()
        print("ran all " + str(len(normal_versions)) + " methods in " +
              str(end - start) + " seconds")
Example #9
0
def resources_test():
    infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt"

    num_nodes = (100, 1000, 5000)  # my OS kills it at 10000 (due to memory)
    for num_to_try in num_nodes:
        adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try)

        pi_vector_learned = score_data.learn_pi_vector(adj_mat)
        pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
            pi_vector_learned, adj_mat)

        # plain WC uses "transform" when dense, "terms" when sparse -- speed varies accordingly
        methods_to_run = ['weighted_corr', 'weighted_corr_faiss']

        adj_mat_preproc_dense = adj_mat_preproc.toarray()
        print("\ndense version takes up " +
              str(sys.getsizeof(adj_mat_preproc_dense)) + " bytes")

        start = timer()
        # scores_faiss = scoring_with_faiss.score_pairs_faiss(adj_mat, methods_to_run, print_timing=True,
        #                                                     pi_vector=pi_vector_preproc)

        score_data.scoring_methods.score_pairs(
            score_data.gen_all_pairs,
            adj_mat_preproc_dense,
            which_methods=methods_to_run,
            pi_vector=pi_vector_preproc,
            back_compat=True,
            num_docs=adj_mat_preproc.shape[0],
            mixed_pairs_sims=[.01],
            print_timing=True)
        end = timer()
        print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \
            + " affils, ")
        print("ran all methods using dense matrix in " + str(end - start) +
              " seconds")
def resources_test(run_all_implementations=True, use_faiss=False):
    # Let's read in portions of a big matrix in increasing size, and for each size, score all pairs (both sparse and dense).
    # This will let us see how things scale and where memory limits will come in.
    infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt"

    num_nodes = (100, 1000, 10000, 100000)
    # num_nodes = [10000]  # this size: no run finished in the length of time I was willing to wait
    num_nodes = (100, 500, 1000, 5000)
    # num_nodes = [5000]
    for num_to_try in num_nodes:
        adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try)

        pi_vector_learned = score_data.learn_pi_vector(adj_mat)
        pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
            pi_vector_learned, adj_mat)

        # (order given here doesn't matter)
        methods_to_run = [
            'cosine',
            'cosineIDF',
            # use fast "transform"
            'shared_size',
            'adamic_adar',
            'newman',
            'shared_weight11',
            # medium
            'hamming',
            'pearson',
            'jaccard',
            # WC uses "transform" when dense, "terms" when sparse -- speed varies accordingly
            'weighted_corr',
            'weighted_corr_exp',
            # only have slow "terms" method
            'shared_weight1100',
            'mixed_pairs'
        ]

        adj_mat_preproc_dense = adj_mat_preproc.toarray()
        print("\ndense version takes up " +
              str(sys.getsizeof(adj_mat_preproc_dense)) + " bytes")

        want_exp_model = ('weighted_corr_exp' in methods_to_run) or \
                         ('weighted_corr_exp_faiss' in methods_to_run) or ('all' in methods_to_run)
        start = timer()
        graph_models = bipartite_fitting.learn_graph_models(
            adj_mat,
            bernoulli=False,
            pi_vector=None,
            exponential=want_exp_model)
        end = timer()
        print("time for learning exponential model: " + str(end - start) +
              " seconds" if want_exp_model else "")

        start = timer()
        score_data.scoring_methods.score_pairs(
            score_data.gen_all_pairs,
            adj_mat_preproc_dense,
            which_methods=methods_to_run,
            pi_vector=pi_vector_preproc,
            back_compat=True,
            num_docs=adj_mat_preproc.shape[0],
            mixed_pairs_sims=[.01],
            print_timing=True,
            exp_model=graph_models.get('exponential', None),
            run_all_implementations=run_all_implementations,
            prefer_faiss=use_faiss)
        end = timer()
        print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \
              + " affils, ")
        print("ran all methods using dense matrix in " + str(end - start) +
              " seconds")

        print("\nsparse adj_matrix takes up " +
              str(asizeof.asizeof(adj_mat_preproc)) + " bytes;")

        start = timer()
        score_data.scoring_methods.score_pairs(
            score_data.gen_all_pairs,
            adj_mat_preproc,
            which_methods=methods_to_run,
            pi_vector=pi_vector_preproc,
            back_compat=True,
            num_docs=adj_mat_preproc.shape[0],
            mixed_pairs_sims=[.01],
            print_timing=True,
            exp_model=graph_models.get('exponential', None),
            run_all_implementations=run_all_implementations)
        end = timer()
        print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \
              + " affils, ")
        print("ran all methods using sparse matrix in " + str(end - start) +
              " seconds")
def test_cosine_versions():
    infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt"

    num_nodes = (100, 500, 1000, 5000)
    num_nodes = [1000, 2000]
    for num_to_try in num_nodes:
        adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try)

        pi_vector_learned = score_data.learn_pi_vector(adj_mat)
        pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
            pi_vector_learned, adj_mat)
        print("\nmatrix has " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \
              + " affils ")
        print("process memory: ")
        print(get_process_memory())

        print("\n** sklearn sparse cosine **")
        scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs,
                                                adj_mat_preproc,
                                                print_timing=True,
                                                use_package=True)
        print(get_process_memory())

        print(
            "\n** sklearn, but called on sparse.csc of dense 'transformed' matrix **"
        )
        start = timer()
        cos = []
        all_pairs_scores = cosine_similarity(
            sparse.csr_matrix(adj_mat_preproc))
        for (row_idx1, row_idx2, _, _, _,
             _) in score_data.gen_all_pairs(adj_mat_preproc):
            score = all_pairs_scores[row_idx1, row_idx2]
            cos.append(score if not np.isnan(score) else 0)
        end = timer()
        print("duration: " + str(end - start) + " seconds")
        print(get_process_memory())

        adj_mat_preproc_dense = adj_mat_preproc.toarray()
        print("\nmade matrix dense")
        print(get_process_memory())

        print("\n** home-grown dense cosine **")
        scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs,
                                                adj_mat_preproc_dense,
                                                print_timing=True,
                                                use_package=False)
        print(get_process_memory())

        print("\n** sklearn dense, using batches **")
        start = timer()
        cos = []
        all_pairs_scores = scoring_methods_fast.cosine_similarity_n_space(
            adj_mat_preproc_dense, adj_mat_preproc_dense, verbose=True)
        for (row_idx1, row_idx2, _, _, _,
             _) in score_data.gen_all_pairs(adj_mat_preproc_dense):
            score = all_pairs_scores[row_idx1, row_idx2]
            cos.append(score if not np.isnan(score) else 0)
        end = timer()
        print("duration: " + str(end - start) + " seconds")
        print(get_process_memory())

        print("\n** faiss (dense) ** ")
        scoring_with_faiss.score_pairs_faiss_all_exact(adj_mat_preproc_dense,
                                                       ['cosine_faiss'],
                                                       print_timing=True)
        print(get_process_memory())

        print("\n** sklearn dense cosine **")
        adj_mat_preproc_dense = adj_mat_preproc.toarray()
        scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs,
                                                adj_mat_preproc_dense,
                                                print_timing=True,
                                                use_package=True)
        print(get_process_memory())