def compare_components_vs_topic_distance(): """ See the relationship between number of SVD components and average distance of topic distance of suggested posters """ # training to get poster vectors result = [] poster_vect_comp = [] N = len(poster_vect) # total number of posters N_trials = 1000 n_suggest = 10 n_posters = np.random.randint(N, size=N_trials) n_components_list = [50, 75, 100, 150, 200, 300, 400, 500] for n_c in n_components_list: poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=n_c) poster_vect_comp.append(poster_vect) # loop through the model for n_model in range(len(n_components_list)): nbrs_model = scc.build_nearest_neighbors(poster_vect_comp[n_model]) for n in n_posters: poster_idx = n # randomly select one poster (pre-random) poster_idx_same_topic = get_poster_same_topic(poster_idx, poster_df, n_posters=5) poster_likes = [poster_idx] + poster_idx_same_topic # list of posters with same topic distance, poster_idx_abs = scc.get_schedule_rocchio(nbrs_model, poster_vect_comp[n_model], like_posters=poster_likes[0:1]) poster_list = poster_idx_abs.flatten()[1:1+n_suggest] avg_distance = np.array([compute_node_distance(poster_df.tree.iloc[poster_idx], poster_df.tree.iloc[idx]) for idx in poster_list]).mean() result.append([poster_idx] + [avg_distance] + [n_components_list[n_model]]) result_df = pd.DataFrame(result, columns=['poster_number', 'distance', 'n_components']) return result_df
def compare_components_vs_topic_distance(): """ See the relationship between number of SVD components and average distance of topic distance of suggested posters """ # training to get poster vectors result = [] poster_vect_comp = [] N = len(poster_vect) # total number of posters N_trials = 1000 n_suggest = 10 n_posters = np.random.randint(N, size=N_trials) n_components_list = [50, 75, 100, 150, 200, 300, 400, 500] for n_c in n_components_list: poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=n_c) poster_vect_comp.append(poster_vect) # loop through the model for n_model in range(len(n_components_list)): nbrs_model = scc.build_nearest_neighbors(poster_vect_comp[n_model]) for n in n_posters: poster_idx = n # randomly select one poster (pre-random) poster_idx_same_topic = get_poster_same_topic(poster_idx, poster_df, n_posters=5) poster_likes = [ poster_idx ] + poster_idx_same_topic # list of posters with same topic distance, poster_idx_abs = scc.get_schedule_rocchio( nbrs_model, poster_vect_comp[n_model], like_posters=poster_likes[0:1]) poster_list = poster_idx_abs.flatten()[1:1 + n_suggest] avg_distance = np.array([ compute_node_distance(poster_df.tree.iloc[poster_idx], poster_df.tree.iloc[idx]) for idx in poster_list ]).mean() result.append([poster_idx] + [avg_distance] + [n_components_list[n_model]]) result_df = pd.DataFrame( result, columns=['poster_number', 'distance', 'n_components']) return result_df
# from http://www.sfn.org/ import science_concierge as scc import pandas as pd import numpy as np path_to_file = '' # add path to poster pickle file poster_df = pd.read_pickle(path_to_file) abstracts = list(poster_df.abstract) abstracts_preprocess = map(lambda abstract: scc.preprocess(abstract), abstracts) # poster vector or abstract vector tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess) poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=200) nbrs_model = scc.build_nearest_neighbors(poster_vect) # keywords vector tfidf_matrix_kw = scc.tfidf_vectorizer(poster_df.keywords) keywords_vect = scc.svd_vectorizer(tfidf_matrix_kw, n_components=30) nbrs_model_kw = scc.build_nearest_neighbors(keywords_vect) def compute_node_distance(node_1, node_2): """ Compute distance between two string nodes in format 'F.01.r' """ node_1 = node_1.split('.') node_2 = node_2.split('.') if node_1[0] != node_2[0]: return 3
# note that we use data provide by SfN, which you can request through the society # from http://www.sfn.org/ import science_concierge as scc import pandas as pd import numpy as np path_to_file = '' # add path to poster pickle file poster_df = pd.read_pickle(path_to_file) abstracts = list(poster_df.abstract) abstracts_preprocess = map(lambda abstract: scc.preprocess(abstract), abstracts) # poster vector or abstract vector tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess) poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=200) nbrs_model = scc.build_nearest_neighbors(poster_vect) # keywords vector tfidf_matrix_kw = scc.tfidf_vectorizer(poster_df.keywords) keywords_vect = scc.svd_vectorizer(tfidf_matrix_kw, n_components=30) nbrs_model_kw = scc.build_nearest_neighbors(keywords_vect) def compute_node_distance(node_1, node_2): """ Compute distance between two string nodes in format 'F.01.r' """ node_1 = node_1.split('.') node_2 = node_2.split('.') if node_1[0] != node_2[0]: return 3