Example #1
0
def compare_node_distance():
    """
    Give path to poster dataframe which has columns as follows:
        - abstract: column contains abstract of all posters
        - tree: human curated topic such as 'F.01.r', sometimes call node
        - keywords: string of keywords given from the conference

    Compare average node distance between random selected poster,
    keywords and abstract
    """

    result = []
    N = len(poster_df)  # total number of posters
    N_trials = 1000  # number of trials
    n_suggest = 10  # number of suggested posters in experiment
    n_posters = 5  # number of posters used to predict

    for n in range(N_trials):
        poster_idx = np.random.randint(N)  # randomly select one poster
        poster_idx_same_topic = get_poster_same_topic(poster_idx,
                                                      poster_df,
                                                      n_posters=n_posters)
        poster_likes = [
            poster_idx
        ] + poster_idx_same_topic  # list of posters with same topic

        for j in range(1, n_posters):
            distance, poster_idx_abs = scc.get_schedule_rocchio(
                nbrs_model, poster_vect, like_posters=poster_likes[0:j])
            distance, poster_idx_kw = scc.get_schedule_rocchio(
                nbrs_model_kw, keywords_vect, like_posters=poster_likes[0:j])
            poster_idx_random = np.random.randint(
                N, size=n_suggest)  # random pick upall posters
            poster_list = np.vstack((np.vstack(
                (poster_idx_abs.flatten(),
                 poster_idx_kw.flatten()))[:, 1:1 + n_suggest],
                                     poster_idx_random))

            node_distances = []
            for row in poster_list:
                node_distances.append([
                    compute_node_distance(poster_df.tree.iloc[poster_idx],
                                          poster_df.tree.iloc[idx])
                    for idx in row
                ])

            result.append([poster_idx] +
                          list(np.array(node_distances).mean(axis=1)) + [j])

    return result
def compare_components_vs_topic_distance():
    """
    See the relationship between number of SVD components and
    average distance of topic distance of suggested posters
    """
    # training to get poster vectors
    result = []
    poster_vect_comp = []
    N = len(poster_vect) # total number of posters
    N_trials = 1000
    n_suggest = 10
    n_posters = np.random.randint(N, size=N_trials)
    n_components_list = [50, 75, 100, 150, 200, 300, 400, 500]
    for n_c in n_components_list:
        poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=n_c)
        poster_vect_comp.append(poster_vect)

    # loop through the model
    for n_model in range(len(n_components_list)):
        nbrs_model = scc.build_nearest_neighbors(poster_vect_comp[n_model])
        for n in n_posters:
            poster_idx = n # randomly select one poster (pre-random)
            poster_idx_same_topic = get_poster_same_topic(poster_idx, poster_df, n_posters=5)
            poster_likes = [poster_idx] + poster_idx_same_topic # list of posters with same topic
            distance, poster_idx_abs = scc.get_schedule_rocchio(nbrs_model, poster_vect_comp[n_model], like_posters=poster_likes[0:1])
            poster_list = poster_idx_abs.flatten()[1:1+n_suggest]
            avg_distance = np.array([compute_node_distance(poster_df.tree.iloc[poster_idx], poster_df.tree.iloc[idx]) for idx in poster_list]).mean()
            result.append([poster_idx] + [avg_distance] + [n_components_list[n_model]])

    result_df = pd.DataFrame(result, columns=['poster_number', 'distance', 'n_components'])

    return result_df
def compare_node_distance():
    """
    Give path to poster dataframe which has columns as follows:
        - abstract: column contains abstract of all posters
        - tree: human curated topic such as 'F.01.r', sometimes call node
        - keywords: string of keywords given from the conference

    Compare average node distance between random selected poster,
    keywords and abstract
    """

    result = []
    N = len(poster_df) # total number of posters
    N_trials = 1000 # number of trials
    n_suggest = 10 # number of suggested posters in experiment
    n_posters = 5 # number of posters used to predict

    for n in range(N_trials):
        poster_idx = np.random.randint(N) # randomly select one poster
        poster_idx_same_topic = get_poster_same_topic(poster_idx, poster_df, n_posters=n_posters)
        poster_likes = [poster_idx] + poster_idx_same_topic # list of posters with same topic

        for j in range(1, n_posters):
            distance, poster_idx_abs = scc.get_schedule_rocchio(nbrs_model, poster_vect, like_posters=poster_likes[0:j])
            distance, poster_idx_kw = scc.get_schedule_rocchio(nbrs_model_kw, keywords_vect, like_posters=poster_likes[0:j])
            poster_idx_random = np.random.randint(N, size=n_suggest) # random pick upall posters
            poster_list = np.vstack((np.vstack((poster_idx_abs.flatten(),
                                                poster_idx_kw.flatten()))[:, 1:1+n_suggest],
                                                poster_idx_random))

            node_distances = []
            for row in poster_list:
                node_distances.append([compute_node_distance(poster_df.tree.iloc[poster_idx], poster_df.tree.iloc[idx]) for idx in row])

            result.append([poster_idx] + list(np.array(node_distances).mean(axis=1)) + [j])

    return result
Example #4
0
def compare_components_vs_topic_distance():
    """
    See the relationship between number of SVD components and
    average distance of topic distance of suggested posters
    """
    # training to get poster vectors
    result = []
    poster_vect_comp = []
    N = len(poster_vect)  # total number of posters
    N_trials = 1000
    n_suggest = 10
    n_posters = np.random.randint(N, size=N_trials)
    n_components_list = [50, 75, 100, 150, 200, 300, 400, 500]
    for n_c in n_components_list:
        poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=n_c)
        poster_vect_comp.append(poster_vect)

    # loop through the model
    for n_model in range(len(n_components_list)):
        nbrs_model = scc.build_nearest_neighbors(poster_vect_comp[n_model])
        for n in n_posters:
            poster_idx = n  # randomly select one poster (pre-random)
            poster_idx_same_topic = get_poster_same_topic(poster_idx,
                                                          poster_df,
                                                          n_posters=5)
            poster_likes = [
                poster_idx
            ] + poster_idx_same_topic  # list of posters with same topic
            distance, poster_idx_abs = scc.get_schedule_rocchio(
                nbrs_model,
                poster_vect_comp[n_model],
                like_posters=poster_likes[0:1])
            poster_list = poster_idx_abs.flatten()[1:1 + n_suggest]
            avg_distance = np.array([
                compute_node_distance(poster_df.tree.iloc[poster_idx],
                                      poster_df.tree.iloc[idx])
                for idx in poster_list
            ]).mean()
            result.append([poster_idx] + [avg_distance] +
                          [n_components_list[n_model]])

    result_df = pd.DataFrame(
        result, columns=['poster_number', 'distance', 'n_components'])

    return result_df