Esempio n. 1
0
def evaluate_prediction(query_path, file_query_names, train_path, file_train_names, index_similarity, k):
    query_list = []
    predicted_list = []

    for x in file_query_names:
        x = int(x[4:-4])
        subpredicted_list = []
        query_list.append(correlation_images[x])

        for y in range(min(k,len(index_similarity[x]))): # the min is required to prevent errors if the index similarity is shorter than k
            subpredicted_list.append(index_similarity[x][y])
        predicted_list.append(subpredicted_list)

    
    return mapk(query_list, predicted_list, k)
def evaluate_prediction(query_path, file_query_names, train_path,
                        file_train_names, index_similarity, k):
    query_list = []
    predicted_list = []

    for x in file_query_names.keys():
        subpredicted_list = []
        if correlation_images[x] != [-1]:
            query_list.append([
                'ima_{:06d}.jpg'.format(correlation_images[x])
                for i in range(len(correlation_images[x]))
            ])
        else:
            query_list.append([])

        for y in range(
                min(k, len(index_similarity))
        ):  # the min is required to prevent errors if the index similarity is shorter than k
            subpredicted_list.append(file_train_names[index_similarity[x][y]])
        predicted_list.append(subpredicted_list)

    return mapk(query_list, predicted_list, k)
def main():
    # Data directory
    data_dir = Path('../Data/final/')

    # Anime Dataset
    dataset = AnimeDataset(data_dir=data_dir)

    # Get anime and rating dataframes
    df_anime, df_rating = dataset.get_whole_dataframe()

    # Replace the rating -1 with nan
    df_rating['rating'].replace({-1: np.nan}, inplace=True)

    # Merge anime and rating dataframes based on anime
    df_merged = df_rating.merge(df_anime,
                                left_on='anime_id',
                                right_on='anime_id',
                                suffixes=['_user', ''])
    df_merged.rename(columns={'rating_user': '******'}, inplace=True)

    # Pick the wanted columns
    df_merged = df_merged[['user_id', 'name', 'user_rating']]

    # Restricted the user to ID < 10000
    df_merged_sub = df_merged[df_merged['user_id'] <= 40000]

    # Get the normalized pivot table
    piv = df_merged_sub.pivot_table(index=['user_id'],
                                    columns=['name'],
                                    values='user_rating')
    piv_norm = piv.apply(lambda x: (x - x.mean()) / (x.max() - x.min()),
                         axis='columns')

    # Fill the Nan
    piv_norm.fillna(0, inplace=True)
    piv_norm = piv_norm.T

    # Delete unrated users
    piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

    # Create the sparse matrix
    piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

    # Calculate the simularity and turn into dataframe
    anime_similarity = cosine_similarity(piv_sparse)
    user_similarity = cosine_similarity(piv_sparse.T)
    df_anime_sim = pd.DataFrame(anime_similarity,
                                index=piv_norm.index,
                                columns=piv_norm.index)
    df_user_sim = pd.DataFrame(user_similarity,
                               index=piv_norm.columns,
                               columns=piv_norm.columns)

    # Predicting
    top_k_animes(df_anime_sim, 'Hunter x Hunter')
    top_k_users(df_user_sim, piv_norm, 3)

    print('\nWatched list')
    for anime in df_merged_sub[df_merged_sub['user_id'] == 3]['name']:
        print(anime)

    print('\nRecommended list')
    #for anime, _ in similar_k_user_recs(df_merged_sub, df_user_sim, piv_norm, 3)[:10]:
    for anime in similar_k_user_recs(df_merged_sub, df_user_sim, piv_norm, 3):
        print(anime)
    #print(similar_k_user_recs(df_merged_sub, df_user_sim, piv_norm, 3))
    #print(predicted_rating(df_user_sim, piv, 'Cowboy Bebop', 3))

    hit_rates = []
    for user in piv_norm.columns.to_list()[:1000]:
        print(user)
        recommended_list = similar_k_user_recs(df_merged_sub, df_user_sim,
                                               piv_norm, user)[:10]
        watched_list = df_merged[df_merged['user_id'] == user]['name'].unique()

        # Hit rate
        hit_rate = HitRate(recommended_list, watched_list)
        print("\nHit Rate: ", hit_rate)
        mean_average_precision = mapk(watched_list, recommended_list)
        hit_rates.append(hit_rate)

    print("\nAverage Hit Rate: ", sum(hit_rates) / len(hit_rates))
    overlap_sum = np.sum(
        [len(overlap_items[row]) for row in range(len(overlap_items))])
    average_novelty = 1 - (overlap_sum / (len(X_test) * X_test.shape[1])
                           )  # new items are the ones that do not overlap
    return average_novelty


print("\n     Performance metrics on test set:")
y_pred = np.vstack(
    predicted_sequences[:, 0])  # top 1 recommendation (predicted next click)
gc.collect()

# TODO this ml_metric and vstack stuff can be implemented faster
accuracy = np.round(accuracy_score(y_test, y_pred), 4)
y_test = np.vstack(y_test)
map2 = np.round(average_precision.mapk(y_test, predicted_sequences, k=2), 4)
map4 = np.round(average_precision.mapk(y_test, predicted_sequences, k=4), 4)
map6 = np.round(average_precision.mapk(y_test, predicted_sequences, k=6), 4)
map12 = np.round(average_precision.mapk(y_test, predicted_sequences, k=12), 4)
coverage = np.round(
    len(np.unique(predicted_sequences[:, :4])) / len(np.unique(X_train)), 4)
novelty = np.round(
    compute_average_novelty(X_test[:, -4:], predicted_sequences[:, :4]), 4)

print("\n    Embedding GRU-RNN Product Type Ranker:")
print("     Accuracy @ 1   {:.4}%".format(accuracy * 100))
print("     MAP @ 2        {:.4}%".format(map2 * 100))
print("     MAP @ 4        {:.4}%".format(map4 * 100))
print("     MAP @ 6        {:.4}%".format(map6 * 100))
print("     MAP @ 12       {:.4}%".format(map12 * 100))
print("     Coverage       {:.4}%".format(coverage * 100))