Example #1
0
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))
 def _Call_Movielens_for_Evaluation(self):
     
     data = Dataset.load_builtin('ml-100k')
     return data
def make_predictions(user_id):
    performance = []
    algorithms = ['SVD', 'KNN', 'ALS']

    # First train an SVD algorithm on the movielens dataset.
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    algo_SVD = SVD()
    algo_SVD.fit(trainset)

    # Then predict ratings for all pairs (u, i) that are NOT in the training set.
    # SVD algorithm
    testset = trainset.build_anti_testset()
    predictions_SVD = algo_SVD.test(testset)

    accurancy_SVD = accuracy.rmse(predictions_SVD)
    performance.append(accurancy_SVD)

    algo_KNN = KNNBasic()
    algo_KNN.fit(trainset)

    predictions_KNN = algo_SVD.test(testset)

    accurancy_KNN = accuracy.rmse(predictions_KNN)
    performance.append(accurancy_KNN)

    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
    algo_ALS = BaselineOnly(bsl_options=bsl_options)
    algo_ALS.fit(trainset)

    predictions_ALS = algo_ALS.test(testset)

    accurancy_ALS = accuracy.rmse(predictions_ALS)
    performance.append(accurancy_ALS)

    # comparing algorithms by performance
    best_performance_index = performance.index(min(performance))
    best_algorithm = algorithms[best_performance_index]

    if best_algorithm == 'SVD':
        top_n = get_top_n(predictions_SVD, n=10)
    elif best_algorithm == 'KNN':
        top_n = get_top_n(predictions_KNN, n=10)
    elif best_algorithm == 'ALS':
        top_n = get_top_n(predictions_ALS, n=10)

    i_cols = [
        'movie_id', 'movie_title', 'release_date', 'video_release_date',
        'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
        'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
        'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
        'Thriller', 'War', 'Western'
    ]

    items = pd.read_csv('../../ml-100k/u.item',
                        sep='|',
                        names=i_cols,
                        encoding='latin-1')

    predictions = []
    # Print the recommended items for the user
    for uid, user_ratings in top_n.items():
        if int(uid) + 1 == int(user_id) + 1:
            # print(uid, [iid for (iid, _) in user_ratings])
            for (iid, _) in user_ratings:
                title = items[items['movie_id'] == int(iid) + 1]['movie_title']
                title_t = str(title)
                title_split = title_t.split()
                print(title_split)
                # print(title_split(1))
                # print(title_split(2))
                # print(title_t)
                predictions.append(title_t)

    return predictions
Example #4
0
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import Dataset
from surprise import accuracy
from surprise import SlopeOne
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset  UserID::MovieID::Rating::Timestamp
data = Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data, test_size=.15)

# Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item.

print("Usando o algoritmo SlopeOne")
algoritmo = SlopeOne()

algoritmo.fit(trainset)

# Selecionamos o usuário e o filme que será analisado
# User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas
uid = str(49)  
# Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4
iid = str(2058)  # raw item id

# get a prediction for specific users and items.
print("Predição de avaliação: ")
pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True)

# run the trained model against the testset
Example #5
0
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.predictions import PredictionImpossible
from surprise.prediction_algorithms.algo_base import AlgoBase
from surprise.prediction_algorithms.knns import SymmetricAlgo
import copy
from collections import defaultdict
import os
import matplotlib
import plotly

file_path_save_data = 'data/processed/'  # don't forget to create this folder before running the scrypt
datasetname = 'ml-100k'  # valid datasetnames are 'ml-latest-small', 'ml-20m', and 'jester'
data1 = Dataset.load_builtin(datasetname)
       
path = '../ml-100k/u.item'
df = pd.read_csv(path, sep="|", encoding="iso-8859-1", names=['id','name','date','space','url','cat1','cat2','cat3','cat4','cat5','cat6','cat7','cat8','cat9','cat10','cat11','cat12','cat13','cat14','cat15','cat16','cat17','cat18','cat19'])
list_of_cats = {}

df1 = df[['id','cat1','cat2','cat3','cat4','cat5','cat6','cat7','cat8','cat9','cat10','cat11','cat12','cat13','cat14','cat15','cat16','cat17','cat18','cat19']]
for row in df.itertuples(index=True, name='Pandas'):
    id = str(getattr(row, "id"))
    cate_x = [getattr(row, "cat1"),getattr(row, "cat2"),getattr(row, "cat3"),getattr(row, "cat4"),getattr(row, "cat5"),getattr(row, "cat6"),getattr(row, "cat7"),getattr(row, "cat8"),getattr(row, "cat9"),getattr(row, "cat10"),getattr(row, "cat11"),getattr(row, "cat12"),getattr(row, "cat13"),getattr(row, "cat14"),getattr(row, "cat15"),getattr(row, "cat16"),getattr(row, "cat17"),getattr(row, "cat18"),getattr(row, "cat19"),]
    list_of_cats[id] = cate_x

def into_rate(cate,ate):
    for index in range(len(cate)):
        if cate[index] == 1:
            cate[index] = rate
    else:
        datasets_list = list(starmap(subset_dataset, data_subsetting_params))

    end_t = time.time()
    print("\n%.2f seconds elapsed \n" % (end_t - start_t))

    subsetted_datasets = {}

    for dataset in datasets_list:
        subsetted_datasets[name_prefix + dataset_desc_str(dataset)] = dataset

    return subsetted_datasets


# %%
jester = Dataset.load_builtin('jester')
print("loaded jester")
jester = rescale_dataset(jester)

# parameters are: (dataset, min_ratings_per_user, max_ratings_per_user, total_ratings)
jester_data_subsetting_params = [(jester, 127, 150, 50000),
                                 (jester, 116, 126, 50000),
                                 (jester, 35, 40, 50000),
                                 (jester, 26, 34, 20000),
                                 (jester, 16, 25, 20000),
                                 (jester, 1, 15, 20000)]

jester_datasets = create_data_subsets(jester, "jester",
                                      jester_data_subsetting_params)

del jester
 def __init__(self):
     self.data = Dataset.load_builtin('ml-1m')
Example #8
0
import numpy as np
import csv

from surprise import Dataset, KNNBasic, SVD, SVDpp, BaselineOnly
from surprise.model_selection import KFold, cross_validate
from cf_models import EbcrMsdKNN, EbcrCosKNN, EbcrNormPccKNN, NormPcc, SW_Norm_PccKNN, SW_MSD_KNN, SW_COS_KNN, LS_MSD_KNN, LS_COS_KNN, LS_Norm_PccKNN

__author__ = "Yu DU"

# Datasets initialisation
ml_100k = Dataset.load_builtin('ml-100k')
ml_1m = Dataset.load_builtin('ml-1m')
jester = Dataset.load_builtin('jester')

# Split train and test set
kf = KFold(random_state=0, n_splits=5)

list_k = [5, 10, 20, 40, 60, 80, 100, 200]
list_k2 = [5, 10, 15, 20, 25, 30, 35, 40]

# The Ml-100k Dataset
with open('results_ml100k_all.csv', mode='w') as result_file:
    fieldnames = ['k', 'algo', 'MAE', 'RMSE']
    writer = csv.DictWriter(result_file, fieldnames=fieldnames)
    writer.writeheader()

    # SVD algo
    svd = SVD()
    out_svd = cross_validate(svd,
                             ml_100k, ['rmse', 'mae'],
                             kf,
Example #9
0
    def execute(self, params, **kwargs):
        # Load the movielens-100k dataset (download it if needed),
        data = Dataset.load_builtin('ml-100k')

        self.marvin_initial_dataset = {"data": data}
Example #10
0
run = Run.get_submitted_run()

# manually downloading the file, as it requires a prompt otherwise
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
DATASETS_DIR = os.path.expanduser('~') + '/.surprise_data/'

print("Starting")

name = 'ml-100k'
os.makedirs(DATASETS_DIR, exist_ok=True)
urllib.request.urlretrieve(url, DATASETS_DIR + 'tmp.zip')

with zipfile.ZipFile(DATASETS_DIR + 'tmp.zip', 'r') as tmp_zip:
    tmp_zip.extractall(DATASETS_DIR + name)

data = Dataset.load_builtin(name)
trainingSet = data.build_full_trainset()

#############################################################################################################################
modelVariations = {
    "model1.pkl": {
        'name': 'cosine',
        'user_based': False
    },
    "model2.pkl": {
        'name': 'cosine',
        'user_based': True
    },
    "model3.pkl": {
        'name': 'msd',
        'user_based': True
Example #11
0
# -- encoding:utf-8 --
import warnings
from surprise import Dataset
from surprise import evaluate
from surprise import KNNBaseline, KNNBasic

warnings.filterwarnings('ignore')

# 1. 读取数据
# 方式一:直接使用surprise框架提供的内置API读取movielens的数据
# name: 指定加载什么数据,可选值: 'ml-100k', 'ml-1m', and 'jester'
# 该API默认会将数据下载到路径"~/.surprise_data"
data = Dataset.load_builtin(name='ml-100k')

# 2. 做一个交叉验证的数据划分
data.split(5)

# 3. 模型对象的构建
bsl_options = {
    'method': 'als',  # 给定求解方式,可选值:als和sgd
    'n_epochs': 10,  # 迭代次数
    'reg_i': 20,  # 正则化系数,
    'reg_u': 10  # 正则化系数,
}
"""
k=40: 给定预测时候的邻居样本的数目
min_k=1:在产生预测值的时候,只要要求有多少个临近用户/物品
sim_options={} : 给定相似度矩阵的计算方式
"""
sim_options = {
    'name':
Example #12
0
Y_train = np.loadtxt('data/train.txt', '\t') - np.array([1, 1, 0])
Y_test = np.loadtxt('data/test.txt', '\t') - np.array([1, 1, 0])

mu = np.mean(Y_train[:, 2])
epochs = 100
lamb = 1
U_ub, V_ub, _, _ = matrix_factorization(Y_train, 943, 1682, k, lamb, 0.03,
                                        epochs)
U_b, V_b, A_b, B_b = matrix_factorization(Y_train, 943, 1682, k, lamb, 0.03,
                                          epochs, True)
err_unbiased = score(Y_test, U_ub, V_ub)
err_biased = score(Y_test, U_b, V_b, True, A_b, B_b, mu)
print('Test error (biased):', err_biased)
print('Test error (unbiased):', err_unbiased)

data_surprise = Dataset.load_builtin('ml-100k')
data_train, data_test = train_test_split(data_surprise, test_size=0.1)
model = SVD(n_factors=k)
model.fit(data_train)
rmse = accuracy.rmse(model.test(data_test))
print('Test error (SVD):', rmse**2 / 2)
model = SVD(n_factors=k)
data_full = data_surprise.build_full_trainset()
model.fit(data_full)
V = model.qi.T

best, most_popular = get_best_and_popular()
movie_selection = {'Best Movies': best, 'Most Popular Movies': most_popular}


def scatterplot(x, y, color, selection, indices, title):
Example #13
0
        'NormalPredictor': '[{}]({})'.format('Random',
                                             stable +
                                             'basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor'),
        'ml-100k': '[{}]({})'.format('Movielens 100k',
                                     'http://grouplens.org/datasets/movielens/100k'),
        'ml-1m': '[{}]({})'.format('Movielens 1M',
                                   'http://grouplens.org/datasets/movielens/1m'),
        }


# set RNG
np.random.seed(0)
random.seed(0)

dataset = 'ml-1m'
data = Dataset.load_builtin(dataset)
kf = KFold(random_state=0)  # folds will be the same for all algorithms.

table = []
for klass in classes:
    start = time.time()
    out = cross_validate(klass(), data, ['rmse', 'mae'], kf)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    link = LINK[klass.__name__]
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))

    new_line = [link, mean_rmse, mean_mae, cv_time]
    print(tabulate([new_line], tablefmt="pipe"))  # print current algo perf
    table.append(new_line)
Example #14
0
def built_in() -> (surprise.dataset.DatasetAutoFolds):
    data = Dataset.load_builtin(name='ml-100k', prompt=True)
    return data
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot
from surprise import SVD, Dataset, NMF
from surprise.model_selection import train_test_split
from surprise.model_selection.search import GridSearchCV
from surprise.model_selection.validation import cross_validate

def custom_rmse_cv():
    pass

if __name__ == '__main__':
    df = pd.read_csv('data/ml-latest-small/ratings.csv')
    
    # ratings = pd.pivot_table(data=df, values='rating', index='userId', columns='movieId')
    ratings = Dataset.load_builtin('ml-100k')
    # train, test = train_test_split(ratings)

    # algo = SVD(n_factors=50)
    # SVD.fit(train)

    # param_grid = {'n_epochs': [20], 'lr_all': [.01], 'n_factors': [183],
    #             'reg_all': [.1], 'verbose': [True]}
    # gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
    # gs.fit(ratings)
    # print(gs.best_params)

    # for e in range(20,31):
    #     print(e)
    #     algo = SVD(n_factors=183, reg_all=.1, lr_all=.01, n_epochs=e)
    #     results = cross_validate(algo, ratings, measures=['RMSE'])
Example #16
0
from surprise import SVD
from surprise import Dataset, Reader
from surprise.dump import dump, load
import os

if os.path.exists("/pfs/out/model"):
    # If we have model saved by previous training, load it
    _, algo = load("/pfs/out/model")
    reader = Reader(line_format='user item rating timestamp', sep=' ')

    # Train model with each new committed train data
    for dirpath, dirs, files in os.walk("/pfs/training"):
        for filename in files:
            filepath = os.path.join(dirpath, file)
            with open(filepath) as f:
                data = Dataset.load_from_file(
                    filepath, reader=reader).build_full_trainset()
                algo.fit(data)
else:
    # If it's initial run, train with existing dataset

    # Load the movielens-100k dataset (download it if needed),
    data = Dataset.load_builtin('ml-100k').build_full_trainset()

    # We'll use the famous SVD algorithm.
    algo = SVD()
    algo.fit(data)

# In both case, save trained model
dump("/pfs/out/model", algo=algo)
Example #17
0
import numpy as np
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import KNNBasic

# Load dataset
dataset = 'ml-100k'
data = Dataset.load_builtin(dataset)

# Set KFold validation and ensure random state remains consistent between all models
kf = KFold(n_splits=10, random_state = 0)

# Use RMS error and mean abs error as metrics
out = cross_validate(KNNBasic(), data, ['rmse', 'mae'], kf)

# Format outputs from surprise CV method out
meanTestRMSE = '{:.3f}'.format(np.mean(out['test_rmse']))
meanTestMAE = '{:.3f}'.format(np.mean(out['test_mae']))

# Print results
print("\nKNN Basic Recommender Model has achieved:")
print("    RMSE: " + meanTestRMSE)
print("    MAE : " + meanTestMAE)






Example #18
0
def load_movies_data():
    data = pd.DataFrame(Dataset.load_builtin("ml-100k").raw_ratings)
    data[0] = pd.to_numeric(data[0]) - 1
    data[1] = pd.to_numeric(data[1]) - 1
    del data[3]
    return data.values
Example #19
0
#importing surprise package and builtin data
from surprise import Dataset, evaluate
from surprise import KNNBasic
from collections import defaultdict

# loading data
dataset = Dataset.load_builtin("ml-100k")
trainingSet = dataset.build_full_trainset()
trainingSet

# cosine similarity between 2 vectors
sim_options = {'name': 'cosine', 'user_based': False}
knn = KNNBasic(sim_options=sim_options)

# training the model
knn.train(trainingSet)

# movie recommendations for users
testSet = trainingSet.build_anti_testset()
predictions = knn.test(testSet)

#top three movie recommendations for each user.


def get_top5_recommendations(predictions, topN=5):

    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))

    for uid, user_ratings in top_recs.items():
Example #20
0
    def __init__(self,
                 embedding_dimension=20,
                 n_items_to_recommend=4,
                 seed=0,
                 n_users=40,
                 n_items=500,
                 normalize_reward=False):
        """
        Environment that models some sequential recommendation process by using MovieLens Dataset
        PMF (Probabilistic Matrix Factorization) is performed to obtain user/item embeddings

        :param embedding_dimension: size of the user/item embeddings
        :param n_items_to_recommend:  number of items to recommend actions is a list of that size
        :param seed:
        :param n_users: number of users
        :param n_items: number of items
        :param normalize_reward: normalize [1,5] ranks to [-1,1] rewards
        """
        self.normalize_reward = normalize_reward
        self.embedding_dimension = embedding_dimension
        self.n_rec = n_items_to_recommend
        self.seed(seed)
        # Load the movielens-100k dataset (download it if needed),
        data = Dataset.load_builtin('ml-100k')

        # sample random trainset and testset
        # test set is made of 25% of the ratings.
        self.trainset, self.testset = train_test_split(data, test_size=.25)

        self.algo = SVD(n_factors=self.embedding_dimension, biased=False)
        self.algo.fit(self.trainset)

        self.users = self.algo.pu[:n_users]
        self.items = self.algo.qi[:n_items]

        self.n_users = len(self.users)
        self.n_items = len(self.items)

        if self.n_users < n_users:
            warnings.warn("Only %d users are available in dataset" %
                          self.n_users)
        if self.n_items < n_items:
            warnings.warn("Only %d items are available in dataset" %
                          self.n_items)

        self.Users = {}
        for i in range(self.n_users):
            user = User(id=i, embedding=self.users[i])
            self.Users[user.id] = user

        self.Items = {}
        for j in range(self.n_items):
            item = Item(id=j, embedding=self.items[j], use_until=np.inf)
            self.Items[item.id] = item

        self.active_uid = self.np_random.choice(range(self.n_users))
        self.bought_items = defaultdict(set)
        # logs
        self.steps_count = 0
        self.info = {}

        # TODO: make action and observation space. checkout robotics envs + FlattenDictWrapper
        # https://github.com/openai/gym/tree/5404b39d06f72012f562ec41f60734bd4b5ceb4b/gym/envs/robotics
        self.action_space = None
        self.observation_space = None
Example #21
0
 def load_ratings_from_surprise(self) -> DatasetAutoFolds:
     ratings = Dataset.load_builtin('ml-100k')
     return ratings
Example #22
0
def main():
    # Load the movielens-100k dataset  UserID::MovieID::Rating::Timestamp
    data = Dataset.load_builtin('ml-100k')
    trainset, testset = train_test_split(data, test_size=.15)
    user_collaborative_filtering(trainset, testset)
Example #23
0
import pandas as pd
from surprise import Dataset
from surprise import Reader
# This is the same data that was plotted for similarity earlier
# with one new user "E" who has rated only movie 1
#Load_data
ratings_dict = {
    "item": [1, 2, 1, 2, 1, 2, 1, 2, 1],
    "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'],
    "rating": [1, 2, 2, 4, 2.5, 4, 4.5, 5, 3],
}
df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(1, 5))
#Load Pandas DataFrame
data = Dataset.load_from_df(df[["item", "user", "rating"]], reader)
#Load builtin movie lens dataset
movielens = Dataset.load_builtin('ml-100k')

#Recommender.py
from surprise import KNNWithMeans
#To use item based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False  #compute similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)
Example #24
0
from surprise import KNNWithMeans, KNNWithZScore
from surprise import Dataset
from surprise.model_selection import GridSearchCV, RandomizedSearchCV, KFold
import time

training_data = Dataset.load_builtin("ml-100k")
sim_options = {
    "name": ["pearson", "msd", "cosine"],
    "min_support": [2, 4, 5],
    "user_based": [False, True],
}

print('\nRUNNING GRID SEARCH')
print('   "name": ["pearson", "msd", "cosine"]')
print('   "min_support": [2, 3, 4, 5]')
print('   "user_based": [False, True]\n')

param_grid = {"sim_options": sim_options}
start_time = time.time()

# GridSearchCV
gs = RandomizedSearchCV(KNNWithZScore,
                        param_grid,
                        measures=["rmse", "mae"],
                        cv=3)
gs.fit(training_data)

print()
print("RMSE:", gs.best_score["rmse"])
print(gs.best_params["rmse"])
print()
Example #25
0
def experiments(config_file):
    args = get_args_parser().parse_args(['@' + config_file])

    # Set seed
    np.random.seed(int(args.seed))

    # Construct output directory
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    outdir = args.outdir + str(args.dataset) + "/" + timestamp + '/'

    # Create results directory
    outdir_path = Path(outdir)
    if not outdir_path.is_dir():
        os.makedirs(outdir)

    # Logging
    logfile = outdir + 'log.txt'
    log(logfile, "Directory " + outdir + " created.")

    # Set dataset
    if str(args.dataset) == 'ml-100k':
        dataset_name = 'MovieLens 100K'
    else:
        dataset_name = 'MovieLens 1M'

    # Load the MovieLens dataset (download it if needed),
    data = Dataset.load_builtin(str(args.dataset))

    # 80-20 split
    train_dataset, test_dataset = train_test_split(data,
                                                   test_size=.20,
                                                   random_state=int(args.seed))

    # Run Autoencoder
    [a_mse, a_runtime] = autoencoder(str(args.dataset), logfile,
                                     int(args.seed))

    # Set algorithms
    user_based_msd_sim_options = {'name': 'msd', 'user_based': True}
    user_based_pearson_baseline_sim_options = {
        'name': 'pearson_baseline',
        'user_based': True
    }
    user_based_msd_algo = KNNBasic(sim_options=user_based_msd_sim_options)
    user_based_pearson_baseline_algo = KNNBasic(
        sim_options=user_based_pearson_baseline_sim_options)

    item_based_sim_options = {'name': 'msd', 'user_based': False}
    item_based_pearson_baseline_sim_options = {
        'name': 'pearson_baseline',
        'user_based': False
    }
    item_based_msd_algo = KNNBasic(sim_options=item_based_sim_options)
    item_based_pearson_baseline_algo = KNNBasic(
        sim_options=item_based_pearson_baseline_sim_options)

    algorithms = (
        ("User MSD", user_based_msd_algo),
        ("User Pearson Baseline", user_based_pearson_baseline_algo),
        ("Item MSD", item_based_msd_algo),
        ("Item Pearson Baseline", item_based_pearson_baseline_algo),
    )

    # Plotting
    plt.style.use('dark_background')
    fig, ax = plt.subplots()

    # Autoencoder results
    runtimes = [a_runtime]
    mses = [a_mse]
    # ax.annotate("Autoencoder", (runtimes[0] + .001, mses[0] + .001))

    # Running
    for name, algorithm in algorithms:
        log(logfile, dataset_name + ", " + name)

        # Train
        time_start = time.time()
        algorithm.fit(train_dataset)
        time_stop = time.time()
        log(
            logfile,
            'Train time: {0:f}'.format(round(time_stop - time_start,
                                             2)).strip('0'))

        # Test
        time_start = time.time()
        predictions = algorithm.test(test_dataset)
        time_stop = time.time()
        runtime = round(time_stop - time_start, 2)
        runtimes += [runtime]
        log(logfile, 'Test time: {0:f}'.format(runtime).strip('0'))

        # MSE metric
        mse = accuracy.mse(predictions, verbose=False)
        mses += [mse]
        log(logfile, 'MSE: {0:1.4f}\n'.format(mse))

    # Draw scatter plot
    ax.scatter(runtimes[1:], mses[1:], marker='x', color='red')
    # ax.scatter(runtimes, mses, marker='x', color='red')

    # Annotate scatter plot, i=0 is for Autoencoder
    for i, (name, _) in enumerate(algorithms):
        ax.annotate(name, (runtimes[i + 1] + .001, mses[i + 1] + .001))

    # Set plot settings
    plt.title("{}".format(dataset_name), size=15)
    plt.xlabel('Runtime (s)')
    plt.ylabel('MSE')

    # Save plot
    plt.savefig(outdir + 'plot.png', bbox_inches='tight')
from surprise import Dataset
from surprise import Reader

rating_dict = {
    "item": [1, 2, 1, 2, 1, 2, 1, 2, 1],
    "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'],
    "rating": [1, 2, 2, 4, 2.5, 4, 4.5, 5, 3]
}

df = pd.DataFrame(rating_dict)
reader = Reader(rating_scale=(1, 5))

# Load the dataframe
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)
# builtin Movielens-100k data
movielens = Dataset.load_builtin('ml-100k')

# Configure KnnwithMeans

from surprise import KNNWithMeans
sim_options = {"name": "cosine", "user_based": False}

algo = KNNWithMeans(sim_options=sim_options)

# Predict the Rating of Movie E
trainSet = data.build_full_trainset()
algo.fit(trainSet)

prediction = algo.predict('E', 2)
print(prediction.est)
Example #27
0
def main(args=None):
    _file, noise, epochs, pca_com = process_args(args)
    data = Dataset.load_builtin('ml-1m')
    rc = RatingCollection(data.raw_ratings)

    print("Constructing network...")

    d_losses = []
    g_losses = []
    print("Starting run...")
    distances = []
    for i in range(len(rc.folds)):
        print("Fold {}...".format(i + 1))
        training_data = {}
        for idx, value in enumerate(rc.folds):
            #if idx != i:
            training_data = {**training_data, **rc._get_matrix(value)}
        print('Calculating principle components...')
        pca = PCA(pca_com)
        pca.fit(get_sample(training_data, len(training_data.keys())))
        dis_arch = [MOVIES_COUNT, 300, 1]
        gen_arch = [noise, 300, MOVIES_COUNT]
        tf.reset_default_graph()
        network = gan(dis_arch, gen_arch, pca_com, 50)
        session = tf.Session()
        session.run(tf.global_variables_initializer())
        for it in range(epochs):
            users = get_sample(training_data, 50)
            _sample = sample_Z(50, noise)
            users_p = get_perturbed_batch(users)
            users_pca = pca.transform(users)
            # _, D_loss_curr = session.run([network.discriminator_optimizer, network.discriminator_loss],
            # feed_dict={network.discriminator_input: users, network.generator_input: _sample,
            # network.generator_condition: users_pca, network.pert: users_p, network.keep_prob: 0.5})
            _, D_loss_curr = session.run(
                [network.discriminator_optimizer, network.discriminator_loss],
                feed_dict={
                    network.discriminator_input: users,
                    network.generator_input: _sample,
                    network.generator_condition: users_pca,
                    network.keep_prob: 0.5
                })
            _, G_loss_curr = session.run(
                [network.generator_optimizer, network.generator_loss],
                feed_dict={
                    network.generator_input: _sample,
                    network.generator_condition: users_pca,
                    network.keep_prob: 0.5
                })

            if it % 100 == 0:
                d_losses.append(D_loss_curr)
                g_losses.append(G_loss_curr)
                print('Iteration {} of {} ---------------'.format(it, epochs))
                print('D loss: {:.4}'.format(D_loss_curr))
                print('G_loss: {:.4}'.format(G_loss_curr))

                # Get the classification distances
                test_fold = rc._get_matrix(rc.folds[i])
                sample_size = len(test_fold)
                users = get_sample(test_fold, sample_size).astype(np.float32)
                _sample = sample_Z(sample_size, noise)
                users_pca = pca.transform(users)
                generated_images = session.run(network.generator.prob,
                                               feed_dict={
                                                   network.generator_input:
                                                   _sample,
                                                   network.generator_condition:
                                                   users_pca
                                               })

                feed_users = get_sample(test_fold,
                                        sample_size).astype(np.float32)
                feed_users = tf.convert_to_tensor(feed_users, dtype=tf.float32)
                generated_images = tf.convert_to_tensor(generated_images,
                                                        dtype=tf.float32)
                result = tf.contrib.gan.eval.frechet_classifier_distance_from_activations(
                    feed_users, generated_images)
                result = session.run(result)
                distances.append(result)

        write_output(d_losses, g_losses, distances, _file)
        break
Example #28
0

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n


movie_train = Dataset.load_builtin('ml-100k')
print(movie_train.raw_ratings)

knn_estimator = KNNBasic
knn_grid = {
    'k': [10, 20],
    'sim_options': {
        'name': ['cosine', 'msd'],
        'min_support': [1, 5],
        'user_based': [True, False]
    }
}
gs = model_selection.GridSearchCV(knn_estimator,
                                  knn_grid,
                                  measures=['rmse'],
                                  cv=3)
Example #29
0
File: cf.py Project: cnzmeca/tufts
import random
import pandas as pd

from surprise.prediction_algorithms.knns import KNNBasic as KNN
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import GridSearchCV

data = Dataset.load_builtin('jester')

raw_ratings = data.raw_ratings

random.shuffle(raw_ratings)

threshold = int(.9 * len(raw_ratings))

train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = train_raw_ratings

print('Grid Search...')
#param_grid = {'k': [1, 10, 100, 1000, 10000, 100000, 1000000], "sim_options['name']": ['MSD', 'cosine', 'pearson'], "sim_options['user_based']": [True, False]}
param_grid = {
    'k': [100],
    "sim_options['name']": ['MSD'],
    "sim_options['user_based']": [True]
}
grid_search = GridSearchCV(KNN, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)
Example #30
0
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 28 10:07:52 2020

@author: lucas
"""

import pandas as pd
from surprise import Dataset
from surprise import KNNWithMeans
from surprise.model_selection import GridSearchCV
from surprise import SVD

data = Dataset.load_builtin("ml-100k")  #Dataset.load_from_df()

#%% memory-based approach

sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)
#
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])
Example #31
0
"""
This module describes how to use the GridSearchCV() class for finding the best
parameter combination of a given algorithm.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

# Use movielens-100K
data = Dataset.load_builtin('ml-100k')

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())
Example #32
0
def load_ratings_from_surprise(name: str) -> DatasetAutoFolds:
    ratings = Dataset.load_builtin(name)
    return ratings