Ejemplo n.º 1
0
def LoadMovieLensData():
    ml = MovieLens()
    print("Loading movie ratings...")
    data = ml.loadMovieLensLatestSmall()
    print(
        "\nComputing movie popularity ranks so we can measure novelty later..."
    )
    rankings = ml.getPopularityRanks()
    return (ml, data, rankings)


np.random.seed(0)
random.seed(0)

# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()

# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)

contentKNN = ContentKNNAlgorithm()
evaluator.AddAlgorithm(contentKNN, "ContentKNN")

# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")

evaluator.Evaluate(False)

evaluator.SampleTopNRecs(ml)
        sum(rec for rec in recalls_combined.values()) / len(recalls_combined))

print("averaged_precision for SVD algorithm:")
print(sum(averaged_precision_SVD) / len(averaged_precision_SVD))
print("averaged_recall for SVD algorithm:")
print(sum(averaged_recall_SVD) / len(averaged_recall_SVD))
print("averaged_precision for KNN algorithm:")
print(sum(averaged_precision_KNN) / len(averaged_precision_KNN))
print("averaged_recall for KNN algorithm:")
print(sum(averaged_recall_KNN) / len(averaged_recall_KNN))
print("averaged_precision for combined algorithm:")
print(sum(averaged_precision_combined) / len(averaged_precision_combined))
print("averaged_recall for combined algorithm:")
print(sum(averaged_recall_combined) / len(averaged_recall_combined))

#We compare our results with a random Predictor
algo_random = NormalPredictor()
algo_random.fit(trainset)
prediction_random = algo_random.test(testset)

#compare rmse and mae of the different algorithms
rmse(predictions_SVD)
rmse(predictions_KNN)
rmse(predictions_combined)
rmse(prediction_random)

mae(predictions_SVD)
mae(predictions_KNN)
mae(predictions_combined)
mae(prediction_random)
from surprise import accuracy
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm
import math
#The comparsion script, compare the Random, KNN, and SVD
# The training and prediction(without rating) is using full training set.

data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
algo = KNNBasic(user_based=False, k=17)
algo.fit(trainset)
algo1 = SVD()
algo1.fit(trainset)
algo2 = NormalPredictor()
algo2.fit(trainset)

random = []
knn = []
svd = []

with open("ml-100k/u.data", encoding='ISO-8859-1') as f:  #<-- use full set
    info = f.readlines()
    infoList = tqdm(info)
    for message in infoList:
        message = message.split("\t")
        message = message[0:3]
        u = message[0]
        m = message[1]
        knnPred = algo.predict(str(u), str(m))
Ejemplo n.º 4
0
reader = Reader(rating_scale=(0, 489))

# load data as Dataset for surprise library
data = Dataset.load_from_df(orig_data[['Customer', 'Product', 'Duration']],
                            reader)

# In[10]:

benchmark = []
# Iterate over all algorithms
for algorithm in [
        SVD(),
        SVDpp(),
        SlopeOne(),
        NMF(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering()
]:
    # Perform cross validation
    print('Executing' + str(algorithm))
    results = cross_validate(algorithm,
                             data,
                             measures=['RMSE'],
                             cv=3,
                             verbose=False)
Ejemplo n.º 5
0
    def train_surprise(self,
                       model_type,
                       trainset,
                       testset,
                       k_recommend,
                       sql_db,
                       k_fold,
                       knowledge,
                       model_name,
                       result_name,
                       system_eval=False):

        knn_user_based = self.config['SURPRISE_KNN'].getboolean(
            'knn_user_based')
        knn_similarity = self.config['SURPRISE_KNN']['knn_similarity']
        sim_options = {'name': knn_similarity, 'user_based': knn_user_based}
        verbose_switch = self.config['DEFAULT'].getboolean('verbose_switch')
        # Selección de modelo a utilizar
        if (model_type == "svd"):
            # Obtener valores de configuracion
            svd_grid_search = self.config['SURPRISE_SVD'].getboolean(
                'svd_grid_search')
            svd_grid_metric = self.config['SURPRISE_SVD']['svd_grid_metric']
            svd_n_factors = int(self.config['SURPRISE_SVD']['svd_n_factors'])
            svd_n_epochs = int(self.config['SURPRISE_SVD']['svd_n_epochs'])
            svd_biased = self.config['SURPRISE_SVD'].getboolean('svd_biased')
            svd_init_mean = float(self.config['SURPRISE_SVD']['svd_init_mean'])
            svd_init_std_dev = float(
                self.config['SURPRISE_SVD']['svd_init_std_dev'])
            svd_lr_all = float(self.config['SURPRISE_SVD']['svd_lr_all'])
            svd_reg_all = float(self.config['SURPRISE_SVD']['svd_reg_all'])

            if (self.common_functions.validate_available_sql_data(
                    'svd_params', sql_db) == True):
                results = pd.read_sql_query('select * from svd_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "svd")]
                if (real_results.empty == False):
                    svd_n_factors = int(real_results.iloc[0]['svd_n_factors'])
                    svd_n_epochs = int(real_results.iloc[0]['svd_n_epochs'])
                    svd_init_std_dev = float(
                        real_results.iloc[0]['svd_init_std_dev'])
                    svd_lr_all = float(real_results.iloc[0]['svd_lr_all'])
                    svd_reg_all = float(real_results.iloc[0]['svd_reg_all'])

            algo = SVD(n_factors=svd_n_factors,
                       n_epochs=svd_n_epochs,
                       biased=svd_biased,
                       init_mean=svd_init_mean,
                       init_std_dev=svd_init_std_dev,
                       lr_all=svd_lr_all,
                       reg_all=svd_reg_all,
                       verbose=verbose_switch)

        elif (model_type == "SVDpp"):
            # Obtener valores de configuracion
            svdpp_grid_search = self.config['SURPRISE_SVDPP'].getboolean(
                'svdpp_grid_search')
            svdpp_grid_metric = self.config['SURPRISE_SVDPP'][
                'svdpp_grid_metric']
            svdpp_n_factors = int(
                self.config['SURPRISE_SVDPP']['svdpp_n_factors'])
            svdpp_n_epochs = int(
                self.config['SURPRISE_SVDPP']['svdpp_n_epochs'])
            svdpp_init_mean = float(
                self.config['SURPRISE_SVDPP']['svdpp_init_mean'])
            svdpp_init_std_dev = float(
                self.config['SURPRISE_SVDPP']['svdpp_init_std_dev'])
            svdpp_lr_all = float(self.config['SURPRISE_SVDPP']['svdpp_lr_all'])
            svdpp_reg_all = float(
                self.config['SURPRISE_SVDPP']['svdpp_reg_all'])

            if (self.common_functions.validate_available_sql_data(
                    'svdpp_params', sql_db) == True):
                results = pd.read_sql_query('select * from svdpp_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "svdpp")]
                if (real_results.empty == False):
                    svdpp_n_factors = int(
                        real_results.iloc[0]['svdpp_n_factors'])
                    svdpp_n_epochs = int(
                        real_results.iloc[0]['svdpp_n_epochs'])
                    svdpp_init_std_dev = float(
                        real_results.iloc[0]['svdpp_init_std_dev'])
                    svdpp_lr_all = float(real_results.iloc[0]['svdpp_lr_all'])
                    svdpp_reg_all = float(
                        real_results.iloc[0]['svdpp_reg_all'])

            algo = SVDpp(n_factors=svdpp_n_factors,
                         n_epochs=svdpp_n_epochs,
                         init_mean=svdpp_init_mean,
                         init_std_dev=svdpp_init_std_dev,
                         lr_all=svdpp_lr_all,
                         reg_all=svdpp_reg_all,
                         verbose=verbose_switch)

        elif (model_type == "NMF"):
            # Obtener valores de configuracion
            nmf_grid_search = self.config['SURPRISE_NMF'].getboolean(
                'nmf_grid_search')
            nmf_grid_metric = self.config['SURPRISE_NMF']['nmf_grid_metric']
            nmf_n_factors = int(self.config['SURPRISE_NMF']['nmf_n_factors'])
            nmf_n_epochs = int(self.config['SURPRISE_NMF']['nmf_n_epochs'])
            nmf_biased = self.config['SURPRISE_NMF'].getboolean('nmf_biased')
            nmf_reg_pu = float(self.config['SURPRISE_NMF']['nmf_reg_pu'])
            nmf_reg_qi = float(self.config['SURPRISE_NMF']['nmf_reg_qi'])
            nmf_reg_bu = float(self.config['SURPRISE_NMF']['nmf_reg_bu'])
            nmf_reg_bi = float(self.config['SURPRISE_NMF']['nmf_reg_bi'])
            nmf_lr_bu = float(self.config['SURPRISE_NMF']['nmf_lr_bu'])
            nmf_lr_bi = float(self.config['SURPRISE_NMF']['nmf_lr_bi'])
            nmf_init_low = float(self.config['SURPRISE_NMF']['nmf_init_low'])
            nmf_init_high = int(self.config['SURPRISE_NMF']['nmf_init_high'])

            if (self.common_functions.validate_available_sql_data(
                    'nmf_params', sql_db) == True):
                results = pd.read_sql_query('select * from nmf_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "nmf")]
                if (real_results.empty == False):
                    nmf_n_factors = int(real_results.iloc[0]['nmf_n_factors'])
                    nmf_n_epochs = int(real_results.iloc[0]['nmf_n_epochs'])
                    nmf_reg_pu = float(real_results.iloc[0]['nmf_reg_pu'])
                    nmf_reg_qi = float(real_results.iloc[0]['nmf_reg_qi'])
                    nmf_init_low = float(real_results.iloc[0]['nmf_init_low'])

            algo = NMF(n_factors=nmf_n_factors,
                       n_epochs=nmf_n_epochs,
                       biased=nmf_biased,
                       reg_pu=nmf_reg_pu,
                       reg_qi=nmf_reg_qi,
                       reg_bu=nmf_reg_bu,
                       reg_bi=nmf_reg_bi,
                       lr_bu=nmf_lr_bu,
                       lr_bi=nmf_lr_bi,
                       init_low=nmf_init_low,
                       init_high=nmf_init_high,
                       verbose=verbose_switch)

        elif (model_type == "NormalPredictor"):
            algo = NormalPredictor()

        elif (model_type == "BaselineOnly"):
            algo = BaselineOnly(verbose=verbose_switch)

        elif (model_type == "KNNBasic"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnbasic_params', sql_db) == True):
                results = pd.read_sql_query('select * from knnbasic_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "knnbasic")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNBasic(k=knn_k,
                            min_k=knn_min_k,
                            sim_options=sim_options,
                            verbose=verbose_switch)

        elif (model_type == "KNNWithMeans"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnwithmeans_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from knnwithmeans_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) & (
                    results["algorithm"] == "knnwithmeans")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNWithMeans(k=knn_k,
                                min_k=knn_min_k,
                                sim_options=sim_options,
                                verbose=verbose_switch)

        elif (model_type == "KNNWithZScore"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnwithzscore_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from knnwithzscore_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) & (
                    results["algorithm"] == "knnwithzscore")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNWithZScore(k=knn_k,
                                 min_k=knn_min_k,
                                 sim_options=sim_options,
                                 verbose=verbose_switch)

        elif (model_type == "KNNBaseline"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnbaseline_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from knnbaseline_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) &
                                       (results["algorithm"] == "knnbaseline")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNBaseline(k=knn_k,
                               min_k=knn_min_k,
                               sim_options=sim_options,
                               verbose=verbose_switch)

        elif (model_type == "SlopeOne"):
            algo = SlopeOne()

        elif (model_type == "CoClustering"):
            # Obtener valores de configuracion
            cc_grid_search = self.config['SURPRISE_COCLUSTERING'].getboolean(
                'cc_grid_search')
            cc_grid_metric = self.config['SURPRISE_COCLUSTERING'][
                'cc_grid_metric']
            cc_n_cltr_u = int(
                self.config['SURPRISE_COCLUSTERING']['cc_n_cltr_u'])
            cc_n_cltr_i = int(
                self.config['SURPRISE_COCLUSTERING']['cc_n_cltr_i'])
            cc_n_epochs = int(
                self.config['SURPRISE_COCLUSTERING']['cc_n_epochs'])

            if (self.common_functions.validate_available_sql_data(
                    'coclustering_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from coclustering_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) & (
                    results["algorithm"] == "coclustering")]
                if (real_results.empty == False):
                    cc_n_cltr_u = int(real_results.iloc[0]['cc_n_cltr_u'])
                    cc_n_cltr_i = int(real_results.iloc[0]['cc_n_cltr_i'])
                    cc_n_epochs = int(real_results.iloc[0]['cc_n_epochs'])

            algo = CoClustering(n_cltr_u=cc_n_cltr_u,
                                n_cltr_i=cc_n_cltr_i,
                                n_epochs=cc_n_epochs,
                                verbose=verbose_switch)
        else:
            return {
                "status": False,
                "result": "Defined model_type does not exist"
            }

        st = default_timer()
        print("STARTING to train model: " + str(model_name))
        algo.fit(trainset)
        train_model_runtime = default_timer() - st
        # Almacenar tiempo de proceso en base de datos
        self.common_functions.save_process_time(
            st,
            event=str(model_name) + "_training",
            description="Time for model to be trained on dataset")

        # Guardar modelo
        # Crear directorio si no existe
        if (os.path.isdir(self.models_path + model_name) == False):
            try:
                os.makedirs(self.models_path + model_name)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    return {"status": False, "result": e}
        # Almacenar modelo en file system
        #file_name =  self.models_path+model_name+"/model"
        #dump.dump(file_name, algo=algo)

        st = default_timer()
        print("STARTING to generate predictions with the trained model: " +
              str(model_name))
        predictions = algo.test(testset)
        runtime = default_timer() - st

        print(
            "Tiempo de ejecucion total de la generacion de predicciones para Surprise Time:",
            round(runtime, 2))
        self.common_functions.save_process_time(
            st,
            event=str(model_name) + "_generate_recommendations",
            description="Time for predictions to be generated using the model")

        # Guardar predicciones para hibridación
        # Crear directorio si no existe
        if (os.path.isdir(self.models_path + model_name + "/predictions/" +
                          str(k_fold)) == False):
            try:
                os.makedirs(self.models_path + model_name + "/predictions/" +
                            str(k_fold))
            except OSError as e:
                if e.errno != errno.EEXIST:
                    return {"status": False, "result": e}

        # Almacenar predicciones para hibridación
        eval_result = pd.DataFrame(
            columns=['user_id', 'item_id', 'r_ui', 'est'])
        for uid, iid, true_r, est, _ in predictions:
            eval_result = eval_result.append(
                {
                    'user_id': uid,
                    'item_id': iid,
                    'r_ui': true_r,
                    'est': est
                },
                ignore_index=True)
        eval_result.to_csv(path_or_buf=self.models_path + model_name +
                           "/predictions/" + str(k_fold) + "/predictions.csv",
                           encoding='latin1',
                           sep=str(u';').encode('utf-8'),
                           index=False)

        # ---------------------------

        if (system_eval == False):
            # Procesar y evaluar las recomendaciones para el modelo
            st = default_timer()
            print("STARTING to evaluate recommendations with model: " +
                  str(model_name))
            process_evaluate_result = self.evaluation.surprise_process_evaluate(
                predictions,
                knowledge,
                model_name,
                result_name,
                train_model_runtime,
                k_recommend,
                sql_db,
                k_fold,
                is_surprise=True)
            # Almacenar tiempo de proceso en base de datos
            self.common_functions.save_process_time(
                st,
                event=str(model_name) + "_evaluate_model",
                description="Time for model to be evaluated in test dataset")
            if (process_evaluate_result["status"] == True):
                del (process_evaluate_result)
                return {"status": True, "result": ""}
            else:
                del (process_evaluate_result)
                return {
                    "status": False,
                    "result":
                    "no se pudo ejecutar correctamente content_explicit"
                }
        else:
            print("decide what to do")
            #result_model.save(self.models_path+model)

        return {"status": True, "result": ""}
Ejemplo n.º 6
0
"""
This module descibes how to load a dataset from a pandas dataframe.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader


# Dummy algo
algo = NormalPredictor()

# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
data.split(2)  # data can now be used normally

for trainset, testset in data.folds():
    algo.fit(trainset)
Ejemplo n.º 7
0
# * using reader to be able to deal with the imported CSV
reader = Reader(
    line_format="user item rating timestamp", sep=",", rating_scale=(1, 5), skip_lines=1
)
# * loading the csv
data = Dataset.load_from_file(
    file_path="../../ML_Dataset/ml-latest-small/ratings.csv", reader=reader
)
# * dividing in train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# * define a cross-validation iterator
kf = KFold(n_splits=5)

# * Choosing Normal Predictor as algorithm
algo = NormalPredictor()

# * Train the algorithm on the trainset, and predict ratings for the testset
for trainset, testset in kf.split(data):
    predictions = algo.fit(trainset).test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    accuracy.mse(predictions)
    accuracy.fcp(predictions)
    print("Precision: ", sum(prec for prec in precisions.values()) / len(precisions))
    print("Recall: ", sum(rec for rec in recalls.values()) / len(recalls))

df = pd.DataFrame(predictions, columns=["uid", "iid", "rui", "est", "details"])
df["err"] = abs(df.est - df.rui)
df.to_csv("predictions_Normal.csv")
Ejemplo n.º 8
0
N = len(R)
M = len(R[0])

K = 5
P = numpy.random.rand(N, K)
Q = numpy.random.rand(M, K)
nP, nQ = matrix_factorization(R, P, Q, K)
nR = numpy.dot(nP, nQ.T)

data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
algo = KNNBasic(user_based=False, k=17)
algo.fit(trainset)
algo1 = SVD()
algo1.fit(trainset)
algo2 = NormalPredictor()
algo2.fit(trainset)
mf = []
knn = []
svd = []
random = []

with open("ml-100k/u.data", encoding='ISO-8859-1') as f:  #<-- use full set
    info = f.readlines()
    infoList = tqdm(info)
    for message in infoList:
        message = message.split("\t")
        message = message[0:3]
        mf.append(int(nR[int(message[0]) - 1][int(message[1]) - 1]))
        knnPred = algo.predict(message[0], message[1])
        svdPred = algo1.predict(message[0], message[1])
Ejemplo n.º 9
0
doTopN = True

# seed for reproducibility
np.random.seed(0)
random.seed(0)

# for expanded display in pandas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# initialize the algorithms before comparison
SVD_Algorithm = SVD(random_state=10)
# SVDpp_Algorithm = SVDpp(random_state=10)
Normal_Predictor = NormalPredictor()

# creating the comparison set
algo_comparison_set = [(SVD_Algorithm, "SVD"), (Normal_Predictor, "Normal")]

# set data
recommenderData = RecommenderData(ratingsPath, moviesPath, verbose=True)

# set comparer
recommenderComparer = RecommenderComparer(recommenderData, algo_comparison_set)
# compare
comparison = recommenderComparer.Compare(doTopN,
                                         verbose=True,
                                         sample_topN_for_userIDs=Test_userIDs)
# comparison["0000"] = {"sample_topn": }
Ejemplo n.º 10
0
data6 = Dataset.load_from_file(file_path6, reader=reader)

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset6, testset6 = train_test_split(data6, test_size=.25)

# Choose the algo to use to compute RMSE
algo = SVD()
algo = BaselineOnly()
algo = KNNBasic()
algo = SlopeOne()
algo = CoClustering()
algo = SVDpp()
algo = NMF()
algo = NormalPredictor()

# Train the algorithm on the trainset, and predict ratings for the testset
start = time.time()
algo.fit(trainset6)
predictions = algo.test(testset6)
accuracy.rmse(predictions)
end = time.time()

elapsed = end - start
print(elapsed)

# Then compute RMSE
accuracy.rmse(predictions)
predictions = algo.fit(trainset6).test(testset6)
predictions
f2 = plt.figure(2)
plt.plot(ratings_per_movie, 'o', color = 'blue')
plt.ylabel('Number of ratings per movie')
plt.xlabel('Item ID')

f3 = plt.figure(3)
plt.plot(ratings_per_user, 'o', color = 'red')
plt.ylabel('Number of ratings per user')
plt.xlabel('User ID')

plt.show()

# Model 1: Random
# Create model object
model_random = NormalPredictor()
print('Model creation successful!')

# Train on data using cross-validation with k=5 folds, measuring the RMSE
model_random_results = cross_validate(model_random, data, measures=['RMSE'], cv=5, verbose=True)
print('Model training successful!')

# Model 2: User-Based Collaborative Filtering
# Create model object
model_user = KNNBasic(sim_options={'user_based': True})
print('Model creation successful!')

# Train on data using cross-validation with k=5 folds, measuring the RMSE
# Note, this may have a lot of print output
# You can set verbose=False to prevent this from happening
model_user_results = cross_validate(model_user, data, measures=['RMSE'], cv=5, verbose=True)
Ejemplo n.º 12
0
# After execute the testFactor to derive the approprate factor, run the surpriseLab to gain the result of 5 fold cross-validation
#from the Random, KNN, and SVD The parameter set is get according to the testFactor.

dataDir = ("ml-100k/")
reader = Reader('ml-100k')
train_file = dataDir + 'u%d.base'
test_file = dataDir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

algo3 = SVD()
algo2 = KNNBasic(user_based=False, k=17)
algo1 = NormalPredictor()
i = 0
for trainset, testset in pkf.split(data):
    i += 1
    print("Random", i)

    # train and test algorithm.
    algo1.fit(trainset)
    predictions = algo1.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

    print("KNN", i)
Ejemplo n.º 13
0
'''

#algo = BaselineOnly(bsl_options=bsl_options)

#algo = BaselineOnly()
'''
Estimating biases using als...
RMSE: 0.8657
Estimating biases using als...
RMSE: 0.8662
Estimating biases using als...
RMSE: 0.8659
user: 196        item: 302        r_ui = 4.00   est = 4.19   {'was_impossible': False}
'''

algo = NormalPredictor()
'''
RMSE: 1.4326
RMSE: 1.4333
RMSE: 1.4316
user: 196        item: 302        r_ui = 4.00   est = 4.84   {'was_impossible': False}
'''

# 定义K折交叉验证迭代器,K=3
'''
交叉验证(Cross Validation)为CV。
基本思想:将原始数据进行分组,一部分作为训练集,另一部分作为测试集,首先用训练集对分类器进行训练,再利用验证集来测试训练
得到的模型,以此作为评价分类器的性能指标。
Kfold:
原始数据分成K组(一般是均分),将每个子集数据分别做一次验证集,其余的k-1组子集数据作为训练集,
这样会得到k个模型,用这k个模型最终的验证集的分类准确率的平均数作为此K-CV下分类器的性能指标。
Ejemplo n.º 14
0

def load_movielens():
    ml = MovieLens()
    print("Loading movie ratings...")
    data = ml.load()
    print("\nComputing movie popularity ranks so we can measure novelty later...")
    rankings = ml.get_popularity_ranks()
    return ml, data, rankings


if __name__ == '__main__':
    # Load up common data set for the recommender algorithms
    ml, data, rankings = load_movielens()

    # Construct an Evaluator to, you know, evaluate them
    evaluator = Evaluator(data, rankings)

    # User-based KNN
    user_knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
    evaluator.add_algorithm(user_knn, "User KNN")

    # Item-based KNN
    item_knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
    evaluator.add_algorithm(item_knn, "Item KNN")

    # Just make random recommendations
    evaluator.add_algorithm(NormalPredictor(), "Random")
    evaluator.evaluate(False)
    evaluator.sample_topn_recs(ml)
Ejemplo n.º 15
0
from surprise import NormalPredictor, BaselineOnly, accuracy, KNNBasic, KNNWithMeans, KNNBaseline, SVD
from surprise.model_selection import train_test_split, GridSearchCV, KFold
import random
import pandas as pd
import numpy as np
np.random.seed(0)
random.seed(0)
pd.set_option('display.max_columns', 500)

data, items, ratings = GetBookData(density_filter=True)
trainset, testset = train_test_split(data, test_size=0.2)
results = {}
top_n = {}

###Normal Predictor
norm = NormalPredictor()
norm.fit(trainset)
norm_pred = norm.test(testset)
rmse = accuracy.rmse(norm_pred)
precisions, recalls = precision_recall_at_k(norm_pred, k=10, threshold=4.5)
avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
metrics = {
    'rmse': rmse,
    'avg_precision': avg_precision,
    'avg_recall': avg_recall
}
results['NormalPredictor'] = metrics

top_n['NormalPredictor'] = get_top_n(norm_pred, n=10)