def main():
    f = open("Python/user_rated_movies.tsv", "r")
    user_ratings = []
    for line in f:
        inline = line.split('\t')
        rating = inline[2]
        mytuple = inline[0], inline[1], float(rating[:-1]), None
        user_ratings.append(mytuple)
    f.close()

    # data = Dataset.load_builtin(name=u'ml-1m')
    reader = Reader(line_format='user item rating', sep='\t')
    datain = pd.read_csv("ratings.tsv", sep="\t")
    data = Dataset.load_from_df(datain, reader=reader)
    for i in user_ratings:
        data.raw_ratings.append(i)

    movies = pd.read_csv("movies.tsv", sep="\t", header=None, low_memory=False)

    algo = NMF(n_factors=4, n_epochs=100, random_state=1)
    trainSet = data.build_full_trainset()
    algo.fit(trainSet)

    predictions = []
    #have i[0] and i[1] be the current user and movie id
    for index, row in movies.iterrows():
        pred = algo.predict(user_ratings[0][0], row[1], r_ui=4)
        predictions.append(pred)

    sortpred = sorted(predictions, key=lambda pred: pred[3])
    sortpred = sortpred[-10:]

    for i in sortpred:
        print(i[1])
def do_nmf(data_raw, impute_params):
    data = data_raw.pivot(index="User", columns="Movie",
                          values="Prediction").to_numpy()
    reader = surprise.Reader(rating_scale=(1, 5))
    dataset = surprise.Dataset.load_from_df(
        data_raw[["User", "Movie", "Prediction"]], reader)
    trainset = dataset.build_full_trainset()

    algo = NMF(n_factors=impute_params["FACTORS"],
               n_epochs=impute_params["EPOCHS"],
               verbose=True)
    algo.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    predictions = pd.DataFrame(predictions)

    predictions.rename(columns={
        "uid": "User",
        "iid": "Movie",
        "est": "Prediction"
    },
                       inplace=True)
    predictions = predictions[["User", "Movie", "Prediction"]]

    data = pd.concat([data_raw, predictions], ignore_index=True)
    data = data.pivot(index="User", columns="Movie",
                      values="Prediction").to_numpy()
    return data
Exemple #3
0
    def recommender_nmf_baseline(self, train_file, test_file, output):

        train, test, train_dataset, test_dataset = prepare_datasets(
            train_file, test_file)
        # Use user_based true/false to switch between user-based or item-based collaborative filtering
        algo_nmf_baseline = NMF()

        algo_nmf_baseline.fit(train)

        #not_seen_elems = self.merge_train_set(train_dataset, test_dataset)

        #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True)
        predictions_nmf_baseline = algo_nmf_baseline.test(test, verbose=False)

        #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0)
        # Precision and recall can then be averaged over all users
        #precision_avg = sum(prec for prec in precisions.values()) / len(precisions)
        #recall_avg = sum(rec for rec in recalls.values()) / len(recalls)
        #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str(
        #    rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False)))
        print('NMF BASELINE: ' + ' RMSE ' +
              str(rmse(predictions_nmf_baseline, verbose=False)) + ' MAE ' +
              str(mae(predictions_nmf_baseline, verbose=False)))

        return algo_nmf_baseline
Exemple #4
0
def recommendation_mf(userArray, numUsers, movieIds):

	ratings_dict = {'itemID': list(df_ratings.movie_id_ml) + list(numUsers*movieIds),
					'userID': list(df_ratings.user_id) + [max(df_ratings.user_id)+1+x for x in range(numUsers) for y in range(len(userArray[0]))],
					'rating': list(df_ratings.rating) + [item for sublist in userArray for item in sublist]
				}

	df = pd.DataFrame(ratings_dict)
	reader = Reader(rating_scale=(1, 5))
	data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
	trainset = data.build_full_trainset()

	nmf = NMF()
	nmf.fit(trainset)

	userIds = [trainset.to_inner_uid(max(df_ratings.user_id)+1+x) for x in range(numUsers)]

	mat = np.dot(nmf.pu, nmf.qi.T)

	scores = hmean(mat[userIds, :], axis=0)
	best_movies = scores.argsort()
	best_movies = best_movies[-9:][::-1]
	scores = scores[best_movies]
	movie_ind = [trainset.to_raw_iid(x) for x in best_movies]

	recommendation = list(zip(list(df_ML_movies[df_ML_movies.movie_id_ml.isin(movie_ind)].title), 
					list(df_ML_movies[df_ML_movies.movie_id_ml.isin(movie_ind)].poster_url), 
					list(scores)))

	return recommendation
Exemple #5
0
def predict_NMF(userid):
    df = pd.read_csv('ratings_small.csv').drop(['timestamp'], axis=1)
    reader = Reader(rating_scale=(1, 30))

    #使用reader格式从文件中读取数据
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']],
                                reader=reader)

    #拆分训练集与测试集,75%的样本作为训练集,25%的样本作为测试集
    trainset, testset = train_test_split(data, test_size=.25)

    #使用NMF
    algo = NMF()
    algo.fit(trainset)
    pred_nmf = algo.test(testset)
    top_nmf_n = get_top_n(pred_nmf, n=5)

    movie_titles = pd.read_csv('movies_metadata.csv', usecols=['id', 'title'])
    movie_titles = movie_titles.rename(columns={'id': 'movieId'})
    movie_titles['movieId'] = pd.to_numeric(movie_titles['movieId'],
                                            errors='coerce').fillna(0)
    movie_titles['movieId'] = movie_titles['movieId'].astype('int')
    movie_titles.drop_duplicates()

    for uid, user_ratings in top_nmf_n.items():
        if (uid == userid):
            #print(uid, [iid for (iid, _) in user_ratings])
            title_list = [iid for (iid, _) in user_ratings]

    titles = movie_titles[movie_titles.movieId.isin(title_list)]
    print(titles[2:])
    return titles[2:]
def user_factorization(data_raw, user_clusters, params):
    n_factors = params["LOCAL_U_NMF_K"]
    user_df = pd.DataFrame()
    for i in range(user_clusters):
        u_i = data_raw[data_raw["user cluster"] == i]
        reader = surprise.Reader(rating_scale=(1, 5))
        dataset = surprise.Dataset.load_from_df(
            u_i[["User", "Movie", "Prediction"]], reader)
        trainset = dataset.build_full_trainset()
        algo = NMF(n_factors=n_factors,
                   n_epochs=params["LOCAL_U_NMF_EPOCHS"],
                   verbose=True)
        algo.fit(trainset)
        testset = trainset.build_testset()
        preds = algo.test(testset)
        predictions_train = pd.DataFrame(preds)
        testset = trainset.build_anti_testset()
        preds = algo.test(testset)
        predictions_rest = pd.DataFrame(preds)
        user_df = pd.concat([user_df, predictions_train, predictions_rest],
                            ignore_index=False,
                            copy=False)
    all_u_m = get_all_u_m()
    user_df = all_u_m.merge(user_df, how="left", on=["uid", "iid"])
    user_df = user_df[["uid", "iid", "est"]]
    logging.info("return from user_factorization")
    return user_df
def item_factorization(data_raw, item_clusters, user_df, params):
    n_factors = params["LOCAL_I_NMF_K"]
    item_df = pd.DataFrame()
    for i in range(item_clusters):
        i_i = data_raw[data_raw["item cluster"] == i]
        reader = surprise.Reader(rating_scale=(1, 5))
        dataset = surprise.Dataset.load_from_df(
            i_i[["User", "Movie", "Prediction"]], reader)
        trainset = dataset.build_full_trainset()
        algo = NMF(n_factors=n_factors,
                   n_epochs=params["LOCAL_I_NMF_EPOCHS"],
                   verbose=True)
        algo.fit(trainset)
        #i_i.rename(columns={"User":"******","Movie":"iid","Prediction":"est"},inplace=True)
        testset = trainset.build_testset()
        preds = algo.test(testset)
        predictions_train = pd.DataFrame(preds)
        testset = trainset.build_anti_testset()
        preds = algo.test(testset)
        predictions_rest = pd.DataFrame(preds)
        item_df = pd.concat([item_df, predictions_train, predictions_rest],
                            ignore_index=False,
                            copy=False)
    item_df = user_df[["uid", "iid"]].merge(item_df,
                                            how="left",
                                            on=["uid", "iid"])
    item_df["est"].loc[item_df["est"].isnull()] = 0
    logging.info("return from item_factorization")
    return item_df
Exemple #8
0
def colaborative_filtering_based_model(path, config, engine, df_valid_games):
    with open(path, 'r') as f:
        raw_strings = f.readlines()

    total_count = len(raw_strings)
    current_count = 0

    user_ratings = []
    scaler = MinMaxScaler((1, 5))

    for raw_string in raw_strings:
        user_id, user_inventory = list(json.loads(raw_string).items())[0]
        if user_inventory is not None:
            app_ids = [item['appid'] for item in user_inventory]
            app_scores = [item['playtime_forever'] for item in user_inventory]
            app_scores = scaler.fit_transform(np.log1p(app_scores).reshape(-1, 1))
            
            user_ratings_temp = [[user_id, app_ids[i], app_scores[i].item()] for i in range(len(app_ids))]
            user_ratings += user_ratings_temp

        show_work_status(1,total_count,current_count)
        current_count+=1

    user_item_ratings = pd.DataFrame(user_ratings)
    user_item_ratings.columns = ['user_id', 'item_id', 'rating']

    # Prediction part
    game_ids_set = set(df_valid_games.steam_appid)
    grouped_user_item_ratings = user_item_ratings.groupby('user_id')
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(user_item_ratings[['user_id', 'item_id', 'rating']], reader)

    alg = NMF(n_factors=20)
    alg.fit(data.build_full_trainset())

    total_count = len(user_item_ratings.user_id.unique())
    current_count = 0
    dict_user_recommendations = {}
    for user in user_item_ratings.user_id.unique().tolist():
        temp = grouped_user_item_ratings.get_group(user)
        not_purchased_ids = game_ids_set - set([str(x) for x in temp.item_id])
        
        user_test_temp = [[user, not_purchased_id, 0] for not_purchased_id in not_purchased_ids]
        user_test_temp = pd.DataFrame(user_test_temp)
        user_test_temp.columns = ['user_id', 'item_id', 'rating']
        
        data = Dataset.load_from_df(user_test_temp[['user_id', 'item_id', 'rating']], reader)
        user_test = data.build_full_trainset().build_testset()
        results = alg.test(user_test)
        dict_user_recommendations.update({user: pd.DataFrame(results).sort_values('est', ascending=False).iloc[:10, 1].values.tolist()})
        
        show_work_status(1,total_count,current_count)
        current_count+=1   

    df_cf_based_results = pd.DataFrame(dict_user_recommendations).T
    df_cf_based_results.index.name = 'user_id'
    df_cf_based_results.reset_index(inplace=True)
    df_cf_based_results.to_sql(config.mysql_user_like_table, engine, if_exists='replace')
Exemple #9
0
def trainingRatings(movies, users, ratings):
    ratings_dict = {"movies": movies, "users": users, "ratings": ratings}
    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[["users", "movies", "ratings"]], reader)
    trainingSet = data.build_full_trainset()
    algo = NMF(n_factors=100, n_epochs=100, reg_pu=0.01)
    algo.fit(trainingSet)
    recommendMoviesForUsers(movies, users, algo)
def get_u_v(data_raw, params):
    reader = surprise.Reader(rating_scale=(1, 5))
    # The columns must correspond to user id, item id and ratings (in that order).
    dataset = surprise.Dataset.load_from_df(
        data_raw[["User", "Movie", "Prediction"]], reader)
    trainset = dataset.build_full_trainset()
    algo = NMF(n_factors=params["GLOBAL_NMF_K"],
               n_epochs=params["GLOBAL_NMF_EPOCHS"],
               verbose=False)
    algo.fit(trainset)

    U_red = algo.pu
    V_red = algo.qi
    logging.info("return from get_u_v")
    return (U_red, V_red)
def nmf(train, test, ids, Xtest, Xids):
    """
    Non Negative Matrix Factorization
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('NMF')
    algo = NMF(n_factors=20,
               n_epochs=50,
               random_state=15,
               reg_pu=0.5,
               reg_qi=0.05)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Exemple #12
0
 def nmf(self, namefile, uid, iid, rati, value_uid, value_iid):
     test_data = pd.read_csv('./container/' + namefile)
     dt = pd.DataFrame(test_data)
     # Retrieve the trainset.
     reader = Reader(rating_scale=(0, 100))
     data = Dataset.load_from_df(dt[[uid, iid, rati]], reader)
     trainset = data.build_full_trainset()
     algo = NMF()
     algo.fit(trainset)
     pred = algo.predict(int(value_uid),
                         int(value_iid),
                         r_ui=1,
                         verbose=True)
     #var_rmse = accuracy.rmse(pred)
     #return result to json
     jsondata = {}
     jsondata = {}
     jsondata["uid"] = pred.uid
     jsondata["idd"] = pred.iid
     jsondata["rati"] = round(pred.est, 2)
     return jsondata
Exemple #13
0
def trainFinalModels(ratingsTrainDataset, ratingsTest, bestParamsNMF,
                     bestParamsKNN):
    ratingsTrainTrainset = ratingsTrainDataset.build_full_trainset()

    modelNMF = NMF(**bestParamsNMF)
    modelNMF.fit(ratingsTrainTrainset)
    saveModel(modelNMF, 'NMF')

    predictions = modelNMF.test(ratingsTest)
    rmseValue = rmse(predictions)
    maeValue = mae(predictions)
    saveFinalResult('NMF', rmseValue, maeValue)

    modelKNN = KNNWithMeans(**bestParamsKNN)
    modelKNN.fit(ratingsTrainTrainset)
    saveModel(modelKNN, 'KNN')

    predictions = modelKNN.test(ratingsTest)
    rmseValue = rmse(predictions)
    maeValue = mae(predictions)
    saveFinalResult('KNN', rmseValue, maeValue)
Exemple #14
0
class NonNegative_MF(BaseSurpriseSTLEstimator):
    """
    Nonnegative Matrix Factorization
    
    Args:
        :attr:`n_factors` (int): 
            number of latent vectors/factors for matrix factorization
        :attr:`n_epochs` (int): 
            Integer, The number of iteration of the SGD procedure. Default is 20
    
    see https://surprise.readthedocs.io/en/stable/matrix_factorization.html for more info
    """
    def __init__(self, n_factors, n_epochs=50, name='NonNegative_MF'):
        super().__init__(name, 'non_feature_based')
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.model = NMF(n_factors=self.n_factors, n_epochs=self.n_epochs)

    def _fit(self, x):
        self.model.fit(x)

    def _predict(self, x):
        return self.model.test(x)

    def get_hyper_params(self):
        hparams = {
            'n_factors': {
                'type': 'integer',
                'values': [2, 150]
            },
            'n_epochs': {
                'type': 'integer',
                'values': [2, 150]
            }
        }
        return hparams

    def set_hyper_params(self, **kwargs):
        self.n_factors = kwargs['n_factors']
Exemple #15
0
    def nmf_from_to(self, namefile, uid, iid, rati, from_uid, to_uid, from_iid,
                    to_iid):
        test_data = pd.read_csv('./container/' + namefile)
        dt = pd.DataFrame(test_data)
        # Retrieve the trainset.
        reader = Reader(rating_scale=(0, 100))
        data = Dataset.load_from_df(dt[[uid, iid, rati]], reader)
        trainset = data.build_full_trainset()
        algo = NMF()
        algo.fit(trainset)

        arr = []
        for value_uid in range(from_uid, to_uid):
            for value_iid in range(from_iid, to_iid):
                pred = algo.predict(value_uid, value_iid, r_ui=1, verbose=True)
                tempdata = []
                tempdata.append(pred.uid)
                tempdata.append(pred.iid)
                tempdata.append(round(pred.est, 2))
                arr.append(tempdata)
        #return result to json
        return arr
Exemple #16
0
def algoFunc(train_data, test_data):
    SVD_var = SVD()
    print("Singular Value Decomposition :\n")
    SVD_var.fit(train_data)
    predict_var = SVD_var.test(test_data)
    SVD_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    SVD_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nProbabilistic Matrix Factorization :\n")
    PMF_var = SVD(biased=False)
    PMF_var.fit(train_data)
    predict_var = PMF_var.test(test_data)
    PMF_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    PMF_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nNon-negative Matrix Factorization :\n")
    NMF_var = NMF()
    NMF_var.fit(train_data)
    predict_var = NMF_var.test(test_data)
    NMF_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    NMF_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nUser based Collaborative Filtering algorithm :\n")
    UB_var = KNNBasic(sim_options={'user_based': True})
    UB_var.fit(train_data)
    predict_var = UB_var.test(test_data)
    user_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    user_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nItem based Collaborative Filtering algorithm :\n")
    IB_var = KNNBasic(sim_options={'user_based': False})
    IB_var.fit(train_data)
    predict_var = IB_var.test(test_data)
    item_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    item_MAE_var = accuracy.mae(predict_var, verbose=True)
    print("\n")

    return SVD_RMSE_var, SVD_MAE_var, PMF_RMSE_var, PMF_MAE_var, NMF_RMSE_var, NMF_MAE_var, user_RMSE_var, user_MAE_var, item_RMSE_var, item_MAE_var
def train_algo(this_data):
    """
    Fit a Non-negative Matrix Factorization algo to the data.

    Args:
        this_data - surprise.dataset; the loaded json data.

    Returns:
        predictions - surprise library object; all predictions generated by algo.  
    """
    print("Running algo...")
    trainset = this_data.build_full_trainset()

    NMF_algo = NMF(biased=False, n_epochs=50, n_factors=35)

    NMF_algo.fit(trainset)

    testset = trainset.build_anti_testset()

    predictions = NMF_algo.test(testset)

    print("Getting predictions...")

    return predictions
def surpriseNMF(mode,
                DataPath='../data/data_clean.txt',
                TrainPath='../data/train_clean.txt',
                TestPath='../data/test_clean.txt',
                n_factors=15,
                n_epochs=50,
                reg_pu=0.06,
                reg_qi=0.06,
                reg_bu=0.02,
                reg_bi=0.02,
                lr_bu=0.005,
                lr_bi=0.005,
                init_low=0,
                init_high=1,
                biased=False,
                verbose=True):

    # We need the rating scale.
    reader = Reader(rating_scale=(1, 5))

    if mode == 'evaluation':

        # train data processing
        train = pd.read_csv(TrainPath, sep="\t", header=None)
        train.columns = ["User Id", "Movie Id", "Rating"]
        data = Dataset.load_from_df(train[["User Id", "Movie Id", "Rating"]],
                                    reader)
        trainset = data.build_full_trainset()

        # fit model
        algo = NMF(n_factors=n_factors,
                   n_epochs=n_epochs,
                   reg_pu=reg_pu,
                   reg_qi=reg_qi,
                   reg_bu=reg_bu,
                   reg_bi=reg_bi,
                   lr_bu=lr_bu,
                   lr_bi=lr_bi,
                   init_low=init_low,
                   init_high=init_high,
                   biased=biased,
                   verbose=verbose)
        algo.fit(trainset)

        # evaluate train error
        test = trainset.build_testset()
        predictions = algo.test(test)
        train_err = accuracy.rmse(predictions, verbose=False)

        # test data processing
        test = pd.read_csv(TestPath, sep="\t", header=None)
        test.columns = ["User Id", "Movie Id", "Rating"]
        data = Dataset.load_from_df(test[["User Id", "Movie Id", "Rating"]],
                                    reader)
        testset = data.build_full_trainset()

        # evaluate train error
        test = testset.build_testset()
        predictions = algo.test(test)
        test_err = accuracy.rmse(predictions, verbose=False)

        # Return V (qi),  U (pu), train_err (RMSE), test_err (RMSE)
        return algo.qi, algo.pu, train_err, test_err

    elif mode == 'visualization':

        # train data processing
        alldata = pd.read_csv(DataPath, sep="\t", header=None)
        alldata.columns = ["User Id", "Movie Id", "Rating"]
        data = Dataset.load_from_df(alldata[["User Id", "Movie Id", "Rating"]],
                                    reader)
        trainset = data.build_full_trainset()

        # fit model
        algo = NMF(n_factors=n_factors,
                   n_epochs=n_epochs,
                   reg_pu=reg_pu,
                   reg_qi=reg_qi,
                   reg_bu=reg_bu,
                   reg_bi=reg_bi,
                   lr_bu=lr_bu,
                   lr_bi=lr_bi,
                   init_low=init_low,
                   init_high=init_high,
                   biased=biased,
                   verbose=verbose)
        algo.fit(trainset)

        # evaluate train error
        test = trainset.build_testset()
        predictions = algo.test(test)
        train_err = accuracy.rmse(predictions, verbose=False)

        U = algo.pu
        V = algo.qi

        A, _, B = np.linalg.svd(V.T)
        A = A.T
        # Use the first 2 cols for work
        Asub = A[:, :2]

        Uproj = np.dot(Asub.T, U.T)
        Vproj = np.dot(Asub.T, V.T)

        # Return Vproj,  Uproj, train_err (RMSE of Y = U^T V)
        return Vproj, Uproj, train_err
Exemple #19
0
k_values = np.arange(2, 51, 2)
RMSE = list()
MAE = list()

# define a cross-validation iterator
kf = KFold(n_splits=10)

for k in k_values:
    algo = NMF(n_factors=k)
    this_k_RMSE = list()
    this_k_MAE = list()
    # use cross-validation iterator, perform CV manually
    for trainset, testset in kf.split(data):
        # fit the whole trainset
        algo.fit(trainset)

        #trim testset here
        testset_df = pd.DataFrame(testset,
                                  columns=['userId', 'movieId', 'rating'])
        testset_popular = testset_df.groupby("movieId").filter(
            lambda x: len(x) > 2).values.tolist()
        # testset_unpopular = testset_df.groupby("movieId").filter(lambda x: len(x) <= 2).values.tolist()
        # testset_highvariance =testset_df.groupby("movieId").filter(lambda x: np.var(x['rating'])>=2 and len(x)>=5 ).values.tolist()

        predictions = algo.test(testset_popular)
        this_k_RMSE.append(accuracy.rmse(predictions, verbose=True))

    RMSE.append(np.mean(this_k_RMSE))

plt.figure(figsize=[6, 5]).set_tight_layout(True)
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    print (score_df)
    print (svd_data.raw_ratings)
    #Try SVD
    algo_svd = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_svd, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_svd.fit(full_train_set)
    predictions = algo_svd.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    latent_usr_factor = algo_svd.pu 
    latent_item_factor = algo_svd.qi 
    user_bias = algo_svd.bu
    item_bias = algo_svd.bi
    recomendation_reportname_df_svd = pd.merge(recommendation_df_svd, df_reports_id, how = 'left', on= 'report_id')

    
    #Try SVD++
    algo_svdpp = SVDpp()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_svdpp, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_svdpp.fit(full_train_set)
    predictions = algo_svdpp.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svdpp = get_top_n(predictions, n=5)
    latent_usr_factor_pp = algo_svd.pu 
    latent_item_factor_pp = algo_svd.qi 
    user_bias_pp = algo_svd.bu
    item_bias_pp = algo_svd.bi
    recomendation_reportname_df_svdpp = pd.merge(recommendation_df_svdpp, df_reports_id, how = 'left', on= 'report_id')

      #Try SVD++ with more factors as default is 20
    algo_svdpp_mod = SVDpp(n_factors =50, n_epochs = 50)
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_svdpp, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_svdpp.fit(full_train_set)
    predictions = algo_svdpp.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    print (score)
    
    #print (recommendation_df)
    
    
    #Try the NMF
    #nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo_nmf = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_nmf, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_nmf.fit(full_train_set)
    predictions = algo_nmf.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_nmf = get_top_n(predictions, n=5)
    #print (recommendation_df)
    latent_usr_factor_nmf = algo_svd.pu 
    latent_item_factor_nmf = algo_svd.qi 
    user_bias_nmf = algo_svd.bu
    item_bias_nmf = algo_svd.bi
    recomendation_reportname_df_mmf = pd.merge(recommendation_df_nmf, df_reports_id, how = 'left', on= 'report_id')
    sidd_recmidation = recomendation_reportname_df.loc[recomendation_reportname_df['user_sso'] == 212568816]
    
        #Try the NMF without default
    #nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo_nmf_mod = NMF(n_factors =50, n_epochs = 50)
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_nmf, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True, )
    # Fitting the SVD
    algo_nmf.fit(full_train_set)
    predictions = algo_nmf.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_nmf = get_top_n(predictions, n=5)
    #print (recommendation_df)
    latent_usr_factor_nmf = algo_svd.pu 
    latent_item_factor_nmf = algo_svd.qi 
    user_bias_nmf = algo_svd.bu
    item_bias_nmf = algo_svd.bi
    recomendation_reportname_df_mmf = pd.merge(recommendation_df_nmf, df_reports_id, how = 'left', on= 'report_id')
    sidd_recmidation = recomendation_reportname_df.loc[recomendation_reportname_df['user_sso'] == 212568816]
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
     # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(algo_svd, svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(algo_svdpp,svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(algo_nmf, svd_data, cv=5, n_jobs=5, verbose=False) 
    svdpp_cv_mod = cross_validate(algo_svdpp_mod,svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv_mod = cross_validate(algo_nmf_mod, svd_data, cv=5, n_jobs=5, verbose=False) 
Exemple #21
0
file_path = os.path.expanduser(
    '/Volumes/DATA/Downloads/ml-latest-small/ratings.csv')
data = Dataset.load_from_file(file_path,
                              reader=Reader(
                                  line_format='user item rating timestamp',
                                  sep=',',
                                  skip_lines=1))

thre_set = [2.5, 3, 3.5, 4]
subplotindex = 220
plt.figure(figsize=[12, 10]).set_tight_layout(True)

for thre in thre_set:
    trainset, testset = train_test_split(data, test_size=0.1)
    algo = NMF(n_factors=20)  # Found in Q18
    predictions = algo.fit(trainset).test(testset)
    predicted_ratings = [t[3] for t in predictions]
    ground_truth_ratings = [t[2] for t in testset]
    ground_truth_labels = [1 if t >= thre else 0 for t in ground_truth_ratings]
    # Plot ROC
    fpr, tpr, _ = roc_curve(ground_truth_labels, predicted_ratings)
    roc_auc = auc(fpr, tpr)
    lw = 2
    subplotindex = subplotindex + 1
    plt.subplot(subplotindex)
    plt.plot(fpr,
             tpr,
             color='darkorange',
             lw=lw,
             label='ROC curve (area = %0.4f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
                        unicode_literals)

from surprise import NMF
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset  UserID::MovieID::Rating::Timestamp
data = Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data, test_size=.15)

# Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item.

algoritmo = NMF(n_epochs=5)

algoritmo.fit(trainset)

# Selecionamos o usuário e o filme que será analisado
# User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas
uid = str(49)
# Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4
iid = str(2058)  # raw item id

# get a prediction for specific users and items.
pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True)

# run the trained model against the testset
test_pred = algoritmo.test(testset)

# Avalia RMSE
print("Avaliação RMSE: ")
Exemple #23
0
        knn_results.append(results)

        print("KNNMean")
        knnmean = KNNWithMeans(sim_options={"name": "cosine"})
        knnmean.fit(trainset)
        knnmean_predictions = knnmean.test(testset)
        results = get_group_measures(preds_all=knnmean_predictions,
                                     U1=U1_users,
                                     U2=U2_users,
                                     U3=U3_users,
                                     U4=U3_users)
        knnmean_results.append(results)

        print("NMF")
        nmf = NMF()
        nmf.fit(trainset)
        nmf_predictions = nmf.test(testset)
        results = get_group_measures(preds_all=nmf_predictions,
                                     U1=U1_users,
                                     U2=U2_users,
                                     U3=U3_users,
                                     U4=U3_users)
        nmf_results.append(results)
        """print("TOP")
        top = TOP()
        top.fit(trainset)
        top_predictions = top.test(testset)
        results = get_group_measures(preds_all=top_predictions, U1=U1_users, U2=U2_users, U3=U3_users, U4=U4_users)
        top_results.append(results)

        print("NormalPredictor")
class NMF_Cosine_Recommender:
    """[summary]
       @author Will Jobs
    """
    def __init__(self,
                 df_users,
                 df_movies,
                 df_ratings,
                 df_movie_lens_tags,
                 biased=False):
        """[summary]

        Args:
            df_users ([type]): [description]
            df_movies ([type]): [description]
            df_ratings ([type]): [description]
            df_movie_lens_tags ([type]): [description]
            biased
        """
        self.users = df_users
        self.movies = df_movies
        self.ratings = df_ratings
        self.ml_tags = df_movie_lens_tags
        self.biased = biased
        self.trained_nmf = False
        self.preprocessed = False
        self.trained_cosine = False
        self.cv_score = None
        self.cv_fit_time = None
        self.movies_merged = pd.DataFrame()
        self.nmf_predictions = pd.DataFrame()
        self.tfidf_matrix = None
        self.algo = None
        self.W = None
        self.H = None

    def preprocess_tags(self, verbose=True):
        """[summary]

        Args:
            verbose (bool, optional): [description]. Defaults to True.
            seed ([type], optional): [description]. Defaults to None.
        """
        if self.preprocessed:  # only do this once
            return

        if verbose:
            print('Preprocessing tags and movie information...', end='')

        self.ml_tags.rename(columns={
            'userId': 'userID',
            'movieId': 'movieID'
        },
                            inplace=True)
        self.ml_tags = self.ml_tags.astype({'tag': str})

        tmp_tags = self.ml_tags.copy()
        tmp_movies = self.movies.copy()

        # replace punctuation in tags (a space), movie name (a space), and genres (no space). These will eventually be folded into the tags list
        # doing it this way to avoid altering the original tags during presentation later
        tmp_tags['new_tag'] = tmp_tags.tag.str.replace(r'[^\w\s]', ' ')
        tmp_movies['new_name'] = tmp_movies.name.str.replace(r'[^\w\s]', ' ')
        tmp_movies['new_genre1'] = tmp_movies.genre1.str.replace(
            r'[^\w\s]', '')
        tmp_movies['new_genre2'] = tmp_movies.genre2.str.replace(
            r'[^\w\s]', '')
        tmp_movies['new_genre3'] = tmp_movies.genre3.str.replace(
            r'[^\w\s]', '')

        # aggregate all users' tags up per movie
        tags_nostrip = tmp_tags.groupby('movieID').tag.apply(
            ' '.join).reset_index()
        tags_nostrip.rename(columns={'tag': 'tags'}, inplace=True)
        tags_strip = tmp_tags.groupby('movieID').new_tag.apply(
            ' '.join).reset_index()
        tags_strip = tags_nostrip.merge(tags_strip, on='movieID')

        # merge name, genres, and tags together
        self.movies_merged = tmp_movies.merge(tags_strip,
                                              on='movieID',
                                              how='left')
        self.movies_merged['tags_strip'] = self.movies_merged.apply(
            lambda x: '{} {} {} {} {}'.format(
                x['new_name'], x['new_genre1'], x['new_genre2']
                if type(x['new_genre2']) != float else "", x['new_genre3']
                if type(x['new_genre3']) != float else "", x['new_tag']),
            axis=1)
        self.movies_merged.drop(columns=[
            'new_tag', 'new_name', 'new_genre1', 'new_genre2', 'new_genre3'
        ],
                                inplace=True)

        # merge in the combined tags (with punctuation)
        self.movies = self.movies.merge(tags_nostrip, on='movieID', how='left')

        self.preprocessed = True

        if verbose:
            print('Done')

    def train_cosine_similarity(self, seed=None, verbose=True):
        """[summary]

        Args:
            seed ([type], optional): [description]. Defaults to None.
            verbose (bool, optional): [description]. Defaults to True.

        Raises:
            RuntimeError: [description]
        """
        if not self.preprocessed:
            raise RuntimeError(
                'Cannot train cosine similarity until preprocessing is done (via preprocess_tags)'
            )

        if self.trained_cosine:  # only do this once
            return

        if seed is not None:
            random.seed(seed)

        vectorizer = TfidfVectorizer(stop_words='english', min_df=3)

        if verbose:
            print('Cosine similarity training...', end='')

        self.tfidf_matrix = vectorizer.fit_transform(
            self.movies_merged['tags_strip'])
        self.trained_cosine = True

        if verbose:
            print('Done')

    def run_nmf(self,
                n_factors=15,
                run_cross_validation=True,
                cv_metric='RMSE',
                seed=None,
                verbose=True):
        """[summary]

        Args:
            n_factors (int, optional): [description]. Defaults to 15.
            run_cross_validation (bool, optional): [description]. Defaults to True.
            cv_metric (str, optional): [description]. Defaults to 'RMSE'.
            seed ([type], optional): [description]. Defaults to None.
            verbose (bool, optional): [description]. Defaults to True.
        """

        # ratings get clipped from 1 to 5
        reader = Reader(rating_scale=(1.0, 5.0))
        data = Dataset.load_from_df(self.ratings, reader)

        # first, calculate CV on a fraction of the dataset
        if run_cross_validation:
            if verbose:
                print('Running cross-validation...', end='')

            if seed is not None:
                random.seed(seed)

            algo = NMF(n_factors=n_factors,
                       biased=self.biased,
                       random_state=seed)
            cv_results = cross_validate(algo,
                                        data,
                                        measures=['RMSE'],
                                        cv=5,
                                        verbose=False)
            avg_cv_result = pd.DataFrame.from_dict(cv_results).mean(axis=0)
            self.cv_score = avg_cv_result['test_' + cv_metric.lower()]
            self.cv_fit_time = avg_cv_result['fit_time']

            if verbose:
                print('Done')
                print('Average CV score: {}\nAverage fit time: {} seconds'.
                      format(round(self.cv_score, 4),
                             round(self.cv_fit_time, 4)))

        if seed is not None:
            random.seed(seed)

        # ratings must have 3 cols: users, items, ratings (in that order)
        train_set = data.build_full_trainset()

        self.algo = NMF(n_factors=n_factors,
                        biased=self.biased,
                        random_state=seed)

        if verbose:
            print('NMF Fitting...', end='')

        self.algo.fit(train_set)

        self.W = self.algo.pu
        self.H = np.transpose(self.algo.qi)

        # get predictions for *every* user/movie combo. These will be also compared to the actual ratings
        if verbose:
            print('Done')
            print('Generating all user-movie pairs for predictions...', end='')

        all_pairs = [(x, y, 0) for x in self.users.userID
                     for y in self.movies.movieID]

        # getting predictions for ALL user/movie combos
        # took 40 seconds on 3.4 million rows
        if verbose:
            print('Done')
            print('Calculating predictions on all user-movie pairs...', end='')

        all_preds = self.algo.test(all_pairs)
        all_preds = pd.DataFrame([{
            'userID': y.uid,
            'movieID': y.iid,
            'nmf_prediction': y.est
        } for y in all_preds])

        self.nmf_predictions = all_preds.merge(self.ratings,
                                               on=['userID', 'movieID'],
                                               how='left')
        self.nmf_predictions = self.nmf_predictions[[
            'userID', 'movieID', 'rating', 'nmf_prediction'
        ]]
        self.trained_nmf = True

        if verbose:
            print('Done')

    def train(self,
              n_factors=15,
              run_cross_validation=True,
              seed=None,
              verbose=True):
        """[summary]

        Args:
            n_factors (int, optional): [description]. Defaults to 15.
            run_cross_validation (bool, optional): [description]. Defaults to True.
            seed ([type], optional): [description]. Defaults to None.
            verbose (bool, optional): [description]. Defaults to True.
        """
        self.preprocess_tags(verbose=verbose)
        self.train_cosine_similarity(seed=seed, verbose=verbose)
        self.run_nmf(n_factors=n_factors,
                     run_cross_validation=run_cross_validation,
                     seed=seed,
                     verbose=verbose)

    def get_similar_movies(self, movieID, number_of_movies=None, verbose=True):
        """[summary]

        Args:
            movieID ([type]): [description]
            verbose (bool, optional): [description]. Defaults to True.

        Raises:
            RuntimeError: [description]

        Returns:
            [type]: [description]
        """
        if not (self.preprocessed and self.trained_cosine):
            raise RuntimeError(
                'Cannot make recommendations without training NMF, preprocessing, and training cosine first.'
            )

        # get the index of the movie
        idx = np.where(self.movies_merged.movieID == movieID)[0][0]

        if verbose:
            print('Getting similar movies to ' +
                  self.movies_merged.iloc[idx]['name'] + '...',
                  end='')

        y = cosine_similarity(self.tfidf_matrix[idx], self.tfidf_matrix)
        idx_scores = pd.DataFrame(
            [(idx, score)
             for (idx, score) in enumerate(list(y[0])) if score > 0],
            columns=['idx', 'similarity'])

        result = pd.concat([
            self.movies_merged.iloc[idx_scores.idx].reset_index(), idx_scores
        ],
                           axis=1).sort_values(by='similarity',
                                               ascending=False)

        # get rid of transformed columns from movies_merged (except tag), and get the *original* name and genres with punctuation
        result.drop(columns=[
            x for x in [*self.movies_merged.columns, 'index', 'idx']
            if x != 'movieID'
        ],
                    inplace=True)
        result = result.merge(self.movies, on='movieID', how='left')
        result = result[[
            'movieID', 'name', 'year', 'genre1', 'genre2', 'genre3', 'tags',
            'similarity'
        ]]

        if verbose:
            print('Done')

        # don't include the movie we're finding similarities for
        if number_of_movies is not None:
            return result[1:].head(number_of_movies)
        else:
            return result[1:]

    def get_recommendations(self,
                            userID,
                            number_of_recs=5,
                            seed=None,
                            show_user_likes=True,
                            verbose=True):
        """[summary]
        Algorithm:
        1. Get 20 of the users' top ratings. Start with 5s, if > 20 exist, sample 20 randomly.
        2. If fewer than 20 5s exist, sample 4s until get to 20 (or use up all 4s).
            - If there are no 5s or 4s, ignore the user's ratings, and just
              return the <number_of_recs> top predicted ratings for this user. Done.
        3. For each movie in the top list, calculate cosine similarity, and get the 10 most-similar
           movies which the user has NOT seen.
        4. Combine the 20 most-similar lists of 10 movies into a single list.
        5. Remove duplicates from this list, choosing the highest-similarity achieved
        6. For each movie, look up the predicted rating for this user.
        7. Multiply each movie's similarity times the predicted rating.
        8. Return the top <number_of_recs> predicted movies (or all if not enough). Done.

        Args:
            userID ([type]): [description]
            number_of_recs (int, optional): [description]. Defaults to 5.
            seed ([type], optional): [description]. Defaults to None.
            verbose (bool, optional): [description]. Defaults to True.

        Returns:
            pandas DataFrame: expected ratings. Columns: movieID, name, genres, weighted_rating
        """
        MAX_CONSIDERED_RATINGS = 20
        CONSIDER_N_SIMILAR = 10

        def combine_genres(df):
            # combine genres into a single column. Note that NaNs parse as float during apply
            df['genres'] = df.apply(lambda row: (row['genre1'] if not type(row['genre1'])==float else "") + \
                                                ("/" + row['genre2'] if not type(row['genre2'])==float else "") + \
                                                ("/" + row['genre3'] if not type(row['genre3'])==float else ""), axis=1)
            df.drop(columns=['genre1', 'genre2', 'genre3'], inplace=True)

        def get_subset_ratings():
            if verbose:
                print("Getting user's highest rated movies to start from...",
                      end='')

            all_5s = self.ratings[(self.ratings.userID == userID)
                                  & (self.ratings.rating == 5)]

            if len(all_5s) >= MAX_CONSIDERED_RATINGS:
                subset_ratings = all_5s.sample(MAX_CONSIDERED_RATINGS,
                                               random_state=seed)
            else:
                # use all 5s, and add in 4s until we have <MAX_CONSIDERED_RATINGS>
                subset_ratings = all_5s.copy()
                all_4s = self.ratings[(self.ratings.userID == userID)
                                      & (self.ratings.rating == 4)]
                count_needed = MAX_CONSIDERED_RATINGS - len(all_5s)
                subset_ratings = pd.concat([
                    subset_ratings,
                    all_4s.sample(min(count_needed, len(all_4s)),
                                  random_state=seed)
                ],
                                           ignore_index=True)

            subset_ratings = subset_ratings.merge(
                self.movies[['movieID', 'name']], on='movieID')

            if verbose:
                print('Done')

            return subset_ratings[['userID', 'movieID', 'name', 'rating']]

        def get_most_similar_movies(subset_ratings):
            if verbose:
                print("Finding similar movies to {} movies the user liked...".
                      format(len(subset_ratings)))

            seen_movies = list(
                self.ratings[(self.ratings.userID == userID)].movieID)
            similar_movies = pd.DataFrame()

            for movie in subset_ratings.movieID:
                tmp_similar = self.get_similar_movies(movie, verbose=verbose)

                # limit to movies the user hasn't seen, and limit to top <CONSIDER_N_SIMILAR>
                tmp_similar = tmp_similar[~tmp_similar['movieID'].isin(
                    seen_movies)].head(CONSIDER_N_SIMILAR)
                tmp_similar['similar_to'] = subset_ratings[
                    subset_ratings['movieID'] == movie].name.values[0]
                similar_movies = pd.concat([similar_movies, tmp_similar],
                                           ignore_index=True)

            # now remove duplicates, and get the top similarity for each movie
            similar_movies.sort_values(by='similarity',
                                       ascending=False,
                                       inplace=True)
            similar_movies.drop_duplicates(subset='movieID',
                                           keep='first',
                                           inplace=True)

            return similar_movies

        if not (self.trained_nmf and self.preprocessed
                and self.trained_cosine):
            raise RuntimeError(
                'Cannot make recommendations without training NMF, preprocessing, and training cosine first.'
            )

        if userID not in self.users.userID.values:
            raise ValueError(
                'User {} does not exist in ratings dataset. If this is a new user, create a new user using the average ratings.'
                .format(userID))

        if seed is not None:
            random.seed(seed)

        review_counts = self.ratings[self.ratings.userID ==
                                     userID].rating.value_counts()

        if review_counts.get(5, 0) + review_counts.get(4, 0) == 0:
            # ignore user's ratings, and just get the user's top <number_of_recs> ratings
            if verbose:
                print(
                    "User has no ratings >= 4. Ignoring user's ratings, returning top predicted ratings."
                )

            # get only predicted ratings for ones the user hasn't seen
            subset_ratings = self.nmf_predictions.loc[
                (self.nmf_predictions.userID == userID)
                & (self.nmf_predictions.rating.isna())].copy()
            subset_ratings = subset_ratings.merge(self.movies, on='movieID')
            combine_genres(subset_ratings)

            # add in columns that would have been calculated
            subset_ratings['similar_to'] = ""
            subset_ratings['similarity'] = np.nan
            subset_ratings['weighted_rating'] = subset_ratings[
                'nmf_prediction']

            # reorder columns
            subset_ratings = subset_ratings[[
                'movieID', 'name', 'year', 'genres', 'tags', 'similar_to',
                'similarity', 'nmf_prediction', 'weighted_rating'
            ]]
            subset_ratings.sort_values(by='nmf_prediction',
                                       ascending=False,
                                       inplace=True)

            return subset_ratings.head(number_of_recs)

        # get up to <MAX_CONSIDERED_RATINGS> 5s
        subset_ratings = get_subset_ratings()

        if show_user_likes:
            print('\n---------------\nHighest-reviewed movies for userID {}:'.
                  format(userID))
            print(subset_ratings)
            print('\n---------------\n')

        # get the similarity for each movie in subset_ratings
        similar_movies = get_most_similar_movies(subset_ratings)

        # now we have the similarity scores for the movies most like the movies the user rated highest
        # get the predicted ratings, and multiply those by the similarity scores
        if verbose:
            print(
                "Getting user's predicted ratings and calculated expected rating...",
                end='')

        user_predictions = self.nmf_predictions[self.nmf_predictions['userID']
                                                == userID]
        similar_movies = similar_movies.merge(user_predictions,
                                              on='movieID',
                                              how='inner')
        similar_movies['weighted_rating'] = similar_movies[
            'similarity'] * similar_movies['nmf_prediction']

        if verbose:
            print('Done')
            print("Finalizing output...", end='')

        # combine genres and reorder columns
        combine_genres(similar_movies)
        similar_movies = similar_movies[[
            'movieID', 'name', 'year', 'genres', 'tags', 'similar_to',
            'similarity', 'nmf_prediction', 'weighted_rating'
        ]]
        similar_movies.sort_values(by='weighted_rating',
                                   ascending=False,
                                   inplace=True)

        if verbose:
            print('Done')

        return similar_movies.head(number_of_recs)
                                                    Y_overall,
                                                    test_size=0.3)
clf = SVC(gamma="auto")
clf.fit(X_train, Y_train)
predict = clf.predict(X_test)
print(
    f"SVM Accuracy Score: {metrics.accuracy_score(Y_test,predict)*100:0.4f}%")

#NMF
data = pd.concat([df['reviewerID'], df['asin'], df['overall']], axis=1)
data2 = pd.concat([df['reviewerID'], df['asin'], df['overall']], axis=1)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data, reader)
NMFModel = NMF()
NMFModel.fit(data.build_full_trainset())

predicted = []
for indx, row in data2.iterrows():
    pred = NMFModel.predict(uid=row['reviewerID'], iid=row['asin']).est
    predicted.append(pred)

true = df['overall'].tolist()
acc = 0
for i in range(len(true)):
    if int(round(predicted[i])) == true[i]:
        acc += 1

avg = acc / len(true)

print(f"NMF Accuracy: {avg*100:0.4f}%")
Exemple #26
0
plt.grid()
plt.title('3-Fold CV - Number of Factors')
plt.savefig('3_fold_CV_Reg_Param_NMF_n_factors.png')

# %% Best Hyper-parameters Training
alg = NMF()

alg.biased = Grid_Search_Result.best_params['rmse']['biased']
alg.n_epochs = Grid_Search_Result.best_params['rmse']['n_epochs']
alg.n_factors = Grid_Search_Result.best_params['rmse']['n_factors']
alg.reg_pu = Grid_Search_Result.best_params['rmse']['reg_pu']
alg.reg_qi = Grid_Search_Result.best_params['rmse']['reg_qi']

start = time.time()

alg.fit(data_train.build_full_trainset())

end = time.time()
print("***********************************************")
print("Exe time:")
print(end - start)

# %% Loading Test Data
file_path = "Data/sample_submission.csv"
data_test = utils.load_data_desired(file_path)

# %% Prediction
Predict_Test = []

for line in data_test:
    Predict_Test.append(alg.predict(str(line[1]), str(line[0])).est)
import os
import numpy as np
import matplotlib.pyplot as plt
from surprise import Dataset,Reader,NMF
import csv

# Load the dataset. The path needs to be set to where ml-latest-small is downloaded
file_path = os.path.expanduser('/Volumes/DATA/Downloads/ml-latest-small/ratings.csv')
data = Dataset.load_from_file(file_path, reader=Reader(line_format='user item rating timestamp', sep=',', skip_lines=1))
full_trainset = data.build_full_trainset()

with open('/Volumes/DATA/Downloads/ml-latest-small/movies.csv') as f:
    movieGenres=[tuple(line) for line in csv.reader(f)]

algo = NMF(n_factors=20)
algo.fit(full_trainset)

col2exam = 5
Vcol = algo.qi[:,col2exam]
sortedCol = Vcol.argsort()
topTen = [full_trainset.to_raw_iid(r) for r in sortedCol[sortedCol.shape[0]-10:sortedCol.shape[0]]]

for movie in topTen:
    print [item[2] for item in movieGenres if item[0] == movie]
Exemple #28
0
test_raw_ratings = raw_ratings[:int(amt_test * len(raw_ratings))]

# Uses training set
data.raw_ratings = train_raw_ratings

# Finds best parameters for NMF model with bias
# Scores using MSE
params = {"biased": [True], "n_factors": np.arange(2, 12, 2)}
nmf = GridSearchCV(NMF, params, measures=["mse"], cv=3)
nmf.fit(data)

print("\nBest number of factors found:", nmf.best_params['mse']['n_factors'])

# Trains NVM using best parameters found
best_nmf = NMF(biased=True, n_factors=nmf.best_params['mse']['n_factors'])
best_nmf.fit(data.build_full_trainset())

# Tests on training set
predictions = best_nmf.test(data.build_full_trainset().build_testset())
mse = accuracy.mse(predictions, verbose=False)
print("Training Set MSE:", mse)

# Scores on test set
predictions = best_nmf.test(data.construct_testset(test_raw_ratings))
mse = accuracy.mse(predictions, verbose=False)
print("Test Set MSE:", mse)

# Checks Recommendations
# -----------------------
recs = defaultdict(list)  # List of recommendations for each user
num_recs = 5  # Number of recommendations to get for each user
def best_pred():
    review['새주소'] = review['장소'] + "*" + review['주소']
    review2 = review.drop([
        '장소', '주소', '위도', '경도', '분류', '대분류', '주소1', '주소2', '방문횟수', '년도', '월',
        '계절'
    ],
                          axis=1)
    review2 = review2[['이름', '새주소', '별점']]

    # 데이터 셋의 차원 줄이기
    # 저조한 평가를 기록한 장소 및 사용자 제외
    min_ratings = 50
    filter_review = review2['새주소'].value_counts() > min_ratings
    filter_review = filter_review[filter_review].index.tolist()

    min_user_ratings = 50
    filter_users = review2['이름'].value_counts() > min_user_ratings
    filter_users = filter_users[filter_users].index.tolist()

    review_new = review2[(review2['새주소'].isin(filter_review))
                         & (review2['이름'].isin(filter_users))]

    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(review_new[['이름', '새주소', '별점']], reader)

    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(), KNNWithZScore,
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        algo = NMF()
        results = cross_validate(algo,
                                 data,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)
        trainset, testset = train_test_split(data, test_size=0.25)
        predictions = algo.fit(trainset).test(testset)
        # accuracy.rmse(predictions)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    surprise_results = pd.DataFrame(benchmark).set_index(
        'Algorithm').sort_values('test_rmse')

    # Train and Predict
    # CoClustering 알고리즘이 가장 좋은 rmse 결과를 보였다. 따라서 CoClustering 사용하여
    # 훈련 및 예측을 진행하고 교대최소제곱(ALS)를 사용할 것
    algo = NMF()
    cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

    # rmse 정확도 훈련셋과 검증셋을 샘플링하기위해 train_Test_split()을 사용
    # rmse 정확도 척도를 사용
    # fit() 메소드를 통해 훈련셋의 알고리즘을 훈련시키고, test() 메소드를 통해 검증셋으로부터
    # 생성된 예측을 반환
    trainset, testset = train_test_split(data, test_size=0.25)
    # algo = BaselineOnly(bsl_options=bsl_options)
    algo = NMF()
    predictions = algo.fit(trainset).test(testset)

    # dump.dump('./dump_file',predictions, algo)
    # predictions, algo = dump.load('./dump_file')

    trainset = algo.trainset

    # 예측을 정확히 살펴보기 위해, 모든 예측에 대한 데이터프레임 생성

    def get_Iu(uid):
        try:
            return len(trainset.ur[trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(iid):
        try:
            return len(trainset.ir[trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    df = pd.DataFrame(predictions,
                      columns=['uid', 'iid', 'rui', 'est', 'details'])
    df['Iu'] = df.uid.apply(get_Iu)
    df['Ui'] = df.iid.apply(get_Ui)
    df['err'] = abs(df.est - df.rui)

    predictions = df.sort_values(by='err').drop_duplicates('iid')

    best_predictions = predictions[:100]
    worst_predictions = predictions[-10:]

    # tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],index=['Algorithm']))
    best_predictions['iid'] = best_predictions.iid.str.split('*').str[0]

    sql = "insert into rec(rec_uid, rec_iid, rec_rui, rec_est) values(:rec_uid, :rec_iid, :rec_rui, :rec_est)"
    data = best_predictions[['uid', 'iid', 'rui', 'est']]
    data.columns = ['rec_uid', 'rec_iid', 'rec_rui', 'rec_est']
    cursor.close()
    conn.close()
    return data
from surprise import Reader, Dataset, SVD, evaluate, NMF
import zipfile

#Unzip the file
"""file = zipfile.ZipFile('/home/shanmukha/AnacondaProjects/Spyder_projects/Recommendation_trail/ml-100k.zip','r')
file.extractall()
file.close()
"""
#Read dataset
reader = Reader(line_format='user item rating timestamp', sep='\t')
dataset = Dataset.load_from_file(file_path='./ml-100k/u.data', reader=reader)

#Split dataset
dataset.split(n_folds=5)

#Using SVD,NMF
algo1 = SVD()
algo2 = NMF()
#evaluate(algo,dataset,measures=['RMSE','MAE'])

#Training entire dataset
train_data = dataset.build_full_trainset()
algo1.fit(train_data)
algo2.fit(train_data)

#predicting
user = str(196)
item = str(302)
actual_rating = 4
print(algo1.predict(user, item, actual_rating))
print(algo2.predict(user, item, actual_rating))