Ejemplo n.º 1
0
    def train(self):

        self.dataSetConstruct()
        print("model training...")
        self.algo = surprise.SVDpp(n_factors=1,
                                   n_epochs=300,
                                   lr_all=0.001,
                                   reg_all=0.01)
        surprise.model_selection.cross_validate(self.algo,
                                                self.data,
                                                measures=['RMSE', 'MAE'],
                                                cv=3,
                                                verbose=True)

        print("model training complete")
        print("Making predictions...")
        self.predictions = self.algo.test(self.testset)
        print("Predictions made")

        self.predictionDict = defaultdict(list)
        for uid, iid, true_r, est, _ in self.predictions:
            self.predictionDict[uid].append((iid, est))

        print("Sorting results...")
        for id, ratings in self.predictionDict.items():
            self.predictionDict[id] = sorted(ratings,
                                             key=lambda x: x[1],
                                             reverse=True)[0:self.n]
        print("Sorting complete")

        file = open(self.dictPath, 'wb')
        pickle.dump(self.predictionDict, file)
        file.close()
        print('Dict saved')
Ejemplo n.º 2
0
    def __init__(self, hyper_params, user_count, item_count):
        latent_size = hyper_params['latent_size']

        if hyper_params['model_type'] == 'kNN':
            self.model = surprise.prediction_algorithms.knns.KNNBasic(
                k=10, verbose=True)
        elif hyper_params['model_type'] == 'NMF':
            self.model = surprise.NMF(n_factors=latent_size,
                                      biased=False,
                                      n_epochs=50,
                                      verbose=True)
        elif hyper_params['model_type'] == 'SVD':
            self.model = surprise.SVD(n_factors=latent_size, verbose=True)
        elif hyper_params['model_type'] == 'SVD++':
            self.model = surprise.SVDpp(n_factors=latent_size, verbose=True)
        elif hyper_params['model_type'] == 'baseline':
            bsl_options = {
                'method': 'sgd',
                'n_epochs': 20,
            }
            self.model = surprise.prediction_algorithms.baseline_only.BaselineOnly(
                bsl_options=bsl_options, verbose=True)

        self.hyper_params = hyper_params
        self.user_count = user_count
        self.item_count = item_count
Ejemplo n.º 3
0
def surprise_SVDpp(trainset, finalset):
    "SVD++ model"
    algo = spr.SVDpp(n_factors=40, n_epochs=20, lr_all=0.001)

    algo.fit(trainset)
    predictions_final = algo.test(finalset)

    return spr_estimate_to_vect(predictions_final)
Ejemplo n.º 4
0
def SVDpp(train, test, rate):
    """
    Run the SVD++ model from Surprise library. The number of factors is 40. The number of iterations is 20.
    @param train: the training set in the Surprise format.
    @param test: the test set in the Surprise format.
    @param rate: the learning rate of all parameters.
    @return: the predictions in a numpy array.
    """
    algo = spr.SVDpp(n_factors=40, lr_all=rate, verbose=True)
    algo.fit(train)
    predictions = algo.test(test)
    return get_predictions(predictions)
Ejemplo n.º 5
0
def algo_tester(data_object):
    '''
  Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms

  ---Parameters---
  data_object(variable) created from the read_data_surprise function

  ---Returns---
  returns a dataframe where you can compare the performance of different algorithms
  '''
    benchmark = []
    algos = [
        sp.SVDpp(),
        sp.SVD(),
        sp.SlopeOne(),
        sp.NMF(),
        sp.NormalPredictor(),
        sp.KNNBaseline(),
        sp.KNNBasic(),
        sp.KNNWithMeans(),
        sp.KNNWithZScore(),
        sp.BaselineOnly(),
        sp.CoClustering()
    ]

    # Iterate over all algorithms
    for algorithm in algos:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_object,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
        'test_rmse')
    return benchmark
Ejemplo n.º 6
0
    def train_model(self):
        # main training component
        def get_top_n(predictions, n=10):
            # get prediction results
            top_n = collections.defaultdict(list)
            for uid, iid, true_r, est, _ in predictions:
                top_n[uid].append((iid, est))

            for uid, user_ratings in top_n.items():
                user_ratings.sort(key=lambda x: x[1], reverse=True)
                top_n[uid] = user_ratings[:30]

            return top_n

        R = pd.read_csv('./Data/Spark_Training.csv')
        reader = surprise.Reader(rating_scale=(0.0, 4.0))
        data = surprise.Dataset.load_from_df(R, reader)
        algo = surprise.SVDpp(lr_all=0.001,
                              n_factors=100,
                              n_epochs=20,
                              reg_all=0.1)
        trainset = data.build_full_trainset()
        testset = trainset.build_anti_testset()
        print(
            'Training started. Depends on your machine, this process may take more than an hour'
        )
        algo.fit(trainset)
        # cross validation
        output = surprise.model_selection.cross_validate(
            algo,
            data,
            verbose=True,
            n_jobs=-2,
            cv=3,
            measures=['rmse', 'mae', 'fcp'])
        predictions = algo.test(testset)
        dump_pred = get_top_n(predictions, n=30)
        with open('./Saved Models/test_pred.pkl', 'wb') as f:
            pickle.dump(dump_pred, f, protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 7
0
    def nb_collaborative_filtering(self, critic, top_n=5):
        lower_rating = self.reviews_rs['rating'].min()
        upper_rating = self.reviews_rs['rating'].max()

        reader = surprise.Reader(rating_scale=(0.0, 10.0))
        data = surprise.Dataset.load_from_df(self.reviews_rs, reader)

        alg = surprise.SVDpp()
        output = alg.fit(data.build_full_trainset())

        # Get a list of all unique movies
        movies_id = self.reviews_rs['id'].unique()

        # Get a list of movies_id that reviewer 0 has rated
        critic_id = self.critic_uid[critic]
        movies_id_critic = self.reviews_rs.loc[self.reviews_rs['critic_uid'] ==
                                               critic_id, 'id']

        # Remove the movie_id that reviewer 0 has rated
        movies_ids_to_pred = np.setdiff1d(movies_id, movies_id_critic)

        testset = [[critic_id, movie_id, 10.0]
                   for movie_id in movies_ids_to_pred]
        predictions = alg.test(testset)

        pred_ratings = np.array([pred.est for pred in predictions])

        # Find the index of the maximum predicted rating
        i_max = np.argpartition(pred_ratings, -top_n)[-top_n:]

        # Use this to find the corresponding movie_id to recommend
        print('Top movies for reviewer {0}: {1}'.format(
            critic_id, self.critics[critic_id]))
        for i in i_max:
            movie_id = movies_ids_to_pred[i]
            print('movie_id: {0} with predicted rating: {1}'.format(
                movie_id, pred_ratings[i]))
Ejemplo n.º 8
0
preprocessed_dataset = dblp.load_preprocessed_dataset()
x_train, y_train, x_test, y_test = dblp.get_fold_data(fold_counter,
                                                      preprocessed_dataset,
                                                      train_test_indices)
df_train = dblp.create_user_item(x_train, y_train)
reader = sr.Reader(rating_scale=(1, 1))
data_train = sr.Dataset.load_from_df(df_train[['userID', 'itemID', 'rating']],
                                     reader)
df_test = dblp.create_user_item(x_test, y_test)
data_test_temp = sr.Dataset.load_from_df(
    df_test[['userID', 'itemID', 'rating']], reader)

temp = data_test_temp.build_full_trainset()
data_test = temp.build_anti_testset()

algo = sr.SVDpp()
algo.fit(data_train.build_full_trainset())


def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
Ejemplo n.º 9
0
    if args.model == 'NormalPredictor':
        model = surprise.NormalPredictor()
    elif args.model == 'BaselineOnly':
        model = surprise.BaselineOnly()
    elif args.model == 'KNNBasic':
        model = surprise.KNNBasic()
    elif args.model == 'KNNWithMeans':
        model = surprise.KNNWithMeans()
    elif args.model == 'KNNWithZScore':
        model = surprise.KNNWithZScore()
    elif args.model == 'KNNBaseline':
        model = surprise.KNNBaseline()
    elif args.model == 'SVD':
        model = surprise.SVD()
    elif args.model == 'SVDpp':
        model = surprise.SVDpp(verbose=True)
    elif args.model == 'NMF':
        model = surprise.NMF()
    elif args.model == 'SlopeOne':
        model = surprise.SlopeOne()
    elif args.model == 'CoClustering':
        model = surprise.CoClustering()

    # cross_validate(model, trainset, cv=5, verbose=True)
    model.fit(trainset)

    lines = []
    test_path = path + '/Data/test_format.txt'
    for line in tqdm(open(test_path, 'r').readlines()):
        user_id, item_id, timestamp, *tags = line.strip().split(',')
        rating = model.predict(user_id, item_id).est
Ejemplo n.º 10
0
def main(args):

    parser = argparse.ArgumentParser(description= \
        'Deploys recommendation algorithms and outputs the recommendations list',\
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--pickleLoadPath", type=str, action='store', \
        help= 'If set=> load topN recoms from pickle file')
    parser.add_argument("--pickleSavePath",
                        type=str,
                        action='store',
                        help='If set => Output .pickle file.')

    parser.add_argument("--proc", type=int, default=multiprocessing.cpu_count(), \
        action='store', \
        help= 'Number of processes to spawn for topN computation\n' +
        'default is number of processors.')
    parser.add_argument("--update_freq", type=int, default=1, action='store', \
        help= 'Number of clicks after which the model is updated')
    parser.add_argument("--topN_list", type=int, nargs="+", required=True, \
        help= 'e.g., --topN_list 5 10 50\n' \
        + 'topN=max(topN_list); the rest of the values are used for evaluation.')
    parser.add_argument("--drop_ratio", type=int, default=0, action='store', \
        help= 'Number of random events to remove from the training set;\n' + \
        'default is 0; Currently not implemented for librec.')
    parser.add_argument("--evalTrain", dest='evalTrain', action='store_true', \
        help='If set => evaluate on training set using k-fold validation.\n' \
            + 'Else => evaluate only on test set')

    parser.add_argument("--dataset", type=str, action='store', \
        help= 'Full path to the dataset.\n' + \
        'Must give --testSize and --validSize for the split')
    parser.add_argument("--testSize",
                        type=int,
                        default=0,
                        action='store',
                        help='TestSet size; default is 0 => no test set')
    parser.add_argument("--validSize", type=int, default=2000, action='store', \
        help= 'Validation Set size; default is 2000.')
    parser.add_argument("--trainSet", type=str, action='store', \
        help= 'Full path to the trainingSet.csv\n' + \
        'If given the (potential) training set split from --dataset will be overwritten')
    parser.add_argument("--validSet", type=str, action='store', \
        help= 'Full path to the validationSet.csv\n' + \
        'If given the (potential) validation set split from --dataset will be overwritten')
    parser.add_argument("--testSet", type=str, action='store', \
        help= 'Full path to the testSet.csv\n' + \
        'If given the (potential) test set split from --dataset will be overwritten')

    parser.add_argument("--librec_home", type=str, action='store', \
        help= 'Full path to the librec folder cloned from git.')
    parser.add_argument("--config", type=str, action='store', \
        help= 'Full path to the librec .properties file.\n' + \
        'Copy from: https://www.librec.net/dokuwiki/doku.php?id=AlgorithmList')
    parser.add_argument("--surprise_algo", type=str, action='store', \
        help= 'Choose algorithm from surprise lib. Available options:\n' + \
        '--surprise_algo SVD\n' + \
        '--surprise_algo SVDpp\n' + \
        '--surprise_algo PMF\n' + \
        '--surprise_algo NMF\n' + \
        '--surprise_algo KNNWithMeans\n')

    args = parser.parse_args(args)

    random.seed(42)  # reproducability
    np.random.seed(42)

    if args.pickleLoadPath is None:
        """DATA"""
        train, valid, test = splitter.splitData(
              fullDataPath=args.dataset, validSize=args.validSize, testSize=args.testSize, \
              trainSetPath=args.trainSet, validSetPath=args.validSet, testSetPath=args.testSet)
        """RECOMMENDATIONS"""
        if args.surprise_algo == 'SVD':
            algo = surprise.SVD()
        elif args.surprise_algo == 'KNNWithMeans':
            #     sim_options = {'name': 'pearson_baseline', 'shrinkage': 2500, \
            #        'user_based': False, }
            sim_options = {'name': 'cosine', 'user_based': False}
            algo = surprise.KNNWithMeans(k=40, sim_options=sim_options)
        elif args.surprise_algo == 'PMF':
            algo = surprise.SVD(n_factors=5,
                                reg_all=0.12,
                                lr_all=0.005,
                                n_epochs=400)
        elif args.surprise_algo == 'NMF':
            algo = surprise.NMF(n_factors=5, n_epochs=400)
        elif args.surprise_algo == 'SVDpp':
            algo = surprise.SVDpp()

        testList = []  # output recommendations for the last element
        if len(test) > 0:
            testList.append(test)
        if len(valid) > 0:
            testList.append(valid)

        for test in testList:
            if args.librec_home is None:
                recs = surprise_recom(train, test, algo, drop_ratio=args.drop_ratio, \
                    update_freq=args.update_freq, N_list=args.topN_list, num=args.proc, \
                    evalTrain=args.evalTrain)
            else:
                recs = librec_recom(train, test, args.librec_home, args.config, \
                    update_freq=args.update_freq, N_list=args.topN_list, num=args.proc, \
                    evalTrain=args.evalTrain)

        if not args.pickleSavePath is None:
            with open(args.pickleSavePath, 'wb') as handle:
                pickle.dump(recs, handle)

    else:
        with open(args.pickleLoadPath, 'rb') as handle:
            recs = pickle.load(handle)
Ejemplo n.º 11
0
def main(train_df, target_df, cache_name="test", force_recompute=[]):
    """Train multiple models on train_df and predicts target_df

    Predictions are cached. If the indices don't match the indices of
    target_df, the cache is discarded.

    By default, if a method was already computed it is not recomputed again
    (except if the method name is listed in force_recompute). cache_name
    is the name to use to read and write the cache.

    Arguments:
        train_df {dataframe} -- Training dataframe
        target_df {dataframe} -- Testing dataframe

    Keyword Arguments:
        cache_name {str} -- Name to use for caching (default: {"test"})
        force_recompute {list} -- Name(s) of methods to recompute, whether or
        not it was already computed. Useful to only recompute single methods
        without discarding the rest. (default: {[]})

    Returns:
        Dataframe -- Dataframe with predictions for each methods as columns,
        IDs as indices
    """
    global algo_in_use
    CACHED_DF_FILENAME = os.path.dirname(
        os.path.abspath(__file__)) +\
        "/cache/cached_predictions_{}.pkl".format(cache_name)
    train_df = preprocess_df(train_df)
    trainset = pandas_to_data(train_df)
    ids_to_predict = target_df["Id"].to_list()

    # try to retrieve backup dataframe
    try:
        print("Retrieving cached predictions")
        all_algos_preds_df = pd.read_pickle(CACHED_DF_FILENAME)
        print("Ensuring cached IDs match given IDs")
        assert sorted(ids_to_predict) == sorted(
            all_algos_preds_df.index.values)
        print("Indices match, continuing")
    except (FileNotFoundError, AssertionError):
        print("No valid cached predictions found")
        all_algos_preds_df = pd.DataFrame(ids_to_predict, columns=["Id"])
        all_algos_preds_df.set_index("Id", inplace=True)

    all_algos = {
        "SVD": spr.SVD(n_factors=200, n_epochs=100),
        "Baseline": spr.BaselineOnly(),
        "NMF": spr.NMF(n_factors=30, n_epochs=100),
        "Slope One": spr.SlopeOne(),
        "KNN Basic": spr.KNNBasic(k=60),
        "KNN Means": spr.KNNWithMeans(k=60),
        "KNN Baseline": spr.KNNBaseline(),
        "KNN Zscore": spr.KNNWithZScore(k=60),
        "SVD ++": spr.SVDpp(n_factors=40, n_epochs=100),
        "Co Clustering": spr.CoClustering()
    }

    for name in all_algos:
        print("##### {} ####".format(name))
        if name in force_recompute and name in all_algos_preds_df.columns:
            all_algos_preds_df.drop(name, axis=1, inplace=True)
        if name in all_algos_preds_df.columns:
            print("Already computed {}, skipping".format(name))
            continue
        algo = all_algos[name]
        time.sleep(1)
        algo.fit(trainset)
        time.sleep(1)
        algo_in_use = algo
        print("Generating predictions...")
        predictions = parallelize_predictions(ids_to_predict, 80)
        print("Done. Merging with previous results")
        this_algo_preds_df = pd.DataFrame(predictions, columns=["Id", name])
        this_algo_preds_df.set_index("Id", inplace=True)
        all_algos_preds_df = pd.merge(all_algos_preds_df,
                                      this_algo_preds_df,
                                      left_index=True,
                                      right_index=True)
        all_algos_preds_df.to_pickle(CACHED_DF_FILENAME)
    print("DONE computing surprize")
    return all_algos_preds_df
 def fit(self):
     # fit model on dataset
     self.model = surprise.SVDpp().fit(self.rating_data)
Ejemplo n.º 13
0
# defining the number of folds = 5
print("Performing splits...")
kf = sp.model_selection.KFold(n_splits=5, random_state=0)
print("Done.")

###
### PART 1.1
###
'''
application of all algorithms for recommendation made available by 
“Surprise” libraries, according to their default configuration.
'''
algorithms = [sp.NormalPredictor(), sp.BaselineOnly(), sp.KNNBasic(),\
              sp.KNNWithMeans(), sp.KNNWithZScore(), sp.KNNBaseline(),\
              sp.SVD(), sp.SVDpp(), sp.NMF(), sp.SlopeOne(), sp.CoClustering()]
for elem in algorithms:
    start_time = time.time()
    algo = elem
    sp.model_selection.cross_validate(algo, data, measures=['RMSE'], \
                                      cv=kf, n_jobs = 2, verbose=True)
    print("--- %s seconds ---" % (time.time() - start_time))
    print()

###
### PART 1.2
###
'''
Improvement of the quality of both KNNBaseline and SVD methods, 
by performing hyper-parameters tuning over 5-folds
Random-Search-Cross-Validation - KNN
    delimiter=',',
    names=['uid', 'iid', 'rating'])
print(dataset.head())
#
lower_rating = dataset['rating'].min()
upper_rating = dataset['rating'].max()
print('Review range: {0} to {1}'.format(lower_rating, upper_rating))
#
import sklearn
import surprise
#
reader = surprise.Reader(rating_scale=(0.5, 4))
data = surprise.Dataset.load_from_df(dataset, reader)
print("Now starting SVD calculation")
#
alg = surprise.SVDpp()
#

train = data.build_full_trainset()
output = alg.fit(train)

print("Displaying training data")
print(train)
print(output)  # Extra line added
#

pred = alg.predict(uid='50', iid='52')
score = pred.est
print(score)
##
# Get a list of all movie ids
                    datafile[['user_id', 'business_id', 'stars']], reader)
                A_train_dense = list([list(row) for row in A_train_dense])
                for i in range(len(A_train_dense)):
                    A_train_dense[i].append(None)
                A_train_dense = list([tuple(row) for row in A_train_dense])

                A_test_dense = list([list(row) for row in A_test_dense])
                for i in range(len(A_test_dense)):
                    A_test_dense[i].append(None)
                A_test_dense = list([tuple(row) for row in A_test_dense])

                trainset = data.construct_trainset(A_train_dense)
                testset = data.construct_testset(A_test_dense)

                # SVDpp:
                algo = surprise.SVDpp()
                algo.fit(trainset)
                predictions = algo.test(testset)
                print("model SVDpp: ")
                # Then compute RMSE
                accuracy.rmse(predictions)
                print("NDCG: " +
                      str(sur_ndcg(atstd, predictions, product_index)))
                print("Precision: " +
                      str(sur_precision(atstd, predictions, product_index)))

                # NMF:
                algo = surprise.NMF()
                algo.fit(trainset)
                predictions = algo.test(testset)
                print("model NMF: ")
Ejemplo n.º 16
0
import surprise as sp
from surprise import Dataset
from surprise.model_selection import cross_validate
import NetflixDataLoad

#for 100000 rows for fast processing
data = Dataset.load_from_df(
    NetflixDataLoad.df_filterd[['Cust_Id', 'Movie_Id', 'Rating']][:100000])

n_folds = 5

for algo in [sp.SVD(), sp.SVDpp(), sp.KNNBasic(), sp.KNNWithMeans()]:
    print(
        cross_validate(algo,
                       data,
                       measures=['RMSE', 'MAE'],
                       cv=n_folds,
                       verbose=True))

# Output Example
# Evaluating RMSE, MAE of algorithm SVD on 5 split(s).
#
#             Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std
# RMSE        0.9311  0.9370  0.9320  0.9317  0.9391  0.9342  0.0032
# MAE         0.7350  0.7375  0.7341  0.7342  0.7375  0.7357  0.0015
# Fit time    6.53    7.11    7.23    7.15    3.99    6.40    1.23
# Test time   0.26    0.26    0.25    0.15    0.13    0.21    0.06
Ejemplo n.º 17
0
data = surprise.Dataset.load_from_df(dftrain, reader) 
print("finished combining the data with the reader =", datetime.now().time())

# param_grid = {'lr_all': np.arange(0.008,0.011,0.001), 'reg_all' : [0.1,0.3, 0.5]}
# grid_s = surprise.model_selection.GridSearchCV(surprise.SVDpp,param_grid,measures = ['rmse','mae'],cv = cv)
# grid_s.fit(data)

# dict = grid_s.best_params['rmse']
# dict

# dftrain = train.drop('timestamp', axis = 'columns')
# dftrain = dftrain.reset_index(drop = True)
# data = surprise.Dataset.load_from_df(dftrain, reader) 
# dict = grid_s.best_params['rmse']

alg = surprise.SVDpp()#lr_all = dict['lr_all'], reg_all = dict['reg_all'])
print("finished creating the svdpp object =", datetime.now().time())
print("started model training =", datetime.now().time())


output = alg.fit(data.build_full_trainset())
print(output)
print("finished training the model =", datetime.now().time())


dummies = [1]*len(test)
test['rating'] = dummies
predictions = alg.test(test.values)
del test['rating']
print("finished predictions on testset =", datetime.now().time())
finpred = [ m.est for m in predictions]
Ejemplo n.º 18
0
    # prepare data for normalization
    scaler = MinMaxScaler(feature_range=(0, 1))

    # train the normalization
    # normalize the dataset
    df[['rating']] = scaler.fit_transform(df[['rating']])
    print df.head(100)

    # A reader is still needed but only the rating_scale param is requiered.
    reader = surprise.Reader(rating_scale=(0, 1))

    # The columns must correspond to user id, item id and ratings (in that order).
    dataset = surprise.Dataset.load_from_df(df[['uid', 'iid', 'rating']], reader)

    alg = surprise.SVDpp(lr_all=.001)
    output = alg.fit(dataset.build_full_trainset())
    print output

    '''
    pred  = alg.predict(uid='3562446', iid='2982938')
    score = pred.est
    print score
    '''

    while True:
        print 'input uid =>'
        puid=str(input())
        if puid == 'exit' and not puid: break
        if puid == '\r': continue
Ejemplo n.º 19
0
def part3():
    file_path = 'DMA_project2_team%02d_part2_UIR.csv' % team
    reader = Reader(line_format='user item rating',
                    sep=',',
                    rating_scale=(1, 10),
                    skip_lines=1)
    data = Dataset.load_from_file(file_path, reader=reader)

    trainset = data.build_full_trainset()
    testset = trainset.build_anti_testset()

    # TODO: Requirement 3-2. User-based Recommendation
    uid_list = [
        'ffffbe8d854a4a5a8ab1a381224f5b80', 'ffe2f26d5c174e13b565d026e1d8c503',
        'ffdccaff893246519b64d76c3561d8c7', 'ffdb001850984ce69c5f91360ac16e9c',
        'ffca7b070c9d41e98eba01d23a920d52'
    ]
    # TODO - set algorithm for 3-2-1
    algo = surprise.KNNBasic(k=40,
                             min_k=1,
                             sim_options={
                                 'name': 'cosine',
                                 'user_based': True
                             },
                             verbose=True)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-2-1.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - set algorithm for 3-2-2
    algo = surprise.KNNWithMeans(k=40,
                                 min_k=1,
                                 sim_options={
                                     'name': 'pearson',
                                     'user_based': True
                                 },
                                 verbose=True)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-2-2.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - 3-2-3. Best Model
    kfold = KFold(n_splits=5, random_state=0)
    parameters = {
        'k': [30, 40, 50],
        'min_k': [1],
        'sim_options': {
            'name': ['pearson', 'cosine'],
            'user_based': [True]
        }
    }

    # Select the best algo with grid search.
    print('Grid Search for user based model...')
    grid_KNNBasic = GridSearchCV(surprise.KNNBasic,
                                 measures=['rmse'],
                                 param_grid=parameters,
                                 cv=kfold)
    grid_KNNWithMeans = GridSearchCV(surprise.KNNWithMeans,
                                     measures=['rmse'],
                                     param_grid=parameters,
                                     cv=kfold)

    grid_KNNBasic.fit(data)
    grid_KNNWithMeans.fit(data)

    best_KNNBasic_score = grid_KNNBasic.best_score['rmse']
    best_KNNWithMeans_score = grid_KNNWithMeans.best_score['rmse']

    if best_KNNBasic_score < best_KNNWithMeans_score:
        algo_name = 'KNNBasic'
        best_algo_ub = grid_KNNBasic.best_estimator['rmse']
        with_parameters = grid_KNNBasic.best_params['rmse']
        score = best_KNNBasic_score

    else:
        algo_name = 'KNNWithMeans'
        best_algo_ub = grid_KNNWithMeans.best_estimator['rmse']
        with_parameters = grid_KNNWithMeans.best_params['rmse']
        score = best_KNNWithMeans_score

    print('The best UB algorithm is', algo_name, 'with', with_parameters,
          '\nscore:', score)

    # TODO: Requirement 3-3. Item-based Recommendation
    iid_list = ['art', 'teaching', 'career', 'college', 'medicine']
    # TODO - set algorithm for 3-3-1
    algo = surprise.KNNBasic(k=40,
                             min_k=1,
                             sim_options={
                                 'name': 'cosine',
                                 'user_based': False
                             },
                             verbose=True)
    algo.fit(trainset)
    results = get_top_n(algo, testset, iid_list, n=10, user_based=False)
    with open('3-3-1.txt', 'w') as f:
        for iid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('Item ID %s top-10 results\n' % iid)
            for uid, score in ratings:
                f.write('User ID %s\tscore %s\n' % (uid, str(score)))
            f.write('\n')

    # TODO - set algorithm for 3-3-2
    algo = surprise.KNNWithMeans(k=40,
                                 min_k=1,
                                 sim_options={
                                     'name': 'pearson',
                                     'user_based': False
                                 },
                                 verbose=True)
    algo.fit(trainset)
    results = get_top_n(algo, testset, iid_list, n=10, user_based=False)
    with open('3-3-2.txt', 'w') as f:
        for iid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('Item ID %s top-10 results\n' % iid)
            for uid, score in ratings:
                f.write('User ID %s\tscore %s\n' % (uid, str(score)))
            f.write('\n')

    # TODO - 3-3-3. Best Model
    kfold = KFold(n_splits=5, random_state=0)
    parameters = {
        'k': [30, 40, 50],
        'min_k': [1],
        'sim_options': {
            'name': ['pearson', 'cosine'],
            'user_based': [False]
        }
    }

    # Select the best algo with grid search.
    print('Grid Search for item based model...')
    grid_KNNBasic = GridSearchCV(surprise.KNNBasic,
                                 measures=['rmse'],
                                 param_grid=parameters,
                                 cv=kfold)
    grid_KNNWithMeans = GridSearchCV(surprise.KNNWithMeans,
                                     measures=['rmse'],
                                     param_grid=parameters,
                                     cv=kfold)

    grid_KNNBasic.fit(data)
    grid_KNNWithMeans.fit(data)

    best_KNNBasic_score = grid_KNNBasic.best_score['rmse']
    best_KNNWithMeans_score = grid_KNNWithMeans.best_score['rmse']

    if best_KNNBasic_score < best_KNNWithMeans_score:
        algo_name = 'KNNBasic'
        best_algo_ub = grid_KNNBasic.best_estimator['rmse']
        with_parameters = grid_KNNBasic.best_params['rmse']
        score = best_KNNBasic_score
    else:
        algo_name = 'KNNWithMeans'
        best_algo_ub = grid_KNNWithMeans.best_estimator['rmse']
        with_parameters = grid_KNNWithMeans.best_params['rmse']
        score = best_KNNWithMeans_score

    print('The best IB algorithm is', algo_name, 'with', with_parameters,
          '\nscore:', score)

    # TODO: Requirement 3-4. Matrix-factorization Recommendation
    # TODO - set algorithm for 3-4-1
    algo = surprise.SVD(n_factors=100, n_epochs=50, biased=False)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-4-1.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - set algorithm for 3-4-2
    algo = surprise.SVD(n_factors=200, n_epochs=100, biased=True)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-4-2.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - set algorithm for 3-4-3
    algo = surprise.SVDpp(n_factors=100, n_epochs=50)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-4-3.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - set algorithm for 3-4-4
    algo = surprise.SVDpp(n_factors=100, n_epochs=100)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-4-4.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - 3-4-5. Best Model
    kfold = KFold(n_splits=5, random_state=0)
    parameters_SVD = {
        'n_factors': [50, 100, 200],
        'n_epochs': [10, 50, 100, 200],
        'biased': [True, False]
    }
    grid_SVD = GridSearchCV(surprise.SVD,
                            measures=['rmse'],
                            param_grid=parameters_SVD,
                            cv=kfold)
    parameters_SVDpp = {
        'n_factors': [50, 100, 200],
        'n_epochs': [10, 50, 100, 200]
    }
    grid_SVDpp = GridSearchCV(surprise.SVDpp,
                              measures=['rmse'],
                              param_grid=parameters_SVDpp,
                              cv=kfold)

    grid_SVD.fit(data)
    grid_SVDpp.fit(data)

    best_SVD_score = grid_SVD.best_score['rmse']
    best_SVDpp_score = grid_SVDpp.best_score['rmse']

    if best_SVD_score < best_SVDpp_score:
        algo_name = 'SVD'
        best_algo_mf = grid_SVD.best_estimator['rmse']
        with_parameters = grid_SVD.best_params['rmse']
        score = best_SVD_score

    else:
        algo_name = 'SVDpp'
        best_algo_mf = grid_SVDpp.best_estimator['rmse']
        with_parameters = grid_SVDpp.best_params['rmse']
        score = best_SVDpp_score

    print('The best MF algorithm is', algo_name, 'with', with_parameters,
          '\nscore:', score)
Ejemplo n.º 20
0
            f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
        f.write('\n')

# TODO - 4-1-2. SVD, n_factors=200, n_epochs=100, biased=True
algo = surprise.SVD(n_factors=200, n_epochs=100, biased=True)
algo.fit(trainset)
results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
with open('4-1-2_results.txt', 'w') as f:
    for uid, ratings in sorted(results.items(), key=lambda x: int(x[0])):
        f.write('User ID %s top-10 results\n' % uid)
        for iid, score in ratings:
            f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
        f.write('\n')

# TODO - 4-1-3. SVD++, n_factors=100, n_epochs=50
algo = surprise.SVDpp(n_factors=100, n_epochs=50)
algo.fit(trainset)
results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
with open('4-1-3_results.txt', 'w') as f:
    for uid, ratings in sorted(results.items(), key=lambda x: int(x[0])):
        f.write('User ID %s top-10 results\n' % uid)
        for iid, score in ratings:
            f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
        f.write('\n')

# TODO - 4-1-4. SVD++, n_factors=50, n_epochs=100
algo = surprise.SVDpp(n_factors=50, n_epochs=100)
algo.fit(trainset)
results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
with open('4-1-4_results.txt', 'w') as f:
    for uid, ratings in sorted(results.items(), key=lambda x: int(x[0])):
Ejemplo n.º 21
0
    df = pd.DataFrame(ratings_dict);
    return sp.Dataset.load_from_df(df[['uid','vid', 'r']], reader);



def load_data(path, r_range):
    train_set = convert_to_df(np.load(path + ".train"), r_range);
    test_set = convert_to_df(np.load(path + ".test"), r_range);
    return train_set.build_full_trainset(), test_set.build_full_trainset().build_testset();
    
    


if __name__ == '__main__':
    PREFIX = "/Users/morino/Downloads/dataset/";
    names = [ 'ml-latest-small/ml', 'BX-CSV-Dump/bx', 'jester/jester'];
    teller = ["MovieLens", "BookCrossing", "Jester"];
    r_ranges = [(1, 5), (1, 10), (0, 20)];
    algos = [sp.SVD(biased = False), sp.SVDpp(), sp.NMF()];
    algos_names = ['SVD', 'SVD++', 'NMF']
  

    for i, name in enumerate(names):
        print("BEGIN {}".format(teller[i]));
        train_set, test_set = load_data(PREFIX + name, r_ranges[i]);
        for j, algo in enumerate(algos):
            algo.fit(train_set);
            preds = algo.test(test_set);
            print("{} RMSE {}".format(algos_names[j], sp.accuracy.rmse(preds)));
        print("END {}".format(teller[i]));
Ejemplo n.º 22
0
knnBasic = surprise.KNNBasic()
knnBasic_temp = surprise.model_selection.cross_validate(
    knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('knnBasic-----------------')
print(knnBasic_temp)
knnWithMeans = surprise.KNNWithMeans()
knnWithMeans_temp = surprise.model_selection.cross_validate(
    knnWithMeans, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('knnWithMeans-----------------')
print(knnWithMeans_temp)
knnBaseline = surprise.KNNBaseline()
knnBaseline_temp = surprise.model_selection.cross_validate(
    knnBaseline, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('knnBaseline-----------------')
print(knnBaseline_temp)
svdpp = surprise.SVDpp()
svdpp_temp = surprise.model_selection.cross_validate(svdpp,
                                                     rating_data,
                                                     measures=['RMSE', 'MAE'],
                                                     cv=5,
                                                     verbose=True)
print('svdpp-----------------')
print(svdpp)
nmf = surprise.NMF()
nmf_temp = surprise.model_selection.cross_validate(nmf,
                                                   rating_data,
                                                   measures=['RMSE', 'MAE'],
                                                   cv=5,
                                                   verbose=True)
print('nmf-----------------')
print(nmf_temp)
Ejemplo n.º 23
0
        print('-' * 12)
        print('-' * 12)

    return hr, arhr


if __name__ == '__main__':
    # builtin dataset
    # data = env.Dataset.load_builtin('ml-100k')

    # ===============================  load data  ============================
    # ml-latest-small
    # file_path = 'input/ml-latest-small/ratings.csv'
    # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    # ------------------------------------------------------------------------------
    # ml-100k
    file_path = 'input/ml-100k/u.data'
    reader = env.Reader(line_format='user item rating timestamp', sep='\t', skip_lines=1)
    # ------------------------------------------------------------------------------
    # ml-20m
    # file_path = 'input/ml-20m/ratings.csv'
    # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    # ==============================================================================

    data = env.Dataset.load_from_file(file_path, reader=reader)
    data.split(n_folds=5)
    algo = env.SVDpp()

    # evaluate_topn(algo, data, top_n=100, threshold=3, verbose=1)
    env.evaluate(algo, data, measures=['rmse', 'mae', 'fcp'], verbose=1)
          epochs=2,
          validation_split=0.1,
          shuffle=True)

y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf])
y_true = df_hybrid_test['Rating'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse))

# Load dataset into surprise specific data-structure
data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader())

benchmark = []
# Iterate over all algorithms
for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    
    # Store data
    benchmark.append(tmp)
    
    # Store results
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False)

# Get data
data = surprise_results[['test_rmse', 'test_mae']]
dataset = pd.read_csv('C:/Users/Sridhar Sanobat/Documents/Data Science Examples/filmtrust/ratings.csv', delimiter = ',', names = ['uid', 'iid', 'rating'])
print(dataset.head())
#
lower_rating = dataset['rating'].min()
upper_rating = dataset['rating'].max()
print('Review range: {0} to {1}'.format(lower_rating, upper_rating))
#
import sklearn
import surprise
#
reader = surprise.Reader(rating_scale = (0.5, 4))
data = surprise.Dataset.load_from_df(dataset, reader)
print("Now starting SVD calculation")
#
alg = surprise.SVDpp()
#

train = data.build_full_trainset()
output = alg.fit(train)

print("Displaying training data")
print(train)
print(output) # Extra line added
#

pred = alg.predict(uid = '50', iid = '52')
score = pred.est
print(score)
##
# Get a list of all movie ids