Exemple #1
0
    def getPrediction(UserId):
        ratings_dict = {
            "userID": [1, 1, 3, 4, 4, 6],
            "POIID": [1, 2, 1, 4, 2, 6],
            "rating": [5, 5, 1, 4, 5, 3],
        }

        #users = User.objects()
        #for user in users:
        #	print(user)

        frame = pd.DataFrame(ratings_dict)

        print(frame)
        reader = Reader(rating_scale=(1, 5))

        data = Dataset.load_from_df(frame[['userID', 'POIID', 'rating']],
                                    reader)

        cross_validate(NormalPredictor(), data, cv=2)

        trainset = data.build_full_trainset()
        algo = SVD()
        algo.fit(trainset)

        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)

        top_n = Predictions.get_top_n(predictions, n=10)

        for uid, user_ratings in top_n.items():
            if (uid == UserId):
                return [iid for (iid, _) in user_ratings]
Exemple #2
0
def crossvalidate(data):
    results = []
    for algorithm in [
            NormalPredictor(),
            KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)),
            BaselineOnly(),
            SVD(),
            SVDpp(),
            NMF(),
            SlopeOne(),
            CoClustering()
    ]:
        result = cross_validate(algorithm,
                                data,
                                measures=['RMSE'],
                                cv=5,
                                verbose=False)
        temp = pd.DataFrame.from_dict(result).mean(axis=0)
        temp = temp.append(
            pd.Series([str(algorithm).split(' ')[0].split(".")[-1]],
                      index=['Algorithm']))
        results.append(temp)
    rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values(
        'test_rmse')
    return rmse_values
def normal_prediction():
    print('Algoritmo Baseline Only...')
    print('Que data desea utilizar?')
    print('(1) Android')
    print('(2) WordPress')
    data_utilizar = input()

    # Funcion de encoding para no tener error de lectura del archivo.
    reload(sys)
    sys.setdefaultencoding('utf8')

    if data_utilizar == 1:
        file_path = configuration.FILE_PATH_ANDROID
        reader = Reader(line_format='user item rating', sep='\t')
    else:
        file_path = configuration.FILE_PATH_WORDPRESS
        reader = Reader(line_format='user item rating', sep=',')

    # Dataset
    data = Dataset.load_from_file(file_path, reader=reader)
    data.split(n_folds=10)

    algo = NormalPredictor()

    perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
    print_perf(perf)
def test_performances():
    """Test the returned dict. Also do dumping."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating',
                    sep=' ',
                    skip_lines=3,
                    rating_scale=(1, 5))
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    algo = NormalPredictor()
    tmp_dir = tempfile.mkdtemp()  # create tmp dir
    performances = evaluate(algo,
                            data,
                            measures=['RmSe', 'Mae'],
                            with_dump=True,
                            dump_dir=tmp_dir,
                            verbose=2)
    shutil.rmtree(tmp_dir)  # remove tmp dir

    print(performances)
    assert performances['RMSE'] is performances['rmse']
    assert performances['MaE'] is performances['mae']
def EvaluateDifferentAlgorithms():
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(),
            KNNWithZScore(),
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_6months,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
Exemple #6
0
def BehaviorBasedCF():
    np.random.seed(0)
    random.seed(0)

    # Load up common data set for the recommender algorithms
    (ml, evaluationData, rankings) = MyDump.LoadMovieLensData(loader)

    # Construct an Evaluator to, you know, evaluate them
    evaluator = Evaluator(evaluationData, rankings, load=True)

    # User-based KNN
    UserKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
    evaluator.AddAlgorithm(UserKNN, "User KNN")

    # Item-based KNN
    ItemKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
    evaluator.AddAlgorithm(ItemKNN, "Item KNN")

    # Just make random recommendations
    Random = NormalPredictor()
    evaluator.AddAlgorithm(Random, "Random")

    evaluator.Evaluate(
        False
    )  # load is also False, cause simsMatrix needs to be loaded; I haven't figured it out.

    evaluator.SampleTopNRecs(ml)
Exemple #7
0
def ContentRecs():
    """ for this content-based(item) recommender
        calculate items' cosine similarity matrix in alg.fit()

        I don't test `HitRate` for `topN` recommendation, because here it's impossible to do that. What I recommend by this algorithm are all the movies the user haven't rated.
    """
    np.random.seed(0)
    random.seed(0)
    # loader = True

    # Load up common data set for the recommender algorithms
    # print(f'call ContentRecs()\nloader = {loader}')
    (ml, evaluationData, rankings) = MyDump.LoadMovieLensData(loader)

    # Construct an Evaluator to, you know, evaluate them
    evaluator = Evaluator(evaluationData, rankings, load=True)

    contentKNN = ContentKNNAlgorithm()
    evaluator.AddAlgorithm(contentKNN, "ContentKNN")

    # Just make random recommendations
    Random = NormalPredictor()
    evaluator.AddAlgorithm(Random, "Random")

    evaluator.Evaluate(False, True)  # not topN, able load

    # recommend 10(default) items
    evaluator.SampleTopNRecs(ml)
def benchmark(data):
    performance = []
    algorithms = [
        SVD(),
        SVDpp(),
        SlopeOne(),
        NMF(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering(),
        SVD_SGD_momentum(),
        SVDpp_SGD_momentum()
    ]
    for algorithm in algorithms:
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE', 'FCP'],
                                 cv=3,
                                 verbose=False)
        output = pd.DataFrame.from_dict(results).mean(axis=0)
        output = output.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        performance.append(output)
    output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values(
        'test_rmse')
    store_dataframe(output_df, 'Algorithm_Benchmark.csv')
Exemple #9
0
def test_cross_validate(toy_data):

    # First test with a specified CV iterator.
    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating', sep=' ', skip_lines=3)
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader,
                                   rating_scale=(1, 5))

    algo = NormalPredictor()
    pkf = ms.PredefinedKFold()
    ret = ms.cross_validate(algo, data, measures=['rmse', 'mae'], cv=pkf,
                            verbose=1)
    # Basically just test that keys (dont) exist as they should
    assert len(ret['test_rmse']) == 1
    assert len(ret['test_mae']) == 1
    assert len(ret['fit_time']) == 1
    assert len(ret['test_time']) == 1
    assert 'test_fcp' not in ret
    assert 'train_rmse' not in ret
    assert 'train_mae' not in ret

    # Test that 5 fold CV is used when cv=None
    # Also check that train_* key exist when return_train_measures is True.
    ret = ms.cross_validate(algo, toy_data, measures=['rmse', 'mae'], cv=None,
                            return_train_measures=True, verbose=True)
    assert len(ret['test_rmse']) == 5
    assert len(ret['test_mae']) == 5
    assert len(ret['fit_time']) == 5
    assert len(ret['test_time']) == 5
    assert len(ret['train_rmse']) == 5
    assert len(ret['train_mae']) == 5
Exemple #10
0
def run_surprise():
    # Load the movielens-100k dataset (download it if needed).
    data = Dataset.load_builtin('ml-100k')

    # Use the famous SVD algorithm.
    algo_svd = SVD()
    algo_normal = NormalPredictor()
    algo_baseline = BaselineOnly()
    algo_knnBasic = KNNBasic()

    # Run 5-fold cross-validation and print results.
    cross_validate(algo_svd,
                   data,
                   measures=['RMSE', 'MAE'],
                   cv=5,
                   verbose=True)
    cross_validate(algo_normal,
                   data,
                   measures=['RMSE', 'MAE'],
                   cv=5,
                   verbose=True)
    cross_validate(algo_baseline,
                   data,
                   measures=['RMSE', 'MAE'],
                   cv=5,
                   verbose=True)
    cross_validate(algo_knnBasic,
                   data,
                   measures=['RMSE', 'MAE'],
                   cv=5,
                   verbose=True)
Exemple #11
0
    def recommender_random(self, train_file, test_file, output):

        train, test, train_dataset, test_dataset = prepare_datasets(
            train_file, test_file)
        # Use user_based true/false to switch between user-based or item-based collaborative filtering
        algo_random = NormalPredictor()

        algo_random.fit(train)

        #not_seen_elems = self.merge_train_set(train_dataset, test_dataset)

        #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True)
        predictions_random = algo_random.test(test, verbose=False)

        #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0)
        # Precision and recall can then be averaged over all users
        #precision_avg = sum(prec for prec in precisions.values()) / len(precisions)
        #recall_avg = sum(rec for rec in recalls.values()) / len(recalls)
        #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str(
        #    rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False)))
        print('RANDOM: ' + ' RMSE ' +
              str(rmse(predictions_random, verbose=False)) + ' MAE ' +
              str(mae(predictions_random, verbose=False)))

        return algo_random
Exemple #12
0
def test_cross_validate():

    # First test with a specified CV iterator.
    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating',
                    sep=' ',
                    skip_lines=3,
                    rating_scale=(1, 5))
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    algo = NormalPredictor()
    pkf = ms.PredefinedKFold()
    ret = ms.cross_validate(algo,
                            data,
                            measures=['rmse', 'mae'],
                            cv=pkf,
                            verbose=1)
    assert len(ret['test_rmse']) == 1
    assert len(ret['test_mae']) == 1
    assert len(ret['fit_time']) == 1
    assert len(ret['test_time']) == 1
    assert 'test_fcp' not in ret

    # Test that 5 fold CV is used when cv=None
    data = Dataset.load_from_file(current_dir + '/custom_dataset', reader)
    ret = ms.cross_validate(algo, data, measures=['rmse', 'mae'], cv=None)
    assert len(ret['test_rmse']) == 5
    assert len(ret['test_mae']) == 5
    assert len(ret['fit_time']) == 5
    assert len(ret['test_time']) == 5
def check_for_args():
    args = sys.argv
    for arg in args:
        if (arg == 'SVD'):
            alg_list.append(SVD())
        elif (arg == 'SVDpp'):
            alg_list.append(SVDpp())
        elif (arg == 'SlopeOne'):
            alg_list.append(SlopeOne())
        elif (arg == 'NMF'):
            alg_list.append(NMF())
        elif (arg == 'NormalPredictor'):
            alg_list.append(NormalPredictor())
        elif (arg == 'KNNBaseline'):
            alg_list.append(KNNBaseline())
        elif (arg == 'KNNBasic'):
            alg_list.append(KNNBasic())
        elif (arg == 'KNNWithMeans'):
            alg_list.append(KNNWithMeans())
        elif (arg == 'KNNWithZScore'):
            alg_list.append(KNNWithZScore())
        elif (arg == 'BaselineOnly'):
            alg_list.append(BaselineOnly())
        elif (arg == 'CoClustering'):
            alg_list.append(CoClustering())

    return alg_list
def model(data, datacsv):
    """
	建立模型
	:return:
	"""
    #使用SVD模型
    algo = SVD()
    #进行5折交叉验证
    # print(cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True))
    print(cross_validate(NormalPredictor(), datacsv, cv=2))
Exemple #15
0
def prepareJob(userID):
    # <<<<<<< HEAD
    #douban_comments = pandas.read_csv('/Users/esthertang/Desktop/movieRecommd/myMovie/static/douban_yingping.csv')
    # =======

    #douban_comments = pandas.read_csv("/Users/huangzeqian/Documents/movieRecommd/myMovie/static/douban_yingping.csv")
    douban_comments.duplicated()
    comments = douban_comments.iloc[:, [8, 9, 10]]

    ratedList = comments[comments['userId'] == userID].values
    ratedMovieList = []
    for i in range(0, ratedList.shape[0]):
        ratedMovieList.append(ratedList[i][1])

    comments = comments.values

    ratings = []
    movieids = []
    userIds = []

    for i in range(0, comments.shape[0]):
        rating = comments[i][0]
        movieid = comments[i][1]
        userId = comments[i][2]
        try:
            rating = int(rating)
            movieid = int(movieid)
            ratings.append(rating)
            movieids.append(movieid)
            userIds.append(userId)
        except:
            # print('str cannot convert to int')
            pass

    ratings_dict = {'itemID': movieids, 'userID': userIds, 'rating': ratings}

    df = pandas.DataFrame(ratings_dict)
    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(1, 5))

    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

    # We can now use this dataset as we please, e.g. calling cross_validate
    cross_validate(NormalPredictor(), data, cv=2)

    userSet = set(userIds)
    movieSet = set(movieids)

    movielist = SVDFun(data, userSet, movieSet, userID)

    return getTopN(movielist, ratedMovieList)  # 这里运行getTopN()
Exemple #16
0
    def EvaluateAllModels(self):
        """
                         test_rmse   fit_time  test_time
        Algorithm
        SVDpp             0.965824   9.401286   0.151476
        SVD               0.967286   1.474139   0.062471
        BaselineOnly      0.972408   0.108964   0.057277
        NMF               0.992677   4.073005   0.171846
        KNNWithZScore     1.001898   0.620192   0.083341
        KNNWithMeans      1.002924   0.489803   0.078121
        SlopeOne          1.006664  19.091191   1.275676
        KNNBaseline       1.007437   0.890452   0.088495
        KNNBasic          1.016717   0.432159   0.072929
        NormalPredictor   1.253265   0.041646   0.078105
        CoClustering      1.828291   3.020921   0.052071
        :return: test_rmse sonucu en düşük olan alınır.
        """
        benchmark = []
        # Iterate over all algorithms
        for algorithm in [
                SVD(),
                SVDpp(),
                SlopeOne(),
                NMF(),
                NormalPredictor(),
                KNNBaseline(),
                KNNBasic(),
                KNNWithMeans(),
                KNNWithZScore(),
                BaselineOnly(),
                CoClustering()
        ]:
            # Perform cross validation
            results = cross_validate(algorithm,
                                     self.data,
                                     measures=['RMSE'],
                                     cv=3,
                                     verbose=False)

            # Get results & append algorithm name
            tmp = pd.DataFrame.from_dict(results).mean(axis=0)
            tmp = tmp.append(
                pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                          index=['Algorithm']))
            benchmark.append(tmp)

        result = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
            'test_rmse')
        print(result)

        return result
def run(masked_R_coo, unmasked_vals_coo, unmasked_cold_coo, mask_coo, mask_csr, ks, aug):
    trainset, testset, cold_testset = setup(masked_R_coo, unmasked_vals_coo, unmasked_cold_coo)
    models = [
        Model(name='random', algo=NormalPredictor(), ks=ks),
        Model(name='bias only', algo=BaselineOnly(verbose=False, bsl_options = {'method': 'sgd','learning_rate': .00005,}), ks=ks),
        Model(name='SVD', algo=SVD(verbose=False), ks=ks),
        # Model(name='KNN', algo=KNNBasic(verbose=False), ks=ks),
        ]

    args = [(model, trainset, testset, cold_testset) for model in models]
    with Pool() as pool:
        models = pool.starmap(run_model, args)
    
    show_and_save(models, aug)
Exemple #18
0
    def calculateRMSE(self, method=9, similarityMeasure=1, isUserBased="Yes"):
        conn = sqlite3.connect(DATABASE_NAME)
        df = pd.read_sql_query(
            "SELECT userID, glassID, relativeRating FROM ratings", conn)

        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(
            df[['userID', 'glassID', 'relativeRating']], reader)

        trainset, testset = train_test_split(data, test_size=.20)

        isUserBased = True if (isUserBased == "Yes") else False
        if similarityMeasure == 1:
            similarityMeasure = "cosine"
        elif similarityMeasure == 2:
            similarityMeasure = "pearson"
        else:
            similarityMeasure = "pearson_baseline"

        sim_options = {'name': similarityMeasure, 'user_based': isUserBased}

        if method == 1:
            algo = SVD()
        elif method == 2:
            algo = SlopeOne()
        elif method == 3:
            algo = NMF()
        elif method == 4:
            algo = NormalPredictor()
        elif method == 5:
            algo = KNNBaseline(sim_options=sim_options)
        elif method == 6:
            algo = KNNBasic(sim_options=sim_options)
        elif method == 7:
            algo = KNNWithMeans(sim_options=sim_options)
        elif method == 8:
            algo = KNNWithZScore(sim_options=sim_options)
        elif method == 9:
            algo = BaselineOnly()
        else:
            algo = CoClustering()

        algo.fit(trainset)
        predictions = algo.test(testset)

        conn.close()

        #cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
        return round(accuracy.rmse(predictions, verbose=False), 4)
def select_cf_model(algorithms=[SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]):
    #=========================Create automated context to pick best CF model========================
    benchmark = []
    algos = []
    # Iterate over all algorithms
    for algorithm in algorithms:
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
        algos = algos +[algorithm]
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
        benchmark.append(tmp)
        
    out = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  
    return out,algos
Exemple #20
0
def func5():
    import pandas as pd
    from surprise import NormalPredictor
    from surprise import Dataset
    from surprise import Reader
    from surprise.model_selection import cross_validate

    ratings_dict = {
        'itemID': [1, 1, 1, 2, 2],
        'userID': [9, 32, 2, 45, 'user_foo'],
        'rating': [3, 2, 4, 3, 1]
    }
    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
    cross_validate(NormalPredictor(), data, cv=2, verbose=True)
Exemple #21
0
def AutoRec():
    np.random.seed(0)
    random.seed(0)

    ml, evaluationData, rankings = MyDump.LoadMovieLensData(loader)
    evaluator = Evaluator(evaluationData, rankings, loader)

    myAutoRec = RBMAlgorithm()
    evaluator.AddAlgorithm(myAutoRec, "AutoRec")

    Random = NormalPredictor()
    evaluator.AddAlgorithm(Random, "Random")

    evaluator.Evaluate(doTopN=False, load=loader)

    evaluator.SampleTopNRecs(ml, loader)
Exemple #22
0
def CompareSVDRandom():
    np.random.seed(0)
    random.seed(0)
    # loader = True
    start_t = time.time()

    # Load up common data set for the recommender algorithms
    # (evaluationData, rankings) = LoadMovieLensData()
    # MyDump.Save('ratingsDataset', data = evaluationData, verbose = 1)
    # MyDump.Save('rankings', data = rankings, verbose = 1)

    # _,_,evaluationData = MyDump.Load('ratingsDataset', 1)
    # _,_,rankings = MyDump.Load('rankings',1)
    # if evaluationData == None or rankings == None:
    #     (_, evaluationData, rankings) = MyDump.LoadMovieLensData() # meat
    #     MyDump.Save('ratingsDataset', data = evaluationData, verbose = 1)
    #     MyDump.Save('rankings', data = rankings, verbose = 1)
    _, evaluationData, rankings = MyDump.LoadMovieLensData(loader)
    print(
        f'------time consumption: {time.time() - start_t} for LoadMovieLensData()'
    )
    start_t = time.time()

    # Construct an Evaluator to, you know, evaluate them
    evaluator = Evaluator(dataset=evaluationData,
                          rankings=rankings,
                          load=loader)

    print(
        f'------time consumption: {time.time() - start_t} for create an evaluator instance'
    )
    start_t = time.time()

    # Throw in an SVD recommender
    SVDAlgorithm = SVD(random_state=10)
    evaluator.AddAlgorithm(SVDAlgorithm, "SVD")

    # Just make random recommendations
    Random = NormalPredictor()
    evaluator.AddAlgorithm(Random, "Random")

    start_t = time.time()
    evaluator.Evaluate(True, loader)  # doTopN, loader
    print(
        f'------time consumption: {time.time() - start_t} for evaluator.Evaluate()'
    )
    start_t = time.time()
Exemple #23
0
def matrix_factorization_param(data_cv):
    # Iterate over all algorithms
    benchmark = []

    for algorithm in [
            SVD(),
            SVDpp(),
            NMF(),
            SlopeOne(),
            NormalPredictor(),
            CoClustering()
    ]:
        # Perform cross validation
        results = model_selection.cross_validate(algorithm,
                                                 data_cv,
                                                 measures=['RMSE', 'MAE'],
                                                 cv=5,
                                                 verbose=False)
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    rmse = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
        'test_mae')
    #print(rmse)

    # Parameter grid
    param_grid = {
        'n_factors': [100, 150, 200],
        'n_epochs': [20, 40],
        'lr_all': [0.001, 0.005, 0.008],
        'reg_all': [0.075, 0.1, 0.15]
    }
    algorithm_gs = model_selection.GridSearchCV(SVD,
                                                param_grid,
                                                measures=['rmse'],
                                                cv=5,
                                                n_jobs=-1)
    algorithm_gs.fit(data_cv)

    # best parameters for a model with the lowest rmse
    best_algo = algorithm_gs.best_estimator['rmse']
    return best_algo
Exemple #24
0
    def __init__(self, df, algo='KNN', user_based=False):
        self.df = df
        self.algo = algo
        self.user_based = user_based

        reader = Reader(line_format='user item rating')
        data = Dataset.load_from_df(df=self.df, reader=reader)
        self.eval_data = EvaluationData(data)

        if self.algo == 'KNN':
            sim_options = {'name': 'cosine', 'user_based': self.user_based}
            self.model = KNNBasic(sim_options=sim_options)
        elif self.algo == 'SVD':
            self.model = SVD()
        elif self.algo == 'SVD++':
            self.model = SVDpp()
        elif self.algo == 'Random':
            self.model = NormalPredictor()
Exemple #25
0
def MF():
    """ the idea behind is math, latent features
        implementation is simple out of library source
    """
    np.random.seed(0)
    random.seed(0)
    ml, evaluationData, rankings = MyDump.LoadMovieLensData(loader)
    evaluator = Evaluator(evaluationData, rankings, loader)

    mySVD = SVD(random_state=10)
    evaluator.AddAlgorithm(mySVD, "SVD")  # the same with before
    mySVDpp = SVDpp(random_state=10)
    evaluator.AddAlgorithm(mySVDpp, "SVDpp")
    Random = NormalPredictor()
    evaluator.AddAlgorithm(Random, "Random")

    evaluator.Evaluate(doTopN=False, load=loader)
    evaluator.SampleTopNRecs(ml, loader)
Exemple #26
0
def surprise_bench(df):
    """
    Creates benchmark dataframe of SVD, NMF, NormalPredictor, and Baseline with 
    5 Fold cross validation and returns rmse metrics
    """
    from surprise import (SVD, SVDpp, NMF, NormalPredictor, BaselineOnly)

    from surprise import Dataset
    from surprise import Reader

    from surprise.model_selection.validation import cross_validate
    from surprise import accuracy

    data = df[['user_id', 'business_id',
               'average_stars']].loc[df.city == 'Scottsdale']

    reader = Reader()

    data = Dataset.load_from_df(data, reader)
    benchmark = []

    # Iterate over all algorithms
    for algorithm in [
            SVD(n_factors=10),
            NMF(n_factors=10),
            NormalPredictor(),
            BaselineOnly()
    ]:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE'],
                                 cv=7,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    return pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
        'test_rmse')
Exemple #27
0
def main():
    book_df = pd.read_csv("../../data/processed/filtered_ratings.csv")
    # Reader object and rating scale specification
    book_df = book_df.drop('Unnamed: 0', axis=1)
    reader = Reader(rating_scale=(1, 5))
    # Load data
    data = Dataset.load_from_df(book_df[["user_id", "book_id", "rating"]],
                                reader)

    # Spilt data into train and test sets
    train_set, test_set = train_test_split(data, test_size=0.20)

    algorithm_list = [
        NormalPredictor(),
        BaselineOnly(),
        KNNWithZScore(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNWithMeans(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNBaseline(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)),
        SVDpp(),
        SVD(),
        NMF()
    ]

    # # Fit model for normal predictor and get rmse
    # basic_model_based(train_set, test_set, NormalPredictor())
    #
    # # Fit model for Baselineonly algorithm
    # basic_model_based(train_set, test_set, BaselineOnly())
    #
    # # Fit model for KNN algorithms
    # basic_model_based(train_set, test_set, KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)))
    #
    # plot_for_rmse(train_set, test_set)
    # Crossvalidation results
    # res = crossvalidate(data)
    # print(res)
    results = {}
    for algo in algorithm_list:
        rmse, preci, recall, f1 = basic_model_based(train_set, test_set, algo)
        print("Algorithm:", algo, preci, recall, f1)
        print(
            "**------------------------------------------------------------------------------------------**"
        )
Exemple #28
0
    def checkBestAlgorithm(self):
        self.df = pd.read_csv(csv_name)
        reader = Reader(rating_scale=(1, 10))
        data = Dataset.load_from_df(self.df[['user_id', 'item_id', 'rating']],
                                    reader)
        benchmark = []
        rmseTuple = []
        # 모든 알고리즘을 literate화 시켜서 반복문을 실행시킨다.
        for algorithm in [
                SVD(),
                SVDpp(),
                SlopeOne(),
                NormalPredictor(),
                KNNBaseline(),
                KNNBasic(),
                KNNWithMeans(),
                KNNWithZScore(),
                BaselineOnly(),
                CoClustering()
        ]:
            # 교차검증을 수행하는 단계.
            results = cross_validate(algorithm,
                                     data,
                                     measures=['RMSE'],
                                     cv=3,
                                     verbose=False)

            # 결과 저장과 알고리즘 이름 추가.
            tmp = pd.DataFrame.from_dict(results).mean(axis=0)
            rmseTuple.append((algorithm, tmp['test_rmse']))
            tmp = tmp.append(
                pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                          index=['Algorithm']))
            benchmark.append(tmp)
        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
        print("\n")
        rmseTuple.sort(key=lambda x: x[1])

        print("Best algorithm : ")
        print(str(rmseTuple[0]).split(' ')[0].split('.')[-1])
        return rmseTuple[0]
Exemple #29
0
def contentGive(id):
    np.random.seed(0)
    random.seed(0)

    # Load up common data set for the recommender algorithms
    (ml, evaluationData, rankings) = LoadMovieLensData()

    # Construct an Evaluator to, you know, evaluate them
    evaluator = Evaluator(evaluationData, rankings)

    contentKNN = ContentKNNAlgorithm()
    evaluator.AddAlgorithm(contentKNN, "ContentKNN")

    # Just make random recommendations
    Random = NormalPredictor()
    evaluator.AddAlgorithm(Random, "Random")

    evaluator.Evaluate(False)

    evaluator.SampleTopNRecs(ml, id)
Exemple #30
0
def RBMtest():
    np.random.seed(0)
    random.seed(0)

    ml, evaluationData, rankings = MyDump.LoadMovieLensData(loader)

    # Construct an Evaluator to, you know, evaluate them
    evaluator = Evaluator(evaluationData, rankings, loader)

    # RBM
    # able to tune by trying more parameter combination
    myRBM = RBMAlgorithm(epochs=20)
    evaluator.AddAlgorithm(myRBM, "RBM")

    Random = NormalPredictor()
    evaluator.AddAlgorithm(Random, "Random")

    evaluator.Evaluate(doTopN=False, load=loader)

    evaluator.SampleTopNRecs(ml, loader)