Example #1
def temp1():
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import sys
    import pickle
    from surprise import Dataset, Reader, SVD, accuracy
    from surprise.model_selection import train_test_split

    movies = pd.read_csv('../data/movies.csv')
    # genome_scores = pd.read_csv('../data/genome-scores.csv')
    # tags = pd.read_csv('../data/tags.csv')
    # genome_tags = pd.read_csv('../data/genome-tags.csv')
    # Use ratings data to downsample tags data to only movies with ratings
    ratings = pd.read_csv('../data/ratings.csv')
    # print(ratings)
    ratings = ratings.drop_duplicates('movieId')

    # print(ratings)
    # 사용자 추가
    temp_df = pd.DataFrame
    rating_ser = [4.0, 5.0, 2.0]
    movie_ser = [1, 2, 3]
    user_id = ['138494', '138494', '138494']
    # userId,movieId,rating,timestamp
    # temp_df['userId'] = pd.Series(user_id)
    # temp_df['movieId'] = pd.Series(movie_ser)
    # temp_df['rating'] = pd.Series(rating_ser)

    # pd.concat([ratings, temp_df])

    # instantiate a reader and read in our rating data
    reader = Reader(rating_scale=(1, 5))
    ratings_f = ratings.groupby('userId').filter(lambda x: len(x) >= 55)
    movie_list_rating = ratings_f.movieId.unique().tolist()
    Mapping_file = dict(zip(movies.title.tolist(), movies.movieId.tolist()))
    data = Dataset.load_from_df(ratings_f[['userId', 'movieId', 'rating']],

    # train SVD on 75% of known rates
    trainset, testset = train_test_split(data, test_size=.25)
    algorithm = SVD()
    predictions = algorithm.test(testset)

    # check the accuracy using Root Mean Square Error

    def pred_user_rating(ui):
        if ui in ratings_f.userId.unique():
            ui_list = ratings_f[ratings_f.userId == ui].movieId.tolist()
            d = {k: v for k, v in Mapping_file.items() if not v in ui_list}
            predictedL = []
            for i, j in d.items():
                predicted = algorithm.predict(ui, j)
                predictedL.append((i, predicted[3]))
            pdf = pd.DataFrame(predictedL, columns=['movies', 'ratings'])
            pdf.sort_values('ratings', ascending=False, inplace=True)
            pdf.set_index('movies', inplace=True)
            return pdf.head(10)
            print("User Id does not exist in the list!")
            return None

    user_id = 1
Example #2
# data = surprise.Dataset.load_builtin('ml-100k')
# print(data)
# #data.split(n_folds=2)  # split data for 2-folds cross validation

# algo = SVD_SGD(learning_rate=.01, n_epochs=10, n_factors=10)
# cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
import pandas as pd
import numpy as np
import surprise # run 'pip install scikit-surprise' to install surprise
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
total_review_df = pd.read_csv("../data/total_review_df.csv")
# Load the dataset (download it if needed)
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(total_review_df[["user_name","res_id","rating"]],reader)

# Use the famous SVD algorithm
algo = SVD()
# Run 5-fold cross-validation and then print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)



uid = 1
iid = "형석"
pred = algo.predict(uid,iid,5)
# !/usr/bin/env python
# -*- coding: utf-8 -*-
@author: lishuang
@description: 使用邻域的协同过滤对movie lens进行预测,并采用K折交叉验证

from surprise import KNNWithZScore, Reader, Dataset
from surprise import accuracy
from surprise.model_selection import KFold

# 加载数据
reader = Reader(line_format='user item rating timestamp',
data = Dataset.load_from_file('data/ratings.csv', reader)

# ItemCF 计算得分
# 取最相思的用户计算时,只取最相思的k个
algo = KNNWithZScore(k=40,
                         'user_based': False,
                         'verbose': 'True'

kf = KFold(n_splits=3)

for train_set, test_set in kf.split(data):
    pred = algo.test(test_set)
    rmse = accuracy.rmse(pred, verbose=True)
import os
from surprise import Dataset, Reader

# 指定文件所在路径
file_path = os.path.expanduser('ml-100k/u.data')
# 告诉文本阅读器,文本的格式是怎么样的
reader = Reader(line_format='user item rating timestamp', sep='\t')
# 加载数据
data2 = Dataset.load_from_file(file_path, reader=reader)
trainset = data2.build_full_trainset()# 转换成这种结构,才能获取到数据集详细信息
print('user count: ', trainset.n_users)# 用户数
print('item count', trainset.n_items)#电影数
Example #5
def svd_impute(order_with_na):
    order_for_svd_drop: 분당 주문 데이터 - 일시품절 구간 na
    train_set: 아소트 정상 판매된 분당 판매량 - svd 학습
    test_set: 일시 품절 발생한 train set 결측치 - svd 예측


    """split all zero attributes"""
    all_zero_attributes = (order_with_na == 0).all(axis=1).loc[lambda x: x == True].index
    order_with_na_ = order_with_na.drop(all_zero_attributes)

    """svd require long shape"""
    sold_per_minute_long_reshape = (order_with_na_

    """check if acceptable"""
    predictable = sold_per_minute_long_reshape[ORD_AMT_KEY].isna().sum() >= 1
    if not predictable:  # return order itself  if no null
        return order_with_na

    train_set = sold_per_minute_long_reshape.dropna().copy()
    long_enough = train_set.shape[0] >= 5
    if not long_enough:
        return order_with_na

    """train svd"""
    sold_out_filter = sold_per_minute_long_reshape[ORD_AMT_KEY].isna()
    test_set = sold_per_minute_long_reshape[sold_out_filter].copy()
    # convert to surprise data type
    max_value = train_set[ORD_AMT_KEY].max()
    train_set_svd_object = Dataset.load_from_df(train_set, Reader(rating_scale=(0, max_value)))
    # find appropriate parameter grid search
    num_of_factor_candidates = 5
    num_of_attr = train_set[ATTR_KEY].unique().shape[0]
    n_factor_min = (num_of_attr // num_of_factor_candidates) + 1
    n_factor_max = num_of_attr - 1
    factor_candidates = np.linspace(start=n_factor_min, stop=n_factor_max, num=num_of_factor_candidates)
    n_factors_grid_search_pool = np.unique(np.floor(factor_candidates)).astype(int)  # svd의 파라미터로 쓸 latent factor 개수

    grid_search_pool = {'reg_all': [0],
                        'lr_all': [0.003, 0.001],
                        'n_factors': n_factors_grid_search_pool,
                        'n_epochs': [15, 22, 30],
                        'biased': [False]}
    error_measure = 'mae'
    grid_searcher = GridSearchCV(SVD, grid_search_pool, measures=[error_measure], cv=5,
                                 n_jobs=N_CORE)  # n_jobs: parallel compute

        grid_searcher = GridSearchCV(SVD, grid_search_pool, measures=[error_measure], cv=2,
                                     n_jobs=N_CORE)  # n_jobs: parallel compute

    best_error = grid_searcher.best_score[error_measure]
    best_param = grid_searcher.best_params[error_measure]
    """svd predict"""
    svd = SVD(**best_param).fit(train_set_svd_object.build_full_trainset())

    def _predict(test_set):
        return svd.predict(test_set[ATTR_KEY], test_set[TIME_KEY]).est

    for predict_row_idx, each_blank in test_set.iterrows():
        predicted_value = _predict(each_blank)
        test_set.loc[predict_row_idx, ORD_AMT_KEY] = predicted_value

    filled_sold_per_minute_long_shape = pd.concat([train_set, test_set]).round(0)
    imputed = filled_sold_per_minute_long_shape.pivot(index=ATTR_KEY, columns=TIME_KEY, values=ORD_AMT_KEY)

    """merge all zeros"""
    for attribute in all_zero_attributes:
        imputed.loc[attribute, :] = 0

    return imputed
Example #6
    def train_collaborative_filtering(self, grid_search=False, gs_params=None):
        #transform page list in single value
        analytics_df_SVD = self.analytics_df.copy()
        analytics_df_SVD['ranking'] = analytics_df_SVD[[
            'totals.pageviews', 'totals.timeOnSite'
        ]].apply(lambda x: self.__generate_ranking(x), axis=1)
        analytics_df_SVD = analytics_df_SVD['pages_visited'].apply(lambda x: pd.Series(eval(x)))\
         .join(analytics_df_SVD[['visitId','ranking']], how='left')
        analytics_df_SVD = analytics_df_SVD.dropna()
        analytics_df_SVD = analytics_df_SVD[['visitId', 'ranking', 'pageId']]
        analytics_df_SVD['pageId'] = analytics_df_SVD['pageId'].apply(
            lambda x: int(x))

        # Saves Matrix for later use

        # A reader is still needed but only the rating_scale param is requiered.
        reader = Reader(rating_scale=(1, 4))

        # The columns must correspond to user id, item id and ratings (in that order).
        data = Dataset.load_from_df(
            analytics_df_SVD[['visitId', 'pageId', 'ranking']], reader)

        trainset, testset = train_test_split(data, test_size=.1)

        # If user desires to use GridSearch to find best params and algo
        if grid_search:
            if (not gs_params):
                param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],\
                            'reg_all': [0.08, 0.1, 0.15]}
                param_grid = gs_params
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
            algo = gs.best_estimator['rmse']

        ## Comment next lines if you are searching the best params
        # We can now use this dataset as we please, e.g. calling cross_validate
            algo = SVD(n_factors=110, n_epochs=110, lr_all=0.008, reg_all=0.15)

                       measures=['RMSE', 'MAE'],

        test_pred = algo.test(testset)
        print("SVD : Test Set")
        accuracy.rmse(test_pred, verbose=True)

        # Dump algorithm
        print('Saving trained algo...', end=" ")
        algo_list = glob.glob('state/algo_*')
        file_name = 'state/algo_' + datetime.datetime.now().strftime(
        dump.dump(file_name, algo=algo)
        for file in algo_list:
Example #7
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]

        # * Precision at K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # * Recall at K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

# * using reader to be able to deal with the imported CSV
reader = Reader(
    line_format="user item rating timestamp", sep=",", rating_scale=(1, 5), skip_lines=1
# * loading the csv
data = Dataset.load_from_file(
    file_path="../../ML_Dataset/ml-latest-small/ratings.csv", reader=reader
# * dividing in train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# * define a cross-validation iterator
kf = KFold(n_splits=5)

# * Choosing Slope One as algorithm
algo = SlopeOne()

# * Train the algorithm on the trainset, and predict ratings for the testset
Example #8
import pandas as pd
from surprise import Reader
from surprise import Dataset
import time
import matplotlib.pyplot as plt
import psutil

timex = []
mem = []
m1 = psutil.virtual_memory().percent

#For 100 record dataset
start = time.time()
df1 = pd.read_csv('C:/Users/dell pc/Desktop/Project/ratings_1million1.csv',
                  dtype={'rating': float})
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df1[['user_id', 'book_id', 'rating']], reader)
algo = surprise.KNNBasic()
result1 = surprise.evaluate(algo, data, measures=['RMSE'])
end = time.time()
print("Time1", end - start)
timex.append(end - start)
m2 = psutil.virtual_memory().percent

#For 1000 record dataset
start = time.time()
df2 = pd.read_csv('C:/Users/dell pc/Desktop/Project/ratings_1million2.csv',
                  dtype={'rating': float})

import numpy as np
import pprint as pp
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise import NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD, SVDpp, NMF, SlopeOne, CoClustering
from surprise.model_selection import RandomizedSearchCV
import datetime

start = datetime.datetime.now(
)  #We want to record the time required to select the estimators
reader = Reader(sep=',', skip_lines=1)
data = Dataset.load_from_file(
    reader=reader)  #Import dataset
similarity_options = {
    'name': ['pearson_baseline'],
    'user_based': [True, False]
}  #We set the similarity options for the RandomizedSearch.We set Pearson Baselina as suggested in the Surprise documentation
baseline_predictor_options = {
    'method': ['sgd'],
    'learning_rate': [0.002, 0.005, 0.01],
    'n_epochs': [50, 100, 150],
    'reg': [0.01, 0.02, 0.05]
}  #We set the baseline predictor options options for the RandomizedSearch
grid_of_parameters = {
    parser = argparse.ArgumentParser()
                            'NormalPredictor', 'BaselineOnly', 'KNNBasic',
                            'KNNWithMeans', 'KNNWithZScore', 'KNNBaseline',
                            'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering'
    args = parser.parse_args()

    train_path = path + '/Data/train_format.txt'

    train_reader = Reader(line_format='user item rating timestamp',
                          rating_scale=(0, 5))
    trainset = Dataset.load_from_file(train_path, reader=train_reader)
    trainset = trainset.build_full_trainset()

    if args.model == 'NormalPredictor':
        model = surprise.NormalPredictor()
    elif args.model == 'BaselineOnly':
        model = surprise.BaselineOnly()
    elif args.model == 'KNNBasic':
        model = surprise.KNNBasic()
    elif args.model == 'KNNWithMeans':
        model = surprise.KNNWithMeans()
    elif args.model == 'KNNWithZScore':
        model = surprise.KNNWithZScore()
    elif args.model == 'KNNBaseline':
Example #11
def train_model():
    # Send request to Nodejs server for authentication.
    if not is_good_request(request):
        return abort(400)

    # Extract data from request.
    data = request.get_json()
    dataset, data_header, model_name, params, train_type, save_on_server, save_on_local = data.values()

    # if not is_header_valid(data_header):
    #     return jsonify({'message': '[ERROR] Incorrect dataset format.'})

    # Use the data uploaded or data on server.
    df = pd.DataFrame(dataset, columns=data_header) if dataset else pd.read_csv('./data/data-new.csv', header=0)
        train_set, test_set = build_train_test(df, Reader(), full=train_type == 'full')
    except ValueError:
        return jsonify({'error': 'Incorrect dataset format.'})

    if model_name == 'insvd':
        n_factors, n_epochs, lr_all, reg_all, random_state = params.values()
        ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'}
        # Parse data types.
        n_factors = int(n_factors)
        n_epochs = int(n_epochs)
        lr_all = float(lr_all)
        reg_all = float(reg_all)
        random_state = int(random_state)
        model = InSVD(n_factors=n_factors, n_epochs=n_epochs,
                      lr_all=lr_all, reg_all=reg_all, random_state=random_state)
        k, sim_options, random_state = params.values()
        model = KNNBasic(k=int(k), sim_options={'name': sim_options, 'user_based': False},

    # Fitting and testing.
    predictions = model.test(test_set)

    # Add suffix if not save on server.
    model_path = f'./model/{model_name}' if save_on_server else f'./model/{model_name}-temp'

    # Save.
    dump.dump(model_path, algo=model, predictions=predictions)
    model_info = {
        'rmse': rmse(predictions),
        'mae': mae(predictions),

    # Zip the trained model.
        zip_obj = ZipFile(f'{model_path}.zip', 'w')
    except FileNotFoundError:
        return abort(404)

    def remove_dump_files(response):
        # If not save model on server, delete model dump file.
        if not save_on_server:

        # Always delete the .zip file.

        return response

    if save_on_local:
        with open(f'{model_path}.zip', 'rb') as f:
            model_zip = f.readlines()

        resp = Response(model_zip)
        resp.headers['X-Model-Info'] = json.dumps(model_info)
        resp.headers['Content-Type'] = 'application/zip'
        resp.headers['Content-Disposition'] = 'attachment; filename=%s;' % 'model.zip'

        return resp
        # return Response(model_zip, headers={
        #     'X-Info': json.dumps(model_info),
        #     'Content-Type': 'application/zip',
        #     'Content-Disposition': 'attachment; filename=%s;' % 'model.zip',
        # })

        # return send_from_directory('./model', model_file), 200

    return jsonify(model_info)
Example #12
merged_data = result[[
    userID_column, itemID_column, itemName_column, ratings_column
# -


# # process data

# +
from surprise import Dataset
from surprise import Reader

data = merged_data[['user_id', 'book_id', 'rating']]
reader = Reader(rating_scale=(0.5, 4.5))
data = Dataset.load_from_df(data, reader)

popularity_rankings = merged_data['book_id'].value_counts()
rankings = pd.Series(range(1,
                           len(popularity_rankings) + 1, 1),

processed_data = ProcessData(data, rankings)
# Train on leave-One-Out train set
trainSet = processed_data.GetLOOCVTrainSet()
testSet = processed_data.GetLOOCVTestSet()
# -

# # run test
Example #13
'''Testing renaming of train() into fit()'''
import os

import pytest

from surprise import Dataset
from surprise import Reader
from surprise import AlgoBase
from surprise.model_selection import KFold

data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
data = Dataset.load_from_file(data_file, Reader('ml-100k'))
kf = KFold(n_splits=2)

def test_new_style_algo():
    '''Test that new algorithms (i.e. algoritms that only define fit()) can
    support both calls to fit() and to train()
    - algo.fit() is the new way of doing things
    - supporting algo.train() is needed for the (unlikely?) case where a user
    has defined custom tools that use algo.train().
    class CustomAlgoFit(AlgoBase):
        def __init__(self):
            self.cnt = -1

        def fit(self, trainset):

            AlgoBase.fit(self, trainset)
            self.est = 3
def main():
    #Here we have 4 data that can be taken for the test. The basic one, the others with users and movies that have more than 10, 20 and 50 ratings
    elem_to_import = 4
    for i in range(elem_to_import):
        if(i == 0):
            #Import the basic data
            data_import = np.genfromtxt(DATA_PATH, delimiter=",", skip_header=1, dtype=str)
            #Construct User, Movie and rating
            userId, movieId, rating = construct_data(data_import);
        elif(i == 1):
            # Import the data with users and movies that have more than 10 ratings
            data_import = pd.read_excel(DATA_PATH_10, index_col=0, header=0)
            userId = data_import["userId"]
            movieId = data_import["movieId"]
            rating = data_import["rating"]
        elif(i == 2):
            # Import the data with users and movies that have more than 20 ratings
            data_import = pd.read_excel(DATA_PATH_20, index_col=0, header=0)
            userId = data_import["userId"]
            movieId = data_import["movieId"]
            rating = data_import["rating"]
            # Import the data with users and movies that have more than 30 ratings
            data_import = pd.read_excel(DATA_PATH_50, index_col=0, header=0)
            userId = data_import["userId"]
            movieId = data_import["movieId"]
            rating = data_import["rating"]

        #We take the indicies that we will shuffle
        indices_to_shuffle = np.array(range(len(userId)))

        test_ratio = 70
        #We create the train and test data (70% ratio of the data does on train) Indicies are shuffeled in split_data
        X_train_userId, X_train_movieId, X_train_rating, X_test_userId, X_test_movieId, X_test_rating = split_data(
            indices_to_shuffle, userId, movieId, rating, test_ratio)

        ratings_dict_train = {'itemID': X_train_movieId,
                        'userID': X_train_userId,
                        'rating': X_train_rating}

        #Create the dataframe for the surprise train
        df_train = pd.DataFrame(ratings_dict_train)
        reader_train = Reader(rating_scale=(1, 5))
        data_train = Dataset.load_from_df(df_train[['userID', 'itemID', 'rating']], reader_train)

        #We have to split the data because the algo test with the splited elements
        split = 3

        ratings_dict_train = {'itemID': X_test_movieId,
                            'userID': X_test_userId,
                            'rating': X_test_rating}

        # Create the dataframe for the surprise test
        df_test = pd.DataFrame(ratings_dict_train)
        reader_test = Reader(rating_scale=(1, 5))
        data_test = Dataset.load_from_df(df_test[['userID', 'itemID', 'rating']], reader_test)

        # KNN best param
        n_epochs = [5]
        reg_us = [5]
        reg_is = [5]

        #KNN (takes to long to test all of them but you can check)
        n_epochs = [5,10,15]
        reg_us = [5,10,15,20]
        reg_is = [5,10,20]

        # Apply the grid search for KNN
        perf_knn_grid = grid_search_knn_surprise(data_train, n_epochs, reg_us, reg_is)

        if (i == 0):
            #Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN.xlsx')
            # KNN with the best params from GridSearch surprise
            knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"], "surprise_bestKNN.csv")

        elif(i == 1):
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN10.xlsx')
            # KNN with the best params from GridSearch surprise
            knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"],"surprise_bestKNN10.csv")
        elif (i == 2):
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN20.xlsx')
            # KNN with the best params from GridSearch surprise
            knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"],"surprise_bestKNN20.csv")
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN50.xlsx')
            # KNN with the best params from GridSearch surprise
            knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"],"surprise_bestKNN50.csv")

        # SVD best param
        n_epochs = [10]
        lr_alls = [0.00147]
        reg_alls = [0.2]
        init_mean = [0.2]
        n_factors = [80]

        #SVD (takes to long to test all of them but you can check)
        n_epochs = [5,10]
        lr_alls = [0.00145, 0.00146, 0.00147]
        reg_alls = [0.2,0.3]
        init_mean = [0, 0.2]
        n_factors = [80,100,120]

        # Apply the grid search for SVD
        perf_svd_grid = grid_search_svd_surprise(data_train, n_epochs, lr_alls, reg_alls, init_mean, n_factors)

        if (i == 0):
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors,
            # SVD with the best params from GridSearch surprise
            svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"],
                         perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD.csv")

        elif(i == 1):
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors,
            # SVD with the best params from GridSearch surprise
            svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"],
                         perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD10.csv")

        elif (i == 2):
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors,
            # SVD with the best params from GridSearch surprise
            svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"],
                         perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD20.csv")

            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors,
            # SVD with the best params from GridSearch surprise
            svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"],
                         perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD50.csv")
    `top_n:` a dictionary of the top k reccommendations for a given user

    `user:` internal user id(uid) used in the datasets like in ml-latest-parsed.csv

        a list containing the top k reccomendations for the given user
    matches = [[iid for (iid, _) in user_ratings]
               for uid, user_ratings in top_n.items() if uid == user]
    return matches

# Load the movielens-100k dataset (download it if needed),
reader = Reader(line_format='user item rating timestamp',
                rating_scale=(0.5, 5),
data = Dataset.load_from_file('./ml-latest-parsed.csv', reader=reader)

trainset = data.build_full_trainset()

testset = trainset.build_anti_testset()
#testset = trainset.build_anti_testset()

#trainset, testset = train_test_split(data, test_size=.3)

# We'll use the famous SVD algorithm.
print("Creating Model")
sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 2}
algo = KNNBasic(k=40, min_k=2, sim_options=sim_options)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
Created on Mon Feb  4 00:08:44 2019

@author: abhijithneilabraham
from surprise import KNNBasic
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise.model_selection import train_test_split
import pandas as pd
customer = pd.read_csv('names.csv')

reader = Reader(line_format='user item rating', rating_scale=(1, 5), sep=',')
fieldnames = ['id', 'male_or_female']

for i in range(25):
    fieldnames.insert(2, 'question' + str(i + 1))

    data = Dataset.load_from_df(customer[fieldnames], reader)
    del fieldnames[2]
    trainset = data.build_full_trainset()

    algo = KNNBasic()
Example #17
mtarix_toGO['Norm_Tot_Amnt'] = (mtarix_toGO['Mean_amount'] - min_amt) / max_amt
#lower_bound = min(mtarix_toGO['Log_Mean_Amount'])
#upper_bound = max(mtarix_toGO['Log_Mean_Amount'])
#print lower_bound
#print upper_bound
# Remove the outliers
dfx = mtarix_toGO[mtarix_toGO['Norm_Tot_Amnt'] <= 0.4]
lower_bound = min(dfx['Norm_Tot_Amnt'])
upper_bound = max(dfx['Norm_Tot_Amnt'])
print 'Lower Bound normalized spending =', lower_bound
print 'Upper Bound normalized spending =', upper_bound
print 'Number of Transactions remaining after removing Outliers::', mtarix_toGO.shape[

#define the reader  with  upper and lower bounds , also now we are predicting Normalized Total Amount column
reader_x = Reader(rating_scale=(lower_bound, upper_bound))
data = Dataset.load_from_df(
    df=dfx[['CustomerID', 'StockCode', 'Norm_Tot_Amnt']], reader=reader_x)

#for i in range(9):
#    print (data.raw_ratings[0][2] - data.df['Log_Mean_amount'][0])

print 'difference in processed and pre-processed dataset = ', (
    data.raw_ratings[0][2] - data.df['Norm_Tot_Amnt'][0])

import time
start_time = time.time()

param_grid = {
    'n_factors': [2, 5, 10, 50],
    'n_epochs': [10, 50, 100],
Example #18
from surprise import Reader, Dataset
from surprise import KNNBasic, evaluate
import csv

reader = Reader(line_format='user item rating', sep=';', rating_scale=(-1, 3))

print("Loading data...")
data = Dataset.load_from_file('./tost/u.data', reader=reader)

#Build the trainset
trainset = data.build_full_trainset()

#Define similarities options
#sim_options = {
#    'name': 'cosine',
#    'user_based': True  # compute  similarities between items

print("Training data...")
#Build an algo, and train it
algo = KNNBasic()
print("> OK\n")

#Get predicitions
print("Predictions :")
ratings = vars(data).get('raw_ratings')

with open('tost/u.results', "wb") as csv_file:
Example #19
def predict(user_id, topk):

    data = pd.read_csv("../data/data.dat",
                       names=['userid', 'itemid', 'rating', 'timestamp'])

    data['rating'] = data['rating'].astype(float)

    print('Dataset 1 shape: {}'.format(data.shape))

    data.index = np.arange(0, len(data))
    print('Full dataset shape: {}'.format(data.shape))

    p = data.groupby('rating')['rating'].agg(['count'])

    # 获取电影总数
    movie_count = data.isnull().sum()[1]

    # 获取用户总数
    cust_count = data['userid'].nunique() - movie_count

    # 获取评分总数
    rating_count = data['userid'].count() - movie_count

    ax = p.plot(kind='barh', legend=False, figsize=(15, 10))
        'Total pool: {:,} Movies, {:,} customers, {:,} ratings given'.format(
            movie_count, cust_count, rating_count),

    for i in range(1, 6):
        ax.text(p.iloc[i - 1][0] / 4,
                i - 1,
                'Rating {}: {:.0f}%'.format(
                    i, p.iloc[i - 1][0] * 100 / p.sum()[0]),
    df_nan = pd.DataFrame(pd.isnull(data.rating))
    df_nan = df_nan[df_nan['rating'] == True]
    df_nan = df_nan.reset_index()

    movie_np = []
    movie_id = 1

    for i, j in zip(df_nan['index'][1:], df_nan['index'][:-1]):

        temp = np.full((1, i - j - 1), movie_id)
        movie_np = np.append(movie_np, temp)
        movie_id += 1
    f = ['count', 'mean']
    df_movie_summary = data.groupby('itemid')['rating'].agg(f)
    df_movie_summary.index = df_movie_summary.index.map(int)
    movie_benchmark = round(df_movie_summary['count'].quantile(0.8), 0)
    drop_movie_list = df_movie_summary[
        df_movie_summary['count'] < movie_benchmark].index
    print('Movie minimum times of review: {}'.format(movie_benchmark))

    df_cust_summary = data.groupby('userid')['rating'].agg(f)
    df_cust_summary.index = df_cust_summary.index.map(int)
    cust_benchmark = round(df_cust_summary['count'].quantile(0.8), 0)
    drop_cust_list = df_cust_summary[
        df_cust_summary['count'] < cust_benchmark].index
    print('Customer minimum times of review: {}'.format(cust_benchmark))

    print('Original Shape: {}'.format(data.shape))
    df = data[~data['itemid'].isin(drop_movie_list)]
    df = df[~df['userid'].isin(drop_cust_list)]
    print('After Trim Shape: {}'.format(df.shape))
    df_p = pd.pivot_table(df,

    reader = Reader()

    svd = SVD()

    user_some = data[(data['userid'] == user_id) & (data['rating'] > 3)]
    user_some = user_some.set_index('itemid')
    user_some = user_some.reset_index()
    user_some = user_some[~user_some['itemid'].isin(drop_movie_list)]

    data = Dataset.load_from_df(df[['userid', 'itemid', 'rating']], reader)

    trainset = data.build_full_trainset()

    user_some['Estimate_Score'] = user_some['itemid'].apply(
        lambda x: svd.predict(user_id, x).est)
    user_some = user_some.sort_values('Estimate_Score', ascending=False)

    return user_some.head(topk)
from surprise import SlopeOne
from surprise import CoClustering
from surprise.model_selection import cross_validate

sys.path.insert(1, './')

from auto_surprise.engine import Engine

if __name__ == '__main__':
    print("Starting benchmark")
    # Surprise algorithms to evaluate
    algorithms = (SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline, CoClustering, BaselineOnly, NormalPredictor)

    # Load dataset
    file_path = os.path.expanduser('../datasets/libimseti/ratings.dat')
    reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(1, 10))
    data = Dataset.load_from_file(file_path, reader=reader)

    benchmark_results = {
        'Algorithm': [],
        'RMSE': [],
        'MAE': [],
        'Best params': [],
        'Time': []

    # Evaluate Surprise Algorithms
    for algo in algorithms:
        algo_name = algo.__name__

        print("Running algorithm : %s" % algo_name)
Example #21
File: SVD++.py Project: pqz793/Web
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
import os

# Load the movielens-100k dataset (download it if needed).
#data = Dataset.load_builtin('ml-100k')
file_path = os.path.expanduser("../train1.csv")
#reader = Reader(line_format="user item rating timestamp", sep=',')
reader = Reader(line_format="user item rating timestamp",
                rating_scale=(0, 5))

data = Dataset.load_from_file(file_path, reader=reader)
file_path1 = os.path.expanduser("../test1.csv")

reader1 = Reader(line_format="user item rating", sep=',')

data = Dataset.load_from_file(file_path, reader=reader)
data1 = Dataset.load_from_file(file_path1, reader=reader1)
trainset = data.build_full_trainset()
#testset = data1.build_full_trainset()
# Use the famous SVD algorithm.
algo = SVDpp()

# Run 5-fold cross-validation and print results.
#cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
#url = st.text_input('Enter the path for the data')
st.write('The data is loaded')
#data_load_state = st.text('Loading the data')
data = ds.get_data(option1)

#data = ds.get_data(_file_path, 'data/data_subset.csv', 0.99)
#data = ds.get_data('/Users/lalitharahul/Desktop/AutoRecommender/RecServe/sample_us.tsv')
#data = ds.get_data(url)
#data_load_state.text('Data is preprocessed')
data_surprise = data[['customer_id', 'product_id', 'star_rating']]. \
rename(columns={'customer_id': 'userID', 'product_id': 'itemID', 'star_rating': 'rating'})

reader = Reader(rating_scale=(1.0, 5.0))
df_loaded = Dataset.load_from_df(data_surprise, reader)
#trainset = df_loaded.build_full_trainset()
results_list = []

# features
reviews = data.shape[0]
n_users = data.customer_id.nunique()
n_products = data.product_id.nunique()
mean_rating = data.star_rating.mean()
rating_std = data.star_rating.std()
sparsity = reviews * 100 / (n_users * n_products)
url = st.sidebar.text_input('Enter CustomerID')
#if st.button('Enter'):
# st.write('Entered customer id')
#st.write('The Entered Customer Id is', url)

from __future__ import (absolute_import, division, print_function,
import os

from surprise import BaselineOnly
from surprise import Dataset
from surprise import evaluate
from surprise import Reader

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)

# We'll use an algorithm that predicts baseline estimates.
algo = BaselineOnly()

# Evaluate performances of our algorithm on the dataset.
evaluate(algo, data)
def main():
    # Load full training data
    with open('data/train_users_04.p', 'rb') as f:
        users = pickle.load(f)

    f = open('data/train_users.csv', 'w')

    for user in users:
        if users[user] != {}:
            # zscores = stats.zscore(list(users[user].values()))
            # items = [(a[0], zscores[i]) for i, a
            #  in enumerate(users[user].items())]
            for i in users[user].items():
                t = float(i[1])
                if t > 6:
                    t = 6
                elif t < 1:
                    t = 1
                f.write('%s\t%s\t%.03f\n' % (user, i[0], t))

    print("finished train data")

    # Load full test data
    with open('data/test_users_04.p', 'rb') as f:
        users = pickle.load(f)

    f = open('data/test_users.csv', 'w')

    for user in users:
        if users[user] != {}:
            # zscores = stats.zscore(list(users[user].values()))
            # items = [(a[0], zscores[i]) for i,
            #  a in enumerate(users[user].items())]
            for i in users[user].items():
                t = float(i[1])
                if t > 6:
                    t = 6
                elif t < 1:
                    t = 1
                f.write('%s\t%s\t%.03f\n' % (user, i[0], t))

    print("finished test data")

    ######### START OF TRAINING #########

    reader = Reader(line_format='user item rating',
                    rating_scale=(1, 6))

    train_data = Dataset.load_from_file('data/train_users.csv', reader=reader)
    #                  .build_full_trainset()

    with open('data/test_users.csv', 'r') as f:
        s = list(map(lambda x: tuple(x.split('\t')), f.read().split('\n')))
        test_data = []
        for x in s:
            if len(x) > 2:
                test_data.append((x[0], x[1], float(x[2])))

    # print(data.ur)

    # algo = KNNBasic(sim_options={'name': 'cosine'})
    # algo = NMF(verbose=True)
    algo = SVD(verbose=True)
    # algo = NormalPredictor()

    # cross_validate(algo, train_data, verbose=True)

    # print(algo.predict('76561197960675902', '70', r_ui=63, verbose=True))
    # print(algo.predict('76561197960675902', '4540', r_ui=22, verbose=True))
    # print(algo.predict('76561197960675902', '550', r_ui=791, verbose=True))
    # print(algo.predict('76561197960675902', '10190', r_ui=1253, verbose=True))
    # print(algo.predict('76561197960675902', '10', r_ui=1037, verbose=True))

    predictions = algo.test(test_data)

import pandas as pd
from surprise import Dataset
from surprise import BaselineOnly
from surprise import Reader
import sys

input_path = sys.argv[1] + "yelp_train.csv"
test_file_name = sys.argv[2]
result_file_name = sys.argv[3]

reader = Reader(rating_scale=(0, 5))
train_read = pd.read_csv(input_path)
test_read = pd.read_csv(test_file_name)

train_load = Dataset.load_from_df(train_read,
test_load = Dataset.load_from_df(test_read,

bsl_options = {'method': 'als', 'n_epochs': 9, 'reg_u': 7.6, 'reg_i': 3.5}

algorithm = BaselineOnly(bsl_options=bsl_options)
results = algorithm.fit(train_load).test(test_load.build_testset())

with open(result_file_name, "w") as fout:
    fout.write("user_id, business_id, prediction\n")
    for p in results:
        fout.write(str(p.uid) + "," + str(p.iid) + "," + str(p.est) + "\n")
Example #26
import os
import random

from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import KNNBaseline
from surprise import evaluate
from surprise import GridSearch

# the test and train files are from the ml-100k dataset (10% of u1.base and
# 10 % of u1.test)
train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))


def test_grid_search_cv_results():
    """Ensure that the number of parameter combinations is correct."""
    param_grid = {
        'n_epochs': [1, 2],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6],
        'n_factors': [1],
        'init_std_dev': [0]
    grid_search = GridSearch(SVD, param_grid)
    df_movie_summary['count'] < movie_benchmark].index

df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(J)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary)['count'].quantile(0.8)
drop_movie_list = df_cust_summary[
    df_cust_summary['count'] < cust_benchmark].index

# pivot the data set to make a matrix form
df_pivot = pd.pivot_table(df,

# Now start collaborative filtering
reader = Reader()

algo = SVD()

# Take input for Customer ID
customerID = input("Customer ID Please: ")
customerID = int(customerID)
print("We are recommending Movies for Customer ID: ", customerID)

number = input("Please input the total number of movies to be recommended: ")
number = int(number)
print("Number of Movies Being Recommended: ", number)

sdata = df[(df['Cust_Id'] == customerID)]
sdata = sdata.set_index('Movie_Id')
Example #28
def train(params):

    # Load the training data and create a df for training
    temp_df = read_dataframe(params.client_id, params.source_bucket, 'buy.csv')
    raw_df = pd.DataFrame(
            'entity_id': temp_df['entity_id'],
            'target_entity_id': temp_df['target_entity_id']
    raw_df['rating'] = MAX_RATING

    # create the training set
    reader = Reader(rating_scale=(0, MAX_RATING))
    data = Dataset.load_from_df(
        raw_df[['entity_id', 'target_entity_id', 'rating']], reader)
    training_data = data.build_full_trainset()

    # Find optimal parameters
    print(' --> fitting the model')

    param_grid = {
        'n_epochs': [10, 30],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.2, 0.6]
    gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)


    # Build an model, and train it
    print(' --> build the model')

    #model = SVD()
    model = gs.best_estimator['rmse']

    # Batch predictions
    print(' --> batch predictions')

    unique_entity = np.unique(raw_df.entity_id.values)
    unique_target_entity = np.unique(raw_df.target_entity_id.values)

    px = pd.DataFrame(-1.0,
    predx = training_data.build_anti_testset(fill=0)

    for p in predx:
        pred = model.predict(training_data.to_inner_uid(p[0]),
        px.at[p[0], p[1]] = round(pred.est, PRECISION)

    # create the export
    print(' --> create export')
    ex = pd.DataFrame(index=unique_entity,
                      columns={'entity_type', 'target_entity_type', 'values'})

    for id in unique_entity:
        p1 = px.loc[id, :]
        p2 = p1.sort_values(ascending=False)
        p3a = p2[p2 > FILTER_THRESHOLD]
        p3 = p3a[p3a < 1.0].head(MAX_PREDICTION)
        t = zip(p3.index.tolist(), p3.values)
        tf = [item for sublist in t for item in sublist]

        ex.at[id, 'entity_type'] = 'user'
        ex.at[id, 'target_entity_type'] = 'item'
        ex.at[id, 'values'] = tf

    write_dataframe(params.job_id, params.job_dir, 'pred_user.csv',
                    ['entity_type', 'target_entity_type', 'values'],
                    'entity_id', ex)
Example #29
from surprise import NMF
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

# Load the movielens‐100k dataset (download it if needed),
reader = Reader(line_format='user item rating', sep=' ')
dataset = Dataset.load_from_file(
    './sushi3-2016/sushi3b.5000.10.score_converted', reader=reader)
trainset = dataset.build_full_trainset()

# We'll use the famous NMF algorithm.
algo = NMF()

# Run 5‐fold cross‐validation and print results
cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)
Example #30
from surprise import KNNBaseline, Reader
from surprise import Dataset

import cProfile as pickle
# 重建歌单id到歌单名的映射字典
id_name_dic = pickle.load(open("popular_playlist.pk1", "rb"))
# 重建歌单名到歌单id的映射字典
name_id_dic = {}
for playlist_id in id_name_dic:
    name_id_dic[id_name_dic[playlist_id]] = playlist_id

file_path = os.path.expanduser("./popular_music_suprise_format.txt")
# 指定文件格式
reader = Reader(line_format="user item rating timestamp", sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 计算歌曲间的相似度
trainset = music_data.build_full_trainset()

# 模板之查找最近的user
algo = KNNBaseline()

current_playlist = name_id_dic.keys()[39]
print("歌单名称:", current_playlist)

# 取出近邻