def final_model(data): """Pickles the collaborative filtering recommendation system model for repeat customers. Args: data -- a dataframe containing user id, item id, and ratings columns in that order. """ # Creates a user ratings surprise matrix for fitting model user_ratings_matrix = surprise_df(data) # Splits dataset into train and test datasets to generate predictions train_set, test_set = train_test_split(user_ratings_matrix, test_size=0.2, random_state=19) # Best params determined using GridSearchCV params = {'n_factors': 10, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1} svdpp = SVDpp(n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) svdpp.fit(train_set) predictions = svdpp.test(test_set) # Use surprise wrapper to pickle model dump.dump('repeat_customer_model', predictions=predictions, algo=svdpp, verbose=0)
def top_ten_df (df): ''' inputs: df (Pandas DF) the dataframe that you would like to train on/NOTE: use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF outputs: top_ten_df (DataFrame Pandas) returns a dataframe with the top ten predictions for every user in your original dataframe ''' data= f.read_data_surprise(df)#use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF # First train an SVD algorithm on entire dataset (choose 6x name filter) trainset = data.build_full_trainset() algo = SVDpp()#n_epochs= 18, lr_all= 0.01, reg_all= 0.175 algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset()#HEAVY THIS TAKES THE MOST RAM predictions = algo.test(testset) #create a dictionary of predictions top_n = f.get_top_n(predictions, n=10) #Turn the dictionary into a df top_ten_df = pd.DataFrame(top_n) return top_ten_df
def __init__(self, dataset_path, books_path, model_path, algo='SVD'): """Init the recommendation engine given a Spark context and a dataset """ self.ratings_path = dataset_path self.model_path = model_path self.books_path = books_path if algo == 'SVD': self.algo = algo self.SVD = SVDpp() (bk, data, rankings) = self.loadBookData() self.bk = bk self.rankings = rankings self.dataset = EvaluationData(data, rankings) # ALS algorithm part self.spark = SparkSession\ .builder\ .appName("ReadMore")\ .config("spark.executor.cores", '4')\ .getOrCreate() self.bookID_to_name = self.loadBookNames() self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="bookId", ratingCol="rating", coldStartStrategy="drop") self.ratings = self.loadDataFrame(dataset_path) self.train_ALSmodel()
def svd(user_id, area): algo = SVDpp() algo = SVDpp(n_factors=100, n_epochs=15) # 3. train model 저장 file_name = os.path.expanduser('./dump') #dump.dump(file_name, algo=algo) # 한번 학습하고 여기는 주석처리 _, algo = dump.load(file_name) Area = pd.read_csv('./area.csv') ## { 상품아이디(학습데이터), area, 상품ID } #nowarea="C" #user=str("A2CX7LUOHB2NDG") # usre ID 받아오기 neww = Area[Area['area'] == area]['productID'].tolist() # 구역 받아오기 predictions = [ algo.predict(str(user_id), str(productID)) for productID in neww ] # 예측 ###### def sortkey_est(pred): return pred.est predictions.sort(key=sortkey_est, reverse=True) #print(predictions) top_product_id = [int(pred.iid) for pred in predictions] top_product_id = top_product_id[:5] return top_product_id
def time_location_model(df): """ Shows the performance of model based on just bias """ lower = df['date_dist_rating'].min() upper = df['date_dist_rating'].max() df = df.drop(columns=["rating", "dist_rating", "date_rating"], axis=1) reader = Reader(rating_scale=(lower, upper)) #TODO figure out data = surprise.dataset.Dataset.load_from_df(df=df, reader=reader) ts = data.build_full_trainset() dusers = ts._raw2inner_id_users ditems = ts._raw2inner_id_items # breakpoint() trainset, testset = train_test_split(data) algo = SVDpp() algo.fit(trainset) # testset = trainset.build_anti_testset() predictions = algo.test(testset) print('\n') return (trainset, testset, predictions, dusers, ditems)
def __init__(self, k=5): if not isinstance(k, int) or k <= 0: raise IOError("Parameter k should be a positive integer.") self.data = None self.k = k self.algo = SVDpp(n_factors=self.k) self.predictions = pd.DataFrame()
def trainSVD_surprise3D( training_data, colorlabels, plot=True, savefig="figures/"): #colorlabels, sizelabels, plot=True, savefig=True # algo = SVD(n_factors=4, n_epochs=1000, biased=True) # algo = SVD(n_factors=20, n_epochs=500, biased=False) algo = SVDpp(n_factors=10, n_epochs=1000) algo.fit(training_data) U = algo.pu if plot: fig = plt.figure(figsize=(8, 8)) # ax = fig.add_subplot(1,1,1) ax = fig.add_subplot(111, projection='3d') ax.set_xlabel('First', fontsize=15) ax.set_ylabel('Second', fontsize=15) ax.set_title('Reduced SVD', fontsize=20) scatter = ax.scatter( U[:, 0], U[:, 1], U[:, 2], c=colorlabels, s=10, alpha=0.7 ) #explore labeling colors with features like demographics, age ax.grid() cbar = fig.colorbar(scatter, ax=ax) cbar.set_label("state") if savefig: plt.savefig(savefig + "svd_counties_3D") plt.show()
def svdPP(data): #SVDPP algorithm print("\nTraining SVDPP model..\n") global x_test, y_test, testlen, trainlen, model_params, x_train, y_train, X, Y, avg_rat, cold_itm p1, p2, p3 = [ model_params[1]['n_epochs'], model_params[1]['lr_all'], model_params[1]['reg_all'] ] svdModel = SVDpp(n_epochs=p1, lr_all=p2, reg_all=p3) svdModel.fit(data.build_full_trainset()) print("\nTraining done..\nPrediction started..") test = [(x_test[i][0], x_test[i][1], y_test[i]) for i in range(testlen)] #train_=[(x_train[i][0],x_train[i][1],y_train[i]) for i in range(trainlen)] #total_=[(X[i][0],X[i][1],Y[i]) for i in range(trainlen+testlen)] predict = svdModel.test(test) #trainset, testset = t_t_s(data, test_size=.25) svdModel_1 = SVDpp() svdModel_1.fit(data.build_full_trainset()) predict1 = svdModel_1.test(test) #predict_train = svdModel_1.test(train_) #predict_tot = svdModel_1.test(total_) usrA = [int(i[0]) - 1 for i in predict] itmA = [int(i[1]) - 1 for i in predict] res = [i[3] for i in predict] res1 = [i[3] for i in predict1] for i in range(testlen): if itmA[i] in cold_itm: res[i] = avg_rat[usrA[i]] res1[i] = avg_rat[usrA[i]] #restrain=[i[3] for i in predict_train] print("\nPrediction done..\n") return [res, res1, svdModel, svdModel_1] #,restrain, predict_tot
class RecommenderSVDpp(Recommender): def __init__(self, recommendation_dataset: RecommendationDataSet): super(RecommenderSVDpp, self).__init__(recommendation_dataset.movies) self.algorithm = SVDpp() self.recommendation_dataset = recommendation_dataset def fit(self, dataset): return self.algorithm.fit(dataset) def test(self, test_set): return self.algorithm.test(test_set) def get_recommendation(self, watched, k=20): # get dataset new_user_id, full_dataset = self.recommendation_dataset.get_dataset_with_extended_user(watched) inner_user_id = full_dataset.to_inner_uid(new_user_id) # after new dataset we need again train our model with the new user for the whole # dataset with the new user. self.algorithm.fit(full_dataset) # watched movies watched = {full_dataset.to_inner_iid(key): value for key,value in watched.items()} # Calculate for all similar user, predictions test_items = [ self.algorithm.predict(new_user_id, full_dataset.to_raw_iid(i)) for i in range(0, full_dataset.n_items) if i not in watched ] topn_items = [i[0] for i in get_top_n(test_items, n=k, minimum_rating=1.0)[new_user_id]] return self.movies.get_movie_by_movie_ids(topn_items)
def predict(): global top_n global user_id print("--predict start--------------------------------") # dataset import rating_data = pd.DataFrame(get_default_ratings()) reader = Reader(rating_scale=(0, 5)) data = Dataset.load_from_df(df=rating_data, reader=reader) trainset_2, testset_2 = train_test_split(data, test_size=0.3) # print("--test2--------------------------------") algo = SVDpp() predictions = algo.fit(trainset_2).test(testset_2) # print("--test1--------------------------------") top_n = get_top_n(predictions, n=10) print("--predict end--------------------------------")
def test_SVDpp_parameters(): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = SVDpp(n_factors=1, n_epochs=1, random_state=1) rmse_default = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] # n_factors algo = SVDpp(n_factors=2, n_epochs=1, random_state=1) rmse_factors = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_factors # AMAN algo = SVDpp(n_factors=1, n_epochs=1, amau=False, missing_val=0, downweight=.001) rmse_aman = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_aman # Mean - centered algo = SVDpp(n_factors=1, n_epochs=1, mean_centered=False) rmse_mean_centered = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_mean_centered # biased algo = SVD(n_factors=1, n_epochs=1, biased=False) rmse_biased = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_biased # The rest is OK but just takes too long for now... """
def fit_model(mlr_df): algo = SVDpp() # Object to parse the data reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(mlr_df[['userId', 'id', 'rating']], reader) trainset = data.build_full_trainset() PREDICTOR = algo.fit(trainset) return PREDICTOR
def SVDpp_calculation(data , trainset, testset, time, cv): start = time.time() algo = SVDpp() algo.fit(trainset) predictions = algo.test(testset) cross_validate_svdpp_dict = cross_validate(algo, data, measures = ['RMSE'],cv=cv,verbose=True) end = time.time() time = end-start return time, cross_validate_svdpp_dict
def svdpp(trainset, testset): # Matrix factorization - SVD++ print("\n" + "-" * 5 + " SVD++ algorithm using surprise package " + "-" * 5) algo = SVDpp() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
class TrainModel: # def __init__(self, method='als', n_epochs=20, sim_option='pearson_baseline'): # # self.algo = KNNBasic(bsl_options={'method': method,'n_epochs': n_epochs}, # sim_options={'name': sim_option, 'user_based': False}) def __init__(self, lr_all=0.006, n_epochs=40): self.algo = SVDpp(lr_all=lr_all, n_epochs=n_epochs) self.reader = Reader(rating_scale=(0, 1)) self.filename = 'trained_model.pkl' def read_from_df(self, dataframe, user_col, item_col, rating_col): data = Dataset.load_from_df( dataframe[[user_col, item_col, rating_col]], self.reader) trainset = data.build_full_trainset() return trainset def train_mod(self, dataframe, user_col, item_col, rating_col): self.algo.fit( self.read_from_df(dataframe, user_col, item_col, rating_col)) def dump_model(self, predictions): saved_ent = dump.dump(self.filename, algo=self.algo, predictions=predictions) return saved_ent def load_model(self): predictions, loaded_ent = dump.load(self.filename) return predictions, loaded_ent def get_user_pred(self, user_id, dataframe, user_col, item_col, rating_col, n=2): data = Dataset.load_from_df( dataframe[[user_col, item_col, rating_col]], self.reader) testset = data.build_full_trainset().build_anti_testset() predictions = self.algo.test(testset) top_n = dict() for uid, iid, _, est, _ in predictions: if uid == user_id: top_n[iid] = est top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True) return predictions, top_n[:n] def get_user_pred_stable(self, user_id, predictions, n=2): top_n = dict() for uid, iid, _, est, _ in predictions: if uid == user_id: top_n[iid] = est top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True) # top_nn = {k: top_n[k] for k in top_n.keys()[0][:n]} return top_n[:n]
def create_model(self): n = 1000000 raw_data = self.get_ratings()[:n].fillna(0)[["userId", "id", "rating"]] reader = Reader() data = Dataset.load_from_df(raw_data, reader) data.split(n_folds=5) svdpp = SVDpp() trainset = data.build_full_trainset() svdpp.fit(trainset) filename = "C:/datasets/the-movies-dataset/models/collaborative_based/coll_svdpp.sav" joblib.dump(svdpp, filename)
def svdpp(dataset): start = time.time() algo = SVDpp() kf = KFold(n_splits=5) for trainset, testset in kf.split(dataset): algo.fit(trainset) predictions = algo.test(testset) acc = accuracy.rmse(predictions, verbose=True) end = time.time() print('svdpp花分钟数为:', (end - start) / 60) return acc
class SvdPP(RecommenderBase): """ SVDpp algorithm. Actually woring bad, just a draft """ def __init__(self, URM): print('train set built') # double check if training set is built fine for sgd # for u, i, r in self.trainset.all_ratings(): # a = 1 def fit(self, urm, n_factors=20, n_epochs=20, lr_all=0.007, reg_all=0.02, init_mean=0, init_std_dev=0.1, verbose=True): # create the training set r, c = urm.nonzero() ones = np.ones(len(r), dtype=np.int32) d = np.vstack((r, c, ones)).transpose() df = pd.DataFrame(d) df.columns = ['userID', 'itemID', 'rating'] reader = Reader() data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) self.trainset = data.build_full_trainset() # fit self.algo = SVDpp(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, init_mean=init_mean, init_std_dev=init_std_dev, verbose=verbose) self.algo.fit(self.trainset) def recommend(self, userid, N=10, urm=None, filter_already_liked=True, with_scores=True, items_to_exclude=[]): if len(items_to_exclude) > 1: raise NotImplementedError('Items to exclude functionality is not implemented yet') r = np.empty([1]) for i in range(d.N_TRACKS): p = self.algo.predict(userid, i) r = np.array([p[3]]) if i == 0 else np.concatenate((r, np.array([p[3]]))) if filter_already_liked: if urm == None: raise ValueError('Please provide a URM in order to items already liked') else: r[urm.getrow(userid).nonzero()[1]] = 0 l = [userid] ind = np.argpartition(r, -10)[-10:] for i in ind: if with_scores: l.append((i, r[i])) else: l.append(i) return l
def train(self, data): ratings_dict = {'itemID': data[:,1], 'userID': data[:,0], 'rating': data[:,2]} df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader).build_full_trainset() # self.algo = KNNBasic(verbose=False) self.algo = SVDpp(verbose=True) self.algo.fit(data)
def model(train_set, test_set): params = {'n_factors': 3, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1} svdpp = SVDpp(n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) svdpp.fit(train_set) predictions = svdpp.test(test_set) rmse = accuracy.rmse(predictions, verbose=False) return predictions, rmse
def __init__(self): self.sales_list_df = self.getSalesList() self.product_df = self.ProductList() self.lower_rating = self.sales_list_df['sum_quantity'].min() self.upper_rating = self.sales_list_df['sum_quantity'].max() self.data = self.LoadDataset() self.train_set, self.test_set = train_test_split(self.data, test_size=0.20) self.algo = SVDpp() self.algo.fit(self.train_set) pred = self.algo.test(self.test_set) # Test score score = accuracy.rmse(pred)
def test_SVDpp_parameters(): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = SVDpp(n_factors=1, n_epochs=1) rmse_default = evaluate(algo, data, measures=['rmse'])['rmse'] # n_factors algo = SVDpp(n_factors=2, n_epochs=1) rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_factors # The rest is OK but just takes too long for now... """
def test_SVDpp_parameters(u1_ml100k, pkf): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = SVDpp(n_factors=1, n_epochs=1, random_state=1) rmse_default = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] # n_factors algo = SVDpp(n_factors=2, n_epochs=1, random_state=1) rmse_factors = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_factors # The rest is OK but just takes too long for now... """
def trainSVD_surprise( training_data, colorlabels, plot=True, simplify=False, savefig="figures/"): #colorlabels, sizelabels, plot=True, savefig=True # algo = SVD(n_factors=4, n_epochs=1000, biased=True) # algo = SVD(n_factors=20, n_epochs=500, biased=False) algo = SVDpp(n_factors=3, n_epochs=1000) algo.fit(training_data) U = algo.pu if plot: fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('First', fontsize=15) ax.set_ylabel('Second', fontsize=15) ax.set_title('Reduced SVD', fontsize=20) scatter = ax.scatter( U[:, 0], U[:, 1], c=colorlabels, s=10, alpha=0.7 ) #explore labeling colors with features like demographics, age ax.grid() cbar = fig.colorbar(scatter, ax=ax) cbar.set_label("state") if savefig: plt.savefig(savefig + "svd_counties") plt.show() if simplify: U = U.transpose() A = np.linalg.svd(U)[0] U_proj = np.dot(A[:, :2].transpose(), U) # Rescale dimensions U_proj /= U_proj.std(axis=1).reshape(2, 1) if plot: fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('First', fontsize=15) ax.set_ylabel('Second', fontsize=15) ax.set_title('Reduced SVD', fontsize=20) scatter = ax.scatter(U_proj[0], U_proj[1], c=colorlabels, s=10) ax.grid() cbar = fig.colorbar(scatter, ax=ax) cbar.set_label("state") if savefig: plt.savefig(savefig + "svd_counties_simplfied") plt.show() return U_proj return U
def RecommendPredictions(): ## Load train and test data into Dataframes trainDF = pan.read_csv("data_source/train_count_norm_1_10.csv", header=None, dtype={2: np.float16}) trainDF = trainDF.fillna(10.0) reader = Reader(rating_scale=(1, 10)) print "Load train set...." dataTrain = Dataset.load_from_df(trainDF[[0, 1, 2]], reader=reader) trainset = dataTrain.build_full_trainset() print "Initiate Training ....." algo = SVDpp(n_epochs=1, lr_all=0.01, reg_all=0.02, verbose=True) algo.train(trainset) ## Predictions for test set with ground truth present print " Load test set..." testDF = pan.read_csv("data_source/test_count_norm_1_10.csv", header=None, dtype={2: np.float16}) testDF = testDF.fillna(10.0) dataTest = Dataset.load_from_df(testDF[[0, 1, 2]], reader=reader) testset = dataTest.build_full_trainset().build_testset() print "Start predictions" predictions = algo.test(testset) try: os.remove("data_source/predictions_results_svdpp.csv") except OSError: pass print "Saving Prediction results in File" resultFile = open("data_source/predictions_results_svdpp.csv", "a") csv_writer = csv.writer(resultFile) for item in predictions: predictionTuple = (item.uid, item.iid, item.r_ui, item.est) csv_writer.writerow(predictionTuple) resultFile.close() ## Predictions for test set with random products present ## LEFT #rmse = accuracy.rmse(predictions, verbose=True)
def svd_pp(): print('Algoritmo Baseline Only...') print('Que data desea utilizar?') print('(1) Android') print('(2) WordPress') data_utilizar = input() # Funcion de encoding para no tener error de lectura del archivo. reload(sys) sys.setdefaultencoding('utf8') if data_utilizar == 1: file_path = configuration.FILE_PATH_ANDROID reader = Reader(line_format='user item rating', sep='\t') else: file_path = configuration.FILE_PATH_WORDPRESS reader = Reader(line_format='user item rating', sep=',') # Dataset data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=10) algo = SVDpp() perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def batchrunSVDpp(data, al, folds): ''' define a function to run batches of data Args: data: data file name in string. al: algorithm name in string. folds: split the data into x folds for cross-validation, interger Returns: None ''' #load the data with given data format print "load data..." data = Dataset.load_from_file(path + data, reader=reader) #split the data into x folds for cross-validation. print "Split data...." data.split(n_folds=folds) # We'll use the famous SVDpp algorithm. if al == 'SVDpp': algo = SVDpp() elif al == 'Base': algo = BaselineOnly(bsl_options=bsl_options) # Evaluate performances of the algorithm on the dataset. perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def check_for_args(): args = sys.argv for arg in args: if (arg == 'SVD'): alg_list.append(SVD()) elif (arg == 'SVDpp'): alg_list.append(SVDpp()) elif (arg == 'SlopeOne'): alg_list.append(SlopeOne()) elif (arg == 'NMF'): alg_list.append(NMF()) elif (arg == 'NormalPredictor'): alg_list.append(NormalPredictor()) elif (arg == 'KNNBaseline'): alg_list.append(KNNBaseline()) elif (arg == 'KNNBasic'): alg_list.append(KNNBasic()) elif (arg == 'KNNWithMeans'): alg_list.append(KNNWithMeans()) elif (arg == 'KNNWithZScore'): alg_list.append(KNNWithZScore()) elif (arg == 'BaselineOnly'): alg_list.append(BaselineOnly()) elif (arg == 'CoClustering'): alg_list.append(CoClustering()) return alg_list
def EvaluateDifferentAlgorithms(): benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # Perform cross validation results = cross_validate(algorithm, data_6months, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print( pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse'))
def batchRunBaseline(data, al, folds): ''' define a function to run batches of data Args: data: data file name in string. al: algorithm name in string. folds: split the data into x folds for cross-validation, interger Returns: None ''' #load the data with given data format print "load data..." data = Dataset.load_from_file(path + data, reader=reader) #split the data into x folds for cross-validation. print "Split data...." data.split(n_folds=folds) bsl_options = {'method': 'sgd', 'learning_rate': .00005, } elif al == 'SVDpp': algo = SVDpp()