def get_recommendation(user): conn = pymysql.connect(Account.link, Account.user, Account.password, Account.db, charset="utf8mb4") df = pd.read_sql_query('SELECT * FROM USERS', conn) if (df.empty): return "Error - empty DF" conn.close() # Anime can be rated from 1 - 10 data = Dataset.load_from_df(df, Reader(rating_scale=(1, 10))) data.split(n_folds=10) algo = SVD() trainset = data.build_full_trainset() algo.train(trainset) # predict ratings for all pairs (user, score) that are NOT in the train set testset = trainset.build_anti_testset() predictions = algo.test(testset) # Get top 15 predictions top_n = get_top_n(predictions, n=15) if top_n.get(user) is None: return "Error - cannot find User" return [iid for (iid, _) in top_n.get(user)]
def svd(data, training, testing): ''' Tune SVD parameters then calculates RMSE, coverage and running time of SVD Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of SVD with Z-score with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters param_grid = {'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50]} # optimize parameters grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data) param = grid_search.best_params['RMSE'] print('SVD:', param) # fit model using the optimized parameters svd = SVD(n_factors=param['n_factors'], n_epochs=param['n_epochs']) svd.train(training) # evaluate the model using test data predictions = svd.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def train_from_dataset(self, filepath): """ train algorithm from a ratings dataset. Use to rebuild a dump of the trained algorithm if it's ever lost """ print("start training") # path to dataset file file_path = os.path.expanduser(filepath) reader = Reader( line_format='user item rating timestamp', sep=',', rating_scale=(1, 10)) data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() # SVD style algo = SVD() # KNN style # sim_options = {'name': 'pearson_baseline', 'user_based': True} # algo = KNNBaseline(k=1, min_k=1, sim_options=sim_options) algo.train(trainset) print("end training") self.data = data self.algorithm = algo
def model(self, alg_key): reader = Reader(rating_scale = (1, 5)) data_result = Dataset.load_from_df(self.make_df()[['user_id', 'place_id', 'score']], reader) # split data into 5 folds data_result.split(n_folds=10) # evaluation if alg_key.lower() == "svd": alg = SVD() elif alg_key.lower() == "knn": alg = KNNBasic() elif alg_key.lower() == "nmf": alg = NMF() evaluate(alg, data_result, measures=['RMSE', 'MAE']) # prediction # user_0 smallShop_5645 2 test_user = '******' test_id = 'smallShop_7089' real_score = 4 trainset = data_result.build_full_trainset() alg.train(trainset) print(alg.predict(test_user, test_id, real_score))
def boost(examples, rounds=10): distr = normalize([1.] * l) hypotheses = [None] * rounds alpha = [0] * rounds for t in range(rounds): #create a training set based on the weight distribution for i in range(l): examples[i] = examples[draw(distr)] # create a trainset object reader = Reader() data = Dataset.load_from_df(examples, reader) trainset = data.build_full_trainset() # Use SVD with surprise algo = SVD()algo.train(trainset) hypotheses[t] = algo for i in range(l): abserr[i] = math.abs(examples.at[i,'rating'] - algo.predict(examples.at[i,'user_id'],examples.at[i,'business_id']).est) # update weights delta = sum(x*y for x,y in zip(distr,abserr) if abserr > delta) hypRes = np.where(abserr > delta,-1,1) alpha[t] = 0.5 * math.log((1 - delta) / (.0001 + delta)) distr = normalize([d * math.exp(-alpha[t] * h) for (d,h) in zip(distr, hypRes)]) def finalHypothesis(x): return sign(sum(a * h(x) for (a, h) in zip(alpha, hypotheses))) return finalHypothesis
def svd(trainset, testset, predset): modelname = 'svd' # Check if predictions already exist if is_already_predicted(modelname): return algo = SVD(n_factors=100, n_epochs=40, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, reg_bu=0.05, reg_bi=0.05, reg_pu=0.09, reg_qi=0.1) print('SVD Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def train_cf_algo(model_data): print(">>> training cf model...") reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df( model_data[['msno', 'song_id', 'target']], reader) algo = SVD() trainset = data.build_full_trainset() algo.train(trainset) return algo
def grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors, file_name): print('SVD Surprise manual grid search') result_train = pd.DataFrame() result_test = pd.DataFrame() # loops on the parameters for n_epoch in n_epochs: for lr_all in lr_alls: for reg_all in reg_alls: for n_factor in n_factors: algo = SVD(reg_all=reg_all, init_mean=init_mean, n_epochs=n_epoch, lr_all=lr_all, n_factors=n_factor) # Retrieve the trainset. trainset = data_train.build_full_trainset() # Build an algorithm, and train it. algo.train(trainset) # Evaluate the performance perf_train = evaluate(algo, data_train, measures=['RMSE']) perf_test = evaluate(algo, data_test, measures=['RMSE']) perf_train["n_epoch"] = n_epoch perf_train["lr_all"] = lr_all perf_train["reg_all"] = reg_all perf_train["init_mean"] = init_mean perf_train["n_factor"] = n_factor # Store the mean performance RMSE on train perf_train["rmse"] = np.mean(perf_train['rmse']) perf_test["n_epoch"] = n_epoch perf_test["lr_all"] = lr_all perf_test["reg_all"] = reg_all perf_test["init_mean"] = init_mean perf_test["n_factor"] = n_factor # Store the mean performance RMSE on test perf_test["rmse"] = np.mean(perf_test['rmse']) # Store on a dataframe result_train = result_train.append(perf_train, ignore_index=True) result_test = result_test.append(perf_test, ignore_index=True) # Save the dataframe so we will see or plot the differencies if it's interesting writer = pd.ExcelWriter(file_name, engine='xlsxwriter') result_train.to_excel(writer, 'Sheet1') result_test.to_excel(writer, 'Sheet2') writer.save()
def _train_predict(self, node): file_name = 'file/%s.dat' % time.time() with open(file_name, 'w') as f: f.writelines( ['%s\t%s\t%s\t%s\n' % (line[0], line[1], line[2], line[3]) for line in node.data] ) reader = Reader(line_format='user item rating timestamp', sep='\t') surprise_data = Dataset.load_from_file(file_name, reader=reader) train_set = surprise_data.build_full_trainset() algo = SVD() algo.train(train_set) node.algo = algo
def startPredModel(ratings, fileOutput): reader = Reader() data = Dataset.load_from_df(ratings[['userId', 'imdbId', 'rating']], reader) data.split(n_folds=5) # 5 svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() svd.train(trainset) dump.dump(fileOutput, None, svd, 1)
def svd_surprise(data_train, reg_all, init_mean, n_epochs, lr_all, n_factors, name_file): print('SVD Surprise') # We construct our SVD algo with surprise and the best parameters algo = SVD(reg_all= reg_all, init_mean = init_mean, n_epochs = n_epochs, lr_all= lr_all, n_factors = n_factors) # Retrieve the trainset. trainset = data_train.build_full_trainset() # Build an algorithm, and train it. algo.train(trainset) # Evaluate the RMSE of the algo evaluate(algo, data_train, measures=['RMSE']) # Make the prediction make_prediction_surprise(algo, name_file)
def latentFeatures(self, ): # Load the movielens-100k dataset (download it if needed), # and split it into 3 folds for cross-validation. reader = Reader(line_format='user item rating timestamp', sep=',') data = Dataset.load_from_file("../data/ml20m_train.csv", reader=reader) trainset = data.build_full_trainset() algo = SVD(n_factors=10) algo.train(trainset) userLatentFeatures = pd.DataFrame( algo.pu, columns=["SVD_user_feature_" + str(i) for i in range(0, 10)]) userLatentFeatures["userId"] = self.r.userId.unique() return userLatentFeatures.set_index("userId")
def inicializar_algoritmo(): csv_df = pd.read_csv('Data/ratings.csv') users_mongo = list(users.get_users_ratings().find()) mongo_df = pd.DataFrame(users_mongo) mongo_df_new = mongo_df[['userId', 'movieId', 'rating']] csv_df_new = csv_df[['userId', 'movieId', 'rating']] final_df = csv_df_new.append(mongo_df_new) final_df.columns = ['userID', 'itemID', 'rating'] reader = Reader() data = Dataset.load_from_df(final_df, reader) data.split(2) # data can now be used normally0 trainset = data.build_full_trainset() algo = SVD() algo.train(trainset) testset = trainset.build_anti_testset() return (algo, testset)
def model_train(rating_dataset=None): if rating_dataset is None: data = Dataset.load_builtin('ml-100k') else: # path to dataset file file_path = os.path.expanduser(rating_dataset) # As we're loading a custom dataset, we need to define a reader. In the # movielens-100k dataset, each line has the following format: # 'user item rating timestamp', separated by '\t' characters. reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # Retrieve the trainset. trainset = data.build_full_trainset() # Build an algorithm, and train it. algo = SVD() algo.train(trainset) return algo
def restore(x): from scipy.sparse import coo_matrix sparse_mat = coo_matrix(x) data = np.stack([sparse_mat.col, sparse_mat.row, sparse_mat.data ], axis=1).astype('int') np.savetxt('tmp.txt', data, fmt='%d') reader = Reader(line_format='user item rating', sep=' ', rating_scale=(0, 255)) dataset = Dataset.load_from_file('tmp.txt', reader=reader) trainset = dataset.build_full_trainset() algo = SVD() algo.train(trainset) xx = np.arange(0, x.shape[0]) yy = np.arange(0, x.shape[1]) y3, x3 = np.meshgrid(yy, xx) testset = zip(x3.ravel().tolist(), y3.ravel().tolist()) testset = [str(a) + ' ' + str(b) for (a, b) in testset] print testset[:10] # pool = mp.Pool(mp.cpu_count() * 2) def my_predict(test): a, b = test.split() return algo.predict(uid=a, iid=b) predictions = [] for test in testset: predictions.append(int(my_predict(test).est)) # predictions=pool.map(my_predict, testset) # pool.close() # pool.join() # print predictions[:10] return np.array(predictions).reshape(x.shape)
def svd_running_time(data): ''' Calculates the running times for training and predictions for SVD Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_SVDtrain: running time for training elapsedtime_SVDtest: running time for predictions on testset ''' elapsedtime_SVDtrain = [] elapsedtime_SVDtest = [] # tune the parameters on the entire data param_grid = { 'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50] } grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data[3]) param = grid_search.best_params['RMSE'] n_factors = param['n_factors'] n_epochs = param['n_epochs'] # using the tuned parameters calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() svd = SVD(n_factors=n_factors, n_epochs=n_epochs) svd.train(training) elapsedtime_SVDtrain.append(time.time() - training_start) # prediction running time test_start = time.time() svd.test(testing) elapsedtime_SVDtest.append(time.time() - test_start) return elapsedtime_SVDtrain, elapsedtime_SVDtest
def collab_filter(md, ratings, links_small, credits, keywords, smd): # data pre-processing id_map = links_small[['movieId', 'tmdbId']] links_small = links_small[ links_small['tmdbId'].notnull()]['tmdbId'].astype('int') reader = Reader() data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) data.split(n_folds=5) svd = SVD() #evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() svd.train(trainset) id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int) id_map.columns = ['movieId', 'id'] id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title') return svd, id_map
# Evaluate performances of our algorithm on the dataset. grid_search.evaluate(data) # best MAE print('best: ' + str(grid_search.best_score['RMSE'])) # combination of parameters that gave the best FCP score print('best params: ' + str(grid_search.best_params['RMSE'])) params = grid_search.best_params['RMSE'] algo_SVD = SVD(verbose=True, n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) algo_SVD.train(data_full) #%% datamat_filled_SVD = datamat_missing.copy().astype(np.float) datamat_filled_NMF = datamat_missing.copy().astype(np.float) for i in range(0, datamat_full.shape[0]): # movie for j in range(0, datamat_full.shape[1]): # user val = algo_SVD.predict('u%i' % (j + 1), 'i%i' % (i + 1)).est datamat_filled_SVD[i, j] = val val = algo_NMF.predict('u%i' % (j + 1), 'i%i' % (i + 1)).est datamat_filled_NMF[i, j] = val #%% compute correlations between real and recovered ratings
def make_reccomendation(self, user_pref): #the filepath to the dataset file_path = 'ml-100k/u.data' #setting the Reader obj reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # Retrieve the trainset. trainset = data.build_full_trainset() # pick an algorithm #we're using a K nearest-neighbors algorithm #algo = KNNBasic() #we're using a singular value decomposition algorithm algo = SVD() #train the algo on our data algo.train(trainset) #read the ratings file r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp'] ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1') #the as of now empty dict that will contain the raters and their likenesses rater_likeness = {} #loop through the ratings table, if our user and the rater agree, give '1 pt' to #the rater's likeness to our user. In the end the rater with the most #'likeness' wins out and their reccomendation will be queried for index, row in ratings.iterrows(): if row['movie_id'] in user_pref and user_pref[ row['movie_id']] == row['rating']: if not row['user_id'] in rater_likeness: rater_likeness[row['user_id']] = 1 else: rater_likeness[row['user_id']] += 1 #determine which rater in the dict has the highest likeness thing = 0 for key, value in rater_likeness.iteritems(): if value >= thing: thing = value best_rater = key user_id = str( best_rater) #we need this to be a string for the predict func #loop through the list of movies until we find one that our rater would #give more than a 4.6 for i in range(1, 1683): item_id = str(i) if not i in user_pref: pred = algo.predict(user_id, item_id, r_ui=3, verbose=False) if pred.est >= 4.6: break return i
def compute_recommendations(user_id, prediction_table, numeric_prediction_table): algo = 'SVD' algorithm = SVD() # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview engine = create_engine(config.DB_URI, echo=True) session = scoped_session( sessionmaker(bind=engine, autocommit=False, autoflush=False)) #reading in the database df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine) df_ratings = df_ratings[['user_id', 'item_id', 'rating']] df_ratings = df_ratings.dropna() df_ratings = df_ratings.drop_duplicates() df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False) df_ratings2 = df_ratings2.rename(columns={'movie_id': 'item_id'}) df_ratings2 = df_ratings2[['user_id', 'item_id', 'rating']] df_ratings2 = df_ratings2.dropna() df_ratings2 = df_ratings2.drop_duplicates() df_ratings = pd.concat([df_ratings, df_ratings2], axis=0) reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10)) data = Dataset.load_from_df(df_ratings, reader=reader) trainset = data.build_full_trainset() # algorithm = eval(algo + "()")# set the algorithm............................................... algorithm.train(trainset) items = pd.read_sql('SELECT distinct id FROM items;', con=engine) df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id] total_items = items.id.unique() user_items = df_user_items.item_id.unique() # user_id = str(user_id) prediction_items = [x for x in total_items if x not in user_items] predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction']) predicted_ratings = [] for i in prediction_items: a = user_id b = i est = algorithm.predict(a, b) predicted_ratings.append(est[3]) predictions['item_id'] = prediction_items predictions['user_id'] = pd.Series( [user_id for x in range(len(predictions.index))], index=predictions.index) predictions['prediction'] = predicted_ratings predictions = predictions.sort_values('prediction', ascending=False) test_prediction = predictions predictions = predictions.head(n=10) cols = [ 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ] df_pred = predictions[['item_id']].T df_pred.columns = cols df_pred['id'] = user_id df_pred = df_pred[[ 'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ]] df_pred['id'] = df_pred['id'].astype(int) df_pred.to_sql(prediction_table, engine, if_exists='append', index=False) #if_exists='append' session.commit() df_num_ratings = test_prediction df_num_ratings = df_num_ratings.head(n=20) df_num_ratings['algorithm'] = algo df_num_ratings.rename(columns={'prediction': 'predicted_rating'}, inplace=True) df_num_ratings.to_sql('numeric_predictions', engine, if_exists='append', index=False) #if_exists='append' session.commit() predcols = [ 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8', 'num_9', 'num_10' ] df_num_ratings_transpose = predictions[['prediction']].T df_num_ratings_transpose.columns = predcols df_num_ratings_transpose['id'] = user_id df_num_ratings_transpose = df_num_ratings_transpose[[ 'id', 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8', 'num_9', 'num_10' ]] df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int) df_num_ratings_transpose.to_sql(numeric_prediction_table, engine, if_exists='append', index=False) #if_exists='append' session.commit()
class SurpriseFeatureBuilder(): def __init__(self, item_identifier='media_id', train_file_path=TRAIN_FILE_PATH, surprise_file_path=SURPRISE_FILE_PATH, user_min_occurrence=20, item_min_occurrence=20): """SupriseFeatureBuilder formats data for ingesting and uses SVD to build a feature for a given item_identifier. Arguments: item_identifier: String colname of the item train_file_path: string train file surprise_file_path: string output filtered data to this location. to be read by Surprise user_min_occurrence: int user must appear at least this number of times to be included item_min_occurrence: int item must appear at least this number of times to be included """ self.train_file_path = train_file_path self.surprise_file_path = surprise_file_path self.item_identifier = item_identifier self.user_min_occurrence = user_min_occurrence self.item_min_occurrence = item_min_occurrence self.svd = SVD() def make_surprise_file(self, user_min_occurrence=None, item_min_occurrence=None): """Generates file to be ingested by Surprise. Arguments: user_min_occurrence: int user must appear at least this number of times to be included item_min_occurrence: int item must appear at least this number of times to be included """ if user_min_occurrence == None: user_min_occurrence = self.user_min_occurrence if item_min_occurrence == None: item_min_occurrence = self.item_min_occurrence data = pd.read_csv(self.train_file_path) filtered_data = (data.groupby('user_id').filter( lambda x: len(x) >= user_min_occurrence).groupby( self.item_identifier).filter( lambda x: len(x) >= item_min_occurrence).groupby( ['user_id', self.item_identifier]).mean()) print(filtered_data.shape) filtered_data.to_csv(path_or_buf=self.surprise_file_path, columns=['is_listened'], header=False, index=True) def make_file_if_missing(self): if not Path(self.surprise_file_path).is_file(): print('File not found. Generating new input file') start_time = time.perf_counter() self.make_surprise_file() print('File generated in {}s'.format(time.perf_counter() - start_time)) def delete_surprise_file(self): if Path(self.surprise_file_path).is_file(): os.remove(self.surprise_file_path) def read_data(self): reader = dataset.Reader(line_format="user item rating", sep=',', rating_scale=(0, 1), skip_lines=0) self.data = dataset.Dataset.load_from_file(self.surprise_file_path, reader=reader) self.data.split(n_folds=5) def eval(self): # Evaluate performances of our algorithm on the dataset. perf = evaluate(self.svd, self.data, measures=['RMSE']) print_perf(perf) def parameter_tuning(self): param_grid = { 'n_epochs': [20, 40], 'lr_all': [0.002, 0.005], 'reg_all': [0.01, 0.02, 0.04], 'n_factors': [20, 50, 100] } print("Starting grid search...") start_time = time.perf_counter() self.grid_search = GridSearch(SVD, param_grid, measures=['RMSE']) self.grid_search.evaluate(self.data) print('Grid search took {}s'.format(time.perf_counter() - start_time)) self.svd = self.grid_search.best_estimator['RMSE'] print(self.grid_search.best_score['RMSE']) print(self.grid_search.best_params['RMSE']) def train(self): trainset = self.data.build_full_trainset() self.svd.train(trainset) def _predict(self, user_lst, item_lst): assert len(user_lst) == len(item_lst) # gets predictions lst_length = len(user_lst) pred = [ self.svd.predict(str(user_lst[idx]), str(item_lst[idx])) for idx in range(lst_length) ] prediction, unseen = zip(*([(est, details['was_impossible']) for (_, _, _, est, details) in pred])) # Replace unseen with number 0, 1, 2 based on whether user, item unseen = [ sum([ self.svd.trainset.knows_user(user_lst[i]), self.svd.trainset.knows_item(item_lst[i]) ]) for i in range(len(user_lst)) ] return prediction, unseen def get_predictions(self, test_file_path): """Use trained model on test file Arguments: test_file_path: String location of test file """ data = pd.read_csv(test_file_path) user_lst, item_lst = data['user_id'].tolist(), data[ self.item_identifier].tolist() predictions, unseen = self._predict(user_lst, item_lst) return { "{}_svd".format(self.item_identifier): predictions, "{}_unseen".format(self.item_identifier): unseen }
#ratings_dict = {'item': items,'rating': ratings,'user': users} #df = pd.DataFrame(ratings_dict) #reader = Reader(rating_scale=(1,10)) #obj = Dataset.load_from_df(df[['item','rating','user']], reader) # As we're loading a custom dataset, we need to define a reader. In the # movielens-100k dataset, each line has the following format: # 'user item rating timestamp', separated by '\t' characters. reader = Reader(line_format='user item rating',sep=' ',rating_scale=(1,10)) dataobj = Dataset.load_from_file('D:/GoogleDrive/mydata.csv', reader=reader) traindata = dataobj.build_full_trainset() algo = SVD( verbose =True, n_factors = 5, n_epochs = 100) algo.train(traindata) data_fill=data.copy() for col in range(0,siz[1]): for row in range(0,siz[0]): #data_fill[row,col]=algo.predict('user%i' % (col+1),'item%i' % (row+1)).est data_fill[row,col]=algo.predict((col+1),(row+1)).est print((np.round(data_fill)).astype(np.int)) print(data_full)
dataset=Dataset.load_from_df(ratings_dataset[['userId','movieId','rating']],reader) #Using the split function to perform cross validation dataset.split(n_folds=6) #Intialising the SVD model and specifying the number of latent features #we can tune this parameters according to our requirement svd=SVD(n_factors=25) #evaluting the model on the based on the root mean square error and Mean absolute error evaluate(svd,dataset,measures=['rmse','mae']) #making the dataset to train our model train=dataset.build_full_trainset() #training our model svd.train(train) #Making a new series which have two columns in it #Movie name and movie id movies_dataset = movies_dataset.reset_index() titles = movies_dataset['movie_name'] indices = pd.Series(movies_dataset.index, index=movies_dataset['movie_name']) #Function to make recommendation to the user def recommendataion(user_id,movie): result=[] #Getting the id of the movie for which the user want recommendation ind=indices[movie].iloc[0] #Getting all the similar cosine score for that movie sim_scores=list(enumerate(cosine_sim[ind]))
# print(improved_recommendations('Mean Girls', smd).head(10)) ############################################################################## #3 reader = Reader() ratings = pd.read_csv('./data/ratings_small.csv') #print(ratings.head()) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) data.split(n_folds=5) svd = SVD() #evaluate(svd, data, measures=['RMSE','MAE']) trainset = data.build_full_trainset() svd.train(trainset) # trainset 생성 #uid 유저아이디 , iid는 영화 아이디 a = svd.predict(uid=1, iid=302) #`print(a.est) m_list = list(set(ratings['movieId'])) def CF_recsys(id): est_list = [] for mv in m_list: est_list += [svd.predict(id, mv).est] df = pd.DataFrame({ 'id': m_list, 'est': est_list
sep=',', skip_lines=1) data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=5) svd_ambiente = SVD(n_epochs=100, lr_all=0.002, reg_all=0.2) # Evaluate performances of our algorithm on the dataset. perf = evaluate(svd_ambiente, data, measures=['RMSE', 'MAE']) print_perf(perf) # Retrieve the trainset. trainset = data.build_full_trainset() svd_ambiente.train(trainset) from sklearn.externals import joblib joblib.dump(svd_ambiente, 'svd_ambiente.pkl') #con esto se carga svd_ambiente = joblib.load('svd_ambiente.pkl') test = pd.read_csv("/Volumes/Disco_SD/Set de datos/guia_oleo/ratings_test.csv", sep=',', encoding="ISO-8859-1") test_ambiente = pd.DataFrame() for i in range(0, len(test.index)):
class SurSVD: def __init__(self, k=5): if not isinstance(k, int) or k <= 0: raise IOError("Parameter k should be a positive integer.") self.data = None self.k = k self.algo = SVD(n_factors=self.k, biased=False, reg_all=0) self.predictions = pd.DataFrame() def fit_directly(self, data_long): """ This function directly computes the predictions of the algorithm for the data provided. The data needs to be in the long shape format. It then add to the class attributes the predictions made by the algorithm (maintaining the long format) :param data_long: pd.DataFrame | DataFrame in the long shape format :return void: """ # Run SVD++ reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(data_long, reader) trainset = data.build_full_trainset() self.algo.train(trainset) testset = trainset.build_anti_testset() predictions = self.algo.test(testset) # Reconstruct predictions users = [] items = [] ratings = [] dataframe = pd.DataFrame() for uid, iid, r_ui, _, _ in predictions: users.append(uid) items.append(iid) ratings.append(r_ui) dataframe["userID"] = users dataframe["itemID"] = items dataframe["ratings"] = ratings self.predictions = dataframe def fit(self, rating_matrix): """ Fits the instance to the rating matrix. The index must be the users and the columns the items. :param rating_matrix: pd.DataFrame | rating matrix :return: void """ data_long = rating_matrix.stack().reset_index() data_long.columns = ["user_id", "item_id", "ratings"] # Run SVD reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(data_long, reader) trainset = data.build_full_trainset() self.algo.train(trainset) testset = trainset.build_anti_testset() predictions = self.algo.test(testset) # Reconstruct predictions users = [] items = [] ratings = [] dataframe = pd.DataFrame() for uid, iid, r_ui, _, _ in predictions: users.append(uid) items.append(iid) ratings.append(r_ui) dataframe["itemID"] = items dataframe["ratings"] = ratings dataframe["userID"] = users self.predictions = dataframe def predict(self, user, item): """ Predict the probability that input user will like input item :param user: int | user ID :param item: int | item ID :return: float | probability that user likes item """ cond1 = self.predictions["userID"] == user cond2 = self.predictions["itemID"] == item mask = cond1 & cond2 temp = np.array(self.predictions.loc[mask, "ratings"]) proba = np.sum(temp) return proba
################################### Collaborative Filtering ################################ reader= Reader() ratings= pd.read_csv("./Movies/ratings_small.csv") print("\n\nRatings:\n", ratings.head()) data= Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) data.split(n_folds=5) # Using Singular Value Decomposition (SVD) from Surprise package svd= SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset= data.build_full_trainset() svd.train(trainset) print(ratings[ratings['userId']==1]) print(svd.predict(1,302, 3)) def convert_int(x): try: return int(x) except: return np.nan id_map= pd.read_csv('./Movies/links_small.csv')[['movieId', 'tmdbId']] id_map['tmdbId']= id_map['tmdbId'].apply(convert_int) id_map.columns= ['movieId', 'id'] id_map= id_map.merge(smd[['title', 'id']], on='id').set_index('title')
# Delete unused columns del dfRatings['date'] del dfRatings['train_id'] del dfTest['date'] del dfTest['test_id'] # Set the rating scale and create the data for Surprise to use reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( dfRatings[['user_id', 'business_id', 'rating']], reader) factors = 50 train_set = data.build_full_trainset() # Use SVD with surprise algo = SVD(n_factors=factors) algo.train(train_set) f = open('SVDOutput.csv', 'w') f.write("test_id,rating\n") for i in range(len(dfTest)): prediction = algo.predict(dfTest.at[i, 'user_id'], dfTest.at[i, 'business_id'], r_ui=4, verbose=True) predRating = prediction.est f.write(str(i) + "," + str(predRating) + '\n') f.close()
from surprise import Reader, Dataset import surprise # Define the format reader = Reader(line_format='user item rating', sep=',') # Load the data from the file using the reader format data = Dataset.load_from_file('recomm.csv', reader=reader) # Split data into 5 folds data.split(n_folds=5) from surprise import SVD, evaluate algo = SVD() evaluate(algo, data, measures=['RMSE', 'MAE']) # Retrieve the trainset. trainset = data.build_full_trainset() algo.train(trainset) userid = str(10) itemid = str(20) actual_rating = 3 print(algo.predict(userid, 40)) a = algo.predict(userid, 20) t = a.est / 3 print(t) from sklearn.externals import joblib joblib.dump(algo, 'reccc.pkl')
""" from __future__ import (absolute_import, division, print_function, unicode_literals) from surprise import Dataset from surprise import SVD from surprise import accuracy data = Dataset.load_builtin('ml-100k') algo = SVD() trainset = data.build_full_trainset() algo.train(trainset) testset = trainset.build_testset() predictions = algo.test(testset) # RMSE should be low as we are biased accuracy.rmse(predictions, verbose=True) # ~ 0.68 (which is low) # We can also do this during a cross-validation procedure! print('CV procedure:') data.split(3) for i, (trainset_cv, testset_cv) in enumerate(data.folds()): print('fold number', i + 1) algo.train(trainset_cv) print('On testset,', end=' ')