from __future__ import (absolute_import, division, print_function, unicode_literals) import pandas as pd from surprise import NormalPredictor from surprise import Dataset from surprise import Reader # Dummy algo algo = NormalPredictor() # Creation of the dataframe. Column names are irrelevant. ratings_dict = { 'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, 'user_foo'], 'rating': [3, 2, 4, 3, 1] } df = pd.DataFrame(ratings_dict) # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 5)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) data.split(2) # data can now be used normally for trainset, testset in data.folds(): algo.train(trainset) algo.test(testset)
def compute_recommendations(user_id, prediction_table, numeric_prediction_table): algo = 'Normal Predictor' algorithm = NormalPredictor() # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview engine = create_engine(config.DB_URI, echo=True) session = scoped_session( sessionmaker(bind=engine, autocommit=False, autoflush=False)) #reading in the database df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine) df_ratings = df_ratings[['user_id', 'item_id', 'rating']] df_ratings = df_ratings.dropna() df_ratings = df_ratings.drop_duplicates() df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False) df_ratings2 = df_ratings2.rename(columns={'movie_id': 'item_id'}) df_ratings2 = df_ratings2[['user_id', 'item_id', 'rating']] df_ratings2 = df_ratings2.dropna() df_ratings2 = df_ratings2.drop_duplicates() df_ratings = pd.concat([df_ratings, df_ratings2], axis=0) reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10)) data = Dataset.load_from_df(df_ratings, reader=reader) trainset = data.build_full_trainset() # algorithm = eval(algo + "()")# set the algorithm............................................... algorithm.train(trainset) items = pd.read_sql('SELECT distinct id FROM items;', con=engine) df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id] total_items = items.id.unique() user_items = df_user_items.item_id.unique() # user_id = str(user_id) prediction_items = [x for x in total_items if x not in user_items] predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction']) predicted_ratings = [] for i in prediction_items: a = user_id b = i est = algorithm.predict(a, b) predicted_ratings.append(est[3]) predictions['item_id'] = prediction_items predictions['user_id'] = pd.Series( [user_id for x in range(len(predictions.index))], index=predictions.index) predictions['prediction'] = predicted_ratings predictions = predictions.sort_values('prediction', ascending=False) test_prediction = predictions predictions = predictions.head(n=10) cols = [ 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ] df_pred = predictions[['item_id']].T df_pred.columns = cols df_pred['id'] = user_id df_pred = df_pred[[ 'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ]] df_pred['id'] = df_pred['id'].astype(int) df_pred.to_sql(prediction_table, engine, if_exists='append', index=False) #if_exists='append' session.commit() df_num_ratings = test_prediction df_num_ratings = df_num_ratings.head(n=20) df_num_ratings['algorithm'] = algo df_num_ratings.rename(columns={'prediction': 'predicted_rating'}, inplace=True) df_num_ratings.to_sql('numeric_predictions', engine, if_exists='append', index=False) #if_exists='append' session.commit() predcols = [ 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8', 'num_9', 'num_10' ] df_num_ratings_transpose = predictions[['prediction']].T df_num_ratings_transpose.columns = predcols df_num_ratings_transpose['id'] = user_id df_num_ratings_transpose = df_num_ratings_transpose[[ 'id', 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8', 'num_9', 'num_10' ]] df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int) df_num_ratings_transpose.to_sql(numeric_prediction_table, engine, if_exists='append', index=False) #if_exists='append' session.commit()
# test_set = pickle.load(f) #with open('/Shared/bdagroup7/download/training_set.dat', "rb") as f: # training_set = pickle.load(f) # Learning options sim_options = {'name': 'cosine', 'min_support': 50, 'user_based': True} bsl_options = {'method': 'sgd', 'learning_rate': .0005} # Algorithms (only select one) #algo = SVD() #algo = KNNBasic(k=10, min_k=8, sim_options=sim_options) #algo = KNNWithMeans(k=15, min_k=5, sim_options=sim_options) #algo = CoClustering() #algo = SVDpp() algo = NormalPredictor() algo.train(training_set) predictions = algo.test(test_set) with open('/Shared/bdagroup7/download/predictions_normal.dat', "wb") as f: pickle.dump(predictions, f) # TODO: Ensemble rmse = accuracy.rmse(predictions, verbose=True) print("RMSE is: ") print(rmse)