def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_file_path", default="data/train.csv", help="training file path") parser.add_argument("--test_file_path", default="data/test.csv", help="testing file path") parser.add_argument("--approach", default="SVD", help="Baseline | SVD | SlopeOne | NMF | CoClustering") parser.add_argument("--output_ranking_file", default="ranking", help="output ranking for test") bsl_options = {'method': 'sgd', 'n_epochs': 20, 'reg_u': 100, 'reg_i': 50} options = { "Baseline": BaselineOnly(bsl_options, verbose=True), "SVD": SVD(verbose=True, n_factors=20, n_epochs=3), "SlopeOne": SlopeOne(), "NMF": NMF(), "CoClustering": CoClustering() } args = parser.parse_args() reader = Reader(line_format='user item rating timestamp', sep='\t') algo = options[args.approach] train_data = Dataset.load_from_file(args.train_file_path, reader=reader) test_data = Dataset.load_from_file(args.test_file_path, reader=reader) train_set = train_data.build_full_trainset() test_set = test_data.build_full_trainset().build_testset() print("training....") algo.fit(train_set) print("testing...") predictions = algo.test(test_set) accuracy.mae(predictions, verbose=True) accuracy.rmse(predictions, verbose=True) ### Extra Credit output_ranking(predictions, args.output_ranking_file + "_" + args.approach + ".out") precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=2.5) print("Precision:", sum(prec for prec in precisions.values()) / len(precisions)) print("Recall:", sum(rec for rec in recalls.values()) / len(recalls)) print("F-measure:", f_measure(precisions, recalls)) print("conversion_rate:", get_conversion_rate(predictions, k=10)) print("ndcg:", get_ndcg(predictions, k_highest_scores=10))
def all_models(_file_path, modelname): #data = ds.get_data(_file_path, 'data/data_subset.csv', 0.99) data = ds.get_data(_file_path) data_surprise = data[['customer_id', 'product_id', 'star_rating']]. \ rename(columns={'customer_id': 'userID', 'product_id': 'itemID', 'star_rating': 'rating'}) reader = Reader(rating_scale=(1.0, 5.0)) df_loaded = Dataset.load_from_df(data_surprise, reader) #trainset = df_loaded.build_full_trainset() results_list = [] # features reviews = data.shape[0] n_users = data.customer_id.nunique() n_products = data.product_id.nunique() mean_rating = data.star_rating.mean() rating_std = data.star_rating.std() sparsity = reviews * 100 / (n_users * n_products) #for model in ['user_user', 'item_item', 'matrix_fact']: # Perform cross validation results = model_selection.cross_validate(select_model( df_loaded, model_selection=modelname), df_loaded, measures=['RMSE', 'MAE'], cv=5, verbose=False) # precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4) kf = KFold(n_splits=5) trainset, testset = train_test_split(df_loaded, test_size=.25) map_k, mar_k = 0, 0 algo = select_model(df_loaded, model_selection=modelname) #for trainset, testset in trainset.split(): algo.fit(trainset) predictions = algo.test(testset) #uid = str(11613707) # raw user id (as in the ratings file). They are **strings**! #iid = str(302) # raw item id (as in the ratings file). They are **strings**! # get a prediction for specific users and items. #pred = algo.predict(uid,verbose=True) #pred top_n = rec.get_top_n(predictions, n=30) #top_n[data_surprise.userID[38745832]] # top_n = rec.get_top_n(predictions,data_surprise,userID = 11613707) # pred_SVD_124 = top.get_top_n(predictions,userId = 13545982,data = data) #top_n.head(15) # pred_SVD_124 print('Recommendations for the user') # print('user Id Item Id') dfo = pd.DataFrame(columns=['UserId', 'ItemId']) i = 0 for uid, user_ratings in top_n.items(): row = [uid, top_n[uid]] dfo.loc[i] = row i = i + 1 #dfo.to_csv('submissionF.csv', index = False) # print('Recommendations for the user') #print(uid, [iid for (iid, _) in user_ratings]) precisions, recalls = metrics.precision_recall_at_k(predictions, k=5, threshold=4) # Precision and recall can then be added for all the splits merge = dfo.merge(data, left_on='UserId', right_on='customer_id') merge1 = merge[['UserId', 'product_title']] print(merge1) map_k += precisions mar_k += recalls # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series(map_k / 5, index=['map_k'])) tmp = tmp.append(pd.Series(mar_k / 5, index=['mar_k'])) tmp = tmp.append(pd.Series([str(_file_path)], index=['data'])) tmp = tmp.append(pd.Series([str(modelname)], index=['Algorithm'])) # features tmp = tmp.append(pd.Series(reviews, index=['reviews'])) tmp = tmp.append(pd.Series(n_users, index=['n_users'])) tmp = tmp.append(pd.Series(n_products, index=['n_products'])) tmp = tmp.append(pd.Series(mean_rating, index=['mean_rating'])) tmp = tmp.append(pd.Series(rating_std, index=['std_rating'])) tmp = tmp.append(pd.Series(sparsity, index=['sparsity'])) results_list.append(tmp) # print(results_list) results_df = pd.DataFrame(results_list) # saving the results file to folder return results_df
def all_models(_file_path, _save_path): data = ds.get_data(_file_path, 'data/data_subset.csv', 0.99) data_surprise = data[['customer_id', 'product_id', 'star_rating']]. \ rename(columns={'customer_id': 'userID', 'product_id': 'itemID', 'star_rating': 'rating'}) reader = Reader(rating_scale=(1.0, 5.0)) df_loaded = Dataset.load_from_df(data_surprise, reader) results_list = [] # features reviews = data.shape[0] n_users = data.customer_id.nunique() n_products = data.product_id.nunique() mean_rating = data.star_rating.mean() rating_std = data.star_rating.std() sparsity = reviews * 100 / (n_users * n_products) for model in ['user_user', 'item_item', 'matrix_fact']: # Perform cross validation results = model_selection.cross_validate(select_model( df_loaded, model_selection=model), df_loaded, measures=['RMSE', 'MAE'], cv=5, verbose=False) # precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4) kf = KFold(n_splits=5) map_k, mar_k = 0, 0 algo = select_model(df_loaded, model_selection=model) for trainset, testset in kf.split(df_loaded): algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = metrics.precision_recall_at_k(predictions, k=5, threshold=4) # Precision and recall can then be added for all the splits map_k += precisions mar_k += recalls # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series(map_k / 5, index=['map_k'])) tmp = tmp.append(pd.Series(mar_k / 5, index=['mar_k'])) tmp = tmp.append(pd.Series([str(_file_path)], index=['data'])) tmp = tmp.append(pd.Series([str(model)], index=['Algorithm'])) # features tmp = tmp.append(pd.Series(reviews, index=['reviews'])) tmp = tmp.append(pd.Series(n_users, index=['n_users'])) tmp = tmp.append(pd.Series(n_products, index=['n_products'])) tmp = tmp.append(pd.Series(mean_rating, index=['mean_rating'])) tmp = tmp.append(pd.Series(rating_std, index=['std_rating'])) tmp = tmp.append(pd.Series(sparsity, index=['sparsity'])) results_list.append(tmp) print(results_list) results_df = pd.DataFrame(results_list) # saving the results file to folder if _save_path: results_df.to_csv(_save_path, mode='a', index=False) return results_df
dfo = pd.DataFrame(columns=['UserId', 'ItemId']) i = 0 for uid, user_ratings in top_n.items(): # print('Recommendations for the user') # st.write('user ID,Item Id') row = [uid, top_n[uid]] dfo.loc[i] = row i = i + 1 #st.write(uid, [iid for (iid, _) in user_ratings]) print('user Id ,Item Id') print(uid, [iid for (iid, _) in user_ratings]) precisions, recalls = metrics.precision_recall_at_k(predictions, k=5, threshold=4) # Precision and recall can then be added for all the splits merge = dfo.merge(data, left_on='UserId', right_on='customer_id') merge1 = merge[['product_title']] st.write(merge1) #UserId = '18206299' #merge2 = customer_recomendation(47781982) #st.write(merge2) #url = st.text_input('Enter CustomerID') #st.write('The Entered Customer Id is', url) #merge2 =merge1.loc[43173394] #st.write(merge2) map_k += precisions