def svd(user_id, area): algo = SVDpp() algo = SVDpp(n_factors=100, n_epochs=15) # 3. train model 저장 file_name = os.path.expanduser('./dump') #dump.dump(file_name, algo=algo) # 한번 학습하고 여기는 주석처리 _, algo = dump.load(file_name) Area = pd.read_csv('./area.csv') ## { 상품아이디(학습데이터), area, 상품ID } #nowarea="C" #user=str("A2CX7LUOHB2NDG") # usre ID 받아오기 neww = Area[Area['area'] == area]['productID'].tolist() # 구역 받아오기 predictions = [ algo.predict(str(user_id), str(productID)) for productID in neww ] # 예측 ###### def sortkey_est(pred): return pred.est predictions.sort(key=sortkey_est, reverse=True) #print(predictions) top_product_id = [int(pred.iid) for pred in predictions] top_product_id = top_product_id[:5] return top_product_id
def recommend_collaborative_implicit(): if request.method == 'GET': # try: db = getDb() if (db): collaboratives = db.collaboratives customer_id = request.args.get('customer_id', default='') top = request.args.get('top', default='') user_id = request.args.get('user_id', default='') data = pd.DataFrame( list( collaboratives.find({ 'customer': ObjectId(customer_id), 'explicit': False }))) data = data[['userId', 'itemId', 'feedBack']] data = data.rename(columns={'userId': 'user', 'itemId': 'item'}) data['user'] = data['user'].astype("category") data['item'] = data['item'].astype("category") # #cat.codes creates a categorical id for the users and artists data['user_id'] = data['user'].cat.codes data['item_id'] = data['item'].cat.codes sparse_item_user = sparse.csr_matrix( (data['feedBack'].astype(float), (data['item_id'], data['user_id']))) sparse_user_item = sparse.csr_matrix( (data['feedBack'].astype(float), (data['user_id'], data['item_id']))) user_ids = data[data['user'] == user_id].iloc[0]['user_id'] _, model = dump.load('models/' + customer_id + '_collaborative_implicit') recommended = model.recommend(user_ids, sparse_user_item, N=int(top), filter_already_liked_items=False) result = [] print('rec', recommended) for item in recommended: idx, score = item print('err', data[data.item_id == idx]) result.append({ 'item_id': str(data.item.loc[data.item_id == idx].iloc[0]), 'score': str(score) }) return { 'data': { 'current_user': { 'id': str(user_id), }, 'suggestion': result, 'top': top } } else: return "Database not found"
def recommend_from_param(): # TODO: add algorithm as a parameter userId = request.args['userid'] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() _, loaded_algo = dump.load(os.path.expanduser('./SVD_model_couchDB')) print("file loaded") predictions_loaded_algo = loaded_algo.test(trainset.build_testset()) recs = get_top_n(predictions_loaded_algo, 10)[int(userId)] print(recs) response_list = { 'source': { "id": "SVD_model" }, 'movieIds': [], 'predictedRatings': [] } for i in range(10): response = {'movieId': recs[i][0], 'predicted rating': recs[i][1]} response_list['predictedRatings'].append(response) response_list['movieIds'].append(recs[i][0]) print(response_list) return jsonify(response_list)
def get_similar_items(item_name, n_similar_items=5): """ Get Similar Items predicted by model. Parameters ---------- item_name: name of the selected product. n_similar_items: number of similar products required, default=5. Returns ------- Similar items list. """ _, algo = load("backend/models/similar_items_algo.pkl") inner_item_mapping = pd.read_sql_table("item_id_mapping", engine, index_col="index") inner_id = inner_item_mapping[inner_item_mapping["item_raw_id"] == item_name] inner_id = int(inner_id["item_inner_id"]) similar_item_ids = algo.get_neighbors(inner_id, k=n_similar_items) similar_items = [algo.trainset.to_raw_iid(ids) for ids in similar_item_ids] return similar_items
def get_model(cls): """Get the model object for this instance, loading it if it's not already loaded.""" if cls.model == None: # with open(os.path.join(model_path, 'model.pkl'), 'r') as inp: # cls.model = pickle.load(inp) _, cls.model = dump.load(os.path.join(model_path, 'model.pkl')) return cls.model
def cf_model_load(file_path): """ :param file_path: # 保存数据的位置 :return: """ # 假设里面保存了预测的数值,则返回 (prediction, algo)的元组,与保存的格式相对应 return load(file_name=file_path)
def recommend_from_form(): userId = request.form['userId'] limit = int(request.form['limit']) data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() _, loaded_algo = dump.load(os.path.expanduser('./SVD_model_couchDB')) print("file loaded") predictions_loaded_algo = loaded_algo.test(trainset.build_testset()) recs = get_top_n(predictions_loaded_algo, limit)[int(userId)] response_list = { 'source': { "id": "SVD_model" }, 'movieIds': [], 'predictedRatings': [] } i = 0 while (len(response_list['predictedRatings']) < limit and i < len(recs)): response = {'movieId': recs[i][0], 'predicted rating': recs[i][1]} response_list['predictedRatings'].append(response) response_list['movieIds'].append(recs[i][0]) print(response_list) i = i + 1 return jsonify(response_list)
def _compute_recommendations(trainset, new_user_neighbor_raw_id, n_recommendations=3): """ Compute Top 5 Product Recommendations. Parameters ---------- trainset: data object. new_user_neighbor_raw_id: str, Inner Id for the nearest neighbor. n_recommendations: int, Number of recommendations. Returns ------- top 5 recommendations. """ _, algo = dump.load("backend/models/user_predictions_algo.pkl") item_id_mapping = pd.read_sql_table("item_id_mapping", engine, index_col="index") predictions = {} for items in list(item_id_mapping["item_raw_id"]): x = algo.predict(items, new_user_neighbor_raw_id) predictions[x[0]] = x[3] predictions = pd.DataFrame(predictions.values(), predictions.keys()) top_five_recommends = list( predictions.sort_values(0, ascending=False).head(n_recommendations).index) return top_five_recommends
def fetch_recommendations(n_lojas=3, n_ofertas=2): model = load('recomendacao_lojas')[1] lojas = pd.read_csv('lojas.csv') lojas['id'] = lojas.index ofertas_manuais = pd.read_csv('ofertas_manuais.csv') lojas_ids = lojas['id'].values ratings = [] for i in range(0, len(lojas_ids)): prediction = model.predict(uid=0, iid=lojas_ids[i]) ratings.append(prediction.est) lojas_escolhidas = lojas.sample(n_lojas, weights=np.array(ratings), axis=0) ofertas_totais = None for i in range(0, lojas_escolhidas.shape[0]): ofertas = ofertas_manuais[ofertas_manuais['lojas'] == lojas_escolhidas.iloc[[i]]['id'].values[0]] if ofertas.shape[0] != 0: if ofertas.shape[0] >= n_ofertas: ofertas = ofertas.sample(n_ofertas, weights='priority', axis=0) if ofertas_totais is None: ofertas_totais = ofertas else: ofertas_totais = pd.concat( [ofertas_totais.reset_index(drop=True), ofertas], axis=0) else: #IMPLEMENTAR O CASO DE NÃO TER OFERTAS MANUAIS pass return lojas_escolhidas.join(ofertas_totais.set_index('lojas'), lsuffix='_lojas', rsuffix='_ofertas', on='id')
def BaselineOnly_alg(): print('Using BaselineOnly') _, alg = dump.load('BaselineOnly') predictions = alg.test(testset) print(accuracy.rmse(predictions)) dump.dump('BSL_pred', predictions, alg)
def get_user_recommend(ratings, movies, USER, filename): movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply( lambda x: [i['name'] for i in x] if isinstance(x, list) else []) movies['year'] = (pd.to_datetime( movies['release_date'], errors='coerce').apply( lambda x: str(x).split('-')[0] if x != np.nan else np.nan)) movies.drop(movies.columns.difference( ['movieId', 'title', 'genres', 'year']), 1, inplace=True) movies.set_index('movieId', inplace=True) user_ratings = ratings[(ratings['userId'] == USER)] user_ratings = user_ratings.set_index('movieId') user_ratings = user_ratings.join(movies) user_ratings.drop(user_ratings.columns.difference( ['movieId', 'title', 'genres', 'year']), 1, inplace=True) movies_cut = movies[~movies.isin(user_ratings)].dropna() _, svd = dump.load(filename) user_predict = movies_cut.copy() user_predict = user_predict.reset_index() user_predict['Estimate_Score'] = user_predict['movieId'].apply( lambda x: svd.predict(USER, x).est) user_predict = user_predict.sort_values('Estimate_Score', ascending=False) return user_predict
def test_dump(): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
def load_prev_colab_results(self, user_id): (_, algo_tuned) = dump.load('SVD_tuned.p') iid = self.df20['route_id'].unique() #user_id = 200128311 #mine, trad, alpine, intermediate #user_id = 110596403 #boulder-er #user_id = 200272475 #boulder-er, advanced #user_id = 200077815 #michaels, trad, alpine, intermediate #user_id = 106540415 #mixed climber, alpine climber, advanced iid_me = self.df20.loc[self.df20['user_id'] == user_id, 'user_id'] iids_to_pred = np.setdiff1d(iid, iid_me) testset = [[user_id, iid, 2] for iid in iids_to_pred] predictions_tuned = algo_tuned.test(testset) pred_ratings_tuned = np.array([pred.est for pred in predictions_tuned]) i_max = np.argpartition(pred_ratings_tuned, -20)[-20:] i_max = i_max[np.argsort(-pred_ratings_tuned[i_max])] iid = iids_to_pred[i_max] #top 20 recommended climbs self.df_top_climbs_mf = pd.DataFrame(iid, pred_ratings_tuned[i_max]) self.df_top_climbs_mf = self.df_top_climbs_mf.reset_index() self.df_top_climbs_mf.columns = ['predicted rating', 'route id']
def get_svd_recommender(df, test_size=0.25, path="", exists=False): """ builds and trains an SVD recommender :param df: a dataframe containing user ID's, beer ID's and ratings :param test_size: the fraction of samples that should be reserved for testing :param path: the path to an existing svd recommender that was saved to a file :param exists: whether or not to upload the algo from a saved file :return: trained recommender, list of predictions, and the root mean square error of the recommender """ if exists: return dump.load(path)[1] # allows surprise to read df reader = Reader(rating_scale=(1, 5)) # must load in particular column order data = Dataset.load_from_df(df[['user_id', 'beer_id', 'user_score']], reader) trainset, testset = train_test_split(data, test_size=test_size) algo = SVD() # Train the algorithm on the trainset algo.fit(trainset) # and predict ratings for the testset. test() returns a list of prediction objects # which have several attributes such as est (the prediction) and r_ui (the true rating) predictions = algo.test(testset) # rmse below 1 is considered low rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return algo, predictions, rmse
def svd4_user_movie_rate(): """ Returns -------返回训练好的基于user movie rating数据的svd模型,若没有训练好的模型存储,则触发模型训练&保存 TYPE surprise.prediction_algorithms.algo_base.AlgoBase. TYPE surprise算法对象. """ if os.path.exists(ALGO_RESULT_PATH + SVD_RESULT_USER2MOVIE): return dump.load(ALGO_RESULT_PATH + SVD_RESULT_USER2MOVIE)[0] else: # 读取数据 reader = Reader(line_format="user item rating") data = Dataset.load_from_df(dataprocess.create_user_movie_rate(). loc[:, ["user", "movie", "rate"]], reader=reader) trainset = data.build_full_trainset() # 参数设置&模型初始化 algo = SVD(n_epochs=N_EPOCHS_SVD, lr_all=LR_ALL_SVD, verbose=True) algo.fit(trainset) dump.dump(file_name=ALGO_RESULT_PATH + SVD_RESULT_USER2MOVIE, algo=algo, verbose=True) return algo
def knn_user_movie_rate(): """ Returns -------返回训练好的基于user movie rating数据的knnbaseline模型 TYPE surprise.KNNbaseline object. """ if os.path.exists(ALGO_RESULT_PATH + KNN_RESULT_USER2MOVIE): return dump.load(ALGO_RESULT_PATH + KNN_RESULT_USER2MOVIE)[0] else: # 读取数据 reader = Reader(line_format="user item rating", sep=",") data = Dataset.load_from_df(dataprocess.create_user_movie_rate(). loc[:, ["user", "movie", "rate"]], reader=reader) trainset = data.build_full_trainset() # 参数设置&模型初始化 sim_options = {'name': 'pearson', "user_based": False} algo = KNNBaseline(k=10, sim_options=sim_options) algo.fit(trainset) dump.dump(file_name=ALGO_RESULT_PATH + KNN_RESULT_USER2MOVIE, algo=algo, verbose=True) return algo
def predict_new_user(newUser, pathToPivotData, pathToModel): # load pivoted data dfPivot = pd.read_csv(pathToPivotData, index_col=0) # append new user to data dfPivot = dfPivot.append(pd.DataFrame(newUser, index=['-99'])) dfPivot = dfPivot.fillna(0)#dfPivot.mean(axis=0)) # calculate distance to each existing user userDistance = {} for user in dfPivot.index: userDistance[user] = spatial.distance.euclidean(dfPivot.loc['-99'], dfPivot.loc[user]) # get top n similar users n=20 similarUsers = sorted(userDistance.items(), reverse=True,key=lambda x:-x[1])[1:n+1] similarUsersKeys = [key[0] for key in similarUsers] # load rs _, loaded_algo = dump.load(pathToModel) # get top movies for similar users preds = {} for user in similarUsersKeys: preds[user] = {} for movie in list(dfPivot): preds[user][movie] = loaded_algo.predict(uid = str(user), iid=str(movie))[3] predsDf = pd.DataFrame.from_dict(preds) # get top movies from average from top movies for similar users # TODO: add distance as weighting recommendedMovies = predsDf.mean(axis=1).sort_values(ascending=False)[:20].to_dict() print(recommendedMovies) return recommendedMovies
def test_dump_nothing(): """Ensure that by default None objects are dumped.""" with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name) predictions, algo = dump.load(tmp_file.name) assert predictions is None assert algo is None
def test_dump_nothing(): """Ensure that by default None objects are dumped.""" with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name) predictions, algo = dump.load(tmp_file.name) assert predictions is None assert algo is None
def retrieve(): print("request received!") retrieve_request = request.get_json() userId = retrieve_request.get("userId") candidateIds = retrieve_request.get("candidateIds") excludeIds = retrieve_request.get("excludeIds") # offset = retrieve_request.get('offset') limit = retrieve_request.get("limit") # retrievalCriteria = retrieve_request.get('retrievalCriteria') # load data and model, get recommendations data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() _, loaded_algo = dump.load(os.path.expanduser('./SVD_model_couchDB')) print("file loaded") predictions_loaded_algo = loaded_algo.test(trainset.build_testset()) recs = get_all_recs(predictions_loaded_algo)[int(userId)] # recs = get_top_n(predictions_loaded_algo,int(limit))[int(userId)] response_list = {'source':{"id":"SVD_model"},'movieIds':[]} i = 0 # Format recommendations for i in range(0, len(recs)): if (len(response_list['movieIds']) >= int(limit)): break; if (recs[i][0] in candidateIds) and (recs[i][0] not in excludeIds): response_list['movieIds'].append(recs[i][0]) i = i+1 return jsonify(response_list)
def get_model(): path = model_dump_path if os.path.isfile(path): _, loaded_algo = dump.load(path) return loaded_algo else: algo = train(path) return algo
def recommend_collaborative_explicit(): if request.method == 'GET': try: db = getDb() if (db): collaboratives = db.collaboratives customer_id = request.args.get('customer_id', default='') top = request.args.get('top', default='') user_id = request.args.get('user_id', default='') data = pd.DataFrame( list( collaboratives.find({ 'customer': ObjectId(customer_id), 'explicit': True }))) data = data[['userId', 'itemId', 'feedBack']] data = data.rename(columns={ 'userId': 'user', 'itemId': 'item', 'feedBack': 'rating' }) # # get list of product id iids = data['item'].unique() iids_user = data.loc[data['user'] == int(user_id), 'item'] # # remove the idds that user has rated iids_to_pred = np.setdiff1d(iids, iids_user) testset = [[user_id, iid, 4.] for iid in iids_to_pred] _, loaded_algo = dump.load('models/' + customer_id + '_collaborative_explicit') predictions = loaded_algo.test(testset) pred_ratings = np.array([pred.est for pred in predictions]) # i_max = pred_ratings.argmax() top_n = pred_ratings.argsort()[-int(top):][::-1] result = [] for idx in top_n: iid = iids_to_pred[idx] result.append({ 'itemId': iid, 'prediction_rating': pred_ratings[idx] }) return { 'data': { 'current_user': { 'id': user_id, }, 'suggestion': result, 'top': top } } else: return "Database not found" except Exception as e: return "Error in " + str(e)
def SVD_alg(): print('Using SVD') _, alg = dump.load('SVD') predictions = alg.test(testset) #pred = alg.predict(5,2) #print(pred) #print(predictions) print(accuracy.rmse(predictions)) dump.dump('SVD_pred', predictions, alg)
def get_serialize_algo(self, score_min=50): """ 获取序列化algo :param score_min: :return: """ file_name = os.path.expanduser("./score" + str(score_min) + ".dump") _, algo = dump.load(file_name=file_name) return algo
def __init__(self, model_path): """Init RecSys Args: model_path (str): Model path """ # load prediction and model from a given file self.predictions, self.model = dump.load(model_path) self.avg_recall = 0 self.avg_precision = 0
def get_oracle_labels_for_test_set(dataset_name, switch_ensemble): reader = Reader(line_format='user item rating timestamp', sep=',') train = Dataset.load_from_file("./created_data/" + dataset_name + "_train.csv", reader=reader) test_ensembles = Dataset.load_from_file("./created_data/" + dataset_name + "_test_ensembles.csv", reader=reader) uf = UserFeatures( pd.DataFrame(train.raw_ratings, columns=["userId", "movieId", "rating", "timestamp"]), False) all_features_df = uf.get_all_user_features() recs_avg_errors = [] for rs in RS: #Memory error for 16GB machine or float division error for lastfm if ("KNN" in rs["name"] and dataset_name in datasets_knn_mem_error): continue file_name = os.path.expanduser('./created_data/trained_RS/dump_file_' + dataset_name + '_' + rs["name"]) _, loaded_algo = dump.load(file_name) predictions = loaded_algo.test( test_ensembles.build_full_trainset().build_testset()) predictions_df = pd.DataFrame( predictions, columns=["userId", "movieId", "rating", "prediction", "details"]) predictions_with_relevance = remove_dataset_bias(predictions_df, has_ns=True) scores = predictions_with_relevance.groupby("userId").agg( lambda r, f=calculate_ndcg_score: f(r, "prediction")) scores = scores[[scores.columns[0]]].rename(index=str, columns={ scores.columns[0]: "NDCG" }).reset_index() scores["RS"] = rs["name"] # this was used when mae was the criterea for creating the H1 dataset # predictions_df["error"] = abs(predictions_df["prediction"]-predictions_df["rating"]) # avg_errors = predictions_df.groupby("userId")["error"].mean().rename("avg_error").to_frame().reset_index() # avg_errors["RS"] = rs["name"] recs_avg_errors.append(scores) all_avg_errors = pd.concat(recs_avg_errors).reset_index() assert all_avg_errors.isnull().values.any() == False Xy = create_best_RS_userwise_dataset(all_avg_errors, all_features_df) if ("amazon" not in dataset_name): Xy["userId"] = Xy["userId"].astype(int) return Xy.sort_values("userId")[["userId", "label"]]
def load_recommender(self, recommender): ''' Load pickled model from recommender directory Param ------ recommender: str - accepts 'knn' or 'svd' ''' if recommender == 'knn': _, model = load(RECOMMENDERS_DIR + 'knn_recommender.pickle') elif recommender == 'svd': _, model = load(RECOMMENDERS_DIR + 'mf_recommender.pickle') else: model = None self.model_type = recommender self.model = model self.set_data(self.model)
def estimate(): predictions_svd, algo_svd = dump.load('models/dump_SVD_test') precisions, recalls = precision_recall_at_k(predictions_svd, k=TOP_K, threshold=THRESHOLD) df_svd = pd.DataFrame(predictions_svd, columns=['uid', 'iid', 'rui', 'est', 'details']) df_svd['err'] = abs(df_svd.est - df_svd.rui) with open("estimation.txt", "w+") as f: f.write(f"SVD\n{df_svd.head()}\n") # Precision and recall can then be averaged over all users f.write(f"Precision: {sum(prec for prec in precisions.values()) / len(precisions)}\n") f.write(f"Recall: {sum(rec for rec in recalls.values()) / len(recalls)}\n")
def load_model(path=None): '''Wrapper with logging. Initializes the main machine learning model. ''' app.logger.debug('Loading model...') if path is None: path = settings.MODEL_PATH _, model = dump.load(path) app.logger.debug('Model has been loaded successfully') return model
def init(): # load the model from file into a global object global model # we assume that we have just one model # AZUREML_MODEL_DIR is an environment variable created during deployment. # It is the path to the model folder # (./azureml-models/$MODEL_NAME/$VERSION) model_path = Model.get_model_path( os.getenv("AZUREML_MODEL_DIR").split('/')[-2]) # model = joblib.load(model_path) model = dump.load(model_path) model = model[1]
def train_SVD(): data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) # Dump algorithm and reload it. file_name = os.path.expanduser('./SVD_model') dump.dump(file_name, algo=algo) print("file dumped") # Load a model: _, loaded_algo = dump.load('./SVD_model') print("file loaded") predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
def test_dump(u1_ml100k): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) trainset, testset = next(PredefinedKFold().split(u1_ml100k)) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
then reloaded and can be used again for making predictions. """ from __future__ import (absolute_import, division, print_function, unicode_literals) import os from surprise import SVD from surprise import Dataset from surprise import dump data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) # Compute predictions of the 'original' algorithm. predictions = algo.test(trainset.build_testset()) # Dump algorithm and reload it. file_name = os.path.expanduser('~/dump_file') dump.dump(file_name, algo=algo) _, loaded_algo = dump.load(file_name) # We now ensure that the algo is still the same by checking the predictions. predictions_loaded_algo = loaded_algo.test(trainset.build_testset()) assert predictions == predictions_loaded_algo print('Predictions are the same')
def load(cls, filename): _, algo = load(filename) return cls(algo)