def evaluate(user_model: UserModel, item_model: ItemModel, valid_data: List[TrainingSample]): from sklearn.neighbors import NearestNeighbors user_vectors = (user_model( coo_to_pytorch_sparse(sp.vstack([sample.row for sample in valid_data ])).cuda()).data.cpu().numpy()) item_vectors = item_model._embeds.weight.data.cpu().numpy() knn = NearestNeighbors(n_neighbors=30, metric="cosine") knn.fit(item_vectors) preds = knn.kneighbors(user_vectors, n_neighbors=30, return_distance=False) coverage = [] for i, recommended in enumerate(preds): gt = valid_data[i].target_items coverage.append(len(gt.intersection(recommended)) / len(gt)) ap = [] for i, recommended in enumerate(preds): gt = valid_data[i].target_items ap.append(normalized_average_precision(gt, recommended)) return np.mean(coverage), np.mean(ap)
def evalute_queries(queryset_file, max_records=1000): check_scores = [] with open(queryset_file) as fin: for i, line in enumerate(tqdm(fin)): splitted = line.strip().split("\t") if len(splitted) == 1: query_data = json.loads(splitted[0]) next_transaction = query_data["target"][0] else: query_data, next_transaction = map(json.loads, splitted) query_data["target"] = [next_transaction] query_data["transaction_history"] = sorted( query_data["transaction_history"], key=lambda x: x["datetime"] ) recommended_items = PREDICTOR.predict(query_data, PREDICTOR.lgb_model) gt_items = query_data["target"][0]["product_ids"] nap = normalized_average_precision(gt_items, recommended_items) check_scores.append(nap) if i == max_records: break return np.mean(check_scores)
params_rec) cosine_model.fit_recommender(df_train_rec) retailHeroModel = RetailHeroRecommender(df_products, params_rec, params_catboost) retailHeroModel.train_model(df_train_rec, df_train_ranker, val_dict) logger.debug('Succsessfully trained Model') logger.debug('Saving...') with open(os.path.join(DIR, 'model.pkl'), 'wb') as f: joblib.dump(retailHeroModel, f) logger.debug('Validation...') scores = [] cosine_scores = [] for (cid, ds) in tqdm(df_test_clients.groupby('client_id')): query = get_json_rows_from_purchases(ds, cid) products_hist_counter, histdata_products = extract_data_from_json_dict( query) recs = retailHeroModel.recommend(products_hist_counter, histdata_products) cosine_recs = cosine_model.recommend(products_hist_counter) scores.append(normalized_average_precision(val_dict[cid], recs)) cosine_scores.append( normalized_average_precision(val_dict[cid], cosine_recs)) logger.debug(f'RetailHeroRecommender MNAP@30: {np.mean(scores)}') logger.debug(f'CosineRecommenderModel MNAP@30: {np.mean(cosine_scores)}')
drop_cols = ["client_id", "target", "lgb_scores", "query_id"] lgb_scores = gbm.predict(df_gbm_test.drop(drop_cols, axis=1, errors="ignore")) df_gbm_test["lgb_scores"] = lgb_scores lgb_ranked = ( df_gbm_test.groupby("client_id")[["idx", "lgb_scores"]] .apply( lambda x: x.sort_values("lgb_scores", ascending=False)[:30]["idx"].tolist() ) .to_dict() ) gt_test = {item["client_id"]: item["products"] for item in gt_all_rec_test} scores = [] for client_id, recommended_idx in lgb_ranked.items(): ap = normalized_average_precision(gt_test[client_id], recommended_idx) scores.append(ap) model_score = np.mean(scores) logger.info(f"Test score: {model_score}") params_str = "__".join( "_".join(map(str, item)) for item in gbm.params.items() if item[0] != "metric" ) model_filename = f"lgbm_model__pool_{N_POOL}__{params_str}__{model_score:.6f}.txt" model_path = str(ASSETS_DIR / model_filename) gbm.save_model(model_path) logger.info(f"Model was saved to {model_path}") # Check predictor PREDICTOR = GBMPredictor( lgbm_model_path=str(ASSETS_DIR / model_filename),
metadata_products = pd.read_csv(os.path.join(DIR, 'products.csv')) with open(os.path.join(DIR, 'model.pkl'), 'rb') as f: model = joblib.load(f) time_bechmark = [] scores = [] if __name__ == '__main__': for i in range(len(sample_queries)): query = json.loads(sample_queries.at[i, 'query']) actual = json.loads(sample_queries.at[i, 'next_trans'])['product_ids'] if len(query['transaction_history']) == 0: continue start_time = time.time() products_hist_counter, histdata_products = extract_data_from_json_dict(query) recs = model.recommend(products_hist_counter, metadata_products, histdata_products) # recs = model.recommend(products_hist_counter) finish_time = time.time() score = normalized_average_precision(actual, recs) scores.append(score) # logger.debug(f'Query: {i}, processed in {(finish_time - start_time):.5f} seconds') logger.debug(f'Query {i}, NAP@30: {score}, Num transactions: {len(query["transaction_history"])}') time_bechmark.append(finish_time - start_time) logger.debug(f'Ran {len(sample_queries)} queries, Average elapsed time: {np.mean(time_bechmark)} ± {np.std(time_bechmark)}') logger.debug(f'MNAP@30: {np.mean(scores)}')
# model = implicit.nearest_neighbours.TFIDFRecommender(K=100) # ALS should be trained with normalize = False # model = implicit.als.AlternatingLeastSquares(factors=16, regularization=1e-5, iterations=12) model.fit(train_mat.T) out_dir = cfg.ASSETS_DIR os.makedirs(out_dir, exist_ok=True) print(f"Dump model to {out_dir}") pickle.dump(model, open(out_dir / "model.pkl", "wb")) print("Estimate quality...") scores = [] for js in tqdm( (json.loads(s) for s in open(get_shard_path(cfg.NUM_SHARDS - 1)))): row = make_coo_row(js["transaction_history"], product_encoder).tocsr() raw_recs = model.recommend( userid=0, user_items=row, N=30, filter_already_liked_items=False, recalculate_user=True, ) recommended_items = product_encoder.toPid( [idx for (idx, score) in raw_recs]) gt_items = js["target"][0]["product_ids"] nap = normalized_average_precision(gt_items, recommended_items) scores.append(nap) print("nap: {}".format(np.mean(scores)))