Example #1
0
def evaluate(user_model: UserModel, item_model: ItemModel,
             valid_data: List[TrainingSample]):
    from sklearn.neighbors import NearestNeighbors

    user_vectors = (user_model(
        coo_to_pytorch_sparse(sp.vstack([sample.row for sample in valid_data
                                         ])).cuda()).data.cpu().numpy())
    item_vectors = item_model._embeds.weight.data.cpu().numpy()
    knn = NearestNeighbors(n_neighbors=30, metric="cosine")
    knn.fit(item_vectors)
    preds = knn.kneighbors(user_vectors, n_neighbors=30, return_distance=False)

    coverage = []
    for i, recommended in enumerate(preds):
        gt = valid_data[i].target_items
        coverage.append(len(gt.intersection(recommended)) / len(gt))

    ap = []
    for i, recommended in enumerate(preds):
        gt = valid_data[i].target_items
        ap.append(normalized_average_precision(gt, recommended))

    return np.mean(coverage), np.mean(ap)
def evalute_queries(queryset_file, max_records=1000):
    check_scores = []
    with open(queryset_file) as fin:
        for i, line in enumerate(tqdm(fin)):
            splitted = line.strip().split("\t")
            if len(splitted) == 1:
                query_data = json.loads(splitted[0])
                next_transaction = query_data["target"][0]
            else:
                query_data, next_transaction = map(json.loads, splitted)
                query_data["target"] = [next_transaction]

            query_data["transaction_history"] = sorted(
                query_data["transaction_history"], key=lambda x: x["datetime"]
            )
            recommended_items = PREDICTOR.predict(query_data, PREDICTOR.lgb_model)

            gt_items = query_data["target"][0]["product_ids"]
            nap = normalized_average_precision(gt_items, recommended_items)
            check_scores.append(nap)

            if i == max_records:
                break
    return np.mean(check_scores)
Example #3
0
                                          params_rec)
    cosine_model.fit_recommender(df_train_rec)

    retailHeroModel = RetailHeroRecommender(df_products, params_rec,
                                            params_catboost)
    retailHeroModel.train_model(df_train_rec, df_train_ranker, val_dict)

    logger.debug('Succsessfully trained Model')
    logger.debug('Saving...')

    with open(os.path.join(DIR, 'model.pkl'), 'wb') as f:
        joblib.dump(retailHeroModel, f)

    logger.debug('Validation...')
    scores = []
    cosine_scores = []
    for (cid, ds) in tqdm(df_test_clients.groupby('client_id')):
        query = get_json_rows_from_purchases(ds, cid)
        products_hist_counter, histdata_products = extract_data_from_json_dict(
            query)
        recs = retailHeroModel.recommend(products_hist_counter,
                                         histdata_products)
        cosine_recs = cosine_model.recommend(products_hist_counter)

        scores.append(normalized_average_precision(val_dict[cid], recs))
        cosine_scores.append(
            normalized_average_precision(val_dict[cid], cosine_recs))

    logger.debug(f'RetailHeroRecommender MNAP@30: {np.mean(scores)}')
    logger.debug(f'CosineRecommenderModel MNAP@30: {np.mean(cosine_scores)}')
    drop_cols = ["client_id", "target", "lgb_scores", "query_id"]
    lgb_scores = gbm.predict(df_gbm_test.drop(drop_cols, axis=1, errors="ignore"))
    df_gbm_test["lgb_scores"] = lgb_scores

    lgb_ranked = (
        df_gbm_test.groupby("client_id")[["idx", "lgb_scores"]]
        .apply(
            lambda x: x.sort_values("lgb_scores", ascending=False)[:30]["idx"].tolist()
        )
        .to_dict()
    )

    gt_test = {item["client_id"]: item["products"] for item in gt_all_rec_test}
    scores = []
    for client_id, recommended_idx in lgb_ranked.items():
        ap = normalized_average_precision(gt_test[client_id], recommended_idx)
        scores.append(ap)
    model_score = np.mean(scores)
    logger.info(f"Test score: {model_score}")

    params_str = "__".join(
        "_".join(map(str, item)) for item in gbm.params.items() if item[0] != "metric"
    )
    model_filename = f"lgbm_model__pool_{N_POOL}__{params_str}__{model_score:.6f}.txt"
    model_path = str(ASSETS_DIR / model_filename)
    gbm.save_model(model_path)
    logger.info(f"Model was saved to {model_path}")

    # Check predictor
    PREDICTOR = GBMPredictor(
        lgbm_model_path=str(ASSETS_DIR / model_filename),
metadata_products = pd.read_csv(os.path.join(DIR, 'products.csv'))


with open(os.path.join(DIR, 'model.pkl'), 'rb') as f:
    model = joblib.load(f)

time_bechmark = []
scores = []
if __name__ == '__main__':
    for i in range(len(sample_queries)):
        query = json.loads(sample_queries.at[i, 'query'])
        actual = json.loads(sample_queries.at[i, 'next_trans'])['product_ids']
        if len(query['transaction_history']) == 0:
            continue

        start_time = time.time()
        products_hist_counter, histdata_products = extract_data_from_json_dict(query)
        recs = model.recommend(products_hist_counter, metadata_products, histdata_products)
        # recs = model.recommend(products_hist_counter)
        finish_time = time.time()

        score = normalized_average_precision(actual, recs)
        scores.append(score)
        # logger.debug(f'Query: {i}, processed in {(finish_time - start_time):.5f} seconds')
        logger.debug(f'Query {i}, NAP@30: {score}, Num transactions: {len(query["transaction_history"])}')
        time_bechmark.append(finish_time - start_time)

    logger.debug(f'Ran {len(sample_queries)} queries, Average elapsed time: {np.mean(time_bechmark)} ± {np.std(time_bechmark)}')
    logger.debug(f'MNAP@30: {np.mean(scores)}')
    # model = implicit.nearest_neighbours.TFIDFRecommender(K=100)

    # ALS should be trained with normalize = False
    # model = implicit.als.AlternatingLeastSquares(factors=16, regularization=1e-5, iterations=12)
    model.fit(train_mat.T)

    out_dir = cfg.ASSETS_DIR
    os.makedirs(out_dir, exist_ok=True)
    print(f"Dump model to {out_dir}")
    pickle.dump(model, open(out_dir / "model.pkl", "wb"))

    print("Estimate quality...")
    scores = []
    for js in tqdm(
        (json.loads(s) for s in open(get_shard_path(cfg.NUM_SHARDS - 1)))):
        row = make_coo_row(js["transaction_history"], product_encoder).tocsr()
        raw_recs = model.recommend(
            userid=0,
            user_items=row,
            N=30,
            filter_already_liked_items=False,
            recalculate_user=True,
        )

        recommended_items = product_encoder.toPid(
            [idx for (idx, score) in raw_recs])
        gt_items = js["target"][0]["product_ids"]
        nap = normalized_average_precision(gt_items, recommended_items)
        scores.append(nap)
    print("nap: {}".format(np.mean(scores)))