Esempio n. 1
0
def test_compute_ranking_predictions(rating_true):
    n_users = len(rating_true["userID"].unique())
    n_items = len(rating_true["itemID"].unique())
    svd = surprise.SVD()
    train_set = surprise.Dataset.load_from_df(
        rating_true, reader=surprise.Reader()).build_full_trainset()
    svd.fit(train_set)

    preds = compute_ranking_predictions(svd, rating_true, remove_seen=True)
    assert set(preds.columns) == {"userID", "itemID", "prediction"}
    assert preds["userID"].dtypes == rating_true["userID"].dtypes
    assert preds["itemID"].dtypes == rating_true["itemID"].dtypes
    user = preds.iloc[0]["userID"]
    item = preds.iloc[0]["itemID"]
    assert preds[(preds["userID"] == user) & (
        preds["itemID"] == item)]["prediction"].values == pytest.approx(
            svd.predict(user, item).est, rel=TOL)
    # Test default remove_seen=True
    assert pd.merge(rating_true, preds, on=["userID", "itemID"]).shape[0] == 0
    assert preds.shape[0] == (n_users * n_items - rating_true.shape[0])

    preds = compute_ranking_predictions(
        svd,
        rating_true.rename(columns={
            "userID": "uid",
            "itemID": "iid",
            "rating": "r"
        }),
        usercol="uid",
        itemcol="iid",
        predcol="pred",
        remove_seen=False,
    )
    assert set(preds.columns) == {"uid", "iid", "pred"}
    assert preds["uid"].dtypes == rating_true["userID"].dtypes
    assert preds["iid"].dtypes == rating_true["itemID"].dtypes
    user = preds.iloc[1]["uid"]
    item = preds.iloc[1]["iid"]
    assert preds[(preds["uid"] == user)
                 & (preds["iid"] == item)]["pred"].values == pytest.approx(
                     svd.predict(user, item).est, rel=TOL)

    # Test remove_seen=False
    assert (pd.merge(rating_true,
                     preds,
                     left_on=["userID", "itemID"],
                     right_on=["uid", "iid"]).shape[0] == rating_true.shape[0])
    assert preds.shape[0] == n_users * n_items
Esempio n. 2
0
def svd_training(params):
    """
    Train Surprise SVD using the given hyper-parameters
    """
    logger.debug("Start training...")
    train_data = pd.read_pickle(path=os.path.join(params['datastore'], params['train_datapath']))
    validation_data = pd.read_pickle(path=os.path.join(params['datastore'], params['validation_datapath']))

    svd_params = {p: params[p] for p in ['random_state', 'n_epochs', 'verbose', 'biased', 'n_factors', 'init_mean',
                                         'init_std_dev', 'lr_all', 'reg_all', 'lr_bu', 'lr_bi', 'lr_pu', 'lr_qi',
                                         'reg_bu', 'reg_bi', 'reg_pu', 'reg_qi']}
    svd = surprise.SVD(**svd_params)

    train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(params['surprise_reader'])) \
        .build_full_trainset()
    svd.fit(train_set)

    logger.debug("Evaluating...")

    metrics_dict = {}
    rating_metrics = params['rating_metrics']
    if len(rating_metrics) > 0:
        predictions = compute_rating_predictions(svd, validation_data, usercol=params['usercol'],
                                                 itemcol=params['itemcol'])
        for metric in rating_metrics:
            result = getattr(evaluation, metric)(validation_data, predictions)
            logger.debug("%s = %g", metric, result)
            if metric == params['primary_metric']:
                metrics_dict['default'] = result
            else:
                metrics_dict[metric] = result

    ranking_metrics = params['ranking_metrics']
    if len(ranking_metrics) > 0:
        all_predictions = compute_ranking_predictions(svd, train_data, usercol=params['usercol'],
                                                      itemcol=params['itemcol'],
                                                      recommend_seen=params['recommend_seen'])
        k = params['k']
        for metric in ranking_metrics:
            result = getattr(evaluation, metric)(validation_data, all_predictions, col_prediction='prediction', k=k)
            logger.debug("%s@%d = %g", metric, k, result)
            if metric == params['primary_metric']:
                metrics_dict['default'] = result
            else:
                metrics_dict[metric] = result

    if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
        raise ValueError("No metrics were specified.")

    # Report the metrics
    nni.report_final_result(metrics_dict)

    # Save the metrics in a JSON file
    output_dir = os.environ.get('NNI_OUTPUT_DIR')
    with open(os.path.join(output_dir, 'metrics.json'), 'w') as fp:
        temp_dict = metrics_dict.copy()
        temp_dict[params['primary_metric']] = temp_dict.pop('default')
        json.dump(temp_dict, fp)

    return svd
def test_compute_ranking_predictions(rating_true):
    n_users = len(rating_true["userID"].unique())
    n_items = len(rating_true["itemID"].unique())
    svd = surprise.SVD()
    train_set = surprise.Dataset.load_from_df(
        rating_true, reader=surprise.Reader()
    ).build_full_trainset()
    svd.fit(train_set)

    preds = compute_ranking_predictions(svd, rating_true)
    assert set(preds.columns) == {"userID", "itemID", "prediction"}
    assert preds["userID"].dtypes == rating_true["userID"].dtypes
    assert preds["itemID"].dtypes == rating_true["itemID"].dtypes
    user = preds.iloc[0]["userID"]
    item = preds.iloc[0]["itemID"]
    assert preds[(preds["userID"] == user) & (preds["itemID"] == item)][
        "prediction"
    ].values == pytest.approx(svd.predict(user, item).est, rel=TOL)
    # Test default recommend_seen=False
    assert pd.merge(rating_true, preds, on=["userID", "itemID"]).shape[0] == 0
    assert preds.shape[0] == (n_users * n_items - rating_true.shape[0])

    preds = compute_ranking_predictions(
        svd,
        rating_true.rename(columns={"userID": "uid", "itemID": "iid", "rating": "r"}),
        usercol="uid",
        itemcol="iid",
        predcol="pred",
        recommend_seen=True,
    )
    assert set(preds.columns) == {"uid", "iid", "pred"}
    assert preds["uid"].dtypes == rating_true["userID"].dtypes
    assert preds["iid"].dtypes == rating_true["itemID"].dtypes
    user = preds.iloc[1]["uid"]
    item = preds.iloc[1]["iid"]
    assert preds[(preds["uid"] == user) & (preds["iid"] == item)][
        "pred"
    ].values == pytest.approx(svd.predict(user, item).est, rel=TOL)
    # Test recommend_seen=True
    assert (
        pd.merge(
            rating_true, preds, left_on=["userID", "itemID"], right_on=["uid", "iid"]
        ).shape[0]
        == rating_true.shape[0]
    )
    assert preds.shape[0] == n_users * n_items
Esempio n. 4
0
def recommend_k_svd(model, test, train):
    with Timer() as t:       
        topk_scores = compute_ranking_predictions(model, 
                                                  train, 
                                                  usercol=DEFAULT_USER_COL, 
                                                  itemcol=DEFAULT_ITEM_COL,
                                                  predcol=DEFAULT_PREDICTION_COL, 
                                                  recommend_seen=False)
    return topk_scores, t
Esempio n. 5
0
def test_compute_ranking_predictions(python_data):
    rating_true, _, _ = python_data(binary_rating=False)
    n_users = len(rating_true['userID'].unique())
    n_items = len(rating_true['itemID'].unique())
    svd = surprise.SVD()
    train_set = surprise.Dataset.load_from_df(
        rating_true, reader=surprise.Reader()).build_full_trainset()
    svd.fit(train_set)

    preds = compute_ranking_predictions(svd, rating_true)
    assert set(preds.columns) == {'userID', 'itemID', 'prediction'}
    assert preds['userID'].dtypes == rating_true['userID'].dtypes
    assert preds['itemID'].dtypes == rating_true['itemID'].dtypes
    user = preds.iloc[0]['userID']
    item = preds.iloc[0]['itemID']
    assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \
           pytest.approx(svd.predict(user, item).est, rel=TOL)
    # Test default recommend_seen=False
    assert pd.merge(rating_true, preds, on=['userID', 'itemID']).shape[0] == 0
    assert preds.shape[0] == (n_users * n_items - rating_true.shape[0])

    preds = compute_ranking_predictions(
        svd,
        rating_true.rename(columns={
            'userID': 'uid',
            'itemID': 'iid',
            'rating': 'r'
        }),
        usercol='uid',
        itemcol='iid',
        predcol='pred',
        recommend_seen=True)
    assert set(preds.columns) == {'uid', 'iid', 'pred'}
    assert preds['uid'].dtypes == rating_true['userID'].dtypes
    assert preds['iid'].dtypes == rating_true['itemID'].dtypes
    user = preds.iloc[1]['uid']
    item = preds.iloc[1]['iid']
    assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \
           pytest.approx(svd.predict(user, item).est, rel=TOL)
    # Test recommend_seen=True
    assert pd.merge(rating_true, preds, left_on=['userID', 'itemID'], right_on=['uid', 'iid']).shape[0] == \
           rating_true.shape[0]
    assert preds.shape[0] == n_users * n_items
def recommend_k_svd(model, test, train):
    with Timer() as t:
        topk_scores = compute_ranking_predictions(
            model,
            train,
            usercol=DEFAULT_USER_COL,
            itemcol=DEFAULT_ITEM_COL,
            predcol=DEFAULT_PREDICTION_COL,
            recommend_seen=False)
    return topk_scores, t
def compute_test_results(svd):
    test_results = {}
    predictions = predict(svd, test, usercol="userID", itemcol="itemID")
    for metric in RATING_METRICS:
        test_results[metric] = eval(metric)(test, predictions)

    all_predictions = compute_ranking_predictions(svd, train, usercol="userID", itemcol="itemID", remove_seen=REMOVE_SEEN)
    for metric in RANKING_METRICS:
        test_results[metric] = eval(metric)(test, all_predictions, col_prediction='prediction', k=K)
    return test_results
Esempio n. 8
0
def svd_training(args):
    """
    Train Surprise SVD using the given hyper-parameters
    """
    print("Start training...")
    train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath))
    validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath))

    svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased,
                       n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev,
                       lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu,
                       lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu,
                       reg_qi=args.reg_qi)

    train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \
        .build_full_trainset()
    svd.fit(train_set)

    print("Evaluating...")

    rating_metrics = args.rating_metrics
    if len(rating_metrics) > 0:
        predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol)
        for metric in rating_metrics:
            result = eval(metric)(validation_data, predictions)
            print(metric, result)
            if HAS_AML:
                run.log(metric, result)

    ranking_metrics = args.ranking_metrics
    if len(ranking_metrics) > 0:
        all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol,
                                                  remove_seen=args.remove_seen)
        k = args.k
        for metric in ranking_metrics:
            result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k)
            print("{}@{}".format(metric, k), result)
            if HAS_AML:
                run.log(metric, result)

    if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
        raise ValueError("No metrics were specified.")

    return svd
Esempio n. 9
0
def svd_training(args):
    """
    Train Surprise SVD using the given hyper-parameters
    """
    print("Start training...")
    train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath))
    validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath))

    svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased,
                       n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev,
                       lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu,
                       lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu,
                       reg_qi=args.reg_qi)

    train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \
        .build_full_trainset()
    svd.fit(train_set)

    print("Evaluating...")

    rating_metrics = args.rating_metrics
    if len(rating_metrics) > 0:
        predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol)
        for metric in rating_metrics:
            result = eval(metric)(validation_data, predictions)
            print(metric, result)
            if HAS_AML:
                run.log(metric, result)

    ranking_metrics = args.ranking_metrics
    if len(ranking_metrics) > 0:
        all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol,
                                                  recommend_seen=args.recommend_seen)
        k = args.k
        for metric in ranking_metrics:
            result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k)
            print("{}@{}".format(metric, k), result)
            if HAS_AML:
                run.log(metric, result)

    if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
        raise ValueError("No metrics were specified.")

    return svd
Esempio n. 10
0
def svd_training(params):
    """
    Train Surprise SVD using the given hyper-parameters
    """
    logger.debug("Start training...")
    train_data = pd.read_pickle(
        path=os.path.join(params["datastore"], params["train_datapath"]))
    validation_data = pd.read_pickle(
        path=os.path.join(params["datastore"], params["validation_datapath"]))

    svd_params = {
        p: params[p]
        for p in [
            "random_state",
            "n_epochs",
            "verbose",
            "biased",
            "n_factors",
            "init_mean",
            "init_std_dev",
            "lr_all",
            "reg_all",
            "lr_bu",
            "lr_bi",
            "lr_pu",
            "lr_qi",
            "reg_bu",
            "reg_bi",
            "reg_pu",
            "reg_qi",
        ]
    }
    svd = surprise.SVD(**svd_params)

    train_set = surprise.Dataset.load_from_df(
        train_data, reader=surprise.Reader(
            params["surprise_reader"])).build_full_trainset()
    svd.fit(train_set)

    logger.debug("Evaluating...")

    metrics_dict = {}
    rating_metrics = params["rating_metrics"]
    if len(rating_metrics) > 0:
        predictions = predict(svd,
                              validation_data,
                              usercol=params["usercol"],
                              itemcol=params["itemcol"])
        for metric in rating_metrics:
            result = getattr(evaluation, metric)(validation_data, predictions)
            logger.debug("%s = %g", metric, result)
            if metric == params["primary_metric"]:
                metrics_dict["default"] = result
            else:
                metrics_dict[metric] = result

    ranking_metrics = params["ranking_metrics"]
    if len(ranking_metrics) > 0:
        all_predictions = compute_ranking_predictions(
            svd,
            train_data,
            usercol=params["usercol"],
            itemcol=params["itemcol"],
            remove_seen=params["remove_seen"],
        )
        k = params["k"]
        for metric in ranking_metrics:
            result = getattr(evaluation, metric)(validation_data,
                                                 all_predictions,
                                                 col_prediction="prediction",
                                                 k=k)
            logger.debug("%s@%d = %g", metric, k, result)
            if metric == params["primary_metric"]:
                metrics_dict["default"] = result
            else:
                metrics_dict[metric] = result

    if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
        raise ValueError("No metrics were specified.")

    # Report the metrics
    nni.report_final_result(metrics_dict)

    # Save the metrics in a JSON file
    output_dir = os.environ.get("NNI_OUTPUT_DIR")
    with open(os.path.join(output_dir, "metrics.json"), "w") as fp:
        temp_dict = metrics_dict.copy()
        temp_dict[params["primary_metric"]] = temp_dict.pop("default")
        json.dump(temp_dict, fp)

    return svd
Esempio n. 11
0
def svd_training(params):
    """
    Train Surprise SVD using the given hyper-parameters
    """
    logger.debug("Start training...")
    train_data = pd.read_pickle(
        path=os.path.join(params['datastore'], params['train_datapath']))
    validation_data = pd.read_pickle(
        path=os.path.join(params['datastore'], params['validation_datapath']))

    svd_params = {
        p: params[p]
        for p in [
            'random_state', 'n_epochs', 'verbose', 'biased', 'n_factors',
            'init_mean', 'init_std_dev', 'lr_all', 'reg_all', 'lr_bu', 'lr_bi',
            'lr_pu', 'lr_qi', 'reg_bu', 'reg_bi', 'reg_pu', 'reg_qi'
        ]
    }
    svd = surprise.SVD(**svd_params)

    train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(params['surprise_reader'])) \
        .build_full_trainset()
    svd.fit(train_set)

    logger.debug("Evaluating...")

    metrics_dict = {}
    rating_metrics = params['rating_metrics']
    if len(rating_metrics) > 0:
        predictions = compute_rating_predictions(svd,
                                                 validation_data,
                                                 usercol=params['usercol'],
                                                 itemcol=params['itemcol'])
        for metric in rating_metrics:
            result = getattr(evaluation, metric)(validation_data, predictions)
            logger.debug("%s = %g", metric, result)
            if metric == params['primary_metric']:
                metrics_dict['default'] = result
            else:
                metrics_dict[metric] = result

    ranking_metrics = params['ranking_metrics']
    if len(ranking_metrics) > 0:
        all_predictions = compute_ranking_predictions(
            svd,
            train_data,
            usercol=params['usercol'],
            itemcol=params['itemcol'],
            recommend_seen=params['recommend_seen'])
        k = params['k']
        for metric in ranking_metrics:
            result = getattr(evaluation, metric)(validation_data,
                                                 all_predictions,
                                                 col_prediction='prediction',
                                                 k=k)
            logger.debug("%s@%d = %g", metric, k, result)
            if metric == params['primary_metric']:
                metrics_dict['default'] = result
            else:
                metrics_dict[metric] = result

    if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
        raise ValueError("No metrics were specified.")

    # Report the metrics
    nni.report_final_result(metrics_dict)

    # Save the metrics in a JSON file
    output_dir = os.environ.get('NNI_OUTPUT_DIR')
    with open(os.path.join(output_dir, 'metrics.json'), 'w') as fp:
        temp_dict = metrics_dict.copy()
        temp_dict[params['primary_metric']] = temp_dict.pop('default')
        json.dump(temp_dict, fp)

    return svd
Esempio n. 12
0
                                     "uid": 'User-Id',
                                     "iid": 'Snack Id',
                                     "est": 'Review'
                                 })
predictions = predictions.drop(["details", "r_ui"], axis="columns")

# In[100]:

predictions

# In[101]:

with Timer() as test_time:
    all_predictions = compute_ranking_predictions(svd,
                                                  train,
                                                  usercol='User-Id',
                                                  itemcol='Snack Id',
                                                  remove_seen=True)

print("Took {} seconds for prediction.".format(test_time.interval))

# In[102]:

suffixes = ["_true", "_pred"]
rating_true_pred = pd.merge(test,
                            predictions,
                            on=["User-Id", 'Snack Id'],
                            suffixes=suffixes)

# In[103]: