def test_predict(rating_true):
    svd = surprise.SVD()
    train_set = surprise.Dataset.load_from_df(
        rating_true, reader=surprise.Reader()
    ).build_full_trainset()
    svd.fit(train_set)

    preds = predict(svd, rating_true)
    assert set(preds.columns) == {"userID", "itemID", "prediction"}
    assert preds["userID"].dtypes == rating_true["userID"].dtypes
    assert preds["itemID"].dtypes == rating_true["itemID"].dtypes
    user = rating_true.iloc[0]["userID"]
    item = rating_true.iloc[0]["itemID"]
    assert preds[(preds["userID"] == user) & (preds["itemID"] == item)][
        "prediction"
    ].values == pytest.approx(svd.predict(user, item).est, rel=TOL)

    preds = predict(
        svd,
        rating_true.rename(columns={"userID": "uid", "itemID": "iid"}),
        usercol="uid",
        itemcol="iid",
        predcol="pred",
    )
    assert set(preds.columns) == {"uid", "iid", "pred"}
    assert preds["uid"].dtypes == rating_true["userID"].dtypes
    assert preds["iid"].dtypes == rating_true["itemID"].dtypes
    user = rating_true.iloc[1]["userID"]
    item = rating_true.iloc[1]["itemID"]
    assert preds[(preds["uid"] == user) & (preds["iid"] == item)][
        "pred"
    ].values == pytest.approx(svd.predict(user, item).est, rel=TOL)
def predict_svd(model, test):
    with Timer() as t:
        preds = predict(
            model,
            test,
            usercol=DEFAULT_USER_COL,
            itemcol=DEFAULT_ITEM_COL,
            predcol=DEFAULT_PREDICTION_COL,
        )
    return preds, t
Beispiel #3
0
def svd_training(params):
    """
    Train Surprise SVD using the given hyper-parameters
    """
    logger.debug("Start training...")
    train_data = pd.read_pickle(
        os.path.join(params["datastore"], params["train_datapath"]))
    validation_data = pd.read_pickle(
        os.path.join(params["datastore"], params["validation_datapath"]))

    svd_params = {
        p: params[p]
        for p in [
            "random_state",
            "n_epochs",
            "verbose",
            "biased",
            "n_factors",
            "init_mean",
            "init_std_dev",
            "lr_all",
            "reg_all",
            "lr_bu",
            "lr_bi",
            "lr_pu",
            "lr_qi",
            "reg_bu",
            "reg_bi",
            "reg_pu",
            "reg_qi",
        ]
    }
    svd = surprise.SVD(**svd_params)

    train_set = surprise.Dataset.load_from_df(
        train_data, reader=surprise.Reader(
            params["surprise_reader"])).build_full_trainset()
    svd.fit(train_set)

    logger.debug("Evaluating...")

    metrics_dict = {}
    rating_metrics = params["rating_metrics"]
    if len(rating_metrics) > 0:
        predictions = predict(svd,
                              validation_data,
                              usercol=params["usercol"],
                              itemcol=params["itemcol"])
        for metric in rating_metrics:
            result = getattr(evaluation, metric)(validation_data, predictions)
            logger.debug("%s = %g", metric, result)
            if metric == params["primary_metric"]:
                metrics_dict["default"] = result
            else:
                metrics_dict[metric] = result

    ranking_metrics = params["ranking_metrics"]
    if len(ranking_metrics) > 0:
        all_predictions = compute_ranking_predictions(
            svd,
            train_data,
            usercol=params["usercol"],
            itemcol=params["itemcol"],
            remove_seen=params["remove_seen"],
        )
        k = params["k"]
        for metric in ranking_metrics:
            result = getattr(evaluation, metric)(validation_data,
                                                 all_predictions,
                                                 col_prediction="prediction",
                                                 k=k)
            logger.debug("%s@%d = %g", metric, k, result)
            if metric == params["primary_metric"]:
                metrics_dict["default"] = result
            else:
                metrics_dict[metric] = result

    if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
        raise ValueError("No metrics were specified.")

    # Report the metrics
    nni.report_final_result(metrics_dict)

    # Save the metrics in a JSON file
    output_dir = os.environ.get("NNI_OUTPUT_DIR")
    with open(os.path.join(output_dir, "metrics.json"), "w") as fp:
        temp_dict = metrics_dict.copy()
        temp_dict[params["primary_metric"]] = temp_dict.pop("default")
        json.dump(temp_dict, fp)

    return svd
def svd_training(args):
    """
    Train Surprise SVD using the given hyper-parameters
    """
    print("Start training...")
    train_data = pd.read_pickle(
        path=os.path.join(args.datastore, args.train_datapath))
    validation_data = pd.read_pickle(
        path=os.path.join(args.datastore, args.validation_datapath))

    svd = surprise.SVD(
        random_state=args.random_state,
        n_epochs=args.epochs,
        verbose=args.verbose,
        biased=args.biased,
        n_factors=args.n_factors,
        init_mean=args.init_mean,
        init_std_dev=args.init_std_dev,
        lr_all=args.lr_all,
        reg_all=args.reg_all,
        lr_bu=args.lr_bu,
        lr_bi=args.lr_bi,
        lr_pu=args.lr_pu,
        lr_qi=args.lr_qi,
        reg_bu=args.reg_bu,
        reg_bi=args.reg_bi,
        reg_pu=args.reg_pu,
        reg_qi=args.reg_qi,
    )

    train_set = surprise.Dataset.load_from_df(
        train_data,
        reader=surprise.Reader(args.surprise_reader)).build_full_trainset()
    svd.fit(train_set)

    print("Evaluating...")

    rating_metrics = args.rating_metrics
    if len(rating_metrics) > 0:
        predictions = predict(svd,
                              validation_data,
                              usercol=args.usercol,
                              itemcol=args.itemcol)
        for metric in rating_metrics:
            result = eval(metric)(validation_data, predictions)
            print(metric, result)
            if HAS_AML:
                run.log(metric, result)

    ranking_metrics = args.ranking_metrics
    if len(ranking_metrics) > 0:
        all_predictions = compute_ranking_predictions(
            svd,
            train_data,
            usercol=args.usercol,
            itemcol=args.itemcol,
            remove_seen=args.remove_seen,
        )
        k = args.k
        for metric in ranking_metrics:
            result = eval(metric)(validation_data,
                                  all_predictions,
                                  col_prediction="prediction",
                                  k=k)
            print("{}@{}".format(metric, k), result)
            if HAS_AML:
                run.log(metric, result)

    if len(ranking_metrics) == 0 and len(rating_metrics) == 0:
        raise ValueError("No metrics were specified.")

    return svd