def test_e2e(spark, pandas_dummy_dataset, header):
    sar = SARPlus(spark, **header, cache_path="tests/test_e2e_cache")

    df = spark.createDataFrame(pandas_dummy_dataset)
    sar.fit(df)

    test_df = spark.createDataFrame(
        pd.DataFrame({
            header["col_user"]: [3],
            header["col_item"]: [2]
        }))

    r1 = (sar.recommend_k_items(test_df, top_k=3,
                                remove_seen=False).toPandas().sort_values([
                                    header["col_user"], header["col_item"]
                                ]).reset_index(drop=True))

    r2 = (sar.recommend_k_items(
        test_df,
        top_k=3,
        n_user_prediction_partitions=2,
        remove_seen=False,
        use_cache=True,
    ).toPandas().sort_values([header["col_user"],
                              header["col_item"]]).reset_index(drop=True))

    assert (r1.iloc[:, :2] == r2.iloc[:, :2]).all().all()
    assert np.allclose(r1.score.values, r2.score.values, 1e-3)
def test_e2e(spark, pandas_dummy_dataset, header):
    sar = SARPlus(spark, **header)

    df = spark.createDataFrame(pandas_dummy_dataset)
    sar.fit(df)

    # assert 4*4 + 32 == sar.item_similarity.count()

    # print(sar.item_similarity
    # .toPandas()
    # .pivot_table(index='i1', columns='i2', values='value'))

    test_df = spark.createDataFrame(
        pd.DataFrame({
            header['col_user']: [3],
            header['col_item']: [2]
        }))

    r1 = sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False)\
        .toPandas()\
        .sort_values([header['col_user'], header['col_item']])\
        .reset_index(drop=True)

    r2 = sar.recommend_k_items(test_df, "tests/test_e2e_cache", top_k=3, n_user_prediction_partitions=2, remove_seen=False)\
        .toPandas()\
        .sort_values([header['col_user'], header['col_item']])\
        .reset_index(drop=True)

    assert (r1.iloc[:, :2] == r2.iloc[:, :2]).all().all()
    assert np.allclose(r1.score.values, r2.score.values, 1e-3)
Exemple #3
0
def test_userpred(
    spark, threshold, similarity_type, file, header, sar_settings, demo_usage_data
):
    time_now = demo_usage_data[header["col_timestamp"]].max()

    test_id = "{0}_{1}_{2}".format(threshold, similarity_type, file)

    model = SARPlus(
        spark,
        **header,
        table_prefix=test_id,
        timedecay_formula=True,
        time_decay_coefficient=30,
        time_now=time_now,
        threshold=threshold,
        similarity_type=similarity_type
    )

    df = spark.createDataFrame(demo_usage_data)
    model.fit(df)

    url = (
        sar_settings["FILE_DIR"]
        + "userpred_"
        + file
        + str(threshold)
        + "_userid_only.csv"
    )

    pred_ref = pd.read_csv(url)
    pred_ref = (
        pd.wide_to_long(pred_ref, ["rec", "score"], "user", "idx")
        .sort_values("score", ascending=False)
        .reset_index(drop=True)
    )

    # Note: it's important to have a separate cache_path for each run as they're interferring with each other
    pred = model.recommend_k_items(
        spark.createDataFrame(
            demo_usage_data[
                demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]
            ]
        ),
        cache_path="test_userpred-" + test_id,
        top_k=10,
        n_user_prediction_partitions=1,
    )

    pred = pred.toPandas().sort_values("score", ascending=False).reset_index(drop=True)

    assert (pred.MovieId.values == pred_ref.rec.values).all()
    assert np.allclose(
        pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"]
    )
Exemple #4
0
from pysarplus import SARPlus, SARModel

# spark dataframe with user/item/rating/optional timestamp tuples
train_df = spark.createDataFrame([
    (1, 1, 1), 
    (1, 2, 1), 
    (2, 1, 1), 
    (3, 1, 1), 
    (3, 3, 1)], 
    ['user_id', 'item_id', 'rating'])

# spark dataframe with user/item tuples
test_df = spark.createDataFrame([
    (1, 1, 1), 
    (3, 3, 1)], 
    ['user_id', 'item_id', 'rating'])



model = SARPlus(
    spark, 
    col_user='******', 
    col_item='item_id', 
    col_rating='rating', 
    similarity_type='jaccard',
)
model.fit(train_df)


model.recommend_k_items(test_df, 'sarplus_cache', top_k=3).show()