Ejemplo n.º 1
0
def test_user_affinity(spark, demo_usage_data, sar_settings, header):
    time_now = demo_usage_data[header["col_timestamp"]].max()

    model = SARPlus(spark,
                    **header,
                    timedecay_formula=True,
                    time_decay_coefficient=30,
                    time_now=time_now,
                    similarity_type="cooccurrence")

    df = spark.createDataFrame(demo_usage_data)
    model.fit(df)

    user_affinity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "user_aff.csv")
    user_affinity_ref = pd.melt(user_affinity_ref,
                                user_affinity_ref.columns[0],
                                user_affinity_ref.columns[1:], 'ItemId',
                                'Rating')
    user_affinity_ref = user_affinity_ref[user_affinity_ref.Rating > 0]\
        .reset_index(drop=True)

    # construct dataframe with test user id we'd like to get the affinity for
    df_test = spark.createDataFrame(
        pd.DataFrame({header['col_user']: [sar_settings["TEST_USER_ID"]]}))
    user_affinity = model.get_user_affinity(df_test).toPandas().reset_index(
        drop=True)

    # verify the that item ids are the same
    assert (
        user_affinity[header['col_item']] == user_affinity_ref.ItemId).all()

    assert np.allclose(user_affinity_ref[header['col_rating']].values,
                       user_affinity['Rating'].values,
                       atol=sar_settings["ATOL"])
Ejemplo n.º 2
0
def test_e2e(spark, pandas_dummy_dataset, header):
    sar = SARPlus(spark, **header)

    df = spark.createDataFrame(pandas_dummy_dataset)
    sar.fit(df)

    test_df = spark.createDataFrame(
        pd.DataFrame({header["col_user"]: [3], header["col_item"]: [2]})
    )

    r1 = (
        sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False)
        .toPandas()
        .sort_values([header["col_user"], header["col_item"]])
        .reset_index(drop=True)
    )

    r2 = (
        sar.recommend_k_items(
            test_df,
            "tests/test_e2e_cache",
            top_k=3,
            n_user_prediction_partitions=2,
            remove_seen=False,
        )
        .toPandas()
        .sort_values([header["col_user"], header["col_item"]])
        .reset_index(drop=True)
    )

    assert (r1.iloc[:, :2] == r2.iloc[:, :2]).all().all()
    assert np.allclose(r1.score.values, r2.score.values, 1e-3)
Ejemplo n.º 3
0
def test_e2e(spark, pandas_dummy_dataset, header):
    sar = SARPlus(spark, **header)

    df = spark.createDataFrame(pandas_dummy_dataset)
    sar.fit(df)

    # assert 4*4 + 32 == sar.item_similarity.count()

    # print(sar.item_similarity
    # .toPandas()
    # .pivot_table(index='i1', columns='i2', values='value'))

    test_df = spark.createDataFrame(
        pd.DataFrame({
            header['col_user']: [3],
            header['col_item']: [2]
        }))

    r1 = sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False)\
        .toPandas()\
        .sort_values([header['col_user'], header['col_item']])\
        .reset_index(drop=True)

    r2 = sar.recommend_k_items(test_df, "tests/test_e2e_cache", top_k=3, n_user_prediction_partitions=2, remove_seen=False)\
        .toPandas()\
        .sort_values([header['col_user'], header['col_item']])\
        .reset_index(drop=True)

    assert (r1.iloc[:, :2] == r2.iloc[:, :2]).all().all()
    assert np.allclose(r1.score.values, r2.score.values, 1e-3)
Ejemplo n.º 4
0
def test_userpred(
    spark, threshold, similarity_type, file, header, sar_settings, demo_usage_data
):
    time_now = demo_usage_data[header["col_timestamp"]].max()

    test_id = "{0}_{1}_{2}".format(threshold, similarity_type, file)

    model = SARPlus(
        spark,
        **header,
        table_prefix=test_id,
        timedecay_formula=True,
        time_decay_coefficient=30,
        time_now=time_now,
        threshold=threshold,
        similarity_type=similarity_type
    )

    df = spark.createDataFrame(demo_usage_data)
    model.fit(df)

    url = (
        sar_settings["FILE_DIR"]
        + "userpred_"
        + file
        + str(threshold)
        + "_userid_only.csv"
    )

    pred_ref = pd.read_csv(url)
    pred_ref = (
        pd.wide_to_long(pred_ref, ["rec", "score"], "user", "idx")
        .sort_values("score", ascending=False)
        .reset_index(drop=True)
    )

    # Note: it's important to have a separate cache_path for each run as they're interferring with each other
    pred = model.recommend_k_items(
        spark.createDataFrame(
            demo_usage_data[
                demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]
            ]
        ),
        cache_path="test_userpred-" + test_id,
        top_k=10,
        n_user_prediction_partitions=1,
    )

    pred = pred.toPandas().sort_values("score", ascending=False).reset_index(drop=True)

    assert (pred.MovieId.values == pred_ref.rec.values).all()
    assert np.allclose(
        pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"]
    )
Ejemplo n.º 5
0
def test_sar_item_similarity(
    spark,
    threshold,
    similarity_type,
    file,
    demo_usage_data,
    sar_settings,
    header,
):

    model = SARPlus(
        spark,
        **header,
        timedecay_formula=False,
        time_decay_coefficient=30,
        time_now=None,
        threshold=threshold,
        similarity_type=similarity_type,
    )

    df = spark.createDataFrame(demo_usage_data)
    model.fit(df)

    # reference
    item_similarity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "sim_" +
                                      file + str(threshold) + ".csv")

    item_similarity_ref = pd.melt(
        item_similarity_ref,
        item_similarity_ref.columns[0],
        item_similarity_ref.columns[1:],
        "i2",
        "value",
    )
    item_similarity_ref.columns = ["i1", "i2", "value"]

    item_similarity_ref = (
        item_similarity_ref[item_similarity_ref.value > 0].sort_values(
            ["i1", "i2"]).reset_index(drop=True))
    # actual
    item_similarity = (model.item_similarity.toPandas().sort_values(
        ["i1", "i2"]).reset_index(drop=True))

    if similarity_type == "cooccurrence":
        assert (item_similarity_ref == item_similarity).all().all()
    else:
        assert ((item_similarity.iloc[:, :1] == item_similarity_ref.iloc[:, :1]
                 ).all().all())

        assert np.allclose(
            item_similarity.value.values,
            item_similarity_ref.value.values,
            atol=sar_settings["ATOL"],
        )
Ejemplo n.º 6
0
def test_fit(spark, similarity_type, timedecay_formula,
             train_test_dummy_timestamp, header):
    model = SARPlus(spark, **header)

    trainset, testset = train_test_dummy_timestamp

    df = spark.createDataFrame(trainset)
    df.write.mode("overwrite").saveAsTable("trainset")

    df = spark.table("trainset")

    model.fit(df,
              timedecay_formula=timedecay_formula,
              similarity_type=similarity_type)
Ejemplo n.º 7
0
def test_sar_item_similarity(spark, threshold, similarity_type, file,
                             demo_usage_data, sar_settings, header):

    model = SARPlus(spark,
                    **header,
                    timedecay_formula=False,
                    time_decay_coefficient=30,
                    time_now=None,
                    threshold=threshold,
                    similarity_type=similarity_type)

    df = spark.createDataFrame(demo_usage_data)
    model.fit(df)

    # reference
    item_similarity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "sim_" +
                                      file + str(threshold) + ".csv")

    item_similarity_ref = pd.melt(item_similarity_ref,
                                  item_similarity_ref.columns[0],
                                  item_similarity_ref.columns[1:], 'i2',
                                  'value')
    item_similarity_ref.columns = ['i1', 'i2', 'value']

    item_similarity_ref = item_similarity_ref[item_similarity_ref.value > 0]\
        .sort_values(['i1', 'i2'])\
        .reset_index(drop=True)\

    # actual
    item_similarity = model.item_similarity\
        .toPandas()\
        .sort_values(['i1', 'i2'])\
        .reset_index(drop=True)

    if similarity_type is "cooccurrence":
        assert ((item_similarity_ref == item_similarity).all().all())
    else:
        assert ((item_similarity.iloc[:, :1] == item_similarity_ref.iloc[:, :1]
                 ).all().all())

        assert np.allclose(item_similarity.value.values,
                           item_similarity_ref.value.values)
Ejemplo n.º 8
0
from pysarplus import SARPlus, SARModel

# spark dataframe with user/item/rating/optional timestamp tuples
train_df = spark.createDataFrame([
    (1, 1, 1), 
    (1, 2, 1), 
    (2, 1, 1), 
    (3, 1, 1), 
    (3, 3, 1)], 
    ['user_id', 'item_id', 'rating'])

# spark dataframe with user/item tuples
test_df = spark.createDataFrame([
    (1, 1, 1), 
    (3, 3, 1)], 
    ['user_id', 'item_id', 'rating'])



model = SARPlus(
    spark, 
    col_user='******', 
    col_item='item_id', 
    col_rating='rating', 
    similarity_type='jaccard',
)
model.fit(train_df)


model.recommend_k_items(test_df, 'sarplus_cache', top_k=3).show()