def test_e2e(spark, pandas_dummy_dataset, header): sar = SARPlus(spark, **header, cache_path="tests/test_e2e_cache") df = spark.createDataFrame(pandas_dummy_dataset) sar.fit(df) test_df = spark.createDataFrame( pd.DataFrame({ header["col_user"]: [3], header["col_item"]: [2] })) r1 = (sar.recommend_k_items(test_df, top_k=3, remove_seen=False).toPandas().sort_values([ header["col_user"], header["col_item"] ]).reset_index(drop=True)) r2 = (sar.recommend_k_items( test_df, top_k=3, n_user_prediction_partitions=2, remove_seen=False, use_cache=True, ).toPandas().sort_values([header["col_user"], header["col_item"]]).reset_index(drop=True)) assert (r1.iloc[:, :2] == r2.iloc[:, :2]).all().all() assert np.allclose(r1.score.values, r2.score.values, 1e-3)
def test_e2e(spark, pandas_dummy_dataset, header): sar = SARPlus(spark, **header) df = spark.createDataFrame(pandas_dummy_dataset) sar.fit(df) # assert 4*4 + 32 == sar.item_similarity.count() # print(sar.item_similarity # .toPandas() # .pivot_table(index='i1', columns='i2', values='value')) test_df = spark.createDataFrame( pd.DataFrame({ header['col_user']: [3], header['col_item']: [2] })) r1 = sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False)\ .toPandas()\ .sort_values([header['col_user'], header['col_item']])\ .reset_index(drop=True) r2 = sar.recommend_k_items(test_df, "tests/test_e2e_cache", top_k=3, n_user_prediction_partitions=2, remove_seen=False)\ .toPandas()\ .sort_values([header['col_user'], header['col_item']])\ .reset_index(drop=True) assert (r1.iloc[:, :2] == r2.iloc[:, :2]).all().all() assert np.allclose(r1.score.values, r2.score.values, 1e-3)
def test_userpred( spark, threshold, similarity_type, file, header, sar_settings, demo_usage_data ): time_now = demo_usage_data[header["col_timestamp"]].max() test_id = "{0}_{1}_{2}".format(threshold, similarity_type, file) model = SARPlus( spark, **header, table_prefix=test_id, timedecay_formula=True, time_decay_coefficient=30, time_now=time_now, threshold=threshold, similarity_type=similarity_type ) df = spark.createDataFrame(demo_usage_data) model.fit(df) url = ( sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" ) pred_ref = pd.read_csv(url) pred_ref = ( pd.wide_to_long(pred_ref, ["rec", "score"], "user", "idx") .sort_values("score", ascending=False) .reset_index(drop=True) ) # Note: it's important to have a separate cache_path for each run as they're interferring with each other pred = model.recommend_k_items( spark.createDataFrame( demo_usage_data[ demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"] ] ), cache_path="test_userpred-" + test_id, top_k=10, n_user_prediction_partitions=1, ) pred = pred.toPandas().sort_values("score", ascending=False).reset_index(drop=True) assert (pred.MovieId.values == pred_ref.rec.values).all() assert np.allclose( pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"] )
from pysarplus import SARPlus, SARModel # spark dataframe with user/item/rating/optional timestamp tuples train_df = spark.createDataFrame([ (1, 1, 1), (1, 2, 1), (2, 1, 1), (3, 1, 1), (3, 3, 1)], ['user_id', 'item_id', 'rating']) # spark dataframe with user/item tuples test_df = spark.createDataFrame([ (1, 1, 1), (3, 3, 1)], ['user_id', 'item_id', 'rating']) model = SARPlus( spark, col_user='******', col_item='item_id', col_rating='rating', similarity_type='jaccard', ) model.fit(train_df) model.recommend_k_items(test_df, 'sarplus_cache', top_k=3).show()