def test_fit(similarity_type, timedecay_formula, train_test_dummy_timestamp,
             header):
    model = SARSingleNode(similarity_type=similarity_type,
                          timedecay_formula=timedecay_formula,
                          **header)
    trainset, testset = train_test_dummy_timestamp
    model.fit(trainset)
Example #2
0
def test_user_affinity(demo_usage_data, sar_settings, header):
    time_now = demo_usage_data[header["col_timestamp"]].max()
    model = SARSingleNode(
        similarity_type="cooccurrence",
        timedecay_formula=True,
        time_decay_coefficient=30,
        time_now=time_now,
        **header
    )
    model.fit(demo_usage_data)

    true_user_affinity, items = load_affinity(sar_settings["FILE_DIR"] + "user_aff.csv")
    user_index = model.user2index[sar_settings["TEST_USER_ID"]]
    sar_user_affinity = np.reshape(
        np.array(
            _rearrange_to_test(
                model.user_affinity, None, items, None, model.item2index
            )[
                user_index,
            ].todense()
        ),
        -1,
    )
    assert np.allclose(
        true_user_affinity.astype(sar_user_affinity.dtype),
        sar_user_affinity,
        atol=sar_settings["ATOL"],
    )
Example #3
0
def test_recommend_k_items(
    threshold, similarity_type, file, header, sar_settings, demo_usage_data
):
    time_now = demo_usage_data[header["col_timestamp"]].max()
    model = SARSingleNode(
        similarity_type=similarity_type,
        timedecay_formula=True,
        time_decay_coefficient=30,
        time_now=time_now,
        threshold=threshold,
        **header
    )
    model.fit(demo_usage_data)

    true_items, true_scores = load_userpred(
        sar_settings["FILE_DIR"]
        + "userpred_"
        + file
        + str(threshold)
        + "_userid_only.csv"
    )
    test_results = model.recommend_k_items(
        demo_usage_data[
            demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]
        ],
        top_k=10,
        sort_top_k=True,
        remove_seen=True,
    )
    test_items = list(test_results[header["col_item"]])
    test_scores = np.array(test_results["prediction"])
    assert true_items == test_items
    assert np.allclose(true_scores, test_scores, atol=sar_settings["ATOL"])
def test_get_normalized_scores(header):
    train = pd.DataFrame({
        header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2],
        header["col_item"]: [1, 2, 3, 4, 1, 5, 6, 7],
        header["col_rating"]: [3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0, 5.0],
        header["col_timestamp"]: [1, 20, 30, 400, 50, 60, 70, 800],
    })
    test = pd.DataFrame({
        header["col_user"]: [1, 1, 1, 2, 2, 2],
        header["col_item"]: [5, 6, 7, 2, 3, 4],
        header["col_rating"]: [2.0, 1.0, 5.0, 3.0, 4.0, 5.0],
    })

    model = SARSingleNode(**header, timedecay_formula=True, normalize=True)
    model.fit(train)
    actual = model.score(test, remove_seen=True, normalize=True)
    expected = np.array([
        [-np.inf, -np.inf, -np.inf, -np.inf, 3.0, 3.0, 3.0],
        [-np.inf, 3.0, 3.0, 3.0, -np.inf, -np.inf, -np.inf],
    ])
    assert actual.shape == (2, 7)
    assert isinstance(actual, np.ndarray)
    assert np.isclose(expected, actual).all()

    actual = model.score(test, normalize=True)
    expected = np.array([
        [3.80000633, 4.14285448, 4.14285448, 4.14285448, 3.0, 3.0, 3.0],
        [2.8000859, 3.0, 3.0, 3.0, 2.71441353, 2.71441353, 2.71441353],
    ])

    assert actual.shape == (2, 7)
    assert isinstance(actual, np.ndarray)
    assert np.isclose(expected, actual).all()
Example #5
0
def test_get_normalized_scores(header):
    train = pd.DataFrame(
        {
            header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2],
            header["col_item"]: [1, 2, 3, 4, 1, 5, 6, 7],
            header["col_rating"]: [3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0, 5.0],
            header["col_timestamp"]: [1, 20, 30, 400, 50, 60, 70, 800],
        }
    )
    test = pd.DataFrame(
        {
            header["col_user"]: [1, 1, 1, 2, 2, 2],
            header["col_item"]: [5, 6, 7, 2, 3, 4],
            header["col_rating"]: [2.0, 1.0, 5.0, 3.0, 4.0, 5.0],
        }
    )

    model = SARSingleNode(**header, timedecay_formula=True, normalize=True)
    model.fit(train)
    actual = model.score(test, remove_seen=True)
    expected = np.array(
        [
            [-np.inf, -np.inf, -np.inf, -np.inf, 1.23512374, 1.23512374, 1.23512374],
            [-np.inf, 1.23512374, 1.23512374, 1.23512374, -np.inf, -np.inf, -np.inf],
        ]
    )
    assert actual.shape == (2, 7)
    assert isinstance(actual, np.ndarray)
    assert np.isclose(expected, np.asarray(actual)).all()

    actual = model.score(test)
    expected = np.array(
        [
            [
                3.11754872,
                4.29408577,
                4.29408577,
                4.29408577,
                1.23512374,
                1.23512374,
                1.23512374,
            ],
            [
                2.5293308,
                1.23511758,
                1.23511758,
                1.23511758,
                3.11767458,
                3.11767458,
                3.11767458,
            ],
        ]
    )

    assert actual.shape == (2, 7)
    assert isinstance(actual, np.ndarray)
    assert np.isclose(expected, np.asarray(actual)).all()
def test_fit(similarity_type, timedecay_formula, train_test_dummy_timestamp,
             header):
    model = SARSingleNode(remove_seen=True,
                          similarity_type=similarity_type,
                          timedecay_formula=timedecay_formula,
                          **header)
    trainset, testset = train_test_dummy_timestamp
    _apply_sar_hash_index(model, trainset, testset, header)

    model.fit(trainset)
def test_get_popularity_based_topk(header):

    train_df = pd.DataFrame({
        header["col_user"]: [1, 1, 1, 2, 2, 2, 3, 3, 3],
        header["col_item"]: [1, 2, 3, 1, 3, 4, 5, 6, 1],
        header["col_rating"]: [1, 2, 3, 1, 2, 3, 1, 2, 3]
    })

    sar = SARSingleNode(**header)
    sar.fit(train_df)

    expected = pd.DataFrame(dict(MovieId=[1, 3, 4], prediction=[3, 2, 1]))
    actual = sar.get_popularity_based_topk(top_k=3, sort_top_k=True)
    assert_frame_equal(expected, actual)
Example #8
0
def test_predict(
    similarity_type, timedecay_formula, train_test_dummy_timestamp, header
):
    model = SARSingleNode(
        similarity_type=similarity_type, timedecay_formula=timedecay_formula, **header
    )
    trainset, testset = train_test_dummy_timestamp
    model.fit(trainset)
    preds = model.predict(testset)

    assert len(preds) == 2
    assert isinstance(preds, pd.DataFrame)
    assert preds[header["col_user"]].dtype == trainset[header["col_user"]].dtype
    assert preds[header["col_item"]].dtype == trainset[header["col_item"]].dtype
    assert preds[DEFAULT_PREDICTION_COL].dtype == trainset[header["col_rating"]].dtype
Example #9
0
def test_predict_all_items(train_test_dummy_timestamp, header):
    model = SARSingleNode(**header)
    trainset, _ = train_test_dummy_timestamp
    model.fit(trainset)

    user_items = itertools.product(
        trainset[header["col_user"]].unique(), trainset[header["col_item"]].unique()
    )
    testset = pd.DataFrame(user_items, columns=[header["col_user"], header["col_item"]])
    preds = model.predict(testset)

    assert len(preds) == len(testset)
    assert isinstance(preds, pd.DataFrame)
    assert preds[header["col_user"]].dtype == trainset[header["col_user"]].dtype
    assert preds[header["col_item"]].dtype == trainset[header["col_item"]].dtype
    assert preds[DEFAULT_PREDICTION_COL].dtype == trainset[header["col_rating"]].dtype
Example #10
0
def test_get_item_based_topk(header, pandas_dummy):

    sar = SARSingleNode(**header)
    sar.fit(pandas_dummy)

    # test with just items provided
    expected = pd.DataFrame(
        dict(UserId=[0, 0, 0], MovieId=[8, 7, 6], prediction=[2.0, 2.0, 2.0])
    )
    items = pd.DataFrame({header["col_item"]: [1, 5, 10]})
    actual = sar.get_item_based_topk(items, top_k=3)
    assert_frame_equal(expected, actual, check_dtype=False)

    # test with items and users
    expected = pd.DataFrame(
        dict(
            UserId=[100, 100, 100, 1, 1, 1],
            MovieId=[8, 7, 6, 4, 3, 10],
            prediction=[2.0, 2.0, 2.0, 2.0, 2.0, 1.0],
        )
    )
    items = pd.DataFrame(
        {
            header["col_user"]: [100, 100, 1, 100, 1, 1],
            header["col_item"]: [1, 5, 1, 10, 2, 6],
        }
    )
    actual = sar.get_item_based_topk(items, top_k=3, sort_top_k=True)
    assert_frame_equal(expected, actual, check_dtype=False)

    # test with items, users, and ratings
    expected = pd.DataFrame(
        dict(
            UserId=[100, 100, 100, 1, 1, 1],
            MovieId=[2, 4, 3, 4, 3, 10],
            prediction=[5.0, 5.0, 5.0, 8.0, 8.0, 4.0],
        )
    ).set_index(["UserId", "MovieId"])
    items = pd.DataFrame(
        {
            header["col_user"]: [100, 100, 1, 100, 1, 1],
            header["col_item"]: [1, 5, 1, 10, 2, 6],
            header["col_rating"]: [5, 1, 3, 1, 5, 4],
        }
    )
    actual = sar.get_item_based_topk(items, top_k=3).set_index(["UserId", "MovieId"])
    assert_frame_equal(expected, actual, check_like=True)
def test_predict(similarity_type, timedecay_formula,
                 train_test_dummy_timestamp, header):
    model = SARSingleNode(remove_seen=True,
                          similarity_type=similarity_type,
                          timedecay_formula=timedecay_formula,
                          **header)
    trainset, testset = train_test_dummy_timestamp

    _apply_sar_hash_index(model, trainset, testset, header)

    model.fit(trainset)
    preds = model.predict(testset)

    assert len(preds) == 2
    assert isinstance(preds, pd.DataFrame)
    assert preds[header["col_user"]].dtype == object
    assert preds[header["col_item"]].dtype == object
    assert preds[PREDICTION_COL].dtype == float
def test_sar_item_similarity(threshold, similarity_type, file, demo_usage_data,
                             sar_settings, header):

    model = SARSingleNode(remove_seen=True,
                          similarity_type=similarity_type,
                          timedecay_formula=False,
                          time_decay_coefficient=30,
                          time_now=TIME_NOW,
                          threshold=threshold,
                          **header)

    _apply_sar_hash_index(model, demo_usage_data, None, header)

    model.fit(demo_usage_data)

    true_item_similarity, row_ids, col_ids = read_matrix(
        sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv")

    if similarity_type is "cooccurrence":
        test_item_similarity = _rearrange_to_test(
            model.item_similarity.todense(),
            row_ids,
            col_ids,
            model.item_map_dict,
            model.item_map_dict,
        )
        assert np.array_equal(
            true_item_similarity.astype(test_item_similarity.dtype),
            test_item_similarity,
        )
    else:
        test_item_similarity = _rearrange_to_test(
            np.array(model.item_similarity),
            row_ids,
            col_ids,
            model.item_map_dict,
            model.item_map_dict,
        )
        assert np.allclose(
            true_item_similarity.astype(test_item_similarity.dtype),
            test_item_similarity,
            atol=sar_settings["ATOL"],
        )
def SARtrain():
    data = pd.read_csv("SnacksData100.csv")
    data.loc[:, 'Ratings'] = data['Ratings'].astype(np.float32)
    header = {
        "col_user": "******",
        "col_item": "Product_Id",
        "col_rating": "Ratings",
        "col_timestamp": "timestamp",
    }
    train, test = python_stratified_split(data,
                                          ratio=0.75,
                                          col_user=header["col_user"],
                                          col_item=header["col_item"],
                                          seed=42)
    joblib.dump(test, 'testdata')
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)-8s %(message)s')
    model = SARSingleNode(similarity_type="jaccard",
                          time_decay_coefficient=30,
                          time_now=None,
                          timedecay_formula=True,
                          **header)
    model.fit(train)
    joblib.dump(model, 'SARDump')
def train_sar(params, data):
    model = SARSingleNode(**params)
    model.set_index(data)    
    with Timer() as t:
        model.fit(data)
    return model, t
Example #15
0
header = {
    "col_user": "******",
    "col_item": "MovieId",
    "col_rating": "Rating",
    "col_timestamp": "Timestamp",
}

model = SARSingleNode(remove_seen=True,
                      similarity_type="jaccard",
                      time_decay_coefficient=30,
                      time_now=None,
                      timedecay_formula=True,
                      **header)

start_time = time.time()
model.fit(train)
train_time = time.time() - start_time

start_time = time.time()
topk = model.recommend_k_items(test)
test_time = time.time() - start_time

# TODO: remove this call when the model returns same type as input

topk['UserId'] = pd.to_numeric(topk['UserId'])
topk['MovieId'] = pd.to_numeric(topk['MovieId'])

mlask(begin="\n", end="\n")

mlcat(
    "Fit the SAR Model", """\
Example #16
0
}

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SARSingleNode(remove_seen=True,
                      similarity_type="jaccard",
                      time_decay_coefficient=30,
                      time_now=None,
                      timedecay_formula=True,
                      **header)

# train the SAR model
start_time = time.time()

model.fit(train)

train_time = time.time() - start_time
run.log(name="Training time", value=train_time)

start_time = time.time()

top_k = model.recommend_k_items(test)

test_time = time.time() - start_time
run.log(name="Prediction time", value=test_time)

# TODO: remove this call when the model returns same type as input
top_k['UserId'] = pd.to_numeric(top_k['UserId'])
top_k['MovieId'] = pd.to_numeric(top_k['MovieId'])
Example #17
0
def train_sar(params, data):
    model = SARSingleNode(**params)
    model.set_index(data)
    with Timer() as t:
        model.fit(data)
    return model, t