def run_tests(IC_DATA_DIR):
    from lib.process_data import instacart_process
    from lib.data_class import DataSet

    # set random seed for consistent tests
    np.random.seed(42)

    # load data from instacart csv files (values below use testing directory)
    order_data, product_data = instacart_process(data_dir=IC_DATA_DIR)

    # create dataset
    ic_dataset = DataSet(order_df=order_data, product_df=product_data)

    # check dataframes created correctly
    assert ic_dataset.order_df.shape == (31032, 4)
    assert ic_dataset.product_df.shape == (6126, 5)

    # create feature model and transform dataset
    feature_model = MainFeatureModel()
    X = feature_model.transform(ic_dataset)
    assert X.shape == (12214, 8)

    feature_model = MainFeatureModel()
    X = feature_model.transform(ic_dataset, return_df=True, drop_categorical=False)
    assert X.shape == (12214, 10)

    log.info("feature_models tests passed!")
Esempio n. 2
0
def run_tests(IC_DATA_DIR):
    from lib.process_data import instacart_process

    # set random seed for consistent tests
    np.random.seed(42)

    # load data from instacart csv files (values below use testing directory)
    order_data, product_data = instacart_process(data_dir=IC_DATA_DIR)

    # create dataset
    ic_dataset = DataSet(order_df=order_data, product_df=product_data)

    # check dataframes created correctly
    assert ic_dataset.order_df.shape == (31032, 4)
    assert ic_dataset.product_df.shape == (6126, 5)

    # check user and product ids created correctly
    assert ic_dataset.user_ids.shape == (206, )
    assert len(ic_dataset.user_idx) == 206
    assert ic_dataset.prod_ids.shape == (6126, )
    assert len(ic_dataset.prod_idx) == 6126

    # perform train-test split
    train_dataset, test_dataset = ic_dataset.train_test_split()
    assert train_dataset.order_df.shape == (24939, 4)
    assert train_dataset.product_df.shape == (6126, 5)
    assert test_dataset.order_df.shape == (6093, 4)
    assert test_dataset.product_df.shape == (6126, 5)

    # check that prod_ids inherited correctly
    assert (train_dataset.prod_ids == ic_dataset.prod_ids).all()
    assert (test_dataset.prod_ids == ic_dataset.prod_ids).all()

    # create adversarial dataset
    adv_dataset = ic_dataset.make_adversarial()
    assert adv_dataset.order_df.shape == (31032, 4)
    assert adv_dataset.product_df.shape == (6126, 5)
    assert (adv_dataset.user_ids == ic_dataset.user_ids).all()

    # number of places old and new dfs differ, should be number of users (unless swap product for self; very unlikely)
    assert np.sum(ic_dataset.order_df.product_id.values !=
                  adv_dataset.order_df.product_id.values) == 206

    # test prior-last order split
    assert ic_dataset.prior_order_df.shape == (28906, 4)
    assert ic_dataset.prior_user_prod.shape == (12214, 2)
    assert ic_dataset.labels.shape == (12214, )
    assert ic_dataset.size == 12214

    # test user-product matrix
    assert ic_dataset.user_prod_matrix.shape == (206, 6126)

    log.info("data_class tests passed!")
Esempio n. 3
0
def run_tests(IC_DATA_DIR):
    from lib.process_data import instacart_process
    from lib.data_class import DataSet

    # set random seed for consistent tests
    np.random.seed(42)

    # load data from instacart csv files (values below use testing directory)
    order_data, product_data = instacart_process(data_dir=IC_DATA_DIR)

    # create dataset
    ic_dataset = DataSet(order_df=order_data, product_df=product_data)

    # check dataframes created correctly
    assert ic_dataset.order_df.shape == (31032, 4)
    assert ic_dataset.product_df.shape == (6126, 5)

    # perform train-test split
    train_dataset, test_dataset = ic_dataset.train_test_split()
    assert train_dataset.order_df.shape == (24939, 4)
    assert test_dataset.order_df.shape == (6093, 4)

    # getall model
    model = GetAllModel()
    model.fit(train_dataset)
    model.predict(test_dataset)
    assert model.preds.shape == test_dataset.labels.shape

    # random model
    model = RandomModel()
    model.fit(train_dataset)
    model.predict(test_dataset)
    assert model.preds.shape == test_dataset.labels.shape

    # logistic model
    model = LogisticModel()
    model.fit(train_dataset)
    model.predict(test_dataset)
    assert model.preds.shape == test_dataset.labels.shape

    # random forest model
    model = RandomForestModel()
    model.fit(train_dataset)
    model.predict(test_dataset)
    assert model.preds.shape == test_dataset.labels.shape

    # LGBoost model
    model = LGBoostModel()
    model.fit(train_dataset)
    model.predict(test_dataset)
    assert model.preds.shape == test_dataset.labels.shape

    log.info("baseline_models tests passed!")
Esempio n. 4
0
def run_tests(IC_DATA_DIR):
    from lib.process_data import instacart_process
    from lib.data_class import DataSet

    # set random seed for consistent tests
    np.random.seed(42)

    # load data from instacart csv files (values below use testing directory)
    order_data, product_data = instacart_process(data_dir=IC_DATA_DIR)

    # create dataset
    ic_dataset = DataSet(order_df=order_data, product_df=product_data)

    # check dataframes created correctly
    assert ic_dataset.order_df.shape == (31032, 4)
    assert ic_dataset.product_df.shape == (6126, 5)

    # perform train-test split
    train_dataset, test_dataset = ic_dataset.train_test_split()
    assert train_dataset.order_df.shape == (24939, 4)
    assert test_dataset.order_df.shape == (6093, 4)

    # create user latent model and fit to train_dataset
    user_latent = UserModel()
    user_latent.fit(train_dataset, epochs=2)
    assert user_latent.transform(train_dataset).shape == (9447, 32)

    # test encoding and decoding works as expected
    encoded_upm = user_latent.encoder.predict(train_dataset.user_prod_matrix)
    decoded_upm = user_latent.decoder.predict(encoded_upm)
    autoencoded_upm = user_latent.autoencoder.predict(
        train_dataset.user_prod_matrix)
    assert encoded_upm.shape == (154, 32)
    assert decoded_upm.shape == (154, 6126)
    assert (decoded_upm == autoencoded_upm).all()

    # create topological user latent model and fit to train_dataset
    top_user_latent = TopUserModel()
    top_user_latent.fit_transform(train_dataset, epochs=2)
    assert top_user_latent.transform(test_dataset).shape == (2767, 219)

    # # manually compare original UPM and autoencoder prediction
    # print(train_dataset.user_prod_matrix[:5, :6])
    # print(autoencoded_upm[:5, :6])

    # transform test_dataset
    assert user_latent.transform(test_dataset).shape == (2767, 32)

    # create word2vec latent model and fit to train_dataset and transform test_dataset
    # TODO: not retraining correctly...
    w2v_model = word2vecModel()
    w2v_model.fit(train_dataset)
    log.debug(w2v_model.model)
    assert w2v_model.transform(test_dataset).shape == (2767, 100)

    # create TFIDF latent model and fit to train_dataset and transform test_dataset
    tfidf_model = TFIDFModel()
    tfidf_model.fit(train_dataset)
    assert tfidf_model.transform(test_dataset).shape == (2767, 20)

    # create product latent model (combination of previous two) and fit to train_dataset and transform test_dataset
    product_latent = ProductModel()
    product_latent.fit(train_dataset)
    assert product_latent.transform(test_dataset).shape == (2767, 10)

    log.info("latent_models tests passed!")
def run_tests(IC_DATA_DIR):
    from lib.process_data import instacart_process
    from lib.data_class import DataSet
    from models.latent_models import UserModel, ProductModel
    from models.feature_models import MainFeatureModel

    # set random seed for consistent tests
    np.random.seed(42)

    # load data from instacart csv files (values below use testing directory)
    order_data, product_data = instacart_process(data_dir=IC_DATA_DIR)

    # create dataset
    ic_dataset = DataSet(order_df=order_data, product_df=product_data)

    # check dataframes created correctly
    assert ic_dataset.order_df.shape == (31032, 4)
    assert ic_dataset.product_df.shape == (6126, 5)

    # perform train-test split
    train_dataset, test_dataset = ic_dataset.train_test_split()
    assert train_dataset.order_df.shape == (24939, 4)
    assert test_dataset.order_df.shape == (6093, 4)

    # create user latent model, fit and transform
    user_latent = UserModel()
    user_latent.fit(train_dataset, epochs=2)
    assert user_latent.transform(train_dataset).shape == (9447, 32)

    # create product latent model, fit and transform
    product_latent = ProductModel()
    product_latent.fit(train_dataset)
    assert product_latent.transform(train_dataset).shape == (9447, 10)

    # create feature model
    feature_model = MainFeatureModel()
    X = feature_model.transform(train_dataset)
    assert X.shape == (9447, 8)

    # fit non-top model to train_dataset
    model = NonTopModel(user_latent_model=user_latent,
                        product_latent_model=product_latent,
                        feature_model=feature_model)
    model.fit(train_dataset, fit_latent=False, epochs=2)
    assert model.input_dim == 50

    # predict on test_dataset
    model.predict(test_dataset)
    assert model.preds.shape == (2767, )
    assert model.preds.shape == test_dataset.labels.shape

    # fit top model to train_dataset
    model = TopModel(user_latent_model=user_latent,
                     product_latent_model=product_latent,
                     feature_model=feature_model)
    model.fit(train_dataset, fit_latent=False, epochs=2)
    print(model.input_dim)

    # predict on test_dataset
    model.predict(test_dataset)
    assert model.preds.shape == (2767, )
    assert model.preds.shape == test_dataset.labels.shape

    log.info("main_models tests passed!")