Esempio n. 1
0
def test_RecommendationDataLoader(input_dataframe, target_dataframe,
                                  batch_size, num_sampling_users):
    common_users = input_dataframe.merge(target_dataframe,
                                         how='inner',
                                         on='user').user.unique()
    common_items = input_dataframe.merge(target_dataframe,
                                         how='inner',
                                         on='item').item.unique()

    input_dataframe = input_dataframe[
        input_dataframe.user.isin(common_users)
        & input_dataframe.item.isin(common_items)]
    target_dataframe = target_dataframe[
        target_dataframe.user.isin(common_users)
        & target_dataframe.item.isin(common_items)]

    interactions_matrix, item_id_map, user_id_map = dataframe_to_csr_matrix(
        input_dataframe, user_col='user', item_col='item', inter_col='inter')

    target_interactions_matrix, _, _ = dataframe_to_csr_matrix(
        target_dataframe,
        user_col='user',
        item_col='item',
        inter_col='inter',
        item_id_map=item_id_map,
        user_id_map=user_id_map)

    dataset = RecommendationDataset(interactions_matrix,
                                    target_interactions_matrix)

    dataloader = RecommendationDataLoader(
        dataset,
        batch_size=batch_size,
        negative_sampling=True,
        num_sampling_users=num_sampling_users)

    for batch_idx, (input, target) in enumerate(dataloader, 1):
        input_idx, input_val, input_size, input_items = input.indices, input.values, input.size, input.items
        input_dense = torch.sparse.FloatTensor(input_idx, input_val,
                                               input_size).to_dense()

        target_idx, target_val, target_size, target_words = target.indices, target.values, target.size, target.items
        target_dense = torch.sparse.FloatTensor(target_idx, target_val,
                                                target_size).to_dense()

        assert target is not None

        assert input_dense.size(0) == batch_size \
               or batch_idx == len(dataloader) and input_dense.size(0) == len(dataset) % batch_size
        assert input_dense.size(1) == len(input_items)
Esempio n. 2
0
def test_RecommendationDataset(input_dataframe):
    interactions_matrix, item_id_map, user_id_map = dataframe_to_csr_matrix(
        input_dataframe, user_col='user', item_col='item', inter_col='inter')

    dataset = RecommendationDataset(interactions_matrix)

    assert len(dataset) == len(np.unique(input_dataframe['user']))

    replica_df = pd.DataFrame(input_dataframe)

    for index in range(len(dataset)):
        user_interactions, _ = dataset[index]
        user = user_interactions.users[0]
        assert user_interactions.interactions_matrix.getnnz() == len(
            replica_df[replica_df.user.map(user_id_map) == user])

        for item_id, inter_val in zip(
                user_interactions.interactions_matrix.nonzero()[1],
                user_interactions.interactions_matrix.data):
            assert len(
                replica_df[(replica_df.user.map(user_id_map) == user)
                           & (replica_df.item.map(item_id_map) == item_id)
                           & (replica_df.inter == inter_val)]) > 0
            replica_df = replica_df[~(
                (replica_df.user.map(user_id_map) == user)
                & (replica_df.item.map(item_id_map) == item_id)
                & (replica_df.inter == inter_val))]

        assert user_interactions.interactions_matrix.getnnz() > 0

    # check that both the returned list of interactions and the dataframe contain
    # the same of interactions
    assert len(replica_df) == 0
Esempio n. 3
0
def test_RecommendationDataset_target(input_dataframe, target_dataframe):
    common_users = input_dataframe.merge(target_dataframe,
                                         how='inner',
                                         on='user').user.unique()
    common_items = input_dataframe.merge(target_dataframe,
                                         how='inner',
                                         on='item').item.unique()

    input_dataframe = input_dataframe[
        input_dataframe.user.isin(common_users)
        & input_dataframe.item.isin(common_items)]
    target_dataframe = target_dataframe[
        target_dataframe.user.isin(common_users)
        & target_dataframe.item.isin(common_items)]

    interactions_matrix, item_id_map, user_id_map = dataframe_to_csr_matrix(
        input_dataframe, user_col='user', item_col='item', inter_col='inter')

    target_interactions_matrix, _, _ = dataframe_to_csr_matrix(
        target_dataframe,
        user_col='user',
        item_col='item',
        inter_col='inter',
        item_id_map=item_id_map,
        user_id_map=user_id_map)

    dataset = RecommendationDataset(interactions_matrix,
                                    target_interactions_matrix)

    test_index = np.random.randint(0, len(dataset))

    input_interactions, target_interactions = dataset[test_index]

    assert input_interactions.users == target_interactions.users

    assert input_interactions.interactions_matrix.getnnz() > 0 \
           and target_interactions.interactions_matrix.getnnz() > 0
Esempio n. 4
0
def test_BatchCollator(input_dataframe, batch_size):
    interactions_matrix, item_id_map, user_id_map = dataframe_to_csr_matrix(
        input_dataframe, user_col='user', item_col='item', inter_col='inter')

    dataset = RecommendationDataset(interactions_matrix)

    batch_collator = BatchCollator(batch_size=batch_size,
                                   negative_sampling=True)

    big_batch, _ = dataset[np.arange(len(dataset))]

    batches = batch_collator.collate(big_batch)

    assert len(batches) == np.ceil(len(dataset) / batch_size)

    current_batch = 0
    for batch in batches:
        input_idx, input_val, input_size, input_words = batch.indices, batch.values, batch.size, batch.items
        input_dense = torch.sparse.FloatTensor(input_idx, input_val,
                                               input_size).to_dense()

        batch_users = big_batch.users[current_batch:current_batch + batch_size]
        batch_sparse_matrix = big_batch.interactions_matrix[
            current_batch:current_batch + batch_size]

        num_values_per_user = [
            batch_sparse_matrix[i].getnnz() for i in range(len(batch_users))
        ]

        assert (input_dense > 0).float().sum(
            dim=1).tolist() == num_values_per_user

        item_idx_map = {
            item_id: item_idx
            for item_idx, item_id in enumerate(input_words.tolist())
        }

        for user_idx in range(len(batch_users)):
            for item_id, val in zip(batch_sparse_matrix[user_idx].nonzero()[1],
                                    batch_sparse_matrix[user_idx].data):
                assert item_id in input_words
                assert input_dense[user_idx, item_idx_map[item_id]] == val

        current_batch += batch_size
Esempio n. 5
0
def test_model(sparse, exp_recall_20, exp_recall_50, exp_ndcg_100):
  data_dir = 'tests/data/'
  model_dir = '/tmp/'

  train_df = pd.read_csv(data_dir + 'train.csv')
  val_df = pd.read_csv(data_dir + 'val.csv')

  # keep the items that exist in the training dataset
  val_df = val_df[val_df.sid.isin(train_df.sid.unique())]

  train_matrix, item_id_map, user_id_map = dataframe_to_csr_matrix(train_df, user_col='uid',
                                                                   item_col='sid', inter_col='watched')

  val_matrix, _, _ = dataframe_to_csr_matrix(val_df, user_col='uid',
                                             item_col='sid', inter_col='watched',
                                             item_id_map=item_id_map, user_id_map=user_id_map)

  train_dataset = RecommendationDataset(train_matrix)
  val_dataset = RecommendationDataset(val_matrix, train_matrix)


  use_cuda = False
  model = DynamicAutoencoder(hidden_layers=[200], activation_type='tanh',
                             noise_prob=0.5, sparse=sparse)
  trainer = Recoder(model=model, use_cuda=use_cuda, optimizer_type='adam',
                    loss='logloss')

  trainer.train(train_dataset=train_dataset, val_dataset=val_dataset,
                batch_size=500, lr=1e-3, weight_decay=2e-5,
                num_epochs=30, negative_sampling=True)

  # assert model metrics
  recall_20 = Recall(k=20, normalize=True)
  recall_50 = Recall(k=50, normalize=True)
  ndcg_100 = NDCG(k=100)

  results = trainer._evaluate(eval_dataset=val_dataset, num_recommendations=100,
                              metrics=[recall_20, recall_50, ndcg_100], batch_size=500)

  for metric, value in list(results.items()):
    results[metric] = np.mean(results[metric])

  assert np.isclose(results[recall_20], exp_recall_20, atol=0.01, rtol=0)
  assert np.isclose(results[recall_50], exp_recall_50, atol=0.01, rtol=0)
  assert np.isclose(results[ndcg_100], exp_ndcg_100, atol=0.01, rtol=0)

  # Save the model and evaluate again
  model_checkpoint = model_dir + 'test_model.model'
  state_file = trainer.save_state(model_checkpoint)

  model = DynamicAutoencoder(sparse=sparse)
  trainer = Recoder(model=model, use_cuda=use_cuda,
                    optimizer_type='adam', loss='logloss')

  trainer.init_from_model_file(state_file)

  results = trainer._evaluate(eval_dataset=val_dataset, num_recommendations=100,
                              metrics=[recall_20, recall_50, ndcg_100], batch_size=500)

  for metric, value in list(results.items()):
    results[metric] = np.mean(results[metric])

  assert np.isclose(results[recall_20], exp_recall_20, atol=0.01, rtol=0)
  assert np.isclose(results[recall_50], exp_recall_50, atol=0.01, rtol=0)
  assert np.isclose(results[ndcg_100], exp_ndcg_100, atol=0.01, rtol=0)

  os.remove(state_file)
Esempio n. 6
0
    recoder.init_from_model_file(model_file)
    recommender = InferenceRecommender(recoder, num_recommendations)
elif method == 'similarity':
    embeddings_index = AnnoyEmbeddingsIndex()
    embeddings_index.load(index_file=index_file)
    cache_embeddings_index = MemCacheEmbeddingsIndex(embeddings_index)
    recommender = SimilarityRecommender(cache_embeddings_index,
                                        num_recommendations,
                                        scale=1,
                                        n=50)

train_df = pd.read_csv(data_dir + 'train.csv')
val_te_df = pd.read_csv(data_dir + 'test_te.csv')
val_tr_df = pd.read_csv(data_dir + 'test_tr.csv')

train_matrix, item_id_map, _ = dataframe_to_csr_matrix(train_df,
                                                       **common_params)

val_tr_matrix, _, user_id_map = dataframe_to_csr_matrix(
    val_tr_df, item_id_map=item_id_map, **common_params)
val_te_matrix, _, _ = dataframe_to_csr_matrix(val_te_df,
                                              item_id_map=item_id_map,
                                              user_id_map=user_id_map,
                                              **common_params)

val_tr_dataset = RecommendationDataset(val_tr_matrix, val_te_matrix)

metrics = [Recall(k=20), Recall(k=50), NDCG(k=100)]
evaluator = RecommenderEvaluator(recommender, metrics)

metrics_accumulated = evaluator.evaluate(val_tr_dataset, batch_size=500)
Esempio n. 7
0
def train_rs(proc_dir: str, model_dir: str, model_name: str, lr: float,
             lr_milestones: List[int], wd: float, epochs: int, emb_size: int,
             batch_size: int, valid_users_pct: float, valid_items_pct: float,
             wo_eval: bool):
    print('Reading data...')
    ds = pd.read_csv(path.join(proc_dir, 'ds.csv'))
    ds['inter'] = 1

    item_identity = {i: i for i in ds['item']}

    if wo_eval:
        train = ds
    else:
        print('Train test split...')
        train, valid = train_test_split(ds, valid_users_pct)
        valid_t, valid_e = train_eval_split(valid, valid_items_pct)
        del valid
    del ds

    print('Making sparse matrices...')

    common_params = {
        'user_col': 'user',
        'item_col': 'item',
        'inter_col': 'inter',
    }

    train_matrix, _, _ = dataframe_to_csr_matrix(train,
                                                 item_id_map=item_identity,
                                                 **common_params)
    train_dataset = RecommendationDataset(train_matrix)
    del train

    if wo_eval:
        valid_dataset = None
    else:
        # noinspection PyUnboundLocalVariable
        val_t_matrix, _, user_id_map = dataframe_to_csr_matrix(
            valid_t, item_id_map=item_identity, **common_params)
        # noinspection PyUnboundLocalVariable
        val_e_matrix, _, _ = dataframe_to_csr_matrix(valid_e,
                                                     item_id_map=item_identity,
                                                     user_id_map=user_id_map,
                                                     **common_params)
        valid_dataset = RecommendationDataset(val_t_matrix, val_e_matrix)
        del valid_t, valid_e
    use_cuda = True

    print('Training model...')

    model = DynamicAutoencoder(hidden_layers=[emb_size],
                               activation_type='tanh',
                               noise_prob=0.5,
                               sparse=False)

    trainer = Recoder(model=model,
                      use_cuda=use_cuda,
                      optimizer_type='adam',
                      loss='logistic',
                      user_based=False)

    metrics = [
        Recall(k=20, normalize=True),
        Recall(k=50, normalize=True),
        NDCG(k=100)
    ]

    model_prefix = path.join(model_dir, model_name)
    eval_num_recs = 100
    trainer.train(train_dataset=train_dataset,
                  val_dataset=valid_dataset,
                  batch_size=batch_size,
                  lr=lr,
                  weight_decay=wd,
                  num_epochs=epochs,
                  negative_sampling=True,
                  lr_milestones=lr_milestones,
                  num_data_workers=mp.cpu_count(),
                  model_checkpoint_prefix=model_prefix,
                  checkpoint_freq=0,
                  eval_num_recommendations=eval_num_recs,
                  metrics=metrics,
                  eval_freq=5)

    actual_path = "{}_epoch_{}.model".format(model_prefix, epochs)
    shutil.move(actual_path, model_prefix + '.model')

    results = trainer._evaluate(valid_dataset, eval_num_recs, metrics,
                                batch_size)

    with open(model_prefix + '_metrics.json', 'w') as f:
        json.dump(
            {str(metric): np.mean(results[metric])
             for metric in metrics}, f)