def test_RecommendationDataLoader(input_dataframe, target_dataframe, batch_size, num_sampling_users): common_users = input_dataframe.merge(target_dataframe, how='inner', on='user').user.unique() common_items = input_dataframe.merge(target_dataframe, how='inner', on='item').item.unique() input_dataframe = input_dataframe[ input_dataframe.user.isin(common_users) & input_dataframe.item.isin(common_items)] target_dataframe = target_dataframe[ target_dataframe.user.isin(common_users) & target_dataframe.item.isin(common_items)] interactions_matrix, item_id_map, user_id_map = dataframe_to_csr_matrix( input_dataframe, user_col='user', item_col='item', inter_col='inter') target_interactions_matrix, _, _ = dataframe_to_csr_matrix( target_dataframe, user_col='user', item_col='item', inter_col='inter', item_id_map=item_id_map, user_id_map=user_id_map) dataset = RecommendationDataset(interactions_matrix, target_interactions_matrix) dataloader = RecommendationDataLoader( dataset, batch_size=batch_size, negative_sampling=True, num_sampling_users=num_sampling_users) for batch_idx, (input, target) in enumerate(dataloader, 1): input_idx, input_val, input_size, input_items = input.indices, input.values, input.size, input.items input_dense = torch.sparse.FloatTensor(input_idx, input_val, input_size).to_dense() target_idx, target_val, target_size, target_words = target.indices, target.values, target.size, target.items target_dense = torch.sparse.FloatTensor(target_idx, target_val, target_size).to_dense() assert target is not None assert input_dense.size(0) == batch_size \ or batch_idx == len(dataloader) and input_dense.size(0) == len(dataset) % batch_size assert input_dense.size(1) == len(input_items)
def test_RecommendationDataset(input_dataframe): interactions_matrix, item_id_map, user_id_map = dataframe_to_csr_matrix( input_dataframe, user_col='user', item_col='item', inter_col='inter') dataset = RecommendationDataset(interactions_matrix) assert len(dataset) == len(np.unique(input_dataframe['user'])) replica_df = pd.DataFrame(input_dataframe) for index in range(len(dataset)): user_interactions, _ = dataset[index] user = user_interactions.users[0] assert user_interactions.interactions_matrix.getnnz() == len( replica_df[replica_df.user.map(user_id_map) == user]) for item_id, inter_val in zip( user_interactions.interactions_matrix.nonzero()[1], user_interactions.interactions_matrix.data): assert len( replica_df[(replica_df.user.map(user_id_map) == user) & (replica_df.item.map(item_id_map) == item_id) & (replica_df.inter == inter_val)]) > 0 replica_df = replica_df[~( (replica_df.user.map(user_id_map) == user) & (replica_df.item.map(item_id_map) == item_id) & (replica_df.inter == inter_val))] assert user_interactions.interactions_matrix.getnnz() > 0 # check that both the returned list of interactions and the dataframe contain # the same of interactions assert len(replica_df) == 0
def test_RecommendationDataset_target(input_dataframe, target_dataframe): common_users = input_dataframe.merge(target_dataframe, how='inner', on='user').user.unique() common_items = input_dataframe.merge(target_dataframe, how='inner', on='item').item.unique() input_dataframe = input_dataframe[ input_dataframe.user.isin(common_users) & input_dataframe.item.isin(common_items)] target_dataframe = target_dataframe[ target_dataframe.user.isin(common_users) & target_dataframe.item.isin(common_items)] interactions_matrix, item_id_map, user_id_map = dataframe_to_csr_matrix( input_dataframe, user_col='user', item_col='item', inter_col='inter') target_interactions_matrix, _, _ = dataframe_to_csr_matrix( target_dataframe, user_col='user', item_col='item', inter_col='inter', item_id_map=item_id_map, user_id_map=user_id_map) dataset = RecommendationDataset(interactions_matrix, target_interactions_matrix) test_index = np.random.randint(0, len(dataset)) input_interactions, target_interactions = dataset[test_index] assert input_interactions.users == target_interactions.users assert input_interactions.interactions_matrix.getnnz() > 0 \ and target_interactions.interactions_matrix.getnnz() > 0
def test_BatchCollator(input_dataframe, batch_size): interactions_matrix, item_id_map, user_id_map = dataframe_to_csr_matrix( input_dataframe, user_col='user', item_col='item', inter_col='inter') dataset = RecommendationDataset(interactions_matrix) batch_collator = BatchCollator(batch_size=batch_size, negative_sampling=True) big_batch, _ = dataset[np.arange(len(dataset))] batches = batch_collator.collate(big_batch) assert len(batches) == np.ceil(len(dataset) / batch_size) current_batch = 0 for batch in batches: input_idx, input_val, input_size, input_words = batch.indices, batch.values, batch.size, batch.items input_dense = torch.sparse.FloatTensor(input_idx, input_val, input_size).to_dense() batch_users = big_batch.users[current_batch:current_batch + batch_size] batch_sparse_matrix = big_batch.interactions_matrix[ current_batch:current_batch + batch_size] num_values_per_user = [ batch_sparse_matrix[i].getnnz() for i in range(len(batch_users)) ] assert (input_dense > 0).float().sum( dim=1).tolist() == num_values_per_user item_idx_map = { item_id: item_idx for item_idx, item_id in enumerate(input_words.tolist()) } for user_idx in range(len(batch_users)): for item_id, val in zip(batch_sparse_matrix[user_idx].nonzero()[1], batch_sparse_matrix[user_idx].data): assert item_id in input_words assert input_dense[user_idx, item_idx_map[item_id]] == val current_batch += batch_size
def test_model(sparse, exp_recall_20, exp_recall_50, exp_ndcg_100): data_dir = 'tests/data/' model_dir = '/tmp/' train_df = pd.read_csv(data_dir + 'train.csv') val_df = pd.read_csv(data_dir + 'val.csv') # keep the items that exist in the training dataset val_df = val_df[val_df.sid.isin(train_df.sid.unique())] train_matrix, item_id_map, user_id_map = dataframe_to_csr_matrix(train_df, user_col='uid', item_col='sid', inter_col='watched') val_matrix, _, _ = dataframe_to_csr_matrix(val_df, user_col='uid', item_col='sid', inter_col='watched', item_id_map=item_id_map, user_id_map=user_id_map) train_dataset = RecommendationDataset(train_matrix) val_dataset = RecommendationDataset(val_matrix, train_matrix) use_cuda = False model = DynamicAutoencoder(hidden_layers=[200], activation_type='tanh', noise_prob=0.5, sparse=sparse) trainer = Recoder(model=model, use_cuda=use_cuda, optimizer_type='adam', loss='logloss') trainer.train(train_dataset=train_dataset, val_dataset=val_dataset, batch_size=500, lr=1e-3, weight_decay=2e-5, num_epochs=30, negative_sampling=True) # assert model metrics recall_20 = Recall(k=20, normalize=True) recall_50 = Recall(k=50, normalize=True) ndcg_100 = NDCG(k=100) results = trainer._evaluate(eval_dataset=val_dataset, num_recommendations=100, metrics=[recall_20, recall_50, ndcg_100], batch_size=500) for metric, value in list(results.items()): results[metric] = np.mean(results[metric]) assert np.isclose(results[recall_20], exp_recall_20, atol=0.01, rtol=0) assert np.isclose(results[recall_50], exp_recall_50, atol=0.01, rtol=0) assert np.isclose(results[ndcg_100], exp_ndcg_100, atol=0.01, rtol=0) # Save the model and evaluate again model_checkpoint = model_dir + 'test_model.model' state_file = trainer.save_state(model_checkpoint) model = DynamicAutoencoder(sparse=sparse) trainer = Recoder(model=model, use_cuda=use_cuda, optimizer_type='adam', loss='logloss') trainer.init_from_model_file(state_file) results = trainer._evaluate(eval_dataset=val_dataset, num_recommendations=100, metrics=[recall_20, recall_50, ndcg_100], batch_size=500) for metric, value in list(results.items()): results[metric] = np.mean(results[metric]) assert np.isclose(results[recall_20], exp_recall_20, atol=0.01, rtol=0) assert np.isclose(results[recall_50], exp_recall_50, atol=0.01, rtol=0) assert np.isclose(results[ndcg_100], exp_ndcg_100, atol=0.01, rtol=0) os.remove(state_file)
recoder.init_from_model_file(model_file) recommender = InferenceRecommender(recoder, num_recommendations) elif method == 'similarity': embeddings_index = AnnoyEmbeddingsIndex() embeddings_index.load(index_file=index_file) cache_embeddings_index = MemCacheEmbeddingsIndex(embeddings_index) recommender = SimilarityRecommender(cache_embeddings_index, num_recommendations, scale=1, n=50) train_df = pd.read_csv(data_dir + 'train.csv') val_te_df = pd.read_csv(data_dir + 'test_te.csv') val_tr_df = pd.read_csv(data_dir + 'test_tr.csv') train_matrix, item_id_map, _ = dataframe_to_csr_matrix(train_df, **common_params) val_tr_matrix, _, user_id_map = dataframe_to_csr_matrix( val_tr_df, item_id_map=item_id_map, **common_params) val_te_matrix, _, _ = dataframe_to_csr_matrix(val_te_df, item_id_map=item_id_map, user_id_map=user_id_map, **common_params) val_tr_dataset = RecommendationDataset(val_tr_matrix, val_te_matrix) metrics = [Recall(k=20), Recall(k=50), NDCG(k=100)] evaluator = RecommenderEvaluator(recommender, metrics) metrics_accumulated = evaluator.evaluate(val_tr_dataset, batch_size=500)
def train_rs(proc_dir: str, model_dir: str, model_name: str, lr: float, lr_milestones: List[int], wd: float, epochs: int, emb_size: int, batch_size: int, valid_users_pct: float, valid_items_pct: float, wo_eval: bool): print('Reading data...') ds = pd.read_csv(path.join(proc_dir, 'ds.csv')) ds['inter'] = 1 item_identity = {i: i for i in ds['item']} if wo_eval: train = ds else: print('Train test split...') train, valid = train_test_split(ds, valid_users_pct) valid_t, valid_e = train_eval_split(valid, valid_items_pct) del valid del ds print('Making sparse matrices...') common_params = { 'user_col': 'user', 'item_col': 'item', 'inter_col': 'inter', } train_matrix, _, _ = dataframe_to_csr_matrix(train, item_id_map=item_identity, **common_params) train_dataset = RecommendationDataset(train_matrix) del train if wo_eval: valid_dataset = None else: # noinspection PyUnboundLocalVariable val_t_matrix, _, user_id_map = dataframe_to_csr_matrix( valid_t, item_id_map=item_identity, **common_params) # noinspection PyUnboundLocalVariable val_e_matrix, _, _ = dataframe_to_csr_matrix(valid_e, item_id_map=item_identity, user_id_map=user_id_map, **common_params) valid_dataset = RecommendationDataset(val_t_matrix, val_e_matrix) del valid_t, valid_e use_cuda = True print('Training model...') model = DynamicAutoencoder(hidden_layers=[emb_size], activation_type='tanh', noise_prob=0.5, sparse=False) trainer = Recoder(model=model, use_cuda=use_cuda, optimizer_type='adam', loss='logistic', user_based=False) metrics = [ Recall(k=20, normalize=True), Recall(k=50, normalize=True), NDCG(k=100) ] model_prefix = path.join(model_dir, model_name) eval_num_recs = 100 trainer.train(train_dataset=train_dataset, val_dataset=valid_dataset, batch_size=batch_size, lr=lr, weight_decay=wd, num_epochs=epochs, negative_sampling=True, lr_milestones=lr_milestones, num_data_workers=mp.cpu_count(), model_checkpoint_prefix=model_prefix, checkpoint_freq=0, eval_num_recommendations=eval_num_recs, metrics=metrics, eval_freq=5) actual_path = "{}_epoch_{}.model".format(model_prefix, epochs) shutil.move(actual_path, model_prefix + '.model') results = trainer._evaluate(valid_dataset, eval_num_recs, metrics, batch_size) with open(model_prefix + '_metrics.json', 'w') as f: json.dump( {str(metric): np.mean(results[metric]) for metric in metrics}, f)