# now the dataset of target users logger.info('Reading {}'.format(args.target_users)) targets = read_dataset(args.target_users, sep=',') # we need to merge with the new indices targets['user_idx'] = user_idx[targets['user_id'].values].values # finally interactions logger.info('Reading {}'.format(args.interactions)) interactions, n, n1 = read_interactions(args.interactions, sep=args.sep, user_to_idx=user_idx, item_to_idx=item_idx) interactions = interactions[interactions['item_idx'] >= 0.0] urm = df_to_csr(interactions, user_idx.shape[0], item_idx.shape[0], is_implicit=True) recommender = RecommenderClass() model_path = 'output/models/' + args.model_file # from where to load the already computed model (similarity matrix) recommender.load_weights(model_path) recs = recommender.make_prediction(targets['user_idx'].values, urm, recomendable_items, num=5) # open the prediction file and write the header if args.prediction_file: pfile = open(args.prediction_file, 'w') header = 'user_id,recommended_items' + '\n' pfile.write(header)
roc_auc_, precision_, recall_, map_, mrr_, ndcg_ = np.zeros( args.cv_folds), np.zeros(args.cv_folds), np.zeros(args.cv_folds), np.zeros( args.cv_folds), np.zeros(args.cv_folds), np.zeros(args.cv_folds) at = args.rec_length nfold = 0 for train_df, test_df in k_fold_cv(dataset, user_key=args.user_key, item_key=args.item_key, k=args.cv_folds, clean_test=True, seed=args.rnd_seed): logger.info(train_df.shape) logger.info(test_df.shape) logger.info('Fold {}'.format(nfold + 1)) train = df_to_csr(train_df, is_implicit=args.is_implicit, nrows=nusers, ncols=nitems) test = df_to_csr(test_df, is_implicit=args.is_implicit, nrows=nusers, ncols=nitems) # train the recommender recommender = RecommenderClass(**init_args) logger.info('Recommender: {}'.format(recommender)) tic = dt.now() logger.info('Training started') recommender.fit(train) logger.info('Training completed in {}'.format(dt.now() - tic)) # evaluate the ranking quality
columns=args.columns, user_key=args.user_key, item_key=args.item_key, rating_key=args.rating_key, item_to_idx=item_to_idx, user_to_idx=user_to_idx) # build reverse maps idx_to_item = pd.Series(index=item_to_idx.data, data=item_to_idx.index) idx_to_user = pd.Series(index=user_to_idx.data, data=user_to_idx.index) nusers, nitems = train_df.user_idx.max() + 1, train_df.item_idx.max() + 1 train = df_to_csr(train_df, is_binary=args.is_binary, nrows=nusers, ncols=nitems, user_key='user_idx', item_key='item_idx', rating_key=args.rating_key) test = df_to_csr(test_df, is_binary=args.is_binary, nrows=nusers, ncols=nitems, user_key='user_idx', item_key='item_idx', rating_key=args.rating_key) # train the recommender recommender = RecommenderClass(**init_args) logger.info('Recommender: {}'.format(recommender)) tic = dt.now()
init_args[key] = eval(value) except: init_args[key] = value # convert the column argument to list if args.columns is not None: args.columns = args.columns.split(',') # read the dataset logger.info('Reading {}'.format(args.train)) train_df = read_dataset(args.train, sep=',', header=0) logger.info('Reading {}'.format(args.test)) test_df = read_dataset(args.test, sep=',', header=0) nusers, nitems = train_df.user_idx.max()+1, train_df.item_idx.max()+1 train = df_to_csr(train_df, nrows=nusers, ncols=nitems) test = df_to_csr(test_df, nrows=nusers, ncols=nitems) # train the recommender recommender = RecommenderClass(**init_args) logger.info('Recommender: {}'.format(recommender)) tic = dt.now() logger.info('Training started') recommender.fit(train) logger.info('Training completed in {}'.format(dt.now() - tic)) # open the prediction file if args.prediction_file: pfile = open(args.prediction_file, 'w') n = args.rec_length if args.rec_length is not None else nitems header = 'user_id,'
def grid_search_cv(RecommenderClass, dataset, param_space, metric=roc_auc, at=None, cv_folds=5, is_binary=True, user_key='user_id', item_key='item_id', rating_key='rating', rnd_seed=1234): """ Finds the best hyper-parameters of a recommender algorithm with Grid Search :param RecommenderClass: Class of the recommender to tune (must be subclass of Recommender) :param dataset: data to use for tuning :param param_space: space of the parameters to explore :param metric: metric to maximize :param at: optional length of the recommendation list used in recommendaiton :param cv_folds: number of cross-validation iters :param is_binary: True to discard ratings, False otherwise :param user_key: name of the column with user ids in dataset :param item_key: name of the column with item ids in dataset :param rating_key: name of the column with ratings in dataset :param rnd_seed: random seed used for cross-validation :return: a tuple with (best configuration, best metric value) """ tried_conf = [] results = np.zeros(np.prod([len(v) for v in param_space.values()]), dtype=np.float32) space_size = len(results) logger.info('Size of the parameter space: {} ({} cv trials)'.format( space_size, space_size * cv_folds)) param_grid = ParameterGrid(param_space) # compute the cv splits nusers, nitems = dataset[user_key].max() + 1, dataset[item_key].max() + 1 cv_split = [] for train_df, test_df in k_fold_cv(dataset, user_key=user_key, item_key=item_key, k=cv_folds, clean_test=True, seed=rnd_seed): train = df_to_csr(train_df, is_binary=is_binary, nrows=nusers, ncols=nitems, user_key=user_key, item_key=item_key, rating_key=rating_key) test = df_to_csr(test_df, is_binary=is_binary, nrows=nusers, ncols=nitems, user_key=user_key, item_key=item_key, rating_key=rating_key) cv_split.append((train, test)) for i, params in enumerate(param_grid): logger.info('Iteration {}/{}: {}'.format(i + 1, space_size, params)) tried_conf.append(params) cv_result = 0.0 for f, (train, test) in enumerate(cv_split): # train the recommender recommender = RecommenderClass(**params) recommender.fit(train) # evaluate the ranking quality n_eval = 0 metric_ = 0.0 for test_user in range(nusers): relevant_items = test[test_user].indices if len(relevant_items) > 0: n_eval += 1 # this will rank **all** items recommended_items = recommender.recommend( user_id=test_user, exclude_seen=True) # evaluate the recommendation list with ranking metrics ONLY if metric == roc_auc: metric_ += roc_auc(recommended_items, relevant_items) elif metric == ndcg: metric_ += ndcg(recommended_items, relevant_items, relevance=test[test_user].data, at=at) else: metric_ += metric(recommended_items, relevant_items, at=at) metric_ /= n_eval cv_result += metric_ # average value of the metric in cross-validation results[i] = cv_result / cv_folds logger.info('Result: {:.4f}'.format(results[i])) # return the best configuration best = results.argsort()[-1] return tried_conf[best], results[best]
# read the dataset logger.info('Reading {}'.format(args.dataset)) dataset, idx_to_user, idx_to_item = read_dataset( args.dataset, header=args.header, sep=args.sep, columns=args.columns, make_implicit=args.make_implicit, implicit_th=args.implicit_th, item_key=args.item_key, user_key=args.user_key, rating_key=args.rating_key, item_to_idx=item_idx, user_to_idx=user_idx) nusers, nitems = len(idx_to_user), len(idx_to_item) logger.info('The dataset has {} users and {} items'.format(nusers, nitems)) # let's construct the training set train = df_to_csr(dataset, is_implicit=args.is_implicit, nrows=nusers, ncols=nitems) logger.info('The train set is a sparse matrix of shape: {}'.format(train.shape)) # train the recommender recommender = RecommenderClass(**init_args) logger.info('Recommender: {}'.format(recommender)) logger.info('Parameters: {}'.format(init_args if args.params else 'default')) tic = dt.now() logger.info('Training started') recommender.fit(train) logger.info('Training completed in {}'.format(dt.now() - tic))
num_users, num_items = len(users), len(items) print("There are %d users and %d items" % (num_users, num_items)) # indexing of users and items user_idx = pd.Series(index=users, data=np.arange(num_users)) item_idx = pd.Series(index=items, data=np.arange(num_items)) # building the final dataframe adding "user's index" and "item's index" useful_interactions["user_idx"] = user_idx[ useful_interactions["user_id"].values].values useful_interactions["item_idx"] = item_idx[ useful_interactions["item_id"].values].values data_csr = df_to_csr(useful_interactions, num_items, num_users, user_key='item_idx', item_key='user_idx') useful_items = [] for i in np.arange(num_items): if (data_csr[i].nnz > 1): useful_items.append(i) useful_items = pd.DataFrame(useful_items, columns=['item_idx']) useful_interactions = useful_interactions.merge(useful_items, on='item_idx', how='right') useful_interactions = useful_interactions.drop(['user_idx', 'item_idx'], axis=1)
return roc_auc_, precision_, recall_, map_, mrr_, ndcg_ metric = roc_auc cv_folds = 5 at = 10 is_binary = True train_df = read_dataset('../../data/ml100k/binary_holdout/train.csv', sep=',', header=0) test_df = read_dataset('../../data/ml100k/binary_holdout/test.csv', sep=',', header=0) nusers, nitems = train_df.user_idx.max() + 1, train_df.item_idx.max() + 1 train = df_to_csr(train_df, is_binary=is_binary, nrows=nusers, ncols=nitems) test = df_to_csr(test_df, is_binary=is_binary, nrows=nusers, ncols=nitems) # # TopPop # # RecommenderClass = TopPop # param_space = {} # # Evaluate all the metrics over the hold out split # recommender = RecommenderClass() # metrics = holdout_eval(recommender, train, test, at=at) # logger.info('Metrics: {}'.format(metrics)) # # # GlobalEffects #
test_observed_df, test_hidden_df = per_user_holdout(test_users_df, user_key='user_idx', item_key='item_idx', n_observed=args.n_observed, seed=args.rnd_seed) tot_observed, tot_hidden = test_observed_df.shape[0], test_hidden_df.shape[0] logger.info('Observed ratings: {}({:.2f}%)'.format( tot_observed, tot_observed / (tot_observed + tot_hidden) * 100)) logger.info('Observed ratings: {}({:.2f}%)'.format( tot_hidden, tot_hidden / (tot_observed + tot_hidden) * 100)) # build the sparse matrices train = df_to_csr(train_users_df, is_binary=args.is_binary, nrows=nusers_train, ncols=nitems, item_key='item_idx', user_key='user_idx', rating_key=args.rating_key) test_observed = df_to_csr(test_observed_df, is_binary=args.is_binary, nrows=nusers_test, ncols=nitems, item_key='item_idx', user_key='user_idx', rating_key=args.rating_key) test_hidden = df_to_csr(test_hidden_df, is_binary=args.is_binary, nrows=nusers_test, ncols=nitems,
profiles = pd.read_csv("data/user_profile.csv", sep='\t') # now the dataset of target users logger.info('Reading {}'.format(args.target_users)) targets = read_dataset(args.target_users, sep=',') # print(set(targets_all['user_id'].values) <= set(idx_user.data)) #targets = targets_all.merge(profiles, how='inner', on='user_id') targets['user_idx'] = user_idx[targets['user_id'].values].values # finally interactions logger.info('Reading {}'.format(args.interactions)) interactions, n1, n2 = read_interactions(args.interactions, sep=args.sep, user_to_idx=user_idx, item_to_idx=item_idx) interactions = interactions[interactions['item_idx'] >= 0.0] urm = df_to_csr(interactions, user_idx.shape[0], len(interactions['item_idx'].unique()), is_implicit=True) recommender = CBFUsersRecommender() recommender.load_user_weights('output/models/sparse_cbf.npz') recs = recommender.make_prediction(targets['user_idx'].values, urm) print(recs.shape) # open the prediction file and write the header if args.prediction_file: pfile = open(args.prediction_file, 'w') header = 'user_id,recommended_items' + '\n' pfile.write(header) new_user_idx = targets['user_idx'].values for target in range(recs.shape[0]): user_id = idx_user[new_user_idx[target]]