def generate_dataset_table(): headers = ['Dataset', 'Users', 'Items', 'Density', '95th/50th'] rows = [] for name, dataset in (('Movielens 10M', get_movielens_dataset('10M')), ('Amazon', get_amazon_dataset()), ('Goodbooks', get_goodbooks_dataset())): item_counts = dataset.tocoo().getnnz(axis=0) print('Dataset {}, ratio: {:0,}' .format(name, np.percentile(item_counts, 95) / np.percentile(item_counts, 50))) row = [ name, '{:0,}'.format(dataset.num_users), '{:0,}'.format(dataset.num_items), len(dataset) / dataset.num_users / dataset.num_items, '{0:.2f}'.format(np.percentile(item_counts, 95) / np.percentile(item_counts, 50)) ] rows.append(row) return _full_width_table( tabulate(rows, headers=headers, floatfmt='.4f', tablefmt='latex_booktabs'))
def main(args): status = 'available' if CUDA else 'not available' print("CUDA is {}!".format(status)) args = parse_args(args) # Fix random_state seed = 72 set_seed(seed) random_state = np.random.RandomState(seed) max_sequence_length = 100 min_sequence_length = 20 step_size = max_sequence_length if args.dataset == 'amazon': max_sequence_length = 50 min_sequence_length = 5 step_size = max_sequence_length dataset = get_amazon_dataset() elif args.dataset == 'goodbooks': dataset = get_goodbooks_dataset() else: dataset = get_movielens_dataset(args.dataset.upper()) args.variant = args.dataset train, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, valid = user_based_train_test_split( rest, test_percentage=0.5, random_state=random_state) train = train.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) valid = valid.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) print('model: {}, data: {}'.format(args.model, train)) fname = 'experiment_{}_{}.pickle'.format(args.model, args.dataset) objective = get_objective(train, valid, test, random_state) space = hyperparameter_space(args.model) for iteration in range(args.num_trials): print('Iteration {}'.format(iteration)) trials = optimize(objective, space, trials_fname=fname, max_evals=iteration + 1) summarize_trials(trials)
def load_data(dataset, random_state): if 'goodbooks' in dataset: dataset = get_goodbooks_dataset() elif 'amazon' in dataset: dataset = get_amazon_dataset() else: dataset = get_movielens_dataset(dataset) train, rest = random_train_test_split(dataset, test_percentage=0.05, random_state=random_state) test, validation = random_train_test_split(rest, test_percentage=0.5, random_state=random_state) return train, validation, test
def load_data(dataset, random_state): max_sequence_length = 100 min_sequence_length = 20 step_size = max_sequence_length if 'goodbooks' in dataset: dataset = get_goodbooks_dataset() elif 'amazon' in dataset: dataset = get_amazon_dataset() # This is a dataset with shorter sequences max_sequence_length = 50 min_sequence_length = 5 step_size = max_sequence_length else: dataset = get_movielens_dataset(dataset) train_nonsequence, rest = user_based_train_test_split( dataset, test_percentage=0.2, random_state=random_state) test, validation = user_based_train_test_split(rest, test_percentage=0.5, random_state=random_state) train = train_nonsequence.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) test = test.to_sequence(max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) validation = validation.to_sequence( max_sequence_length=max_sequence_length, min_sequence_length=min_sequence_length, step_size=step_size) return train_nonsequence, train, validation, test
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('dataset', type=str) parser.add_argument('model', type=str) args = parser.parse_args() random_state = np.random.RandomState(100) if args.dataset == 'movielens': dataset = get_movielens_dataset('1M') test_percentage = 0.2 else: test_percentage = 0.01 dataset = get_amazon_dataset(min_user_interactions=20, min_item_interactions=5) print(dataset) if args.model == 'sequence': max_sequence_length = int( np.percentile(dataset.tocsr().getnnz(axis=1), 95)) min_sequence_length = 20 step_size = max_sequence_length train, rest = user_based_train_test_split(dataset, test_percentage=0.05, random_state=random_state) test, validation = user_based_train_test_split( rest, test_percentage=0.5, random_state=random_state) train = train.to_sequence(max_sequence_length=max_sequence_length,
parser.add_argument('--variant', type=str, default='100K') parser.add_argument('--n_epoch', type=int, default='20') parser.add_argument('--loss', type=str, default='bpr') parser.add_argument('--lr', type=float, default=1e-4) parser.add_argument('--sparsity', type=float, default=0.05) parser.add_argument('--data', type=str, default='synthetic') args = parser.parse_args() if str(args.data) == 'synthetic': split, dataset = make_synthetic(args.sparsity) elif str(args.data).lower() == 'movielens': print('MovieLens') dataset = get_movielens_dataset(variant=args.variant) split = 0.2 elif str(args.data).lower() == 'amazon': print('Amazon') dataset = get_amazon_dataset() split = 0.2 else: print('GoodBook') dataset = get_goodbooks_dataset() split = 0.2 rmses = [] mrrs = [] rs = np.random.RandomState(100) pdb.set_trace() for i in range(5): print('Split - {} , Run {}'.format(split, i)) train, test = random_train_test_split(dataset, random_state=rs, test_percentage=split) if args.model == 'implicit':