Esempio n. 1
0
def generate_dataset_table():

    headers = ['Dataset', 'Users', 'Items', 'Density', '95th/50th']

    rows = []

    for name, dataset in (('Movielens 10M', get_movielens_dataset('10M')),
                          ('Amazon', get_amazon_dataset()),
                          ('Goodbooks', get_goodbooks_dataset())):

        item_counts = dataset.tocoo().getnnz(axis=0)

        print('Dataset {}, ratio: {:0,}'
              .format(name, np.percentile(item_counts, 95) / np.percentile(item_counts, 50)))

        row = [
            name,
            '{:0,}'.format(dataset.num_users),
            '{:0,}'.format(dataset.num_items),
            len(dataset) / dataset.num_users / dataset.num_items,
            '{0:.2f}'.format(np.percentile(item_counts, 95) / np.percentile(item_counts, 50))
        ]

        rows.append(row)

    return _full_width_table(
        tabulate(rows,
                 headers=headers,
                 floatfmt='.4f',
                 tablefmt='latex_booktabs'))
Esempio n. 2
0
def main(args):
    status = 'available' if CUDA else 'not available'
    print("CUDA is {}!".format(status))
    args = parse_args(args)

    # Fix random_state
    seed = 72
    set_seed(seed)
    random_state = np.random.RandomState(seed)

    max_sequence_length = 100
    min_sequence_length = 20
    step_size = max_sequence_length

    if args.dataset == 'amazon':
        max_sequence_length = 50
        min_sequence_length = 5
        step_size = max_sequence_length
        dataset = get_amazon_dataset()
    elif args.dataset == 'goodbooks':
        dataset = get_goodbooks_dataset()
    else:
        dataset = get_movielens_dataset(args.dataset.upper())

    args.variant = args.dataset
    train, rest = user_based_train_test_split(
        dataset,
        test_percentage=0.2,
        random_state=random_state)
    test, valid = user_based_train_test_split(
        rest,
        test_percentage=0.5,
        random_state=random_state)
    train = train.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    valid = valid.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    print('model: {}, data: {}'.format(args.model, train))

    fname = 'experiment_{}_{}.pickle'.format(args.model, args.dataset)
    objective = get_objective(train, valid, test, random_state)
    space = hyperparameter_space(args.model)

    for iteration in range(args.num_trials):
        print('Iteration {}'.format(iteration))
        trials = optimize(objective,
                          space,
                          trials_fname=fname,
                          max_evals=iteration + 1)

        summarize_trials(trials)
Esempio n. 3
0
def load_data(dataset, random_state):

    if 'goodbooks' in dataset:
        dataset = get_goodbooks_dataset()
    elif 'amazon' in dataset:
        dataset = get_amazon_dataset()
    else:
        dataset = get_movielens_dataset(dataset)

    train, rest = random_train_test_split(dataset,
                                          test_percentage=0.05,
                                          random_state=random_state)

    test, validation = random_train_test_split(rest,
                                               test_percentage=0.5,
                                               random_state=random_state)

    return train, validation, test
Esempio n. 4
0
def load_data(dataset, random_state):

    max_sequence_length = 100
    min_sequence_length = 20
    step_size = max_sequence_length

    if 'goodbooks' in dataset:
        dataset = get_goodbooks_dataset()
    elif 'amazon' in dataset:
        dataset = get_amazon_dataset()

        # This is a dataset with shorter sequences
        max_sequence_length = 50
        min_sequence_length = 5
        step_size = max_sequence_length
    else:
        dataset = get_movielens_dataset(dataset)

    train_nonsequence, rest = user_based_train_test_split(
        dataset, test_percentage=0.2, random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)

    train = train_nonsequence.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)
    validation = validation.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    return train_nonsequence, train, validation, test
Esempio n. 5
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('dataset', type=str)
    parser.add_argument('model', type=str)

    args = parser.parse_args()

    random_state = np.random.RandomState(100)

    if args.dataset == 'movielens':
        dataset = get_movielens_dataset('1M')
        test_percentage = 0.2
    else:
        test_percentage = 0.01
        dataset = get_amazon_dataset(min_user_interactions=20,
                                     min_item_interactions=5)

    print(dataset)

    if args.model == 'sequence':
        max_sequence_length = int(
            np.percentile(dataset.tocsr().getnnz(axis=1), 95))
        min_sequence_length = 20
        step_size = max_sequence_length

        train, rest = user_based_train_test_split(dataset,
                                                  test_percentage=0.05,
                                                  random_state=random_state)
        test, validation = user_based_train_test_split(
            rest, test_percentage=0.5, random_state=random_state)
        train = train.to_sequence(max_sequence_length=max_sequence_length,
Esempio n. 6
0
 parser.add_argument('--variant', type=str, default='100K')
 parser.add_argument('--n_epoch', type=int, default='20')
 parser.add_argument('--loss', type=str, default='bpr')
 parser.add_argument('--lr', type=float, default=1e-4)
 parser.add_argument('--sparsity', type=float, default=0.05)
 parser.add_argument('--data', type=str, default='synthetic')
 args = parser.parse_args()
 if str(args.data) == 'synthetic':
     split, dataset = make_synthetic(args.sparsity)
 elif str(args.data).lower() == 'movielens':
     print('MovieLens')
     dataset = get_movielens_dataset(variant=args.variant)
     split = 0.2
 elif str(args.data).lower() == 'amazon':
     print('Amazon')
     dataset = get_amazon_dataset()
     split = 0.2
 else:
     print('GoodBook')
     dataset = get_goodbooks_dataset()
     split = 0.2
 rmses = []
 mrrs = []
 rs = np.random.RandomState(100)
 pdb.set_trace()
 for i in range(5):
     print('Split - {} , Run {}'.format(split, i))
     train, test = random_train_test_split(dataset,
                                           random_state=rs,
                                           test_percentage=split)
     if args.model == 'implicit':