Example #1
0
def main(args):
    status = 'available' if CUDA else 'not available'
    print("CUDA is {}!".format(status))
    args = parse_args(args)

    # Fix random_state
    seed = 72
    set_seed(seed)
    random_state = np.random.RandomState(seed)

    max_sequence_length = 100
    min_sequence_length = 20
    step_size = max_sequence_length

    if args.dataset == 'amazon':
        max_sequence_length = 50
        min_sequence_length = 5
        step_size = max_sequence_length
        dataset = get_amazon_dataset()
    elif args.dataset == 'goodbooks':
        dataset = get_goodbooks_dataset()
    else:
        dataset = get_movielens_dataset(args.dataset.upper())

    args.variant = args.dataset
    train, rest = user_based_train_test_split(
        dataset,
        test_percentage=0.2,
        random_state=random_state)
    test, valid = user_based_train_test_split(
        rest,
        test_percentage=0.5,
        random_state=random_state)
    train = train.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    valid = valid.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    print('model: {}, data: {}'.format(args.model, train))

    fname = 'experiment_{}_{}.pickle'.format(args.model, args.dataset)
    objective = get_objective(train, valid, test, random_state)
    space = hyperparameter_space(args.model)

    for iteration in range(args.num_trials):
        print('Iteration {}'.format(iteration))
        trials = optimize(objective,
                          space,
                          trials_fname=fname,
                          max_evals=iteration + 1)

        summarize_trials(trials)
Example #2
0
def generate_dataset_table():

    headers = ['Dataset', 'Users', 'Items', 'Density', '95th/50th']

    rows = []

    for name, dataset in (('Movielens 10M', get_movielens_dataset('10M')),
                          ('Amazon', get_amazon_dataset()),
                          ('Goodbooks', get_goodbooks_dataset())):

        item_counts = dataset.tocoo().getnnz(axis=0)

        print('Dataset {}, ratio: {:0,}'
              .format(name, np.percentile(item_counts, 95) / np.percentile(item_counts, 50)))

        row = [
            name,
            '{:0,}'.format(dataset.num_users),
            '{:0,}'.format(dataset.num_items),
            len(dataset) / dataset.num_users / dataset.num_items,
            '{0:.2f}'.format(np.percentile(item_counts, 95) / np.percentile(item_counts, 50))
        ]

        rows.append(row)

    return _full_width_table(
        tabulate(rows,
                 headers=headers,
                 floatfmt='.4f',
                 tablefmt='latex_booktabs'))
Example #3
0
def load_data(dataset, random_state):

    if 'goodbooks' in dataset:
        dataset = get_goodbooks_dataset()
    elif 'amazon' in dataset:
        dataset = get_amazon_dataset()
    else:
        dataset = get_movielens_dataset(dataset)

    train, rest = random_train_test_split(dataset,
                                          test_percentage=0.05,
                                          random_state=random_state)

    test, validation = random_train_test_split(rest,
                                               test_percentage=0.5,
                                               random_state=random_state)

    return train, validation, test
Example #4
0
def load_data(dataset, random_state):

    max_sequence_length = 100
    min_sequence_length = 20
    step_size = max_sequence_length

    if 'goodbooks' in dataset:
        dataset = get_goodbooks_dataset()
    elif 'amazon' in dataset:
        dataset = get_amazon_dataset()

        # This is a dataset with shorter sequences
        max_sequence_length = 50
        min_sequence_length = 5
        step_size = max_sequence_length
    else:
        dataset = get_movielens_dataset(dataset)

    train_nonsequence, rest = user_based_train_test_split(
        dataset, test_percentage=0.2, random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)

    train = train_nonsequence.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)
    validation = validation.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    return train_nonsequence, train, validation, test
Example #5
0
 parser.add_argument('--sparsity', type=float, default=0.05)
 parser.add_argument('--data', type=str, default='synthetic')
 args = parser.parse_args()
 if str(args.data) == 'synthetic':
     split, dataset = make_synthetic(args.sparsity)
 elif str(args.data).lower() == 'movielens':
     print('MovieLens')
     dataset = get_movielens_dataset(variant=args.variant)
     split = 0.2
 elif str(args.data).lower() == 'amazon':
     print('Amazon')
     dataset = get_amazon_dataset()
     split = 0.2
 else:
     print('GoodBook')
     dataset = get_goodbooks_dataset()
     split = 0.2
 rmses = []
 mrrs = []
 rs = np.random.RandomState(100)
 pdb.set_trace()
 for i in range(5):
     print('Split - {} , Run {}'.format(split, i))
     train, test = random_train_test_split(dataset,
                                           random_state=rs,
                                           test_percentage=split)
     if args.model == 'implicit':
         model = ImplicitFactorizationModel(n_iter=args.n_epoch,
                                            loss=args.loss,
                                            use_cuda=True,
                                            learning_rate=args.lr,