Exemple #1
0
def load_data(dataset, random_state):

    dataset = get_movielens_dataset(dataset)

    # np.random.shuffle(dataset.timestamps)

    # max_sequence_length = int(np.percentile(dataset.tocsr()
    #                                         .getnnz(axis=1),
    #                                         80))
    max_sequence_length = 100
    min_sequence_length = 50
    step_size = max_sequence_length

    train_nonsequence, rest = user_based_train_test_split(
        dataset, test_percentage=0.2, random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)

    train = train_nonsequence.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)
    validation = validation.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    return train_nonsequence, train, validation, test
Exemple #2
0
def main(args):
    status = 'available' if CUDA else 'not available'
    print("CUDA is {}!".format(status))
    args = parse_args(args)

    # Fix random_state
    seed = 72
    set_seed(seed)
    random_state = np.random.RandomState(seed)

    max_sequence_length = 100
    min_sequence_length = 20
    step_size = max_sequence_length

    if args.dataset == 'amazon':
        max_sequence_length = 50
        min_sequence_length = 5
        step_size = max_sequence_length
        dataset = get_amazon_dataset()
    elif args.dataset == 'goodbooks':
        dataset = get_goodbooks_dataset()
    else:
        dataset = get_movielens_dataset(args.dataset.upper())

    args.variant = args.dataset
    train, rest = user_based_train_test_split(
        dataset,
        test_percentage=0.2,
        random_state=random_state)
    test, valid = user_based_train_test_split(
        rest,
        test_percentage=0.5,
        random_state=random_state)
    train = train.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    valid = valid.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    print('model: {}, data: {}'.format(args.model, train))

    fname = 'experiment_{}_{}.pickle'.format(args.model, args.dataset)
    objective = get_objective(train, valid, test, random_state)
    space = hyperparameter_space(args.model)

    for iteration in range(args.num_trials):
        print('Iteration {}'.format(iteration))
        trials = optimize(objective,
                          space,
                          trials_fname=fname,
                          max_evals=iteration + 1)

        summarize_trials(trials)
Exemple #3
0
def run(model_type=None):
    random_state = mrecsys.sequence.__random_state__

    if model_type is None:
        model_type = input('Enter model type (cnn / lstm / pooling): ')
    print('CUDA:', CUDA)
    interactions, time_code, _, _ = load_latest_interactions()
    train, rest = user_based_train_test_split(interactions,
                                              random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   random_state=random_state)
    print('Split into \n {} and \n {} and \n {}.'.format(
        train, test, validation))

    tuning(train, test, validation, random_state, model_type, time_code)
def _get_synthetic_data(num_users=100,
                        num_items=100,
                        num_interactions=10000,
                        randomness=0.01,
                        order=2,
                        random_state=None):

    interactions = synthetic.generate_sequential(
        num_users=num_users,
        num_items=num_items,
        num_interactions=num_interactions,
        concentration_parameter=randomness,
        order=order,
        random_state=random_state)

    print('Max prob {}'.format(
        (np.unique(interactions.item_ids, return_counts=True)[1] /
         num_interactions).max()))

    train, test = user_based_train_test_split(interactions,
                                              random_state=random_state)

    train = train.to_sequence(max_sequence_length=10)
    test = test.to_sequence(max_sequence_length=10)

    return train, test
Exemple #5
0
def data_implicit_sequence():

    max_sequence_length = 200
    min_sequence_length = 20
    step_size = 200

    interactions = movielens.get_movielens_dataset('100K')

    train, test = user_based_train_test_split(interactions,
                                              random_state=RANDOM_STATE)

    train = train.to_sequence(max_sequence_length=max_sequence_length,
                              min_sequence_length=min_sequence_length,
                              step_size=step_size)

    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)

    model = ImplicitSequenceModel(loss='adaptive_hinge',
                                  representation='lstm',
                                  batch_size=8,
                                  learning_rate=1e-2,
                                  l2=1e-3,
                                  n_iter=2,
                                  use_cuda=CUDA,
                                  random_state=RANDOM_STATE)

    model.fit(train, verbose=True)

    return train, test, model
def _get_synthetic_data(num_users=100,
                        num_items=100,
                        num_interactions=10000,
                        randomness=0.01,
                        order=2,
                        max_sequence_length=10,
                        random_state=None):

    interactions = synthetic.generate_sequential(num_users=num_users,
                                                 num_items=num_items,
                                                 num_interactions=num_interactions,
                                                 concentration_parameter=randomness,
                                                 order=order,
                                                 random_state=random_state)

    print('Max prob {}'.format((np.unique(interactions.item_ids,
                                          return_counts=True)[1] /
                                num_interactions).max()))

    train, test = user_based_train_test_split(interactions,
                                              random_state=random_state)

    train = train.to_sequence(max_sequence_length=max_sequence_length,
                              step_size=None)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            step_size=None)

    return train, test
Exemple #7
0
def test_user_based_split():

    interactions = movielens.get_movielens_dataset('100K')

    train, test = (cross_validation.user_based_train_test_split(
        interactions, test_percentage=0.2, random_state=RANDOM_STATE))

    assert len(train) + len(test) == len(interactions)

    users_in_test = len(np.unique(test.user_ids))
    assert np.allclose(float(users_in_test) / interactions.num_users, 0.2, atol=0.001)
Exemple #8
0
def load_data(dataset, random_state):

    max_sequence_length = 100
    min_sequence_length = 20
    step_size = max_sequence_length

    if 'goodbooks' in dataset:
        dataset = get_goodbooks_dataset()
    elif 'amazon' in dataset:
        dataset = get_amazon_dataset()

        # This is a dataset with shorter sequences
        max_sequence_length = 50
        min_sequence_length = 5
        step_size = max_sequence_length
    else:
        dataset = get_movielens_dataset(dataset)

    train_nonsequence, rest = user_based_train_test_split(
        dataset, test_percentage=0.2, random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)

    train = train_nonsequence.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)
    validation = validation.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    return train_nonsequence, train, validation, test
Exemple #9
0
def preprocess_generated(num_users = 100, num_items = 1000, num_interactions = 10000):
    from spotlight.datasets.synthetic import generate_sequential
    from spotlight.cross_validation import user_based_train_test_split

    dataset = generate_sequential(num_users=num_users,
                                  num_items=num_items,
                                  num_interactions=num_interactions,
                                  concentration_parameter=0.0001,
                                  order=3)

    dat = {key: dat for key, dat in zip(["train","test"], user_based_train_test_split(dataset))}
    dat_seq = {key : val.to_sequence() for key, val in dat.items()}

    ind2val = {}
    ind2val['itemId'] = {idx : item for item, idx in enumerate(range(dataset.item_ids.max()))}

    return dat, dat_seq, ind2val
def train_model(df, hyperparams):
    # Fix random_state
    seed = 42
    set_seed(seed)
    random_state = np.random.RandomState(seed)

    max_sequence_length = 15
    min_sequence_length = 2
    step_size = 1

    # create dataset using interactions dataframe and timestamps
    dataset = Interactions(user_ids=np.array(df['user_id'], dtype='int32'),
                           item_ids=np.array(df['item_id'], dtype='int32'),
                           timestamps=df['entry_at'])

    # create training and test sets using a 80/20 split
    train, test = user_based_train_test_split(dataset,
                                              test_percentage=0.2,
                                              random_state=random_state)
    # convert to sequences
    train = train.to_sequence(max_sequence_length=max_sequence_length,
                              min_sequence_length=min_sequence_length,
                              step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)

    print('data: {}'.format(train))

    # initialize and train model
    model = ImplicitSequenceModel(**hyperparams,
                                  use_cuda=CUDA,
                                  random_state=random_state)
    model.fit(train, verbose=True)

    # compute mrr score on test set
    test_mrr = sequence_mrr_score(model, test).mean()
    print('MRR score on test set: {}'.format(test_mrr))

    return model
Exemple #11
0
    df_out['timestamp'] = df_data[['timestamp']]
    df_out['step'] = df_data[['step']]
    df_out['item_recommendations'] = item_recommendations

    return df_out


print("Load Data")
train_csv = abspath("../../../resources/train_small_no_header.csv")
test_csv = abspath("../../../resources/test.csv")
subm_csv = abspath("../../../resources/myoutput.csv")

print(f"Reading {train_csv} ...")
df_train = pd.read_csv(train_csv)

train, test = user_based_train_test_split(train_csv)
train = train.to_sequence()
test = test.to_sequence()

#print(f"Reading {test_csv} ...")
#df_test = pd.read_csv(test_csv)

print("Build and Fit Implicit Sequence Model")
model = ImplicitSequenceModel(n_iter=3, representation='cnn', loss='bpr')
#model.fit(df_train)

model.fit(train)

print("Calculate MRR Score")
mrr = sequence_mrr_score(model, test_csv)
print("MRR Result: ", mrr)
Exemple #12
0
import numpy as np
from spotlight.cross_validation import user_based_train_test_split
from spotlight.evaluation import sequence_mrr_score
from spotlight.sequence.implicit import ImplicitSequenceModel
from spotlight.datasets.synthetic import generate_sequential

dataset = generate_sequential(num_users=100,
                              num_items=1000,
                              num_interactions=10000,
                              concentration_parameter=0.01,
                              order=3)
train, test = user_based_train_test_split(dataset)

train = train.to_sequence()
test = test.to_sequence()

model = ImplicitSequenceModel(n_iter=3, representation='cnn', loss='bpr')
model.fit(train)

mrr = sequence_mrr_score(model, test)
Exemple #13
0
        results.save(hyperparameters, test_mrr.mean(), val_mrr.mean())

    return results


if __name__ == '__main__':

    max_sequence_length = 200
    min_sequence_length = 20
    step_size = 200
    random_state = np.random.RandomState(100)

    dataset = get_movielens_dataset('1M')

    train, rest = user_based_train_test_split(dataset,
                                              random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)
    train = train.to_sequence(max_sequence_length=max_sequence_length,
                              min_sequence_length=min_sequence_length,
                              step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)
    validation = validation.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    mode = sys.argv[1]
def main(max_evals):
    status = 'available' if CUDA else 'not available'
    print("CUDA is {}!".format(status))

    # Fix random_state
    seed = 42
    set_seed(seed)
    random_state = np.random.RandomState(seed)

    max_sequence_length = 15
    min_sequence_length = 2
    step_size = 1

    df = pd.read_csv(FILE_PATH)
    if 'time_of_day' in df.columns:
        df = df.drop(columns=['time_of_day', 'time_of_year', 'is_content_block'])
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0', 'js_key'])

    sub_col = 'subscriber_id'
    block_col = 'ddi_id'
    time_col = 'entry_at'

    # preprocess dataframe
    df[time_col] = pd.to_datetime(df[time_col])
    df.sort_values(by=time_col, inplace=True)
    df.reset_index(inplace=True)
    df.drop(columns='index', inplace=True)

    # create idx mapping compatible with spotlight, map users and items
    sub_mapping = {k:v for v, k in enumerate(df[sub_col].unique())}
    block_mapping = {k:v for v, k in enumerate(df[block_col].unique(), 1)}
    df['user_id'] = df[sub_col].map(sub_mapping)
    df['item_id'] = df[block_col].map(block_mapping)

    # create dataset using interactions and timestamps
    dataset = Interactions(user_ids=np.array(df['user_id'], dtype='int32'),
                           item_ids=np.array(df['item_id'], dtype='int32'),
                           timestamps=df[time_col])

    # create training, validation and test sets using a 80/10/10 split
    train, rest = user_based_train_test_split(
        dataset,
        test_percentage=0.2,
        random_state=random_state)
    test, valid = user_based_train_test_split(
        rest,
        test_percentage=0.5,
        random_state=random_state)
    # convert to sequences
    train = train.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    test = test.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)
    valid = valid.to_sequence(
        max_sequence_length=max_sequence_length,
        min_sequence_length=min_sequence_length,
        step_size=step_size)

    print('data: {}'.format(train))

    dtime = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    fname = './experiment_{}.pickle'.format(dtime)
    objective = get_objective(train, valid, test, random_state)
    space = hyperparameter_space()

    trials = optimize(objective,
                      space,
                      trials_fname=fname,
                      max_evals=max_evals)

    summarize_trials(trials)

    return trials
        results.save(hyperparameters, test_mrr.mean(), val_mrr.mean())

    return results


if __name__ == '__main__':

    max_sequence_length = 200
    min_sequence_length = 20
    step_size = 200
    random_state = np.random.RandomState(100)

    dataset = get_movielens_dataset('1M')

    train, rest = user_based_train_test_split(dataset,
                                              random_state=random_state)
    test, validation = user_based_train_test_split(rest,
                                                   test_percentage=0.5,
                                                   random_state=random_state)
    train = train.to_sequence(max_sequence_length=max_sequence_length,
                              min_sequence_length=min_sequence_length,
                              step_size=step_size)
    test = test.to_sequence(max_sequence_length=max_sequence_length,
                            min_sequence_length=min_sequence_length,
                            step_size=step_size)
    validation = validation.to_sequence(max_sequence_length=max_sequence_length,
                                        min_sequence_length=min_sequence_length,
                                        step_size=step_size)

    mode = sys.argv[1]