os.system("wget https://s3.amazonaws.com/cornell-tech-sdl-rec-bias/dataset/citeulike/rsrf_user_data_test.npy")

raw_data = dict()
raw_data['train_data'] = np.load("rsrf_user_data_train.npy")
raw_data['val_data'] = np.load("rsrf_user_data_val.npy")
raw_data['test_data'] = np.load("rsrf_user_data_test.npy")
raw_data['max_user'] = 5551
raw_data['max_item'] = 16980
batch_size = 8000
test_batch_size = 1000
display_itr = 5000

train_dataset = ImplicitDataset(raw_data['train_data'], raw_data['max_user'], raw_data['max_item'], name='Train')
val_dataset = ImplicitDataset(raw_data['val_data'], raw_data['max_user'], raw_data['max_item'], name='Val')
test_dataset = ImplicitDataset(raw_data['test_data'], raw_data['max_user'], raw_data['max_item'], name='Test')

bpr_model = BPR(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), 
                dim_embed=50, opt='Adam', sess_config=None, l2_reg=0.01)
sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=1)
model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, 
    train_dataset=train_dataset, model=bpr_model, sampler=sampler, 
    eval_save_prefix="bpr-citeulike")
auc_evaluator = AUC()
recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
precision_evaluator = Precision(precision_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

model_trainer.train(num_itr=100001, display_itr=display_itr, eval_datasets=[val_dataset],
                    evaluators=[auc_evaluator, recall_evaluator, precision_evaluator, ndcg_evaluator], num_negatives=200)

Ejemplo n.º 2
0
                               raw_data['max_user'],
                               raw_data['max_item'],
                               name='Test')

model = ConcatVisualBPR(batch_size=batch_size,
                        max_user=raw_data['max_user'],
                        max_item=raw_data['max_item'],
                        item_serving_size=item_serving_size,
                        dim_embed=20,
                        dim_ve=10,
                        item_f_source=raw_data['item_features'],
                        l2_reg=None,
                        sess_config=sess_config)
sampler = PairwiseSampler(batch_size=batch_size,
                          dataset=train_dataset,
                          num_process=5)
model_trainer = ImplicitModelTrainer(batch_size=batch_size,
                                     test_batch_size=test_batch_size,
                                     item_serving_size=item_serving_size,
                                     train_dataset=train_dataset,
                                     model=model,
                                     sampler=sampler)

auc_evaluator = AUC()

model_trainer.train(num_itr=int(1e5),
                    display_itr=display_itr,
                    eval_datasets=[val_dataset, test_dataset],
                    evaluators=[auc_evaluator],
                    num_negatives=1000)
                                name='Train')
val_dataset = ImplicitDataset(raw_data['val_data'],
                              raw_data['max_user'],
                              raw_data['max_item'],
                              name='Val')

bpr_model = BPR(batch_size=batch_size,
                max_user=train_dataset.max_user(),
                max_item=train_dataset.max_item(),
                dim_embed=50,
                l2_reg=0.001,
                opt='Adam',
                sess_config=None)
sampler = PairwiseSampler(batch_size=batch_size,
                          dataset=train_dataset,
                          num_process=4)
model_trainer = ImplicitModelTrainer(batch_size=batch_size,
                                     test_batch_size=test_batch_size,
                                     train_dataset=train_dataset,
                                     model=bpr_model,
                                     sampler=sampler,
                                     eval_save_prefix="bpr-yahoo",
                                     item_serving_size=666)
auc_evaluator = AUC()

model_trainer.train(num_itr=10001,
                    display_itr=display_itr,
                    eval_datasets=[val_dataset],
                    evaluators=[auc_evaluator],
                    num_negatives=200)
Ejemplo n.º 4
0
                              raw_data['max_item'],
                              name='Val')
test_dataset = ImplicitDataset(raw_data['test_data'],
                               raw_data['max_user'],
                               raw_data['max_item'],
                               name='Test')

model = PMF(batch_size=batch_size,
            max_user=train_dataset.max_user(),
            max_item=train_dataset.max_item(),
            dim_embed=50,
            opt='Adam',
            sess_config=sess_config)
sampler = PointwiseSampler(batch_size=batch_size,
                           dataset=train_dataset,
                           pos_ratio=0.2,
                           num_process=5)
model_trainer = ImplicitModelTrainer(batch_size=batch_size,
                                     test_batch_size=test_batch_size,
                                     train_dataset=train_dataset,
                                     model=model,
                                     sampler=sampler)

auc_evaluator = AUC()
recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

model_trainer.train(num_itr=int(1e5),
                    display_itr=display_itr,
                    eval_datasets=[val_dataset, test_dataset],
                    evaluators=[auc_evaluator, recall_evaluator])
Ejemplo n.º 5
0
def run_test_exp(model_name=None,
                 evaluator=None,
                 raw_data=None,
                 user_per=1.0,
                 keep_days=1,
                 l2_reg=0.001,
                 test_date=None,
                 outdir=None,
                 num_itr=1e4 + 1):

    # parse dataset into incremental training and testing set
    data = raw_data
    max_user = len(np.unique(data["user_id"]))
    max_item = len(np.unique(data["item_id"]))
    print("max_user:{}, max_item:{}".format(max_user, max_item))

    test_date = datetime.datetime.strptime(test_date, "%Y-%m-%d").date()
    print("test date:%s" % test_date)
    train_data = data[data["timestamp"] < test_date]
    test_data = data[(data["timestamp"] >= test_date) & (
        data["timestamp"] < (test_date + datetime.timedelta(days=7)))]
    np.random.seed(10)
    test_data = np.asarray([
        np.random.choice(test_data[test_data["user_id"] == uid], 1)[0]
        for uid in np.unique(test_data["user_id"])
    ])

    # filter training data, for selected users keep only the latest n days of data
    print("filter user percentage:%f" % user_per)
    print("ratings before filter:%d" % len(train_data))
    user_list = np.unique(train_data["user_id"])
    filter_user = np.random.choice(user_list,
                                   int(len(user_list) * user_per),
                                   replace=False)
    filter_mask = (np.isin(train_data["user_id"], filter_user)) & (
        train_data["timestamp"] <
        (test_date - datetime.timedelta(days=keep_days)))

    # output filtered data and test data
    if LOGGING:
        np.save(outdir + "filtered_data.npy", train_data[filter_mask])
        np.save(outdir + "train_data.npy", train_data[~filter_mask])
        np.save(outdir + "test_data.npy", test_data)

    train_data = train_data[~filter_mask]
    print("ratings after filter:%d" % len(train_data))

    train_dataset = ImplicitDataset(train_data,
                                    max_user,
                                    max_item,
                                    name='Train')
    test_dataset = ImplicitDataset(test_data, max_user, max_item, name='Test')

    num_process = 8
    dim_embed = 50
    if model_name == 'PMF':
        model = PMF(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PointwiseSampler(batch_size=batch_size,
                                   dataset=train_dataset,
                                   pos_ratio=0.5,
                                   num_process=num_process)
    elif model_name == 'CML':
        model = CML(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PairwiseSampler(batch_size=batch_size,
                                  dataset=train_dataset,
                                  num_process=num_process)
    elif model_name == 'BPR':
        model = BPR(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PairwiseSampler(batch_size=batch_size,
                                  dataset=train_dataset,
                                  num_process=num_process)
    else:
        print("Wrong model assigned")
        return

    if evaluator == 'Recall':
        test_evaluator = Recall(
            recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    elif evaluator == 'NDCG':
        test_evaluator = NDCG(
            ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    else:
        print("Wrong evaluator assisgned")
        return

    model_trainer = ImplicitModelTrainer(batch_size=batch_size,
                                         test_batch_size=test_batch_size,
                                         train_dataset=train_dataset,
                                         model=model,
                                         sampler=sampler,
                                         item_serving_size=1,
                                         eval_save_prefix=outdir)
    model_trainer.train(num_itr=num_itr + 1,
                        display_itr=num_itr,
                        eval_datasets=[test_dataset],
                        evaluators=[test_evaluator],
                        num_negatives=200)
Ejemplo n.º 6
0
def run_exp(model_name=None,
            raw_data=None,
            user_per=1.0,
            keep_days=1,
            l2_reg=0.001,
            test_date=None,
            outdir=None):

    # parse dataset into incremental training and testing set
    data = raw_data
    max_user = len(np.unique(data["user_id"]))
    max_item = len(np.unique(data["item_id"]))
    print("max_user:{}, max_item:{}".format(max_user, max_item))

    test_date = datetime.datetime.strptime(test_date, "%Y-%m-%d").date()
    print("test date:%s" % test_date)
    train_data = data[data["timestamp"] < test_date]

    np.random.seed(10)

    # filter training data, for selected users keep only the most recent n days of data
    print("filter user percentage:%f" % user_per)
    print("ratings before filter:%d" % len(train_data))
    user_list = np.unique(train_data["user_id"])
    filter_user = np.random.choice(user_list,
                                   int(len(user_list) * user_per),
                                   replace=False)
    mask = (np.isin(train_data["user_id"], filter_user)) & (
        train_data["timestamp"] <
        (test_date - datetime.timedelta(days=keep_days)))
    train_data = train_data[~mask]
    print("ratings after filter:%d" % len(train_data))

    # random select one item for each user for validation
    user_list = np.unique(train_data["user_id"])
    val_index = [
        np.where(train_data["user_id"] == uid)[0][0] for uid in user_list
    ]  # leave out the most recent rating for validation
    val_data = train_data[val_index]
    train_data = np.delete(train_data, val_index)
    print("trian data: %d, validation data %d" %
          (len(train_data), len(val_data)))

    train_dataset = ImplicitDataset(train_data,
                                    max_user,
                                    max_item,
                                    name='Train')
    val_dataset = ImplicitDataset(val_data, max_user, max_item, name='Val')

    num_process = 8
    dim_embed = 50
    if model_name == 'PMF':
        model = PMF(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PointwiseSampler(batch_size=batch_size,
                                   dataset=train_dataset,
                                   pos_ratio=0.5,
                                   num_process=num_process)
    elif model_name == 'CML':
        model = CML(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PairwiseSampler(batch_size=batch_size,
                                  dataset=train_dataset,
                                  num_process=num_process)
    elif model_name == 'BPR':
        model = BPR(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PairwiseSampler(batch_size=batch_size,
                                  dataset=train_dataset,
                                  num_process=num_process)
    else:
        print("Wrong model assigned")
        return

    recall_evaluator = Recall(
        recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

    model_trainer = ImplicitModelTrainer(batch_size=batch_size,
                                         test_batch_size=test_batch_size,
                                         train_dataset=train_dataset,
                                         model=model,
                                         sampler=sampler,
                                         item_serving_size=1)
    model_trainer.train(num_itr=num_itr,
                        display_itr=display_itr,
                        eval_datasets=[val_dataset],
                        evaluators=[recall_evaluator, ndcg_evaluator],
                        num_negatives=200)