def train(args):
    # Load Data
    spec = MODEL_SPEC[args.model]
    config = spec['config']
    config['pretrain'] = False
    wework_dir = config['data_dir']
    wework_rating = pd.read_csv(os.path.join(wework_dir,  config['train_filename']), sep=',', header=0, names=['','account_id', 'atlas_location_uuid', 'rating', 'timestamp', 'weight'],  engine='python')
    # # Reindex
    # account_id = ml1m_rating[['uid']].drop_duplicates().reindex()
    # account_id['userId'] = np.arange(len(user_id))
    # ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
    # item_id = ml1m_rating[['mid']].drop_duplicates()
    # item_id['itemId'] = np.arange(len(item_id))
    # ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')
    # ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]
    print('Range of AccountId is [{}, {}]'.format(wework_rating.account_id.min(), wework_rating.account_id.max()))
    print('Range of LocationId is [{}, {}]'.format(wework_rating.atlas_location_uuid.min(), wework_rating.atlas_location_uuid.max()))
    
    Engine = spec['engine']
    # DataLoader for training
    sample_generator = SampleGenerator(wework_rating, config)
    sample_generator.test()
    test_data = sample_generator.evaluate_data
    sample_generator.val()
    val_data = sample_generator.evaluate_data

   
    # Specify the exact model
    engine = Engine(config)
    # gamma = decaying factor
    scheduler = StepLR(engine.opt, step_size=1, gamma=0.75)
    train_negatives = []
    best_epoch = 0
    best_metric = float('inf')
    HR_10, NDCG_10 = 0, 0
    for epoch in range(config['num_epoch']):
        print('Epoch {} starts !'.format(epoch))
        print('-' * 80)

        train_loader, train_negative = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size'])
        engine.train_an_epoch(train_loader, epoch_id=epoch)
        scheduler.step()
        plot_grad_flow(engine.model.named_parameters(), epoch)
        train_negative = flatten(train_negative)
        if len(train_negatives) != 0:
            train_negatives = pd.concat([train_negatives, train_negative], axis=0)
        else:
            train_negatives = train_negative
        metric, auc, HR5, HR10, NDCG5, NDCG10 = engine.evaluate(val_data, train_negatives, epoch_id=epoch)
        if metric < best_metric :
            best_epoch = epoch
            best_metric = metric
            HR_10, NDCG_10 = HR10, NDCG10
            engine.save(config['alias'], epoch, HR_10, NDCG_10)
            print ('Epoch {}: found best results on validation data: metric = {:.4f}, HR10 = {:.4f}, NDCG10 = {:.4f}'.format(epoch, best_metric, HR_10, NDCG_10))

    engine.load(config['alias'], best_epoch, HR_10, NDCG_10)
    metric, auc, HR5, HR10, NDCG5, NDCG10 = engine.evaluate(test_data, train_negatives, epoch_id=epoch)
    print('Best Epoch {}: metric = {:.4f}, auc = {:.4f}, HR@5 = {:.4f}, HR@10 = {:.4f},\
          NDCG@5 = {:.4f}, NDCG@10 = {:.4f}'.format(best_epoch, metric, auc, HR5, HR10, NDCG5, NDCG10))
ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
item_id = ml1m_rating[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))
ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')
ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]
print('Range of userId is [{}, {}]'.format(ml1m_rating.userId.min(),
                                           ml1m_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(ml1m_rating.itemId.min(),
                                           ml1m_rating.itemId.max()))

# 加载训练数据
sample_generator = SampleGenerator(ratings=ml1m_rating)
evaluate_data = sample_generator.evaluate_data

# 指定训练的参数和训练模型
for config in [gmf_config, mlp_config, neumf_config]:
    if config == mlp_config:
        engine = MLPEngine(config)
    elif config == gmf_config:
        engine = GMFEngine(config)
    else:
        engine = NeuMFEngine(config)
    for epoch in range(config['num_epoch']):
        print('Epoch {} starts !'.format(epoch))
        print('-' * 80)
        train_loader = sample_generator.instance_a_train_loader(
            config['num_negative'], config['batch_size'])
        engine.train_an_epoch(train_loader, epoch_id=epoch)
        hit_ratio, ndcg = engine.evaluate(evaluate_data, epoch_id=epoch)
        engine.save(config['alias'], epoch, hit_ratio, ndcg)
def main(params):
    books_df_sample, movies_df_sample = sample_data(params['books_df'],
                                                    params['movies_df'],
                                                    params['sample_size'])
    if params['use_itemVec']:
        books_df_sample, movies_df_sample = assign_vec(books_df_sample,
                                                       movies_df_sample,
                                                       params['itemVec_file'])
    books_df_sample, movies_df_sample = lbencoder(books_df_sample,
                                                  movies_df_sample, 5000)
    sample_generator = SampleGenerator(ratings_s=books_df_sample,
                                       ratings_t=movies_df_sample)
    evaluate_data = sample_generator.evaluate_data
    alias = 'conetItemVecc_factor{}neg{}_bz{}_{}_reg_0.0000001_{}'.format(\
        params['latent_dim'],params['num_negative'],params['batch_size'],''.join(params['layers']),params['id'])
    config = {
        'alias':
        alias,
        'num_epoch':
        params['epoch'],
        'batch_size':
        params['batch_size'],
        'optimizer':
        'adam',
        'adam_lr':
        1e-3,
        'num_users':
        books_df_sample['userId'].nunique(),
        'num_items_s':
        books_df_sample['itemId'].nunique(),
        'num_items_t':
        movies_df_sample['itemId'].nunique(),
        'device_id':
        0,
        'latent_dim':
        params['latent_dim'],
        'num_negative':
        params['num_negative'],
        'layers':
        params[
            'layers'],  # layers[0] is the concat of latent user vector & latent item vector
        'l2_regularization':
        0.0000001,  # MLP model is sensitive to hyper params
        'use_cuda':
        params['use_cuda'],
        'pretrain':
        False,
        'model_dir':
        'checkpoints/{}_Epoch{}_HR_s{:.4f}_NDCG_s{:.4f}_HR_t{:.4f}_NDCG_t{:.4f}.model'
    }
    engine = CoNetEngine(config)
    train_loader = sample_generator.instance_a_train_loader(
        config['num_negative'], config['batch_size'])
    res = []
    for epoch in range(config['num_epoch']):
        print('Epoch {} starts !'.format(epoch))
        print('-' * 80)
        engine.train_an_epoch(train_loader, epoch_id=epoch)
        hit_ratio_s, ndcg_s, hit_ratio_t, ndcg_t = engine.evaluate(
            evaluate_data, epoch_id=epoch)
        res.append([hit_ratio_s, ndcg_s, hit_ratio_t, ndcg_t])
        engine.save(config['alias'], epoch, hit_ratio_s, ndcg_s, hit_ratio_t,
                    ndcg_t)
    return res
Example #4
0
movie_user = list(set(movie['userId']))   #2007 users
music_user = list(set(music['userId']))   #160 users

book_movie_overlap = list(set(book['userId']).intersection(movie['userId']))     # 195 users
movie_music_overlap = list(set(movie['userId']).intersection(music['userId']))   # 40 users
book_music_overlap = list(set(music['userId']).intersection(book['userId']))     # 23 users
    
sample_book_generator = SampleGenerator(ratings=book)
evaluate_book_data = sample_book_generator.evaluate_data
sample_movie_generator = SampleGenerator(ratings=movie)
evaluate_movie_data = sample_movie_generator.evaluate_data
sample_music_generator = SampleGenerator(ratings=music)
evaluate_music_data = sample_music_generator.evaluate_data

engine = Engine(config)
train_book_loader = sample_book_generator.instance_a_train_loader(config['batch_size'])
train_music_loader = sample_music_generator.instance_a_train_loader(config['batch_size'])
train_movie_loader = sample_movie_generator.instance_a_train_loader(config['batch_size'])

with open('overlap_movie_music_index','r') as f:
    overlap = json.load(f)
movie_overlap = overlap['movie']
music_overlap = overlap['music']

#book_music_overlap = list(np.random.choice(book_music_overlap,64,replace=True))
overlap_generator = OverlapGenerator(rating1=movie, rating2=music, users=movie_music_overlap)
overlap_movie_loader, overlap_music_loader, movie_user_embeddings, music_user_embeddings = overlap_generator.instance_a_train_loader(config['batch_size'])

metric_generator = MetricGenerator(rating1=movie, rating2=music, metric1=movie_overlap, metric2=music_overlap)
metric_movie_loader, metric_music_loader, movie_item_embeddings, music_item_embeddings = overlap_generator.instance_a_train_loader(config['batch_size'])