def train(args): # Load Data spec = MODEL_SPEC[args.model] config = spec['config'] config['pretrain'] = False wework_dir = config['data_dir'] wework_rating = pd.read_csv(os.path.join(wework_dir, config['train_filename']), sep=',', header=0, names=['','account_id', 'atlas_location_uuid', 'rating', 'timestamp', 'weight'], engine='python') # # Reindex # account_id = ml1m_rating[['uid']].drop_duplicates().reindex() # account_id['userId'] = np.arange(len(user_id)) # ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left') # item_id = ml1m_rating[['mid']].drop_duplicates() # item_id['itemId'] = np.arange(len(item_id)) # ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left') # ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']] print('Range of AccountId is [{}, {}]'.format(wework_rating.account_id.min(), wework_rating.account_id.max())) print('Range of LocationId is [{}, {}]'.format(wework_rating.atlas_location_uuid.min(), wework_rating.atlas_location_uuid.max())) Engine = spec['engine'] # DataLoader for training sample_generator = SampleGenerator(wework_rating, config) sample_generator.test() test_data = sample_generator.evaluate_data sample_generator.val() val_data = sample_generator.evaluate_data # Specify the exact model engine = Engine(config) # gamma = decaying factor scheduler = StepLR(engine.opt, step_size=1, gamma=0.75) train_negatives = [] best_epoch = 0 best_metric = float('inf') HR_10, NDCG_10 = 0, 0 for epoch in range(config['num_epoch']): print('Epoch {} starts !'.format(epoch)) print('-' * 80) train_loader, train_negative = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size']) engine.train_an_epoch(train_loader, epoch_id=epoch) scheduler.step() plot_grad_flow(engine.model.named_parameters(), epoch) train_negative = flatten(train_negative) if len(train_negatives) != 0: train_negatives = pd.concat([train_negatives, train_negative], axis=0) else: train_negatives = train_negative metric, auc, HR5, HR10, NDCG5, NDCG10 = engine.evaluate(val_data, train_negatives, epoch_id=epoch) if metric < best_metric : best_epoch = epoch best_metric = metric HR_10, NDCG_10 = HR10, NDCG10 engine.save(config['alias'], epoch, HR_10, NDCG_10) print ('Epoch {}: found best results on validation data: metric = {:.4f}, HR10 = {:.4f}, NDCG10 = {:.4f}'.format(epoch, best_metric, HR_10, NDCG_10)) engine.load(config['alias'], best_epoch, HR_10, NDCG_10) metric, auc, HR5, HR10, NDCG5, NDCG10 = engine.evaluate(test_data, train_negatives, epoch_id=epoch) print('Best Epoch {}: metric = {:.4f}, auc = {:.4f}, HR@5 = {:.4f}, HR@10 = {:.4f},\ NDCG@5 = {:.4f}, NDCG@10 = {:.4f}'.format(best_epoch, metric, auc, HR5, HR10, NDCG5, NDCG10))
ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left') item_id = ml1m_rating[['mid']].drop_duplicates() item_id['itemId'] = np.arange(len(item_id)) ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left') ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']] print('Range of userId is [{}, {}]'.format(ml1m_rating.userId.min(), ml1m_rating.userId.max())) print('Range of itemId is [{}, {}]'.format(ml1m_rating.itemId.min(), ml1m_rating.itemId.max())) # 加载训练数据 sample_generator = SampleGenerator(ratings=ml1m_rating) evaluate_data = sample_generator.evaluate_data # 指定训练的参数和训练模型 for config in [gmf_config, mlp_config, neumf_config]: if config == mlp_config: engine = MLPEngine(config) elif config == gmf_config: engine = GMFEngine(config) else: engine = NeuMFEngine(config) for epoch in range(config['num_epoch']): print('Epoch {} starts !'.format(epoch)) print('-' * 80) train_loader = sample_generator.instance_a_train_loader( config['num_negative'], config['batch_size']) engine.train_an_epoch(train_loader, epoch_id=epoch) hit_ratio, ndcg = engine.evaluate(evaluate_data, epoch_id=epoch) engine.save(config['alias'], epoch, hit_ratio, ndcg)
def main(params): books_df_sample, movies_df_sample = sample_data(params['books_df'], params['movies_df'], params['sample_size']) if params['use_itemVec']: books_df_sample, movies_df_sample = assign_vec(books_df_sample, movies_df_sample, params['itemVec_file']) books_df_sample, movies_df_sample = lbencoder(books_df_sample, movies_df_sample, 5000) sample_generator = SampleGenerator(ratings_s=books_df_sample, ratings_t=movies_df_sample) evaluate_data = sample_generator.evaluate_data alias = 'conetItemVecc_factor{}neg{}_bz{}_{}_reg_0.0000001_{}'.format(\ params['latent_dim'],params['num_negative'],params['batch_size'],''.join(params['layers']),params['id']) config = { 'alias': alias, 'num_epoch': params['epoch'], 'batch_size': params['batch_size'], 'optimizer': 'adam', 'adam_lr': 1e-3, 'num_users': books_df_sample['userId'].nunique(), 'num_items_s': books_df_sample['itemId'].nunique(), 'num_items_t': movies_df_sample['itemId'].nunique(), 'device_id': 0, 'latent_dim': params['latent_dim'], 'num_negative': params['num_negative'], 'layers': params[ 'layers'], # layers[0] is the concat of latent user vector & latent item vector 'l2_regularization': 0.0000001, # MLP model is sensitive to hyper params 'use_cuda': params['use_cuda'], 'pretrain': False, 'model_dir': 'checkpoints/{}_Epoch{}_HR_s{:.4f}_NDCG_s{:.4f}_HR_t{:.4f}_NDCG_t{:.4f}.model' } engine = CoNetEngine(config) train_loader = sample_generator.instance_a_train_loader( config['num_negative'], config['batch_size']) res = [] for epoch in range(config['num_epoch']): print('Epoch {} starts !'.format(epoch)) print('-' * 80) engine.train_an_epoch(train_loader, epoch_id=epoch) hit_ratio_s, ndcg_s, hit_ratio_t, ndcg_t = engine.evaluate( evaluate_data, epoch_id=epoch) res.append([hit_ratio_s, ndcg_s, hit_ratio_t, ndcg_t]) engine.save(config['alias'], epoch, hit_ratio_s, ndcg_s, hit_ratio_t, ndcg_t) return res
movie_user = list(set(movie['userId'])) #2007 users music_user = list(set(music['userId'])) #160 users book_movie_overlap = list(set(book['userId']).intersection(movie['userId'])) # 195 users movie_music_overlap = list(set(movie['userId']).intersection(music['userId'])) # 40 users book_music_overlap = list(set(music['userId']).intersection(book['userId'])) # 23 users sample_book_generator = SampleGenerator(ratings=book) evaluate_book_data = sample_book_generator.evaluate_data sample_movie_generator = SampleGenerator(ratings=movie) evaluate_movie_data = sample_movie_generator.evaluate_data sample_music_generator = SampleGenerator(ratings=music) evaluate_music_data = sample_music_generator.evaluate_data engine = Engine(config) train_book_loader = sample_book_generator.instance_a_train_loader(config['batch_size']) train_music_loader = sample_music_generator.instance_a_train_loader(config['batch_size']) train_movie_loader = sample_movie_generator.instance_a_train_loader(config['batch_size']) with open('overlap_movie_music_index','r') as f: overlap = json.load(f) movie_overlap = overlap['movie'] music_overlap = overlap['music'] #book_music_overlap = list(np.random.choice(book_music_overlap,64,replace=True)) overlap_generator = OverlapGenerator(rating1=movie, rating2=music, users=movie_music_overlap) overlap_movie_loader, overlap_music_loader, movie_user_embeddings, music_user_embeddings = overlap_generator.instance_a_train_loader(config['batch_size']) metric_generator = MetricGenerator(rating1=movie, rating2=music, metric1=movie_overlap, metric2=music_overlap) metric_movie_loader, metric_music_loader, movie_item_embeddings, music_item_embeddings = overlap_generator.instance_a_train_loader(config['batch_size'])