def model_predict(path_finish, path_like): Config["normal_config"]["pretrain"] = True Config["normal_config"]["pretrain_model_dir"] = path_finish engine_finish = ModelEngine(config=Config, model=NFFM) sample_generator = SampleGenerator() print() print("------------start testing finish--------------") test_loader = sample_generator.instance_a_loader(t="test") df_finish = engine_finish.predict(test_loader) print("------------finish testing -------------------") print("------------start testing like ----------------") Config["normal_config"]["pretrain_model_dir"] = path_like engine_like = ModelEngine(config=Config, model=NFFM) df_like = engine_like.predict(test_loader) df_finish["like_probability"] = df_like["pred_probability"] df_finish.columns = [ "uid", "item_id", "finish_probability", "like_probability" ] df_finish.to_csv(Config["normal_config"]["predict_file"] + Config["normal_config"]["model_name"], index=None, float_format="%.6f")
def train(args): # Load Data spec = MODEL_SPEC[args.model] config = spec['config'] config['pretrain'] = False wework_dir = config['data_dir'] wework_rating = pd.read_csv(os.path.join(wework_dir, config['train_filename']), sep=',', header=0, names=['','account_id', 'atlas_location_uuid', 'rating', 'timestamp', 'weight'], engine='python') # # Reindex # account_id = ml1m_rating[['uid']].drop_duplicates().reindex() # account_id['userId'] = np.arange(len(user_id)) # ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left') # item_id = ml1m_rating[['mid']].drop_duplicates() # item_id['itemId'] = np.arange(len(item_id)) # ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left') # ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']] print('Range of AccountId is [{}, {}]'.format(wework_rating.account_id.min(), wework_rating.account_id.max())) print('Range of LocationId is [{}, {}]'.format(wework_rating.atlas_location_uuid.min(), wework_rating.atlas_location_uuid.max())) Engine = spec['engine'] # DataLoader for training sample_generator = SampleGenerator(wework_rating, config) sample_generator.test() test_data = sample_generator.evaluate_data sample_generator.val() val_data = sample_generator.evaluate_data # Specify the exact model engine = Engine(config) # gamma = decaying factor scheduler = StepLR(engine.opt, step_size=1, gamma=0.75) train_negatives = [] best_epoch = 0 best_metric = float('inf') HR_10, NDCG_10 = 0, 0 for epoch in range(config['num_epoch']): print('Epoch {} starts !'.format(epoch)) print('-' * 80) train_loader, train_negative = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size']) engine.train_an_epoch(train_loader, epoch_id=epoch) scheduler.step() plot_grad_flow(engine.model.named_parameters(), epoch) train_negative = flatten(train_negative) if len(train_negatives) != 0: train_negatives = pd.concat([train_negatives, train_negative], axis=0) else: train_negatives = train_negative metric, auc, HR5, HR10, NDCG5, NDCG10 = engine.evaluate(val_data, train_negatives, epoch_id=epoch) if metric < best_metric : best_epoch = epoch best_metric = metric HR_10, NDCG_10 = HR10, NDCG10 engine.save(config['alias'], epoch, HR_10, NDCG_10) print ('Epoch {}: found best results on validation data: metric = {:.4f}, HR10 = {:.4f}, NDCG10 = {:.4f}'.format(epoch, best_metric, HR_10, NDCG_10)) engine.load(config['alias'], best_epoch, HR_10, NDCG_10) metric, auc, HR5, HR10, NDCG5, NDCG10 = engine.evaluate(test_data, train_negatives, epoch_id=epoch) print('Best Epoch {}: metric = {:.4f}, auc = {:.4f}, HR@5 = {:.4f}, HR@10 = {:.4f},\ NDCG@5 = {:.4f}, NDCG@10 = {:.4f}'.format(best_epoch, metric, auc, HR5, HR10, NDCG5, NDCG10))
def test_nlp_algs(review_json): bert_config = BertConfig() bert_engine = Engine(bert_config) bert_embedding_generator = BertEmbeddingGenerator(review_json) bert_tensors = generate_tensors(bert_embedding_generator, review_json) seq2seq_config = Seq2Seq.get_config() seq2seq_engine = Engine(seq2seq_config) seq2seq_embedding_generator = Seq2SeqEmbeddingGenerator(review_json) seq2seq_tensors = generate_tensors(seq2seq_embedding_generator, review_json) bert_rating_dataset = UserItemRatingDataset(bert_tensors) seq2seq_rating_dataset = UserItemRatingDataset(seq2seq_tensors) bert_evaluation_tool = SampleGenerator(bert_rating_dataset) seq2seq_evaluation_tool = SampleGenerator(seq2seq_rating_dataset) return (bert_evaluation_tool.evaluate_data(), seq2seq_evaluation_tool.evaluate_data())
# Reindex user_id = ml1m_rating[['uid']].drop_duplicates().reindex() user_id['userId'] = np.arange(len(user_id)) ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left') item_id = ml1m_rating[['mid']].drop_duplicates() item_id['itemId'] = np.arange(len(item_id)) ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left') ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']] print('Range of userId is [{}, {}]'.format(ml1m_rating.userId.min(), ml1m_rating.userId.max())) print('Range of itemId is [{}, {}]'.format(ml1m_rating.itemId.min(), ml1m_rating.itemId.max())) # 加载训练数据 sample_generator = SampleGenerator(ratings=ml1m_rating) evaluate_data = sample_generator.evaluate_data # 指定训练的参数和训练模型 for config in [gmf_config, mlp_config, neumf_config]: if config == mlp_config: engine = MLPEngine(config) elif config == gmf_config: engine = GMFEngine(config) else: engine = NeuMFEngine(config) for epoch in range(config['num_epoch']): print('Epoch {} starts !'.format(epoch)) print('-' * 80) train_loader = sample_generator.instance_a_train_loader( config['num_negative'], config['batch_size'])
elif args.model.lower() == "global_sum_embedding_gmf": config['latent_dim'] = config['latent_dim_mf'] engine =New_Gloabl_sum_embedding_gmfEngine(config) elif args.model.lower() == "global_sum_embedding_mlp": config['latent_dim'] = config['latent_dim_mf'] engine =New_Gloabl_sum_embedding_MLPEngine(config) # DataLoader for training sample_generator = SampleGenerator(ratings=data_rating, train=data_rating_train, test=data_rating_test) # Train this model evaluate_data = sample_generator.evaluate_data sample_train_data = sample_generator.sample_train_data print("TRAINING:---------------------") engine.evaluate(sample_train_data, epoch_id=0, save=False) print("TESTING:----------------------") hit_ratio_max, ndcg_max = engine.evaluate(evaluate_data, epoch_id=0) for epoch in range(config['num_epoch']): print('Epoch {} starts !'.format(epoch)) print('-' * 80) train_loader = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size']) engine.train_an_epoch(train_loader, epoch_id=epoch)
def main(params): books_df_sample, movies_df_sample = sample_data(params['books_df'], params['movies_df'], params['sample_size']) if params['use_itemVec']: books_df_sample, movies_df_sample = assign_vec(books_df_sample, movies_df_sample, params['itemVec_file']) books_df_sample, movies_df_sample = lbencoder(books_df_sample, movies_df_sample, 5000) sample_generator = SampleGenerator(ratings_s=books_df_sample, ratings_t=movies_df_sample) evaluate_data = sample_generator.evaluate_data alias = 'conetItemVecc_factor{}neg{}_bz{}_{}_reg_0.0000001_{}'.format(\ params['latent_dim'],params['num_negative'],params['batch_size'],''.join(params['layers']),params['id']) config = { 'alias': alias, 'num_epoch': params['epoch'], 'batch_size': params['batch_size'], 'optimizer': 'adam', 'adam_lr': 1e-3, 'num_users': books_df_sample['userId'].nunique(), 'num_items_s': books_df_sample['itemId'].nunique(), 'num_items_t': movies_df_sample['itemId'].nunique(), 'device_id': 0, 'latent_dim': params['latent_dim'], 'num_negative': params['num_negative'], 'layers': params[ 'layers'], # layers[0] is the concat of latent user vector & latent item vector 'l2_regularization': 0.0000001, # MLP model is sensitive to hyper params 'use_cuda': params['use_cuda'], 'pretrain': False, 'model_dir': 'checkpoints/{}_Epoch{}_HR_s{:.4f}_NDCG_s{:.4f}_HR_t{:.4f}_NDCG_t{:.4f}.model' } engine = CoNetEngine(config) train_loader = sample_generator.instance_a_train_loader( config['num_negative'], config['batch_size']) res = [] for epoch in range(config['num_epoch']): print('Epoch {} starts !'.format(epoch)) print('-' * 80) engine.train_an_epoch(train_loader, epoch_id=epoch) hit_ratio_s, ndcg_s, hit_ratio_t, ndcg_t = engine.evaluate( evaluate_data, epoch_id=epoch) res.append([hit_ratio_s, ndcg_s, hit_ratio_t, ndcg_t]) engine.save(config['alias'], epoch, hit_ratio_s, ndcg_s, hit_ratio_t, ndcg_t) return res
w_dir = '/Users/linyishi/Desktop/毕业论文/recommendation_system/CoNet-torch/src' os.chdir(w_dir) import sys sys.path.append(w_dir) import pandas as pd import numpy as np from CoNet import CoNetEngine from data import SampleGenerator books_df_sample = pd.read_csv('books_df_sample.csv') movies_df_sample = pd.read_csv('movies_df_sample.csv') sample_generator = SampleGenerator(ratings_s=books_df_sample, ratings_t=movies_df_sample) evaluate_data = sample_generator.evaluate_data os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' conet_config = { 'alias': 'conet_factor8neg4_bz256_166432168_reg_0.0000001', 'num_epoch': 20, 'batch_size': 256, # 1024, 'optimizer': 'adam', 'adam_lr': 1e-3,
book['user_embedding'] = book['user_embedding'].map(eval) book['item_embedding'] = book['item_embedding'].map(eval) movie['user_embedding'] = movie['user_embedding'].map(eval) movie['item_embedding'] = movie['item_embedding'].map(eval) music['user_embedding'] = music['user_embedding'].map(eval) music['item_embedding'] = music['item_embedding'].map(eval) book_user = list(set(book['userId'])) #1005 users movie_user = list(set(movie['userId'])) #2007 users music_user = list(set(music['userId'])) #160 users book_movie_overlap = list(set(book['userId']).intersection(movie['userId'])) # 195 users movie_music_overlap = list(set(movie['userId']).intersection(music['userId'])) # 40 users book_music_overlap = list(set(music['userId']).intersection(book['userId'])) # 23 users sample_book_generator = SampleGenerator(ratings=book) evaluate_book_data = sample_book_generator.evaluate_data sample_movie_generator = SampleGenerator(ratings=movie) evaluate_movie_data = sample_movie_generator.evaluate_data sample_music_generator = SampleGenerator(ratings=music) evaluate_music_data = sample_music_generator.evaluate_data engine = Engine(config) train_book_loader = sample_book_generator.instance_a_train_loader(config['batch_size']) train_music_loader = sample_music_generator.instance_a_train_loader(config['batch_size']) train_movie_loader = sample_movie_generator.instance_a_train_loader(config['batch_size']) with open('overlap_movie_music_index','r') as f: overlap = json.load(f) movie_overlap = overlap['movie'] music_overlap = overlap['music']
from data import SampleGenerator from config import Config from models_engine import ModelEngine from model.xDeepFM import xDeepFM from model.mlp import MLP # from model.DTFM import DTFM from model.nffm import NFFM from model.affm import AFFM engine = ModelEngine(config=Config, model=AFFM) sample_generator = SampleGenerator() for epoch in range(Config["training_config"]['num_epoch']): print('Epoch {} starts!'.format(epoch)) print('-' * 80) train_loader = sample_generator.instance_a_loader(t="train") engine.train_an_epoch(train_loader, epoch_id=epoch) # evaluation print() print("------------start evaluating-----------") evaluate_loader = sample_generator.instance_a_loader(t="val") auc = engine.evaluate(evaluate_loader, epoch_id=epoch) engine.save(epoch, auc=auc) # close hd5 file
200, 'optimizer': 'adam', 'adam_lr': 1e-3, 'l2_regularization': 0, 'test_size': 500, 'GNNStep': 3 } train_data = get_train_data(config['train_path'], config['attack_types']) valid_data = get_query(config['valid_query_path']) test_data = get_query(config['test_query_path']) sample_generator = SampleGenerator(config, train_data, valid_data, test_data) engine = ProcedureEngine(config) for epoch in range(config['num_epoch']): print('Epoch{}starts !'.format(epoch)) print('_' * 80) #engine.train_an_epoch(sample_generator,epoch) #val_f1=engine.evaluate(sample_generator,epoch) #engine.save(config['alias'],epoch,val_f1) engine.get_result(sample_generator, epoch)