def bl_IsRec_best(a_dataset): model_name = 'IsRec_best' # 'IsRec' 'IsRec_best_modified' epoch_num = 20 neighbor_size = 15 topTopicNum = 3 cluster_mode = 'LDA' cluster_mode_topic_nums = [50] # 10,25,75,,100,125,150 train_data, test_data = get_train_test_data(a_dataset.train_data, a_dataset.test_data) for cluster_mode_topic_num in cluster_mode_topic_nums: HINRec_model = HINRec(model_name=model_name, semantic_mode='TF_IDF', epoch_num=epoch_num, neighbor_size=neighbor_size, topTopicNum=topTopicNum, cluster_mode=cluster_mode, cluster_mode_topic_num=cluster_mode_topic_num) if os.path.exists(HINRec_model.weight_path): print('have trained,return!') else: HINRec_model.train(test_data) HINRec_model.save_model() evalute_by_epoch(HINRec_model, HINRec_model, HINRec_model.model_name, test_data, evaluate_by_slt_apiNum=True) # )
def bl_IsRec(a_dataset): model_name = 'IsRec' # '' epoch_nums = [20] # 15,100,1000 neighbor_size = 15 topTopicNums = [3] # [3,4,5,6] train_data, test_data = get_train_test_data(a_dataset.train_data, a_dataset.test_data) for epoch_num in epoch_nums: for topTopicNum in topTopicNums: HINRec_model = HINRec(model_name=model_name, epoch_num=epoch_num, neighbor_size=neighbor_size, topTopicNum=topTopicNum) if os.path.exists(HINRec_model.weight_path): print('have trained,return!') else: HINRec_model.train(test_data) # HINRec_model.test_model(test_data) HINRec_model.save_model() evalute_by_epoch(HINRec_model, HINRec_model, HINRec_model.model_name, test_data, evaluate_by_slt_apiNum=True ) # ,if_save_recommend_result=True)
def bl_PasRec(): model_name = 'PasRec_2path' # 'PasRec_2path' epoch_num = 60 # 之前是40 40比20差点 neighbor_size = 15 topTopicNum = 3 args = data_repository.get_args() train_data, test_data = data_repository.get_ds( ).train_data, data_repository.get_ds().test_data HINRec_model = HINRec(args, model_name=model_name, epoch_num=epoch_num, neighbor_size=neighbor_size, topTopicNum=topTopicNum) if os.path.exists(HINRec_model.weight_path): print('have trained,return!') else: # 这里是每隔20epoch测试一下,所以train中输入test_data HINRec_model.train(test_data) HINRec_model.save_model() evalute_by_epoch( HINRec_model, HINRec_model, HINRec_model.model_name, test_data, evaluate_by_slt_apiNum=False) # ,if_save_recommend_result=True)
def bl_DHSR(a_dataset): dhsr_recommend_model = DHSR_model() dhsr_model = dhsr_recommend_model.get_model() # a_dataset.transfer() # 将重复sample删除? 'newScene'且need_slt_apis=False时 train_data, test_data = get_train_test_data(a_dataset.train_data, a_dataset.test_data) dhsr_model = train_model(dhsr_recommend_model, dhsr_model, train_data, test_data, *new_Para.param.train_paras) # 'monitor loss&acc' dhsr_recommend_model.save_sth() evalute_by_epoch( dhsr_recommend_model, dhsr_model, dhsr_recommend_model.model_name, test_data ) # ,if_save_recommend_result=True,evaluate_by_slt_apiNum = True)
def bl_DHSR_new(a_dataset): train_datas, test_datas = a_dataset.transfer_false_test_DHSR( if_reduct_train=True) # 是否约减训练集 # 选择的服务数目不同,训练对应的模型,并评估效果 for slt_num in range(1, new_Para.param.slt_item_num + 1): train_data, test_data = train_datas[slt_num - 1], test_datas[slt_num - 1] # old_new = 'new','new_sigmoid', 'new_reduct'效果最好 dhsr_recommend_model = DHSR_model(old_new='new_reduct', slt_num=slt_num) dhsr_model = dhsr_recommend_model.get_model() dhsr_model = train_model( dhsr_recommend_model, dhsr_model, train_data, test_data, *new_Para.param.train_paras) # 'monitor loss&acc' evalute_by_epoch(dhsr_recommend_model, dhsr_model, dhsr_recommend_model.model_name, test_data, evaluate_by_slt_apiNum=True) dhsr_recommend_model.save_sth() print('DHSR, slt_num:{}, train_predict,done!'.format(slt_num))
def DINRec(a_dataset, new_old='new'): train_data, test_data = a_dataset.train_data, a_dataset.test_data CI_recommend_model = CI_Model(new_old) # 'old' CI_recommend_model.prepare() CI_model_obj = CI_recommend_model.get_model() CI_model_obj = train_model( CI_recommend_model, CI_model_obj, train_data, test_data, *new_Para.param.train_paras ) # ,true_candidates_dict=HINRec_model.get_true_candi_apis() 'monitor loss&acc' DINRec_model = DIN_Rec(CI_recommend_model, new_Para.param.predict_fc_unit_nums) DINRec_model.prepare() DINRec_model_obj = DINRec_model.get_model() DINRec_model_obj = train_model( DINRec_model, DINRec_model_obj, train_data, test_data, *new_Para.param.train_paras ) # ,true_candidates_dict=HINRec_model.get_true_candi_apis() 'monitor loss&acc' evalute_by_epoch(DINRec_model, DINRec_model_obj, DINRec_model.simple_name, test_data, if_save_recommend_result=True, evaluate_by_slt_apiNum=True)
def train_early_stop(recommend_model, model, train_data, test_data): """ 训练时按照验证集的loss,early stopping得到最优的模型;最后基于该模型测试 :return: """ if_Train = True if data_repository.get_args().pairwise else False train_labels = train_data[-1] train_instances_tuple = recommend_model.get_instances( *train_data[:-1], pairwise_train_phase_flag=if_Train) train_model = recommend_model.get_pairwise_model( ) if data_repository.get_args().pairwise else model if data_repository.get_args().pairwise: train_model.compile(optimizer=recommend_model.optimizer, loss=lambda y_true, y_pred: y_pred, metrics=['accuracy']) else: train_model.compile(optimizer=recommend_model.optimizer, loss='binary_crossentropy', metrics=['accuracy']) early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='min') hist = train_model.fit( [*train_instances_tuple], train_labels, epochs=data_repository.get_args().num_epochs, batch_size=data_repository.get_args().small_batch_size, callbacks=[early_stopping], validation_split=data_repository.get_args().validation_split, shuffle=True) # model.save_weights(data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, 'min_loss')) # !!! 改正 model_name = recommend_model.get_simple_name() + recommend_model.get_name( ) + '_min_loss' save_loss_acc(hist, model_name, if_multi_epoch=True) epoch_evaluate_result = evalute_by_epoch(recommend_model, model, model_name, test_data) return model
def train_best_NDCG_model(recommend_model, model, train_data, test_data, true_candidates_dict=None, CI_start_test_epoch=0, earlyStop_epochs=5): """ 训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型 :param recommend_model: 整体的推荐模型 :param model: model_core :param train_data: :param test_data: :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练 :param true_candidates_dict: :return: """ print('training_save_best_NDCG_model...') epoch_evaluate_results = [] # 模型 train_model = recommend_model.get_pairwise_model( ) if data_repository.get_args().pairwise else model # 数据 train_instances_dict = recommend_model.get_instances( train_data, pairwise_train_phase_flag=data_repository.get_args().pairwise) train_labels = train_data.get('label') if data_repository.get_args( ).final_activation == 'softmax': # 针对softmax变换labels train_labels = utils.to_categorical(train_labels, num_classes=2) best_epoch, best_NDCG_5 = 0, 0 for epoch in range(data_repository.get_args().num_epochs): if epoch == 0: # 首次训练要编译 # loss_ = lambda y_true, y_pred: y_pred if data_repository.get_args().pairwise else 'binary_crossentropy' # train_model.compile(optimizer=recommend_model.optimizer, loss=loss_,metrics=['accuracy']) train_model.compile(optimizer=recommend_model.optimizer, loss='binary_crossentropy', metrics=['accuracy']) print('whole_model compile,done!') print('Epoch {}'.format(epoch)) hist = train_model.fit( train_instances_dict, np.array(train_labels), batch_size=data_repository.get_args().batch_size, epochs=1, verbose=1, shuffle=True, validation_split=data_repository.get_args().validation_split) print('Epoch {}, train done!'.format(epoch)) # 记录:数据集情况,模型架构,训练设置 record_name = recommend_model.get_name() + data_repository.get_args( ).train_name if epoch == 0 else '' # 记录在测试集的效果,写入evalute.csv save_loss_acc(hist, record_name, epoch=epoch) # 每个epoch记录 # CI的前3轮效果差,一般不用测,提高速度 first_test_epoch = CI_start_test_epoch if isinstance( recommend_model, CI_Model) else 0 if epoch < first_test_epoch: epoch_evaluate_results.append(None) continue # epoch测试 epoch_evaluate_result = evalute_by_epoch( recommend_model, model, record_name, test_data, record_time=True if epoch == 0 else False, true_candidates_dict=true_candidates_dict) epoch_evaluate_results.append(epoch_evaluate_result) # 优于目前的best_NDCG_5才存储模型参数 TODO if epoch_evaluate_result[0][3] >= best_NDCG_5: best_NDCG_5 = epoch_evaluate_result[0][3] best_epoch = epoch model.save_weights( data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, epoch)) else: if epoch - best_epoch >= earlyStop_epochs: # 大于若干个epoch,效果没有提升,即时终止 break # 记录最优epoch和最优NDCG@5 with open( data_repository.get_ds().new_best_epoch_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_epoch)) with open( data_repository.get_ds().new_best_NDCG_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_NDCG_5)) print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5)) # 记录最优指标 csv_table_name = 'best_indicaters\n' summary(evaluate_path, csv_table_name, epoch_evaluate_results[best_epoch], data_repository.get_args().topKs) # 看word embedding矩阵是否发生改变,尤其是padding的0 # print('some embedding parameters after {} epoch:'.format(epoch)) # print (recommend_model.embedding_layer.get_weights ()[0][:2]) # 把记录的非最优的epoch模型参数都删除 try: for i in range(data_repository.get_args().num_epochs): temp_path = data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, i) if i != best_epoch and os.path.exists(temp_path): os.remove(temp_path) model.load_weights(data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, best_epoch)) finally: return model
def test_model(self,test_data): evalute_by_epoch(self, self, self.model_name, test_data)