def official(self, model_path, test_data): print('-----------------------------------------------') print("Initializing model...") self.setup_model(self.preproc.train_embedding) self.load_model(model_path) print("Predicting in batches...") test_batches = BatchGen(self.opt, test_data['data'], self.use_cuda, self.preproc.train_vocab, self.preproc.train_char_vocab, evaluation=True) predictions = [] confidence = [] final_json = [] cnt = 0 for j, test_batch in enumerate(test_batches): cnt += 1 if cnt % 50 == 0: print(cnt, '/', len(test_batches)) phrase, phrase_score, pred_json = self.predict(test_batch) predictions.extend(phrase) confidence.extend(phrase_score) final_json.extend(pred_json) return predictions, confidence, final_json
def train(self): self.isTrain = True self.getSaveFolder() self.saveConf() self.vocab, self.char_vocab, vocab_embedding = self.preproc.load_data() self.log('-----------------------------------------------') self.log("Initializing model...") self.setup_model(vocab_embedding) if 'RESUME' in self.opt: model_path = os.path.join(self.opt['datadir'], self.opt['MODEL_PATH']) self.load_model(model_path) print('Loading train json...') with open( os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'train-preprocessed.json'), 'r') as f: train_data = json.load(f) print('Loading dev json...') with open( os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'dev-preprocessed.json'), 'r') as f: dev_data = json.load(f) best_f1_score = 0.0 numEpochs = self.opt['EPOCH'] for epoch in range(self.epoch_start, numEpochs): self.log('Epoch {}'.format(epoch)) self.network.train() startTime = datetime.now() train_batches = BatchGen(self.opt, train_data['data'], self.use_cuda, self.vocab, self.char_vocab) dev_batches = BatchGen(self.opt, dev_data['data'], self.use_cuda, self.vocab, self.char_vocab, evaluation=True) for i, batch in enumerate(train_batches): if i == len(train_batches) - 1 or ( epoch == 0 and i == 0 and ('RESUME' in self.opt)) or (i > 0 and i % 1500 == 0): print('Saving folder is', self.saveFolder) print('Evaluating on dev set...') predictions = [] confidence = [] dev_answer = [] final_json = [] for j, dev_batch in enumerate(dev_batches): phrase, phrase_score, pred_json = self.predict( dev_batch) final_json.extend(pred_json) predictions.extend(phrase) confidence.extend(phrase_score) dev_answer.extend(dev_batch[-3]) # answer_str result, all_f1s = score(predictions, dev_answer, final_json) f1 = result['f1'] if f1 > best_f1_score: model_file = os.path.join(self.saveFolder, 'best_model.pt') self.save_for_predict(model_file, epoch) best_f1_score = f1 pred_json_file = os.path.join(self.saveFolder, 'prediction.json') with open(pred_json_file, 'w') as output_file: json.dump(final_json, output_file) score_per_instance = [] for instance, s in zip(final_json, all_f1s): score_per_instance.append({ 'id': instance['id'], 'turn_id': instance['turn_id'], 'f1': s }) score_per_instance_json_file = os.path.join( self.saveFolder, 'score_per_instance.json') with open(score_per_instance_json_file, 'w') as output_file: json.dump(score_per_instance, output_file) self.log("Epoch {0} - dev: F1: {1:.3f} (best F1: {2:.3f})". format(epoch, f1, best_f1_score)) self.log("Results breakdown\n{0}".format(result)) self.update(batch) if i % 100 == 0: self.log( 'updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'. format( self.updates, self.train_loss.avg, str((datetime.now() - startTime) / (i + 1) * (len(train_batches) - i - 1)).split('.')[0])) print("PROGRESS: {0:.2f}%".format(100.0 * (epoch + 1) / numEpochs)) print('Config file is at ' + self.opt['confFile'])
def train(self): """ train()函数进行批次处理,即对于一个batch的数据,计算当前预测结果并求导更新参数。 每训练1500个batch,利用predict()函数在验证数据上进行一次预测并计算准确率得分。 当前得分最高的模型参数保存在run_id文件夹中。 """ self.isTrain = True # 标记训练模式 self.getSaveFolder() self.saveConf() self.vocab, self.char_vocab, vocab_embedding = self.preproc.load_data( ) # 从CoQAPreprocess中获得词表和编码 self.log('-----------------------------------------------') self.log('Initializing model...') self.setup_model(vocab_embedding) # 初始化模型 if 'RESUME' in self.opt: # 在继续训练模式时,读取之前存储的模型 model_path = os.path.join(self.opt['datadir'], self.opt['MODEL_PATH']) self.load_model(model_path) print('Loading train json') # 读取处理好的训练数据 with open( os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'train-preprocessed.json'), 'r') as f: train_data = json.load(f) print('Loading dev json') # 读取处理好的验证数据 with open( os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'dev-preprocessed.json'), 'r') as f: dev_data = json.load(f) best_f1_score = 0.0 # 训练中得到的验证集上的最高的F1得分 numEpochs = self.opt['EPOCH'] # 配置文件中EPOCH为训练轮数 for epoch in range(self.epoch_start, numEpochs): self.log('Epoch {}'.format(epoch)) # 训练模式,开启Dropout等功能 self.network.train() startTime = datetime.now() # 获得训练数据的batch迭代器 train_batches = BatchGen(self.opt, train_data['data'], self.use_cuda, self.vocab, self.char_vocab) # 获得验证数据的batch迭代器 dev_batches = BatchGen(self.opt, dev_data['data'], self.use_cuda, self.vocab, self.char_vocab, evaluation=True) for i, batch in enumerate(train_batches): # 每轮结束时或继续训练模式的第一个batch或每1500个batch,在验证数据上预测并计算得分 if i == len(train_batches) - 1 or ( epoch == 0 and i == 0 and ('RESUME' in self.opt)) or (i > 0 and i % 1500 == 0): print('Saving folder is', self.saveFolder) print('Evaluating on dev set...') predictions = [] confidence = [] dev_answer = [] final_json = [] for j, dev_batch in enumerate(dev_batches): # 预测的结果包括答案文本、答案可能性打分以及JSON格式结果 phrase, phrase_score, pred_json = self.predict( dev_batch) final_json.extend(pred_json) predictions.extend(phrase) confidence.extend(phrase_score) dev_answer.extend(dev_batch[-3]) # answer_str # 计算精确匹配EM和F1得分 result, all_f1s = score(pred=predictions, truth=dev_answer, final_json=final_json) f1 = result['f1'] # 如果F1得分高于之前的所有模型,则存储此模型 if f1 > best_f1_score: model_file = os.path.join(self.saveFolder, 'best_model.pt') self.save_for_predict(model_file, epoch) best_f1_score = f1 pred_json_file = os.path.join(self.saveFolder, 'prediction.json') with open(pred_json_file, 'w') as output_file: json.dump(final_json, output_file) score_per_instance = [] for instance, s in zip(final_json, all_f1s): score_per_instance.append({ 'id': instance['id'], 'turn_id': instance['turn_id'], 'f1': s }) score_per_instance_json_file = os.path.join( self.saveFolder, 'score_per_instance.json') with open(score_per_instance_json_file, 'w') as output_file: json.dump(score_per_instance, output_file) self.log('Epoch {0} - dev F1: {1:.3f} (best F1: {2:.3f})'. format(epoch, f1, best_f1_score)) self.log('Results breakdown\n{0}'.format(result)) # 对本批次进行计算、求导和参数更新 self.update(batch) if i % 100 == 0: self.log( 'updates[{0: 6}] train loss[{1: .5f}] remaining[{2}]'. format( self.updates, self.train_loss.avg, str((datetime.now() - startTime) / (i + 1) * (len(train_batches) - i - 1)).split('.')[0])) print('PROGRESS: {0:.2F}%'.format(100.0 * (epoch + 1) / numEpochs)) print('Config file is at ' + self.opt['confFile'])
def train(self): self.getSaveFolder() self.saveConf() self.result_file = self.opt['RESULT_FILE'] self.log('-----------------------------------------------') self.log("Initializing model...") self.setup_model() if 'CHECK_POINT' in self.opt: model_path = os.path.join(self.opt['datadir'], self.opt['CHECK_POINT_PATH']) self.load_model(model_path) print('Loaing train json...') with open(os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'train-preprocessed.json'), 'r') as f: train_data = json.load(f) with open(os.path.join(self.opt['FEATURE_FOLDER'], self.data_prefix + 'dev-preprocessed.json'), 'r') as f: dev_data = json.load(f) output_prediction_file = self.opt['OUTPUT_FILE'] + "prediction_file.json" best_f1_score = 0 last_epoch = 0 num_epochs = self.opt['EPOCH'] # self.scheduler = CyclicLRWithRestarts(self.optimizer, batch_size, num_epochs, restart_period=5, t_mult=1.2, # policy="cosine") for epoch in range(self.epoch_start, num_epochs): ### best_f1_score记录每个epoch里的最优值 self.log('\n########Epoch {}########\n'.format(epoch)) # self.network.train() start_time = datetime.now() train_batches = BatchGen(self.opt, train_data['data'], self.use_cuda, is_training=True) dev_batches = BatchGen(self.opt, dev_data['data'], self.use_cuda, is_training=False) # self.scheduler.step() ### step = 2700 for i, batch in enumerate(train_batches): ''' 先判断是否进入测试阶段 三个条件: 1.正常训练即将结束 2.训练刚开始,载入Check point 3.每1600步测试一次(参数可调) ''' # if i == len(train_batches) - 1 or (epoch == 0 and i == 0 and ('CHECK_POINT' in self.opt)) or (i ==1800): # if (self.updates >= 0 and self.updates % 5000 == 0): # if self.updates>0 and self.updates%1000==0: if self.updates > 0 and self.updates % 1000 == 0: print('Saving folder is', self.saveFolder) print('Evaluating on dev set......') final_json, all_predictions_list, all_nbest_json_list = [], [], [] results = {} count = 0 for j, dev_batch in enumerate(dev_batches): pred_json, all_predictions, all_nbest_json = self.predict(dev_batch) count += len(pred_json) final_json.append(pred_json) all_predictions_list += all_predictions with open(output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions_list, indent=4) + "\n") with open(self.opt['Quac_DEV_FILE'], 'r') as f: val_file = json.load(f) val_file = val_file['data'] new = {} for r in all_predictions_list: tmp = {r['turn_id']: [r['answer'], 'y', 'y']} if r['id'] in new: new[r['id']][r['turn_id']] = [r['answer'], 'y', 'y'] else: new[r['id']] = {} new[r['id']][r['turn_id']] = [r['answer'], 'y', 'y'] metric_json = eval_fn(val_file, new, False) # logger.info("Results: {}".format(results)) final_f1 = metric_json['f1'] # pdb.set_trace() if best_f1_score != 0: print("Best F1 : {}".format(max(final_f1, best_f1_score))) # print("dev loss: ", final_loss) if final_f1>best_f1_score: model_file = os.path.join(self.result_file, 'best_model.pt') self.save_for_predict(model_file, epoch) best_f1_score = final_f1 pred_json_file = os.path.join(self.result_file, 'prediction.json') with open(pred_json_file, 'w', encoding='utf-8') as output_file: json.dump(final_json, output_file, ensure_ascii=False) # with open(pred_json_file, 'w', encoding='utf-8') as result_file: # json.dump("f1: {}".format(final_f1), result_file, ensure_ascii=False) score_per_instance = [] ### 可以确定len(all_f1) = len(final_json) for instance in final_json: score_per_instance.append({ 'id': instance[0]['turn_id'], 'turn_id': instance[0]['id']}) score_per_instance_json_file = os.path.join(self.result_file, 'score_per_instance.json') with open(score_per_instance_json_file, 'w') as output_file: json.dump(score_per_instance, output_file) self.log("Epoch {0} - dev: F1: {1:.3f} (best F1: {2:.3f})\n".format(epoch, final_f1, best_f1_score)) # self.log("Results breakdown\n{0}".format(result)) # if self.updates<200: # # print(self.updates) # self.updates += 1 # continue self.update(batch) if i % 100 == 0: self.log('**********************EPOCH[{0:2}] i[{1:4}] updates[{2:6}] train loss[{3:.5f}] remaining[{4}]'.format( epoch, i, self.updates, self.train_loss.avg, str((datetime.now() - start_time) / (i + 1) * (len(train_batches) - i - 1)).split('.')[0])) print("PROGRESS: {0:.2f}%".format(100.0 * (epoch + 1) / num_epochs)) print('Config file is at ' + self.opt['confFile'])