def train_model(self, train_data, epoch_number=1): '''Train the underlying Tensorflow graph variables. Keyword arguments: self -- SRL model train_data -- training data, list of preprocessing.SentenceData instances. epoch_number -- number of epochs used during training. ''' # Begin training with self._get_session() as session: # initialize variables self.init.run() # generate stream of data batch_generator = Batch_Generator(train_data, self.batch_size) # if possible restore the variables' values from a previous session try: self.saver.restore(session, self.savefile) except Exception as exp: print(exp.message) writer = tf.summary.FileWriter(self.savefile, graph=self.graph) # start a new epoch ----------------------------------------------- for epoch in range(epoch_number): print("Epoch number: "+str(epoch+1)) # start a new iteration in the epoch -------------------------- for step in tqdm(range(len(batch_generator))): # get batch data batch = batch_generator.generate_next_batch() feed_dict = self._get_feed_dict(batch) # take summary if step % 50 and self.savefile is not None and self.profile_data: summ = session.run(self.summary, feed_dict=feed_dict) writer.add_summary(summ, step) # optimize weights session.run(self.optimizer, feed_dict=feed_dict) # end of iteration ----------------------------------------- # saving graph variables after epoch print('saving graph variables ...') self.saver.save(session, self.savefile)
sample_histograms(valid_sample, valid_labels, train_sample, train_labels, train_weights, bins, args.output_dir) #sys.exit() callbacks = callback(args.model_out, args.patience, args.metrics) if args.generator == 'ON': del (train_sample) if np.all(train_weights) != None: train_weights = gen_weights(args.n_train, weight_idx, train_weights) print('\nLAUNCHING GENERATOR FOR', np.diff(args.n_train)[0], 'TRAINING SAMPLES') eval_gen = Batch_Generator(data_files, args.n_eval, input_data, args.n_tracks, args.n_classes, valid_batch_size, args.valid_cuts, scaler, t_scaler, shuffle='OFF') train_gen = Batch_Generator(data_files, args.n_train, input_data, args.n_tracks, args.n_classes, train_batch_size, args.train_cuts, scaler, t_scaler, train_weights, shuffle='ON')
def predict(self, data_loader, data_set=None, hyper_param={}, use_cuda=None, rebuild=False): ''' 预测出 test_data_mat_dict['y_ent_matrix']中的内容,重新填写进该matrix, 未预测之前都是0 :param @data_loader: (KGDataLoader), @hyper_param: (dict) @hyper_param['batch_size'] ##默认4 @hyper_param['issave'] ##默认False @hyper_param['result_dir'] ##默认None :return @result: list, len(句子个数) case = result[0] case['input'] case['relation_list'] r = case['relation_list'][0] r['relation']: 成立日期 r['head']: '百度' r['tail']: '2016年04月08日' ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda if use_cuda: print('use cuda=========================') self.cuda() BATCH_SIZE = hyper_param.get('batch_size', 100) ISSAVE = hyper_param.get('issave', False) result_dir = hyper_param.get('result_dir', './result/') DATA_TYPE = 'rel' test_dataset = data_loader.dataset.test_dataset if data_set is None else data_set if rebuild: test_data_mat_dict = data_loader.transform(test_dataset, istest=True, data_type=DATA_TYPE) ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间 *WARNING* else: old_test_dict_path = os.path.join(result_dir, 'test_data_mat_dict.pkl') if os.path.exists(old_test_dict_path): test_data_mat_dict = data_loader.load_preprocessed_data( old_test_dict_path) log('Reload preprocessed data successfully~') else: test_data_mat_dict = data_loader.transform(test_dataset, istest=True, data_type=DATA_TYPE, ratio=0) data_loader.save_preprocessed_data(old_test_dict_path, test_data_mat_dict) ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间 *WARNING* print('test_dataset_length:', len(test_dataset)) print('test_data_mat_dict_length:', test_data_mat_dict['cha_matrix'].shape) data_generator = Batch_Generator(test_data_mat_dict, batch_size=BATCH_SIZE, data_type=DATA_TYPE, isshuffle=False) self.eval() #disable dropout layer and the bn layer total_output_rel = [] all_cnt = len(test_data_mat_dict['cha_matrix']) log(f'Predict start!', 0) for cnt, data_batch in enumerate(data_generator): x, pos, reltype, y_rel, y_ent, lens, data_list = data_batch pre_paths = self._output( x, reltype, lens) ##pre_paths, (batch_size, T), torch.tensor if use_cuda: pre_paths = pre_paths.data.cpu().numpy().astype(np.int) else: pre_paths = pre_paths.data.numpy().astype(np.int) total_output_rel.append(pre_paths) if (cnt + 1) % 10 == 0: log(f'[PREDICT] step {(cnt+1)*BATCH_SIZE}/{all_cnt}', 1) ## add mask when the ent seq idx larger than sentance length pred_output = np.vstack( total_output_rel) ###(N, max_length), numpy.array len_list = test_data_mat_dict['sentence_length'] ###(N), list pred_output = self._padding_mask(pred_output, len_list[:len(pred_output)]) ## transform back to the dict form test_data_mat_dict['y_rel_matrix'] = pred_output result = data_loader.transform_back(test_data_mat_dict, data_type=DATA_TYPE) ## save the result if ISSAVE and result_dir: save_file = os.path.join(result_dir, 'predict.json') with open(save_file, 'w') as f: for data in result: temps = json.dumps(data, ensure_ascii=False) f.write(temps + '\n') log(f'save the predict result in {save_file}') print('final predict length:', len(result)) return result
def train_model(self, data_loader: KGDataLoader, train_dataset=None, eval_dataset=None, hyper_param={}, use_cuda=None, rebuild=False): ''' :param @data_loader: (KGDataLoader), @result_dir: (str) path to save the trained model and extracted dictionary @hyper_param: (dict) @hyper_param['EPOCH'] @hyper_param['batch_size'] @hyper_param['learning_rate_upper'] @hyper_param['learning_rate_bert'] @hyper_param['bert_finetune'] @hyper_param['visualize_length'] #num of batches between two check points @hyper_param['isshuffle'] @hyper_param['result_dir'] @hyper_param['model_name'] :return @loss_record, @score_record ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda if use_cuda: print('use cuda=========================') self.cuda() EPOCH = hyper_param.get('EPOCH', 3) BATCH_SIZE = hyper_param.get('batch_size', 4) LEARNING_RATE_upper = hyper_param.get('learning_rate_upper', 1e-2) LEARNING_RATE_bert = hyper_param.get('learning_rate_bert', 5e-5) bert_finetune = hyper_param.get('bert_finetune', True) visualize_length = hyper_param.get('visualize_length', 10) result_dir = hyper_param.get('result_dir', './result/') model_name = hyper_param.get('model_name', 'model.p') is_shuffle = hyper_param.get('isshuffle', True) DATA_TYPE = 'rel' train_dataset = data_loader.dataset.train_dataset if train_dataset is None else train_dataset if rebuild: train_data_mat_dict = data_loader.transform(train_dataset, data_type=DATA_TYPE) ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间 *WARNING* else: old_train_dict_path = os.path.join(result_dir, 'train_data_mat_dict.pkl') if os.path.exists(old_train_dict_path): train_data_mat_dict = data_loader.load_preprocessed_data( old_train_dict_path) log('Reload preprocessed data successfully~') else: # train_data_mat_dict = data_loader.transform(train_dataset, data_type=DATA_TYPE) train_data_mat_dict = data_loader.transform( train_dataset, istest=False, data_type=DATA_TYPE, ratio=0) data_loader.save_preprocessed_data(old_train_dict_path, train_data_mat_dict) ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间 *WARNING* data_generator = Batch_Generator(train_data_mat_dict, batch_size=BATCH_SIZE, data_type=DATA_TYPE, isshuffle=is_shuffle) print('train_data_set_length:', len(train_dataset)) print('train_data_mat_dict_length:', train_data_mat_dict['cha_matrix'].shape) all_param = list(self.named_parameters()) bert_param = [p for n, p in all_param if 'bert' in n] other_param = [p for n, p in all_param if 'bert' not in n] if bert_finetune: optimizer_group_paramters = [{ 'params': other_param, 'lr': LEARNING_RATE_upper }, { 'params': bert_param, 'lr': LEARNING_RATE_bert }] optimizer = torch.optim.Adam(optimizer_group_paramters) log( f'****BERT_finetune, learning_rate_upper: {LEARNING_RATE_upper}, learning_rate_bert: {LEARNING_RATE_bert}', 0) else: optimizer = torch.optim.Adam(other_param, lr=LEARNING_RATE_upper) log(f'****BERT_fix, learning_rate_upper: {LEARNING_RATE_upper}', 0) # ##TODO: scheduler = LambdaLR(optimizer, lr_lambda=my_lr_lambda) # # scheduler = transformers.optimization.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(EPOCH*0.2), num_training_steps=EPOCH) all_cnt = len(train_data_mat_dict['cha_matrix']) log(f'{model_name} Training start!', 0) loss_record = [] score_record = [] max_score = -1 evel_param = { 'batch_size': 100, 'issave': False, 'result_dir': result_dir } for epoch in range(EPOCH): self.train() log(f'EPOCH: {epoch+1}/{EPOCH}', 0) loss = 0.0 for cnt, data_batch in enumerate(data_generator): x, pos, reltype, y_rel, y_ent, lens, data_list = data_batch loss_avg = self._loss(x, reltype, y_rel, lens) optimizer.zero_grad() loss_avg.backward() optimizer.step() loss += loss_avg if use_cuda: loss_record.append(loss_avg.cpu().item()) else: loss_record.append(loss_avg.item()) if (cnt + 1) % visualize_length == 0: loss_cur = loss / visualize_length log( f'[TRAIN] step: {(cnt+1)*BATCH_SIZE}/{all_cnt} | loss: {loss_cur:.4f}', 1) loss = 0.0 # self.eval() # print(data_list[0]['input']) # pre_paths = self._output(x, reltype, lens) # print('predict-path') # print(pre_paths[0]) # print('target-path') # print(y_rel[0]) # self.train() temp_score = self.eval_model(data_loader, data_set=eval_dataset, hyper_param=evel_param, use_cuda=use_cuda) score_record.append(temp_score) scheduler.step() if temp_score[2] > max_score: max_score = temp_score[2] save_path = os.path.join(result_dir, model_name) self.save_model(save_path) print( f'Checkpoint saved successfully, current best socre is {max_score}' ) log(f'the best score of the model is {max_score}') return loss_record, score_record
if __name__ == '__main__': # load_bert_pretrained_dict() result_dir = './result/' data_set = AutoKGDataset('./data/d4/') train_dataset = data_set.train_dataset[:200] import os os.makedirs(result_dir, exist_ok=True) data_loader = KGDataLoader2(data_set, rebuild=False, temp_dir=result_dir) show_dict_info(data_loader) # train_data_mat_dict = data_loader.transform_rel(train_dataset, istest=False, ratio=0) train_data_mat_dict = data_loader.transform(train_dataset, istest=False, data_type='rel', ratio=0) data_generator = Batch_Generator(train_data_mat_dict, batch_size=4, data_type='rel', isshuffle=True) # # data_generator = Batch_Generator(train_data_mat_dict, batch_size=4, data_type='ent', isshuffle=True) pred = data_loader.transform_back(train_data_mat_dict, data_type='rel') for i in range(len(train_dataset)): ori_data = train_dataset[i] pre_data = pred[i] print('origin sentence:') print(ori_data['input']) print('decode sentence') print(pre_data['input']) def str_relation_fn(item): return item['relation']+'--'+item['head']+'--'+item['tail']
def create_prediction_file(input_file, output_file, model, batch_size, classes, embedding_data): '''Create a prediction file, containing the predict semantic roles for the input sentences. This function creates an output file (in the CoNLL 2009 format) containing the predicted semantic roles for the sentences in the input file. Keyword arguments: input_file -- name of file containing the sentences for the SRL task. output_file -- name of the file that will contain the predicted roles. model -- instance of srl_models.Model used for the prediction. batch_size -- batch size used by the model ''' from utils import Batch_Generator wordembeddings, posembeddings, depembeddings, predembeddings = embedding_data # load input data with open(input_file, 'r', encoding='utf-8') as infile: with open(output_file, 'w', encoding='utf-8') as outfile: with model._get_session() as session: # initialize variables model.saver.restore(session, model.savefile) fileisover = False while not fileisover: sentence_data, fileisover = SentenceData.get_sentence_data(infile) if sentence_data: digested_data = sentence_data.digest( classes=classes, wordembeddings=wordembeddings, posembeddings=posembeddings, depembeddings=depembeddings, predembeddings=predembeddings) digested_data.roles = np.zeros([digested_data.predicate_count(), len(digested_data)], dtype=np.int32) def i2roles(array_like): array, out = np.array(array_like), [] shape = array.shape array_flat = np.reshape(array,[-1]) for i in array_flat: out.append(classes.i2c[i]) return np.reshape(out, shape) if digested_data.predicate_count(): generator = Batch_Generator([digested_data], batch_size) total_predictions = [] for _ in range(len(generator)): batch = generator.generate_next_batch() predictions = model.predict(session, batch) endindex = min(generator.batch_size, digested_data.predicate_count()) predictions = predictions.transpose()[:, :endindex] predictions_text = i2roles(predictions) total_predictions.extend(predictions_text) sentence_data.roles = total_predictions outfile.write(str(sentence_data)+"\n")
def evaluate_model(self, eval_data): '''Evaluate the neural model with respect to some evaluation data. This methods returns the model's performance. Keyword arguments: self -- SRL model eval_data -- data used for evaluation (it must ) Returns: Precision Recall F1 measure ''' # Begin training true_positive, true_negative, false_positive, false_negative = 0, 0, 0, 0 given, present = 0, 0 with self._get_session() as session: # initialize variables self.saver.restore(session, self.savefile) batch_generator = Batch_Generator(eval_data, self.batch_size) for _ in tqdm(range(len(batch_generator))): batch = batch_generator.generate_next_batch() roles = batch.roles seq_lens = batch.sequence_lengths feed_dict = self._get_feed_dict(batch) # get logits logits = session.run(self.logits, feed_dict=feed_dict) # get predictions predictions = np.argmax(logits, axis=2) # for each prediction check if it was correct for i in range(len(seq_lens)): for j in range(seq_lens[i]): role_is_null = roles[i][j] == 0 pred_is_null = predictions[i][j] == 0 if predictions[i][j] == roles[i][j]: if role_is_null: true_negative += 1 else: true_positive += 1 else: if pred_is_null: false_negative += 1 else: false_positive += 1 if predictions[i][j] != 0: given += 1 if not role_is_null: present += 1 precision = true_positive/given recall = true_positive/present f_measure = precision*recall*2/(precision+recall) ''' with open('../tmp/eval_2_'+self.__class__.__name__+'.txt', 'w') as file: file.write('\n\nTrue positives:'+str(true_positive)+'\nFalse positives:'+str(false_positive)+'\nTrue negatives:'+str(true_negative)+'\nFalse negatives:'+str(false_negative)+'\n') file.write('\n\nTotal Precision:'+str(precision)+'\nTotal Recall:'+str(recall)+'\nTotal f1-measure:'+str(f_measure)+'\n') ''' return precision, recall, f_measure
def train_model(self, data_loader: KGDataLoader, train_dataset=None, eval_dataset=None, hyper_param={}, use_cuda=None, rebuild=False): ''' :param @data_loader: (KGDataLoader), @result_dir: (str) path to save the trained model and extracted dictionary @hyper_param: (dict) @hyper_param['EPOCH'] @hyper_param['batch_size'] @hyper_param['learning_rate_upper'] @hyper_param['learning_rate_bert'] @hyper_param['bert_finetune'] @hyper_param['visualize_length'] #num of batches between two check points @hyper_param['isshuffle'] @hyper_param['result_dir'] @hyper_param['model_name'] :return @loss_record, @score_record ''' use_cuda = self.use_cuda if use_cuda is None else use_cuda use_ema = True ema = EMA(self, mu=0.99) if use_ema else None if use_cuda: print('use cuda=========================') self.cuda() if use_ema: ema.register() EPOCH = hyper_param.get('EPOCH', 3) BATCH_SIZE = hyper_param.get('batch_size', 4) LEARNING_RATE_upper = hyper_param.get('learning_rate_upper', 1e-3) LEARNING_RATE_bert = hyper_param.get('learning_rate_bert', 5e-5) bert_finetune = hyper_param.get('bert_finetune', True) visualize_length = hyper_param.get('visualize_length', 10) result_dir = hyper_param.get('result_dir', './result/') model_name = hyper_param.get('model_name', 'model.p') is_shuffle = hyper_param.get('isshuffle', True) DATA_TYPE = 'ent' train_dataset = data_loader.dataset.train_dataset if train_dataset is None else train_dataset if rebuild: train_data_mat_dict = data_loader.transform(train_dataset, data_type=DATA_TYPE) ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间 *WARNING* else: old_train_dict_path = os.path.join(result_dir, 'train_data_mat_dict.pkl') if os.path.exists(old_train_dict_path): train_data_mat_dict = data_loader.load_preprocessed_data( old_train_dict_path) log('Reload preprocessed data successfully~') else: train_data_mat_dict = data_loader.transform( train_dataset, data_type=DATA_TYPE) data_loader.save_preprocessed_data(old_train_dict_path, train_data_mat_dict) ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间 *WARNING* data_generator = Batch_Generator(train_data_mat_dict, batch_size=BATCH_SIZE, data_type=DATA_TYPE, isshuffle=is_shuffle) all_param = list(self.named_parameters()) bert_param = [p for n, p in all_param if 'bert' in n] other_param = [p for n, p in all_param if 'bert' not in n] if bert_finetune: optimizer_group_paramters = [{ 'params': other_param, 'lr': LEARNING_RATE_upper }, { 'params': bert_param, 'lr': LEARNING_RATE_bert }] optimizer = torch.optim.Adam(optimizer_group_paramters) log( f'****BERT_finetune, learning_rate_upper: {LEARNING_RATE_upper}, learning_rate_bert: {LEARNING_RATE_bert}', 0) else: optimizer = torch.optim.Adam(other_param, lr=LEARNING_RATE_upper) log(f'****BERT_fix, learning_rate_upper: {LEARNING_RATE_upper}', 0) ##TODO: scheduler = LambdaLR(optimizer, lr_lambda=my_lr_lambda) # scheduler = transformers.optimization.get_cosine_schedule_with warmup(optimizer, num_warmup_steps=int(EPOCH*0.2), num_training_steps=EPOCH) all_cnt = len(train_data_mat_dict['cha_matrix']) log(f'{model_name} Training start!', 0) loss_record = [] score_record = [] max_score = 0 evel_param = { 'batch_size': 100, 'issave': False, 'result_dir': result_dir } for epoch in range(EPOCH): self.train() log(f'EPOCH: {epoch+1}/{EPOCH}', 0) loss = 0.0 ##备份当前epoch训练之前的model和ema中的参数,用来回滚 TODO: temp_param = self.backup_param() if use_ema: ema.backup_oldema() # print('before train bias', self.model.classifier.bias[:3]) # print('before ema bias', list(ema.shadow.values())[200][:3], list(ema.shadow.keys())[200]) print(optimizer.state_dict()['param_groups'][0]['lr'], optimizer.state_dict()['param_groups'][1]['lr']) for cnt, data_batch in enumerate(data_generator): x, pos, _, _, y_ent, lens, data_list = data_batch loss_avg = self._loss(x, y_ent, lens) optimizer.zero_grad() loss_avg.backward() optimizer.step() if use_ema: ema.update() loss += loss_avg if use_cuda: loss_record.append(loss_avg.cpu().item()) else: loss_record.append(loss_avg.item()) if (cnt + 1) % visualize_length == 0: loss_cur = loss / visualize_length log( f'[TRAIN] step: {(cnt+1)*BATCH_SIZE}/{all_cnt} | loss: {loss_cur:.4f}', 1) loss = 0.0 if use_ema: ema.apply_shadow() temp_score = self.eval_model(data_loader, data_set=eval_dataset, hyper_param=evel_param, use_cuda=use_cuda) score_record.append(temp_score) # scheduler.step() #TODO: if temp_score[2] >= max_score: max_score = temp_score[2] save_path = os.path.join(result_dir, model_name) self.save_model(save_path) print( f'Checkpoint saved successfully, current best socre is {max_score}' ) if use_ema: ema.restore() # print('restore bias', self.embed2sub.bias) ##TODO: elif temp_score[2] < max_score: ###回滚到这个epoch之前的参数 self.restore_param(temp_param) if use_ema: ema.return_oldema() scheduler.step() print(optimizer.state_dict()['param_groups'][0]['lr'], optimizer.state_dict()['param_groups'][1]['lr']) if optimizer.state_dict()['param_groups'][0]['lr'] < 1e-4: print('early stop!!!') break else: if use_ema: ema.restore() log(f'the best score of the model is {max_score}') return loss_record, score_record