def test2(): args = Args() ner_data_processor = TransformerNerDataProcessor() conll_2003 = Path( __file__).resolve().parent.parent.parent / "test_data/conll-2003" ner_data_processor.set_data_dir(conll_2003) labels, label2idx = ner_data_processor.get_labels(default='roberta') # train_examples = roberta_ner_data_processor.get_train_examples() train_examples = ner_data_processor.get_test_examples() tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") # tokenizer = XLNetTokenizer.from_pretrained("xlnet-base_uncased") features = transformer_convert_data_to_features(args, train_examples[:5], label2idx, tokenizer, max_seq_len=10) model = AlbertNerModel.from_pretrained("albert-base-v2", num_labels=len(label2idx)) for idx, each_batch in enumerate( ner_data_loader(features, batch_size=5, task='test', auto=True)): original_mask = each_batch[1].numpy() print(original_mask, original_mask.shape) inputs = batch_to_model_inputs(each_batch) with torch.no_grad(): logits, flatted_logits, loss = model(**inputs) logits = logits.numpy() print(logits) print(logits.shape) break
def test(): from pprint import pprint roberta_ner_data_processor = TransformerNerDataProcessor() conll_2003 = Path( __file__).resolve().parent.parent.parent / "test_data/conll-2003" roberta_ner_data_processor.set_data_dir(conll_2003) labels, label2idx = roberta_ner_data_processor.get_labels( default='roberta') print(labels, label2idx) # train_examples = roberta_ner_data_processor.get_train_examples() train_examples = roberta_ner_data_processor.get_test_examples() pprint(train_examples[:5], indent=1) tokenizer = RobertaTokenizer.from_pretrained("roberta-base") # tokenizer = XLNetTokenizer.from_pretrained("xlnet-base_uncased") features = transformer_convert_data_to_features(train_examples[:5], label2idx, tokenizer, max_seq_len=10) model = RobertaNerModel.from_pretrained("roberta-base", num_labels=len(label2idx)) # model = XLNetNerModel.from_pretrained("xlnet-base_uncased", num_labels=len(label2idx)) y_trues, y_preds = [], [] y_pred, y_true = [], [] prev_gd = 0 for idx, each_batch in enumerate( ner_data_loader(features, batch_size=5, task='test', auto=True)): # [idx*batch_size: (idx+1)*batch_size] print([(fea.input_tokens, fea.guards) for fea in features[idx * 2:(idx + 1) * 2]]) print(each_batch) original_tkid = each_batch[0].numpy() original_mask = each_batch[1].numpy() original_labels = each_batch[3].numpy() guards = each_batch[4].numpy() print(guards) inputs = batch_to_model_inputs(each_batch) with torch.no_grad(): logits, flatted_logits, loss = model(**inputs) # get softmax output of the raw logits (keep dimensions) raw_logits = torch.argmax(torch.nn.functional.log_softmax(logits, dim=2), dim=2) raw_logits = raw_logits.detach().cpu().numpy() logits = logits.numpy() loss = loss.numpy() print(logits.shape) # print(loss) # tk=token, mk=mask, lb=label, lgt=logits for mks, lbs, lgts, gds in zip(original_mask, original_labels, raw_logits, guards): connect_sent_flag = False for mk, lb, lgt, gd in zip(mks, lbs, lgts, gds): if mk == 0: # after hit first mask, we can stop for the current sentence since all rest will be pad break if gd == 0 or prev_gd == gd: continue if gd == -2: connect_sent_flag = True break if prev_gd != gd: y_true.append(lb) y_pred.append(lgt) prev_gd = gd if connect_sent_flag: continue y_trues.append(y_true) y_preds.append(y_pred) y_pred, y_true = [], [] prev_gd = 0 print(y_trues) print(y_preds)
def _eval(args, model, features): """common evaluate test data with pre-trained model shared by eval and predict""" data_loader = ner_data_loader(features, batch_size=args.eval_batch_size, task='test', auto=True) eval_size = len(data_loader) args.logger.info( "***** Running evaluation on {} number of test data *****".format( eval_size)) args.logger.info(" Instantaneous batch size per GPU = {}".format( args.eval_batch_size)) args.logger.info("******************************") # prepare processing results for each batch y_trues, y_preds = [], [] y_pred, y_true = [], [] prev_gd = 0 # prediction model.eval() eval_loss = .0 for batch in tqdm(data_loader, desc='evaluation', disable=False if args.progress_bar else True): original_tkid = batch[0].numpy() original_mask = batch[1].numpy() original_labels = batch[3].numpy() guards = batch[4].numpy() batch = tuple(b.to(args.device) for b in batch) eval_inputs = batch_to_model_inputs(batch, args.model_type) with torch.no_grad(): raw_logits, _, loss = model(**eval_inputs) # get softmax output of the raw logits (keep dimensions) if not args.use_crf: raw_logits = torch.argmax(F.log_softmax(raw_logits, dim=2), dim=2) raw_logits = raw_logits.detach().cpu().numpy() # update evaluate loss eval_loss += loss.item() assert guards.shape == original_tkid.shape == original_mask.shape == original_labels.shape == raw_logits.shape, \ """ expect same dimension for all the inputs and outputs but get input_tokens: {} mask: {} label: {} logits: {} """.format(original_tkid.shape, original_mask.shape, original_labels.shape, raw_logits.shape) # tk=token, mk=mask, lb=label, lgt=logits for mks, lbs, lgts, gds in zip(original_mask, original_labels, raw_logits, guards): connect_sent_flag = False for mk, lb, lgt, gd in zip(mks, lbs, lgts, gds): if mk == 0: # after hit first mask, we can stop for the current sentence since all rest will be pad (not for xlnet) if args.model_type == "xlnet": continue else: break if gd == 0 or prev_gd == gd: continue if gd == NEXT_GUARD: connect_sent_flag = True break if prev_gd != gd: y_true.append(args.idx2label[lb]) y_pred.append(args.idx2label[lgt]) prev_gd = gd if connect_sent_flag: continue y_trues.append(y_true) y_preds.append(y_pred) y_pred, y_true = [], [] prev_gd = 0 return y_trues, y_preds, round(eval_loss / eval_size, 4)
def train(args, model, train_features, dev_features): """NER model training on train dataset; select model based on performance on dev dataset""" # create data loader data_loader = ner_data_loader(train_features, batch_size=args.train_batch_size, task='train', auto=True) # total training step counts t_total = len(data_loader ) // args.gradient_accumulation_steps * args.num_train_epochs # parameters for optimization no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # using fp16 for training rely on Nvidia apex package if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # training linear warm-up setup scheduler = None if args.do_warmup: warmup_steps = np.dtype('int64').type(args.warmup_ratio * t_total) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) args.logger.info("***** Running training *****") args.logger.info(" Num data points = {}".format(len(data_loader))) args.logger.info(" Num Epochs = {}".format(args.num_train_epochs)) args.logger.info(" Instantaneous batch size per GPU = {}".format( args.train_batch_size)) args.logger.info(" Gradient Accumulation steps = {}".format( args.gradient_accumulation_steps)) args.logger.info(" Total optimization steps = {}".format(t_total)) args.logger.info( " Training steps (number of steps between two evaluation on dev) = {}" .format(args.train_steps * args.gradient_accumulation_steps)) args.logger.info("******************************") # create directory to save model new_model_dir = Path(args.new_model_dir) new_model_dir.mkdir(parents=True, exist_ok=True) # save label2idx json in new model directory json_dump(args.label2idx, new_model_dir / "label2idx.json") # save base model name to a base_model_name.txt with open(new_model_dir / "base_model_name.txt", "w") as f: f.write( 'model_type: {}\nbase_model: {}\nconfig: {}\ntokenizer: {}'.format( args.model_type, args.pretrained_model, args.config_name, args.tokenizer_name)) global_step = 0 tr_loss = .0 best_score, epcoh_best_score = .0, .0 early_stop_flag = 0 model.zero_grad() epoch_iter = trange(int(args.num_train_epochs), desc="Epoch", disable=False if args.progress_bar else True) for epoch in epoch_iter: batch_iter = tqdm(iterable=data_loader, desc='Batch', disable=False if args.progress_bar else True) for step, batch in enumerate(batch_iter): model.train() batch = tuple(b.to(args.device) for b in batch) train_inputs = batch_to_model_inputs(batch, args.model_type) _, _, loss = model(**train_inputs) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() if args.do_warmup: scheduler.step() model.zero_grad() global_step += 1 # using training step if args.train_steps > 0 and ( global_step + 1) % args.train_steps == 0 and epoch > 0: # the current implementation will skip the all evaluations in the first epoch best_score, eval_loss = evaluate(args, model, new_model_dir, dev_features, epoch, global_step, best_score) args.logger.info(""" Global step: {}; Epoch: {}; average_train_loss: {:.4f}; eval_loss: {:.4f}; current best score: {:.4f}""".format( global_step, epoch + 1, round(tr_loss / global_step, 4), eval_loss, best_score)) # default model select method using strict F1-score with beta=1; evaluate model after each epoch on dev if args.train_steps <= 0 or epoch == 0: best_score, eval_loss = evaluate(args, model, new_model_dir, dev_features, epoch, global_step, best_score) args.logger.info(""" Global step: {}; Epoch: {}; average_train_loss: {:.4f}; eval_loss: {:.4f}; current best score: {:.4f}""".format( global_step, epoch + 1, round(tr_loss / global_step, 4), eval_loss, best_score)) # early stop check if epcoh_best_score < best_score: epcoh_best_score = best_score early_stop_flag = 0 else: early_stop_flag += 1 if 0 < args.early_stop <= early_stop_flag: args.logger.warn( 'Early stop activated; performance not improve anymore.') break