def _compute(self, predictions, references, suffix=False): report = classification_report(y_true=references, y_pred=predictions, suffix=suffix, output_dict=True) report.pop("macro avg") report.pop("weighted avg") overall_score = report.pop("micro avg") scores = {} for type_name, score in report.items(): scores[type_name]["precision"] = score["precision"] scores[type_name]["recall"] = score["recall"] scores[type_name]["f1"] = score["f1-score"] scores[type_name]["number"] = score["support"] scores["overall_precision"] = overall_score["precision"] scores["overall_recall"] = overall_score["recall"] scores["overall_f1"] = overall_score["f1-score"] scores["overall_accuracy"] = accuracy_score(y_true=references, y_pred=predictions) return scores
def model_evaluate_roberta(model, data, label, tag2id, batch_size, seq_len_list): id2tag = {value: key for key, value in tag2id.items()} pred_logits = model.predict(data, batch_size=batch_size)[0] # pred shape [batch_size, max_len] preds = np.argmax(pred_logits, axis=2).tolist() assert len(preds) == len(seq_len_list) # get predcit label predict_label = [] target_label = [] for i in range(len(preds)): pred = preds[i][1:] temp = [] true_label = label[i][:min(seq_len_list[i], len(pred))] for j in range(min(seq_len_list[i], len(pred))): temp.append(id2tag[pred[j]]) assert len(temp) == len(true_label) target_label.append(true_label) predict_label.append(temp) # 计算 precision, recall, f1_score precision = precision_score(target_label, predict_label, average="macro", zero_division=0) recall = recall_score(target_label, predict_label, average="macro", zero_division=0) f1 = f1_score(target_label, predict_label, average="macro", zero_division=0) logger.info(classification_report(target_label, predict_label)) return precision, recall, f1
def evaluate(self, data, labels): """Evaluate the performance of ner model. Args: data: list of tokenized texts (, like ``[['我', '是', '中', '国', '人']]`` labels: list of list of str, the corresponding label strings """ features, y = self.preprocessor.prepare_input(data, labels) pred_probs = self.model.predict(features) lengths = [ min(len(label), pred_prob.shape[0]) for label, pred_prob in zip(labels, pred_probs) ] y_pred = self.preprocessor.label_decode(pred_probs, lengths) r = metrics.recall_score(labels, y_pred) p = metrics.precision_score(labels, y_pred) f1 = metrics.f1_score(labels, y_pred) print('Recall: {}, Precision: {}, F1: {}'.format(r, p, f1)) print(metrics.classification_report(labels, y_pred)) return f1
def benchmark_flair_mdl(): tagger = load_flair_ner_model() start = time.time() flair_sentences = [] for i, sentence in enumerate(sentences_tokens): flair_sentence = Sentence() for token_txt in sentence: flair_sentence.add_token(Token(token_txt)) flair_sentences.append(flair_sentence) tagger.predict(flair_sentences, verbose=True) predictions = [[tok.tags['ner'].value for tok in fs] for fs in flair_sentences] print("Made predictions on {} sentences and {} tokens in {}s".format( num_sentences, num_tokens, time.time() - start) ) assert len(predictions) == num_sentences print(classification_report(sentences_entities, remove_miscs(predictions), digits=4))
def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id): if args["max_steps"] > 0: num_train_steps = args["max_steps"] * args[ "gradient_accumulation_steps"] args["num_train_epochs"] = 1 else: num_train_steps = (math.ceil(num_train_examples / train_batch_size) // args["gradient_accumulation_steps"] * args["num_train_epochs"]) writer = tf.summary.create_file_writer("/tmp/mylogs") with strategy.scope(): loss_fct = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"]) if args["fp16"]: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, "dynamic") loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32) gradient_accumulator = GradientAccumulator() logging.info("***** Running training *****") logging.info(" Num examples = %d", num_train_examples) logging.info(" Num Epochs = %d", args["num_train_epochs"]) logging.info(" Instantaneous batch size per device = %d", args["per_device_train_batch_size"]) logging.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", train_batch_size * args["gradient_accumulation_steps"], ) logging.info(" Gradient Accumulation steps = %d", args["gradient_accumulation_steps"]) logging.info(" Total training steps = %d", num_train_steps) model.summary() @tf.function def apply_gradients(): grads_and_vars = [] for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables): if gradient is not None: scaled_gradient = gradient / ( args["n_device"] * args["gradient_accumulation_steps"]) grads_and_vars.append((scaled_gradient, variable)) else: grads_and_vars.append((gradient, variable)) optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"]) gradient_accumulator.reset() @tf.function def train_step(train_features, train_labels): def step_fn(train_features, train_labels): inputs = { "attention_mask": train_features["attention_mask"], "training": True } if "token_type_ids" in train_features: inputs["token_type_ids"] = train_features["token_type_ids"] with tf.GradientTape() as tape: logits = model(train_features["input_ids"], **inputs)[0] active_loss = tf.reshape(train_labels, (-1, )) != pad_token_label_id active_logits = tf.boolean_mask( tf.reshape(logits, (-1, len(labels))), active_loss) active_labels = tf.boolean_mask( tf.reshape(train_labels, (-1, )), active_loss) cross_entropy = loss_fct(active_labels, active_logits) loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size) grads = tape.gradient(loss, model.trainable_variables) gradient_accumulator(grads) return cross_entropy per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels)) mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0) return mean_loss current_time = datetime.datetime.now() train_iterator = master_bar(range(args["num_train_epochs"])) global_step = 0 logging_loss = 0.0 for epoch in train_iterator: epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1) step = 1 with strategy.scope(): for train_features, train_labels in epoch_iterator: loss = train_step(train_features, train_labels) if step % args["gradient_accumulation_steps"] == 0: strategy.experimental_run_v2(apply_gradients) loss_metric(loss) global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics if ( args["n_device"] == 1 and args["evaluate_during_training"] ): # Only evaluate when single GPU otherwise metrics may not average well y_true, y_pred, eval_loss = evaluate( args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev") report = metrics.classification_report(y_true, y_pred, digits=4) logging.info("Eval at step " + str(global_step) + "\n" + report) logging.info("eval_loss: " + str(eval_loss)) precision = metrics.precision_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) f1 = metrics.f1_score(y_true, y_pred) with writer.as_default(): tf.summary.scalar("eval_loss", eval_loss, global_step) tf.summary.scalar("precision", precision, global_step) tf.summary.scalar("recall", recall, global_step) tf.summary.scalar("f1", f1, global_step) lr = optimizer.learning_rate learning_rate = lr(step) with writer.as_default(): tf.summary.scalar("lr", learning_rate, global_step) tf.summary.scalar( "loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step) logging_loss = loss_metric.result() with writer.as_default(): tf.summary.scalar("loss", loss_metric.result(), step=step) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir = os.path.join( args["output_dir"], "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model.save_pretrained(output_dir) logging.info("Saving model checkpoint to %s", output_dir) train_iterator.child.comment = f"loss : {loss_metric.result()}" step += 1 train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}") loss_metric.reset_states() logging.info(" Training took time = {}".format(datetime.datetime.now() - current_time))
def main(args): with open(args.cache_dir / "vocab.pkl", "rb") as f: vocab: Vocab = pickle.load(f) tag_idx_path = args.cache_dir / "tag2idx.json" tag2idx: Dict[str, int] = json.loads(tag_idx_path.read_text()) data_paths = {split: args.data_dir / f"{split}.json" for split in SPLITS} data = { split: json.loads(path.read_text()) for split, path in data_paths.items() } datasets: Dict[str, SeqSlotDataset] = { split: SeqSlotDataset(split_data, vocab, tag2idx, args.max_len) for split, split_data in data.items() } # TODO: crecate DataLoader for train / dev datasets dataloaders = { split: DataLoader(dataset, batch_size=args.batch_size, shuffle=True, collate_fn=dataset.collate_fn) for split, dataset in datasets.items() } # COMPLETE embeddings = torch.load(args.cache_dir / "embeddings.pt") # TODO: init model and move model to target device(cpu / gpu) model = SeqSlot(embeddings, args.hidden_size, args.num_layers, args.dropout, args.bidirectional, datasets[TRAIN].num_classes) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # COMPLETE # TODO: init optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # COMPLETE epoch_pbar = trange(args.num_epoch, desc="Epoch") for epoch in epoch_pbar: # TODO: Training loop - iterate over train dataloader and update model weights size = len(dataloaders[TRAIN].dataset) loss_fn = torch.nn.CrossEntropyLoss() model.train() for batch_num, batch in enumerate(dataloaders[TRAIN]): encoded = batch["encoded"] tag = batch["tag"] lens = batch["lens"] if torch.cuda.is_available(): encoded = encoded.cuda() tag = tag.cuda() pred = model(encoded, lens) pred = pred.view(-1, pred.shape[-1]) tag = tag.reshape(-1) loss = loss_fn(pred, tag) optimizer.zero_grad() loss.backward() optimizer.step() if batch_num % 50 == 0: loss, current = loss, batch_num * len(encoded) print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]") # COMPLETE # TODO: Evaluation loop - calculate accuracy and save model weights loss, size = 0, 0 y_true = [] y_pred = [] model.eval() with torch.no_grad(): for batch_num, batch in enumerate(dataloaders[DEV]): encoded = batch["encoded"] tag = batch["tag"] lens = batch["lens"] if torch.cuda.is_available(): encoded = encoded.cuda() tag = tag.cuda() pred = model(encoded, lens) pred = pred.view(-1, pred.shape[-1]) tag = tag.reshape(-1) loss += loss_fn(pred, tag) pred_tag = torch.argmax(pred, dim=1) pred_tag = pred_tag.view(-1, len(encoded)) tag = tag.view(-1, len(encoded)) size += len(encoded) tran_pred = pred_tag.t() tran_true = tag.t() tran_pred = [ list( map(datasets[DEV].idx2tag, (tran_pred[i][:lens[i]]).tolist())) for i in range(len(tran_pred)) ] tran_true = [ list( map(datasets[DEV].idx2tag, (tran_true[i][:lens[i]]).tolist())) for i in range(len(tran_true)) ] for i in range(len(tran_pred)): y_pred.append(tran_pred[i]) y_true.append(tran_true[i]) report = classification_report(y_pred, y_true, mode='strict', scheme=IOB2) print(report) join_correct = correct = 0 join_count = count = 0 for i in range(len(y_pred)): join = True for j in range(len(y_pred[i])): count += 1 if y_pred[i][j] == y_true[i][j]: correct += 1 else: join = False join_count += 1 if join: join_correct += 1 accuracy = correct / count join_ac = join_correct / join_count loss /= size print( f"Dev Error: \n Accuracy: {(100*accuracy):>0.1f}%, Avg loss: {loss:>8f} JoinAC: {(100*join_ac):>0.1f}% \n" ) torch.save(model, args.ckpt_dir / "best.pt")
def evaluate_on_model(ds_X_word, ds_X_char, ds_y): # load padding length padding_len = load_dict_after('padding_len.json') max_len = min(padding_len['max_len'], MAX_LEN) max_len_char = min(padding_len['max_len_char'], MAX_LEN_CHAR) # load the model in terms of CRF as output layer or Dense as output layer model = load_saved_model() # prepare the tags in terms of multiple output model, or single output y = [] if USE_CRF and MULTI_OUT: y = convert_to_multi_output(ds_y, max_len) else: for ds in ds_y: y.extend(ds) X_word = [] X_char = [] for x_word in ds_X_word: X_word.extend(x_word) for x_char in ds_X_char: X_char.extend(x_char) X_word = np.array(X_word, dtype="float32") X_char = np.array(X_char, dtype="float32") print(model.metrics_names) if MULTI_OUT: scores = model.evaluate([X_word, X_char], [ np.array(y[0], dtype="float32").reshape(len(X_word), max_len, 1), np.array(y[1], dtype="float32").reshape(len(X_word), max_len, 1), np.array(y[2], dtype="float32").reshape(len(X_word), max_len, 1), np.array(y[3], dtype="float32").reshape(len(X_word), max_len, 1), np.array(y[4], dtype="float32").reshape(len(X_word), max_len, 1) ], verbose=1) print(scores) else: # single output # get scores for test sets from each data-set for i in range(len(ds_X_word)): scores = model.evaluate([ds_X_word[i], ds_X_char[i]], np.array(ds_y[i], dtype="float32").reshape( len(ds_X_word[i]), max_len, 1), verbose=1) print(scores) if MULTI_OUT: test_pred = model.predict([X_word, X_char]) for i in range(len(ds_X_word)): tag2idx = load_dict_after('tag2idx' + str(i) + '.json') n_tags = len(tag2idx) idx2tag = flip_dict(tag2idx) conv_pred = [] conv_gold = [] for sentence_tag in test_pred[i]: p = np.argmax(sentence_tag, axis=-1) p = [idx2tag[tag_idx] for tag_idx in p] conv_pred.append(p) for sentence_tag in y[i]: sentence_tag = [idx2tag[tag_idx] for tag_idx in sentence_tag] conv_gold.append(sentence_tag) print("F1-score: {:.1%}".format(f1_score(conv_gold, conv_pred))) print(classification_report(conv_gold, conv_pred)) else: for i in range(len(ds_X_word)): x_word = ds_X_word[ i] # all the sentences in word-indexed form for dataset i x_char = ds_X_char[ i] # all the sentences in character-indexed form for dataset i y_sen = ds_y[i] # all the corresponding tags of the sentences y_sen = np.array(y_sen) #predict test_pred = model.predict([ np.array(x_word, dtype="float32"), np.array(x_char, dtype="float32") ]) tag2idx = load_dict_after('tag2idx.json') n_tags = len(tag2idx) idx2tag = flip_dict(tag2idx) conv_pred = [ ] # list to store the predicted tags, converted from indices conv_gold = [ ] # list to store the actual/ gold tags, converted from indices for sentence_tag in test_pred: p = np.argmax( sentence_tag, axis=-1 ) # for each word, get the tag with maximum probabiliity out of all the possible tags p = [idx2tag[tag_idx] for tag_idx in p] # convert each tag from indice to name conv_pred.append(p) for sentence_tag in y_sen: sentence_tag = [idx2tag[tag_idx] for tag_idx in sentence_tag] conv_gold.append(sentence_tag) print("F1-score: {:.1%}".format(f1_score(conv_gold, conv_pred))) print(classification_report(conv_gold, conv_pred))
def run_ner_w_args(args): if args.server_ip and args.server_port: # Distant debugging - see # https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of # sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForNer.from_pretrained(args.bert_model, cache_dir=cache_dir, config_dir=args.config_dir, num_labels=num_labels, config=args.config) model_to_save = model.module if hasattr(model, 'module') else model # print(model_to_save.config, cache_dir) # print(args.config_dir, args.config) # exit() if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # def resolve_opt(pre_model_path, optimizer): # opt_path = os.path.join(args.bert_model, "opt.pth") # if os.path.exists(opt_path): # optimizer.load_state_dict( torch.load( opt_path ) ) # return optimizer # optimizer = resolve_opt(args.bert_model, optimizer) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() def warmup_linear(progress, warmup): if progress < warmup: return progress / warmup return max((progress - 1.) / (warmup - 1.), 0.) for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask) # input_ids, input_mask, segment_ids, label_ids = batch # loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles # this automatically lr_this_step = args.learning_rate * \ warmup_linear(global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) # Save optimizer output_optimizer_file = os.path.join(args.output_dir, "opt.pth") torch.save(optimizer.state_dict(), output_optimizer_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) tokenizer.save_vocabulary(args.output_dir) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # output_config_file = os.path.join(args.output_dir, CONFIG_NAME) # output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) # config = BertConfig(output_config_file) # model = BertForTokenClassification(config, num_labels=num_labels) # model.load_state_dict(torch.load(output_model_file)) model = BertForNer.from_pretrained(args.bert_model, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} # for input_ids, input_mask, segment_ids, label_ids in # tqdm(eval_dataloader, desc="Evaluating"): for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == 11: y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) loss = tr_loss / global_step if args.do_train else None result = dict() result['loss'] = loss report = classification_report(y_true, y_pred, digits=4) logger.info("\n%s", report) print(report) result['f1'] = f1_score(y_true, y_pred) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) # writer.write(report) for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) return result
def test_classification_report(self): print(classification_report(self.y_true, self.y_pred))
val_batch_labels = label_ids.to("cpu").numpy() predictions.extend(val_batch_preds) true_labels.extend(val_batch_labels) tmp_eval_accuracy = flat_accuracy(val_batch_labels, val_batch_preds) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 # Evaluate loss, acc, conf. matrix, and class. report on devset pred_tags = [[tag2name[i] for i in predictions]] valid_tags = [[tag2name[i] for i in true_labels]] cl_report = classification_report(valid_tags, pred_tags) eval_loss = eval_loss / nb_eval_steps tmp_accuracy = accuracy_score(valid_tags, pred_tags) if tmp_accuracy > dev_best_acc: dev_best_acc = tmp_accuracy model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(bert_out_address, "pytorch_model.bin") output_config_file = os.path.join(bert_out_address, "config.json") torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) # Report metrics f1 = f1_score(valid_tags, pred_tags) if f1 > dev_best_f1:
def evaluate(args, model, tokenizer, ngram_dict, processor, label_list): num_labels = len(label_list) + 1 eval_dataset = load_examples(args, tokenizer, ngram_dict, processor, label_list, mode="test") # Run prediction for full data eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) input_ids, input_mask, segment_ids, label_ids, ngram_ids, ngram_positions, \ ngram_lengths, ngram_seg_ids, ngram_masks, valid_ids, l_mask = batch with torch.no_grad(): logits = model(input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=valid_ids, attention_mask_label=None, ngram_ids=ngram_ids, ngram_positions=ngram_positions) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.detach().cpu().numpy() for i, label in enumerate(label_ids): for j, m in enumerate(label): if j == 0: continue if label_ids[i][j] == num_labels - 1: break y_true.append(label_map[label_ids[i][j]]) y_pred.append(label_map[logits[i][j]]) if args.task_name == 'cwsmsra' or args.task_name == 'cwspku': #evaluating CWS result = cws_evaluate_word_PRF(y_pred, y_true) logger.info("=======entity level========") logger.info( "\n%s", ', '.join("%s: %s" % (key, val) for key, val in result.items())) logger.info("=======entity level========") else: #evaluating NER, POS report = classification_report(y_true, y_pred, digits=4) f = f1_score(y_true, y_pred) result = {"report": report, "f1": f} logger.info("=======entity level========") logger.info(report) logger.info("=======entity level========") return result
def show_ner_report(labels, preds): return classification_report(labels, preds, suffix=True)
def show_ner_report(labels, preds): return seqeval_metrics.classification_report(labels, preds, suffix=True)
def main(): logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y ', level=logging.INFO) logger = logging.getLogger(__name__) parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--data", default=None, type=str, required=True, help="Directory which has the data files for the task") parser.add_argument( "--output", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--overwrite", default=False, type=bool, help="Set it to True to overwrite output directory") args = parser.parse_args() if os.path.exists(args.output) and os.listdir( args.output) and not args.overwrite: raise ValueError( "Output directory ({}) already exists and is not empty. Set the overwrite flag to overwrite" .format(args.output)) if not os.path.exists(args.output): os.makedirs(args.output) train_batch_size = 32 valid_batch_size = 64 test_batch_size = 64 # padding sentences and labels to max_length of 128 max_seq_len = 128 EMBEDDING_DIM = 100 epochs = 10 split_train = split_text_label(os.path.join(args.data, "train.txt")) split_valid = split_text_label(os.path.join(args.data, "valid.txt")) split_test = split_text_label(os.path.join(args.data, "test.txt")) labelSet = set() wordSet = set() # words and labels for data in [split_train, split_valid, split_test]: for labeled_text in data: for word, label in labeled_text: labelSet.add(label) wordSet.add(word.lower()) # Sort the set to ensure '0' is assigned to 0 sorted_labels = sorted(list(labelSet), key=len) # Create mapping for labels label2Idx = {} for label in sorted_labels: label2Idx[label] = len(label2Idx) num_labels = len(label2Idx) idx2Label = {v: k for k, v in label2Idx.items()} pickle.dump(idx2Label, open(os.path.join(args.output, "idx2Label.pkl"), 'wb')) logger.info("Saved idx2Label pickle file") # Create mapping for words word2Idx = {} if len(word2Idx) == 0: word2Idx["PADDING_TOKEN"] = len(word2Idx) word2Idx["UNKNOWN_TOKEN"] = len(word2Idx) for word in wordSet: word2Idx[word] = len(word2Idx) logger.info("Total number of words is : %d ", len(word2Idx)) pickle.dump(word2Idx, open(os.path.join(args.output, "word2Idx.pkl"), 'wb')) logger.info("Saved word2Idx pickle file") # Loading glove embeddings embeddings_index = {} f = open('embeddings/glove.6B.100d.txt', encoding="utf-8") for line in f: values = line.strip().split(' ') word = values[0] # the first entry is the word coefs = np.asarray( values[1:], dtype='float32') #100d vectors representing the word embeddings_index[word] = coefs f.close() logger.info("Glove data loaded") #print(str(dict(itertools.islice(embeddings_index.items(), 2)))) embedding_matrix = np.zeros((len(word2Idx), EMBEDDING_DIM)) # Word embeddings for the tokens for word, i in word2Idx.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector pickle.dump(embedding_matrix, open(os.path.join(args.output, "embedding.pkl"), 'wb')) logger.info("Saved Embedding matrix pickle") # Interesting - to check how many words were not there in Glove Embedding # indices = np.where(np.all(np.isclose(embedding_matrix, 0), axis=1)) # print(len(indices[0])) train_sentences, train_labels = createMatrices(split_train, word2Idx, label2Idx) valid_sentences, valid_labels = createMatrices(split_valid, word2Idx, label2Idx) test_sentences, test_labels = createMatrices(split_test, word2Idx, label2Idx) train_features, train_labels = padding(train_sentences, train_labels, max_seq_len, padding='post') valid_features, valid_labels = padding(valid_sentences, valid_labels, max_seq_len, padding='post') test_features, test_labels = padding(test_sentences, test_labels, max_seq_len, padding='post') logger.info( f"Train features shape is {train_features.shape} and labels shape is{train_labels.shape}" ) logger.info( f"Valid features shape is {valid_features.shape} and labels shape is{valid_labels.shape}" ) logger.info( f"Test features shape is {test_features.shape} and labels shape is{test_labels.shape}" ) train_dataset = tf.data.Dataset.from_tensor_slices( (train_features, train_labels)) valid_dataset = tf.data.Dataset.from_tensor_slices( (valid_features, valid_labels)) test_dataset = tf.data.Dataset.from_tensor_slices( (test_features, test_labels)) shuffled_train_dataset = train_dataset.shuffle( buffer_size=train_features.shape[0], reshuffle_each_iteration=True) batched_train_dataset = shuffled_train_dataset.batch(train_batch_size, drop_remainder=True) batched_valid_dataset = valid_dataset.batch(valid_batch_size, drop_remainder=True) batched_test_dataset = test_dataset.batch(test_batch_size, drop_remainder=True) epoch_bar = master_bar(range(epochs)) train_pb_max_len = math.ceil( float(len(train_features)) / float(train_batch_size)) valid_pb_max_len = math.ceil( float(len(valid_features)) / float(valid_batch_size)) test_pb_max_len = math.ceil( float(len(test_features)) / float(test_batch_size)) model = TFNer(max_seq_len=max_seq_len, embed_input_dim=len(word2Idx), embed_output_dim=EMBEDDING_DIM, weights=[embedding_matrix], num_labels=num_labels) optimizer = tf.keras.optimizers.Adam(learning_rate=0.01) scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) train_log_dir = f"{args.output}/logs/train" valid_log_dir = f"{args.output}/logs/valid" train_summary_writer = tf.summary.create_file_writer(train_log_dir) valid_summary_writer = tf.summary.create_file_writer(valid_log_dir) train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) valid_loss_metric = tf.keras.metrics.Mean('valid_loss', dtype=tf.float32) def train_step_fn(sentences_batch, labels_batch): with tf.GradientTape() as tape: logits = model( sentences_batch) # batchsize, max_seq_len, num_labels loss = scce(labels_batch, logits) #batchsize,max_seq_len grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(list(zip(grads, model.trainable_variables))) return loss, logits def valid_step_fn(sentences_batch, labels_batch): logits = model(sentences_batch) loss = scce(labels_batch, logits) return loss, logits for epoch in epoch_bar: with train_summary_writer.as_default(): for sentences_batch, labels_batch in progress_bar( batched_train_dataset, total=train_pb_max_len, parent=epoch_bar): loss, logits = train_step_fn(sentences_batch, labels_batch) train_loss_metric(loss) epoch_bar.child.comment = f'training loss : {train_loss_metric.result()}' tf.summary.scalar('training loss', train_loss_metric.result(), step=epoch) train_loss_metric.reset_states() with valid_summary_writer.as_default(): for sentences_batch, labels_batch in progress_bar( batched_valid_dataset, total=valid_pb_max_len, parent=epoch_bar): loss, logits = valid_step_fn(sentences_batch, labels_batch) valid_loss_metric.update_state(loss) epoch_bar.child.comment = f'validation loss : {valid_loss_metric.result()}' # Logging after each Epoch ! tf.summary.scalar('valid loss', valid_loss_metric.result(), step=epoch) valid_loss_metric.reset_states() model.save_weights(f"{args.output}/model_weights", save_format='tf') logger.info(f"Model weights saved") #Evaluating on test dataset test_model = TFNer(max_seq_len=max_seq_len, embed_input_dim=len(word2Idx), embed_output_dim=EMBEDDING_DIM, weights=[embedding_matrix], num_labels=num_labels) test_model.load_weights(f"{args.output}/model_weights") logger.info(f"Model weights restored") true_labels = [] pred_labels = [] for sentences_batch, labels_batch in progress_bar(batched_test_dataset, total=test_pb_max_len): logits = test_model(sentences_batch) temp1 = tf.nn.softmax(logits) preds = tf.argmax(temp1, axis=2) true_labels.append(np.asarray(labels_batch)) pred_labels.append(np.asarray(preds)) label_correct, label_pred = idx_to_label(pred_labels, true_labels, idx2Label) report = classification_report(label_correct, label_pred, digits=4) logger.info(f"Results for the test dataset") logger.info(f"\n{report}")
def evaluate(eval_ATE=True, eval_APC=True): # evaluate apc_result = {'max_apc_test_acc': 0, 'max_apc_test_f1': 0} ate_result = 0 y_true = [] y_pred = [] n_test_correct, n_test_total = 0, 0 test_apc_logits_all, test_polarities_all = None, None model.eval() label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask in eval_dataloader: input_ids_spc = input_ids_spc.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) polarities = polarities.to(device) l_mask = l_mask.to(device) with torch.no_grad(): ate_logits, apc_logits = model(input_ids_spc, segment_ids, input_mask, valid_ids=valid_ids, polarities=polarities, attention_mask_label=l_mask) if eval_APC: polarities = model.get_batch_polarities(polarities) n_test_correct += (torch.argmax( apc_logits, -1) == polarities).sum().item() n_test_total += len(polarities) if test_polarities_all is None: test_polarities_all = polarities test_apc_logits_all = apc_logits else: test_polarities_all = torch.cat( (test_polarities_all, polarities), dim=0) test_apc_logits_all = torch.cat( (test_apc_logits_all, apc_logits), dim=0) if eval_ATE: if not args.use_bert_spc: label_ids = model.get_batch_token_labels_bert_base_indices( label_ids) ate_logits = torch.argmax(F.log_softmax(ate_logits, dim=2), dim=2) ate_logits = ate_logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_list): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map.get(label_ids[i][j], 'O')) temp_2.append(label_map.get(ate_logits[i][j], 'O')) if eval_APC: test_acc = n_test_correct / n_test_total if args.dataset in {'camera', 'car', 'phone', 'notebook'}: test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1], average='macro') else: test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1, 2], average='macro') test_acc = round(test_acc * 100, 2) test_f1 = round(test_f1 * 100, 2) apc_result = { 'max_apc_test_acc': test_acc, 'max_apc_test_f1': test_f1 } if eval_ATE: report = classification_report(y_true, y_pred, digits=4) tmps = report.split() ate_result = round(float(tmps[7]) * 100, 2) return apc_result, ate_result
X_train = [sent2features(s) for s in data[996:]] y_train = [sent2labels(s) for s in data[996:]] X_test = [sent2features(s) for s in data[:996]] y_test = [sent2labels(s) for s in data[:996]] crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=20, all_possible_transitions=False, ) if __name__ == '__main__': crf.fit(X_train, y_train) y_pred = crf.predict(X_test) y_p, y_t = [], [] for i in range(len(y_pred)): for j in range(len(y_pred[i])): y_p.append(y_pred[i][j]) y_t.append(y_test[i][j]) print( metrics.flat_classification_report(y_test, y_pred, labels=corpus.labels)) print(classification_report(y_t, y_p))
def get_classification_report(self, index2label: [List[str], Dict[int, str]]): golds, preds = self._map_to_labels(index2label) cr = classification_report(golds, preds, digits=5) return report2dict(cr)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.", ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train.", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval or not.") parser.add_argument( "--eval_on", default="dev", help="Whether to run eval on the dev set or test set.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--train_batch_size", default=32, type=int, help="Total batch size for training.", ) parser.add_argument( "--eval_batch_size", default=8, type=int, help="Total batch size for eval.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument( "--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.", ) parser.add_argument( "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.", ) parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--server_ip", type=str, default="", help="Can be used for distant debugging.", ) parser.add_argument( "--server_port", type=str, default="", help="Can be used for distant debugging.", ) args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = (args.train_batch_size // args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = (int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs) if args.local_rank != -1: num_train_optimization_steps = (num_train_optimization_steps // torch.distributed.get_world_size()) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Prepare model config = BertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name) model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, ) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps, ) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids, ) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = ( batch) loss = model( input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask, ) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = (model.module if hasattr(model, "module") else model ) # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map, } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w"), ) # Load a trained model and config that you have fine-tuned else: # Load a trained model and vocabulary that you have fine-tuned model = Ner.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids, ) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for ( input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask, ) in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model( input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask, ) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to("cpu").numpy() input_mask = input_mask.to("cpu").numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) report = classification_report(y_true, y_pred, digits=4) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
async def accuracy(self, sources: Sources): if not os.path.isfile( os.path.join(self.parent.config.output_dir, "tf_model.h5") ): raise ModelNotTrained("Train model before assessing for accuracy.") config = self.parent.config._asdict() config["strategy"] = self.parent.config.strategy config["n_device"] = self.parent.config.n_device self.tokenizer = self.tokenizer_class.from_pretrained( config["output_dir"], do_lower_case=config["do_lower_case"] ) eval_batch_size = ( config["per_device_eval_batch_size"] * config["n_device"] ) data_df = await self._preprocess_data(sources) eval_dataset, num_eval_examples = self.get_dataset( data_df, self.tokenizer, self.pad_token_label_id, eval_batch_size, mode="accuracy", ) eval_dataset = self.parent.config.strategy.experimental_distribute_dataset( eval_dataset ) checkpoints = [] results = [] if config["eval_all_checkpoints"]: checkpoints = list( os.path.dirname(c) for c in sorted( pathlib( config["output_dir"] + "/**/" + TF2_WEIGHTS_NAME ).glob(recursive=True), key=lambda f: int("".join(filter(str.isdigit, f)) or -1), ) ) if len(checkpoints) == 0: checkpoints.append(config["output_dir"]) self.logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = ( checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final" ) with self.parent.config.strategy.scope(): self.model = self.model_class.from_pretrained(checkpoint) y_true, y_pred, eval_loss = self._custom_accuracy( eval_dataset, self.tokenizer, self.model, num_eval_examples, eval_batch_size, ) report = classification_report(y_true, y_pred, digits=4) if global_step: results.append( { global_step + "_report": report, global_step + "_loss": eval_loss, } ) output_eval_file = os.path.join( config["output_dir"], "accuracy_results.txt" ) # create the report and save in output_dir with self.tf.io.gfile.GFile(output_eval_file, "w") as writer: for res in results: for key, val in res.items(): if "loss" in key: self.logger.debug(key + " = " + str(val)) writer.write(key + " = " + str(val)) writer.write("\n") else: self.logger.debug(key) self.logger.debug("\n" + report) writer.write(key + "\n") writer.write(report) writer.write("\n") # Return accuracy for the last checkpoint return Accuracy(f1_score(y_true, y_pred))
def test(padded_X, X_lengths, padded_Y, model, batch_size, longest_sent, optimizer, label_map, device, upos=None, feats=None, fixes=None, results_dir=None, save_file=True, results_file="eval_results.txt"): y_corr_all = [] y_pred_all = [] for example_i in range(0, len(padded_X), batch_size): # TODO Erase this # If last batch size != 16 break if example_i + batch_size > len(padded_X): break X_ids = padded_X[example_i:min(example_i + batch_size, len(padded_X))] upos_ids, feats_ids, fixes_ids = None, None, None if upos is not None: upos_ids = upos[example_i:min(example_i + batch_size, len(upos))] if feats is not None: feats_ids = [ feat[example_i:min(example_i + batch_size, len(feat))] for feat in feats ] if fixes is not None: fixes_ids = fixes[example_i:min(example_i + batch_size, len(fixes))] X_leng = X_lengths[example_i:min(example_i + batch_size, len(X_lengths))] Y_ids = padded_Y[example_i:min(example_i + batch_size, len(padded_Y))] if upos is not None: if feats is not None: if fixes is not None: sorted_data = sorted(zip(X_leng, X_ids, Y_ids, upos_ids, feats_ids, fixes_ids), key=lambda pair: pair[0], reverse=True) X_leng, X_ids, Y_ids, upos_ids, feats_ids, fixes_ids = zip( *sorted_data) X_leng, X_ids, Y_ids, upos_ids, feats_ids, fixes_ids = list( X_leng), list(X_ids), list(Y_ids), list( upos_ids), list(feats_ids), list(fixes_ids) else: sorted_data = sorted(zip(X_leng, X_ids, Y_ids, upos_ids, *feats_ids), key=lambda pair: pair[0], reverse=True) X_leng, X_ids, Y_ids, upos_ids, *feats_ids = zip( *sorted_data) X_leng, X_ids, Y_ids, upos_ids = list(X_leng), list( X_ids), list(Y_ids), list(upos_ids) feats_ids = [list(feat_ids) for feat_ids in feats_ids] else: sorted_data = sorted(zip(X_leng, X_ids, Y_ids, upos_ids), key=lambda pair: pair[0], reverse=True) X_leng, X_ids, Y_ids, upos_ids = zip(*sorted_data) X_leng, X_ids, Y_ids, upos_ids = list(X_leng), list( X_ids), list(Y_ids), list(upos_ids) elif feats is not None: if fixes is not None: sorted_data = sorted(zip(X_leng, X_ids, Y_ids, feats_ids, fixes_ids), key=lambda pair: pair[0], reverse=True) X_leng, X_ids, Y_ids, feats_ids, fixes_ids = zip(*sorted_data) X_leng, X_ids, Y_ids, feats_ids, fixes_ids = list( X_leng), list(X_ids), list(Y_ids), list(feats_ids), list( fixes_ids) else: sorted_data = sorted(zip(X_leng, X_ids, Y_ids, *feats_ids), key=lambda pair: pair[0], reverse=True) X_leng, X_ids, Y_ids, *feats_ids = zip(*sorted_data) X_leng, X_ids, Y_ids, = list(X_leng), list(X_ids), list(Y_ids) feats_ids = [list(feat_ids) for feat_ids in feats_ids] elif fixes is not None: sorted_data = sorted(zip(X_leng, X_ids, Y_ids, upos_ids, feats_ids, fixes_ids), key=lambda pair: pair[0], reverse=True) X_leng, X_ids, Y_ids, fixes_ids = zip(*sorted_data) X_leng, X_ids, Y_ids, fixes_ids = list(X_leng), list(X_ids), list( Y_ids), list(fixes_ids) else: sorted_data = sorted(zip(X_leng, X_ids, Y_ids), key=lambda pair: pair[0], reverse=True) X_leng, X_ids, Y_ids = zip(*sorted_data) X_leng, X_ids, Y_ids = list(X_leng), list(X_ids), list(Y_ids) Y_ids = torch.tensor([index for exam in Y_ids for index in exam], dtype=torch.long) - 1 Y_ids = Y_ids.to(device) X_ids = torch.tensor(X_ids, dtype=torch.float32) X_ids = X_ids.to(device) if upos is not None: upos_ids = torch.tensor(upos_ids, dtype=torch.long) upos_ids = upos_ids.to(device) if feats is not None: for feat_i, feat_ids in enumerate(feats_ids): feat_ids = torch.tensor(feat_ids, dtype=torch.long) feats_ids[feat_i] = feat_ids.to(device) if fixes is not None: fixes_ids = torch.tensor(fixes_ids, dtype=torch.long) fixes_ids = fixes_ids.to(device) with torch.no_grad(): if fixes is not None: y_pred = model(X_ids, X_leng, upos_ids, feats_ids, fixes_ids) elif feats is not None: y_pred = model(X_ids, X_leng, upos_ids, feats_ids) elif upos is not None: y_pred = model(X_ids, X_leng, upos_ids) else: y_pred = model(X_ids, X_leng) y_pred = y_pred.detach().cpu() # reshape out_label_ids, create dict for mapping, map all out_label_ids to out_labels, stack them in one array, classification_report(Y_words, out_labels) y_pred_reshaped = torch.argmax(y_pred, dim=1) y_pred_reshaped = y_pred_reshaped.view(-1, longest_sent).numpy() y_corr_reshaped = Y_ids.view(-1, longest_sent).cpu().numpy() y_corr = [] y_pred_tags = [] for i_y in range(batch_size): y_corr_row = [] y_pred_tags_row = [] for j_y in range(X_leng[i_y]): y_corr_row.append(label_map[y_corr_reshaped[i_y][j_y]]) y_pred_tags_row.append(label_map[y_pred_reshaped[i_y][j_y]]) y_corr.append(y_corr_row) y_pred_tags.append(y_pred_tags_row) y_corr_all.extend(y_corr) y_pred_all.extend(y_pred_tags) optimizer.zero_grad() report = classification_report(y_corr_all, y_pred_all, digits=4) if save_file: if not os.path.exists(results_dir): os.mkdir(results_dir) output_eval_file = os.path.join(results_dir, results_file) with open(output_eval_file, "w") as writer: writer.write(report) print(report)
def test_inv_classification_report(self): print( classification_report(self.y_true_inv, self.y_pred_inv, suffix=True))
def evaluate(self, eval_dataset, output_dir): """ Evaluates the model on eval_dataset. Utility function to be used by the eval_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args pad_token_label_id = self.pad_token_label_id eval_output_dir = output_dir results = {} eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, disable=args["silent"]): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], } # XLM and RoBERTa don"t use segment_ids if args["model_type"] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps model_outputs = preds preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(self.labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) result = { "eval_loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1_score": f1_score(out_label_list, preds_list), } results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: if args["classification_report"]: cls_report = classification_report(out_label_list, preds_list) writer.write("{}\n".format(cls_report)) for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) return results, model_outputs, preds_list
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--test_file", default='', type=str, help="Test file") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--validate_per_epoch", default=3, type=int, help="validations number per epoch") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_validation", action='store_true', help="Whether to run validation.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--drop", default=0.1, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_tag_labels(args.data_dir) global EVAL_TAGS EVAL_TAGS = [ label for label in label_list if label not in ['O', '[CLS]', '[SEP]'] ] # EVAL_TAGS = [f'{x}-{y}' for x in ['B', 'I'] for y in EVAL_TAGS] logger.info(EVAL_TAGS) num_labels = len(label_list) + 1 allowed_tags = set(EVAL_TAGS + ['O']) do_lower_case = 'uncased' in args.bert_model tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Prepare model config = BertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name, hidden_dropout_prob=args.drop) print(config) model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} best_dev = 0.0 if args.do_validation: dev_examples = processor.get_dev_examples(args.data_dir, label_list) dev_features = convert_examples_to_features(dev_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Dev set *****") logger.info(" Num examples = %d", len(dev_examples)) all_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in dev_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in dev_features], dtype=torch.long) dev_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.eval_batch_size) validation_steps = int( len(train_examples) / args.train_batch_size) // args.validate_per_epoch if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() start_time = time.time() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask, device=device) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.do_validation and (step + 1) % validation_steps == 0: logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_dataloader), time.time() - start_time, tr_loss / nb_tr_steps)) model.eval() y_true = [] y_pred = [] label_map = { i: label for i, label in enumerate(label_list, 1) } label_map[0] = '[PAD]' for batch in tqdm(dev_dataloader, desc='Validation'): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, None, valid_ids, l_mask, device=device) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() # input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map) - 1: y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) y_true_copy = [[ x if x in allowed_tags else 'O' for x in y ] for y in y_true] y_pred_copy = [[ x if x in allowed_tags else 'O' for x in y ] for y in y_pred] report = classification_report(y_true_copy, y_pred_copy, digits=6) # report_dict = classification_report(y_true_copy, # y_pred_copy, # output_dict=True) # report_dict = report logger.info("***** Validation results *****") logger.info("\n%s", report) fscore = float([ line.strip().split()[4] for line in report.split('\n') if line.strip().startswith('micro') ][0]) if fscore > best_dev: logger.info(f'!!!Best dev: {fscore}') logger.info(f'at epoch: {epoch}') best_dev = fscore model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) label_map = { i: label for i, label in enumerate(label_list, 1) } label_map[0] = '[PAD]' model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open( os.path.join(args.output_dir, "model_config.json"), "w")) model.train() model = Ner.from_pretrained(args.output_dir) do_lower_case = 'uncased' in args.bert_model tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): args.test_file = os.path.join( args.data_dir, 'test.json') if args.test_file == '' else args.test_file eval_examples = processor.get_test_examples(args.test_file) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] tag_scores = [] label_map = {i: label for i, label in enumerate(label_list, 1)} label_map[0] = '[PAD]' for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask, device=device) scores = np.max(F.softmax(logits, dim=-1).cpu().numpy(), axis=-1) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() # scores = scores.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] temp_3 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map) - 1: y_true.append(temp_1) y_pred.append(temp_2) tag_scores.append(temp_3) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) temp_3.append(scores[i][j]) y_true_copy = [[x if x in allowed_tags else 'O' for x in y] for y in y_true] y_pred_copy = [[x if x in allowed_tags else 'O' for x in y] for y in y_pred] # report = classification_report(y_true_copy, # y_pred_copy, digits=4) report = 'all scores are 0!\n' logger.info("\n%s", report) output_eval_file = os.path.join( args.output_dir, f"{args.test_file.split('/')[-1]}_eval_results.txt") output_preds_file = os.path.join( args.output_dir, f"{args.test_file.split('/')[-1]}_predictions.tsv") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report) prediction_results = { 'id': [ex.guid for ex in eval_examples], 'token': [ex.text_a for ex in eval_examples], 'tag_label': [' '.join(ex.label) for ex in eval_examples], 'tag_pred': [' '.join(pred) for pred in y_pred], 'scores': [' '.join([str(x) for x in score]) for score in tag_scores] } pd.DataFrame(prediction_results).to_csv(output_preds_file, sep='\t', index=False)
def main(): args: ModelArguments = get_arguments() process = NerProcessor(args.data_dir) label_list = process.get_labels() num_labels = len(label_list) + 1 if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( 'Output directory ({}) already exist and the dir is not empty') if not args.output_dir: os.makedirs(args.output_dir) if args.do_train: tokenizer = FullTokenizer(os.path.join(args.bert_model, "vocab.txt"), args.do_lower_case) if args.multi_gpu: if len(args.gpu.split(',')) == 1: strategy = tf.distribute.MirroredStrategy() else: # build the gpu device name arr gpus = [f'/gpu:{gpu}' for gpu in args.gpu.split(',')] strategy = tf.distribute.MirroredStrategy(devices=gpus) else: strategy = tf.distribute.OneDeviceStrategy(device=args.gpu) if args.do_train: train_examples = process.get_train_examples() # optimization total steps -> learning_rate scheduler, weight decay, warmup learning rate num_train_optimization_steps = int( len(train_examples) / args.train_batch_size) * args.num_train_epochs warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) # keep the final learning should be zero learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=args.learning_rate, decay_steps=num_train_optimization_steps, end_learning_rate=0.) if warmup_steps: # layer norm and bias should not weight decay learning_rate_fn = AdamWeightDecay( learning_rate=args.learning_rate, weight_decay_rate=args.weight_decay, beta_1=0.9, beta_2=0.99, epsilon=args.adam_epsilon, exclude_from_weight_decay=['layer_norm', 'bias']) with strategy.scope(): ner = BertNer(args.bert_model, tf.float32, args.num_labels, args.max_seq_length) # can define the specific meaning of the reduction loss_fct = tf.keras.losses.SparseCategoricalCrossentropy( reduction=tf.keras.losses.Reduction.NONE) label_map = {label: index for index, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info('*** Running training ***') logger.info(' Num Examples = %d', len(train_examples)) logger.info(' Batch Size = %d', len(args.train_batch_size)) logger.info(' Num Steps = %d', len(num_train_optimization_steps)) all_input_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.input_ids for f in train_features])) all_input_mask = tf.data.Dataset.from_tensor_slices( np.asarray([f.input_mask for f in train_features])) all_label_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.label_ids for f in train_features])) all_label_mask = tf.data.Dataset.from_tensor_slices( np.asarray([f.label_mask for f in train_features])) all_valid_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.valid_ids for f in train_features])) all_segment_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.segment_ids for f in train_features])) train_data = tf.data.Dataset.zip( (all_input_ids, all_input_mask, all_segment_ids, all_valid_ids, all_label_ids, all_label_mask)) # set the shuffle buffer size, reshuffle the train data in each iteration shuffled_train_data = train_data.shuffle( buffer_size=int(len(train_features) * 0.1), seed=args.seed, reshuffle_each_iteration=True).batch(args.train_batch_size) distributed_data = strategy.experimental_distribute_dataset( shuffled_train_data) loss_metric = tf.keras.metrics.Mean() epoch_bar = master_bar(range(1)) optimizer: tf.keras.optimizers.Optimizer = None def train_steps(input_ids, input_mask, segment_id, valid_ids, label_ids, label_mask): def step_fn(_input_ids, _input_mask, _segment_id, _valid_ids, _label_ids, _label_mask): with tf.GradientTape() as tape: # _input_ids, one-axis, which will be run on pattern output = ner(_input_ids, _input_mask, _segment_id, _label_ids, training=True) # flatten all of the outputs _label_mask = tf.reshape(_label_mask, (-1)) output = tf.reshape(output, (-1, num_labels)) output = tf.boolean_mask(output, _input_mask) _label_ids = tf.reshape(_label_ids, (-1, )) _label_ids = tf.boolean_mask(_label_ids, _label_mask) cross_entropy = loss_fct(_label_ids, output) # this is for single one train data loss = tf.reduce_sum( cross_entropy) * 1. / args.train_batch_size gradients = tape.gradient(loss, ner.trainable_variables) optimizer.apply_gradients( grads_and_vars=zip(gradients, ner.trainable_variables)) return cross_entropy # 在多个gpu上并行跑训练数据 per_example_loss = strategy.experimental_run_v2( step_fn, args=(input_ids, input_mask, segment_id, valid_ids, label_ids, label_mask)) mean_loss = strategy.reduce(tf.distribute) return mean_loss pb_max_length = math.ceil(len(train_features) / args.train_batch_size) for epoch in epoch_bar: with strategy.scope(): for (input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask) in progress_bar(distributed_data, total=pb_max_length, parent=epoch_bar): loss = train_steps(input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask) loss_metric(loss) epoch_bar.child.comment = f'loss: {loss}' loss_metric.reset_states() ner.save_weights(os.path.join(args.output_dir, 'model.h5')) if args.do_eval: tokenizer = FullTokenizer(os.path.join(args.bert_model, 'vocab.txt'), do_lower_case=args.do_lower_case) ner = BertNer(args.bert_model, tf.float32, args.num_labels, args.max_seq_length) # create example eval data to build the model ids = tf.ones((1, 128), dtype=tf.float32) ner(ids, ids, ids, ids, ids, training=False) ner.load_weights(os.path.join(args.output_dir, 'model.h5')) # load the data if args.eval_on == 'dev': eval_examples = process.get_dev_examples() elif args.eval_on == 'test': eval_examples = process.get_test_examples() else: raise KeyError(f'eval_on arguments is expected in [dev, test]') eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) # print the eval info logger.info('*** Eval Examples ***') logger.info(' Num Examples = %d', len(eval_features)) logger.info(' Batch Size = %d', args.eval_batch_size) all_input_ids = tf.data.Dataset.from_tensor_slices( [f.input_ids for f in eval_features]) all_input_mask = tf.data.Dataset.from_tensor_slices( [f.input_mask for f in eval_features]) all_segment_ids = tf.data.Dataset.from_tensor_slices( [f.segment_ids for f in eval_features]) all_valid_ids = tf.data.Dataset.from_tensor_slices( [f.valid_ids for f in eval_features]) all_label_ids = tf.data.Dataset.from_tensor_slices( [f.label_ids for f in eval_features]) all_label_mask = tf.data.Dataset.from_tensor_slices( [f.label_mask for f in eval_features]) eval_data = tf.data.Dataset.zip( (all_input_ids, all_input_mask, all_segment_ids, all_valid_ids, all_label_ids, all_label_mask)).batch(args.eval_batch_size) loss_metric = tf.metrics.Mean() epoch_bar = master_bar(range(1)) processor_bar_length = math.ceil( len(eval_features) / args.eval_batch_size) y_true, y_predict = [], [] for epoch in epoch_bar: for (input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask) in progress_bar(eval_data, total=processor_bar_length, parent=epoch_bar): logits = ner(input_ids, input_mask, segment_ids, valid_ids, training=False) logits = tf.argmax(logits, axis=-1) label_predict = tf.boolean_mask(logits, label_mask) y_true.append(label_ids) y_predict.append(label_predict) report = classification_report(y_true, y_predict, digits=4) output_eval_file = os.path.join(args.output_dir, 'eval_result.txt') with open(output_eval_file, 'w', encoding='utf-8') as f: logger.info('*** Eval Result ***') logger.info(report) f.write(report)
def evaluate(opt): # set config config = load_config(opt) if opt.num_threads > 0: torch.set_num_threads(opt.num_threads) config['opt'] = opt logger.info("%s", config) # set path set_path(config) # prepare test dataset test_loader = prepare_datasets(config) # load pytorch model checkpoint checkpoint = load_checkpoint(config) # prepare model and load parameters model = load_model(config, checkpoint) model.eval() # convert to onnx format if opt.convert_onnx: (x, y) = next(iter(test_loader)) x = to_device(x, opt.device) y = to_device(y, opt.device) convert_onnx(config, model, x) check_onnx(config) logger.info("[ONNX model saved at {}".format(opt.onnx_path)) # quantize onnx if opt.quantize_onnx: quantize_onnx(opt.onnx_path, opt.quantized_onnx_path) logger.info("[Quantized ONNX model saved at {}".format( opt.quantized_onnx_path)) return # load onnx model for using onnxruntime if opt.enable_ort: import onnxruntime as ort sess_options = ort.SessionOptions() sess_options.inter_op_num_threads = opt.num_threads sess_options.intra_op_num_threads = opt.num_threads ort_session = ort.InferenceSession(opt.onnx_path, sess_options=sess_options) # enable to use dynamic quantized model (pytorch>=1.3.0) if opt.enable_dqm and opt.device == 'cpu': model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) print(model) # evaluation preds = None ys = None n_batches = len(test_loader) total_examples = 0 whole_st_time = time.time() first_time = time.time() first_examples = 0 total_duration_time = 0.0 with torch.no_grad(): for i, (x, y) in enumerate(tqdm(test_loader, total=n_batches)): start_time = time.time() x = to_device(x, opt.device) y = to_device(y, opt.device) if opt.enable_ort: x = to_numpy(x) if config['emb_class'] == 'glove': ort_inputs = { ort_session.get_inputs()[0].name: x[0], ort_session.get_inputs()[1].name: x[1] } if opt.use_char_cnn: ort_inputs[ort_session.get_inputs()[2].name] = x[2] if config['emb_class'] in [ 'bert', 'distilbert', 'albert', 'roberta', 'bart', 'electra' ]: if config['emb_class'] in ['distilbert', 'bart']: ort_inputs = { ort_session.get_inputs()[0].name: x[0], ort_session.get_inputs()[1].name: x[1] } else: ort_inputs = { ort_session.get_inputs()[0].name: x[0], ort_session.get_inputs()[1].name: x[1], ort_session.get_inputs()[2].name: x[2] } if opt.bert_use_pos: ort_inputs[ort_session.get_inputs()[3].name] = x[3] if opt.use_crf: logits, prediction = ort_session.run(None, ort_inputs) prediction = to_device(torch.tensor(prediction), opt.device) logits = to_device(torch.tensor(logits), opt.device) else: logits = ort_session.run(None, ort_inputs)[0] logits = to_device(torch.tensor(logits), opt.device) else: if opt.use_crf: logits, prediction = model(x) else: logits = model(x) if preds is None: if opt.use_crf: preds = to_numpy(prediction) else: preds = to_numpy(logits) ys = to_numpy(y) else: if opt.use_crf: preds = np.append(preds, to_numpy(prediction), axis=0) else: preds = np.append(preds, to_numpy(logits), axis=0) ys = np.append(ys, to_numpy(y), axis=0) cur_examples = y.size(0) total_examples += cur_examples if i == 0: # first one may take longer time, so ignore in computing duration. first_time = float((time.time() - first_time) * 1000) first_examples = cur_examples if opt.num_examples != 0 and total_examples >= opt.num_examples: logger.info("[Stop Evaluation] : up to the {} examples".format( total_examples)) break duration_time = float((time.time() - start_time) * 1000) if i != 0: total_duration_time += duration_time ''' logger.info("[Elapsed Time] : {}ms".format(duration_time)) ''' whole_time = float((time.time() - whole_st_time) * 1000) avg_time = (whole_time - first_time) / (total_examples - first_examples) if not opt.use_crf: preds = np.argmax(preds, axis=2) # compute measure using seqeval labels = model.labels ys_lbs = [[] for _ in range(ys.shape[0])] preds_lbs = [[] for _ in range(ys.shape[0])] pad_label_id = config['pad_label_id'] for i in range(ys.shape[0]): # foreach sentence for j in range(ys.shape[1]): # foreach token if ys[i][j] != pad_label_id: ys_lbs[i].append(labels[ys[i][j]]) preds_lbs[i].append(labels[preds[i][j]]) ret = { "precision": precision_score(ys_lbs, preds_lbs), "recall": recall_score(ys_lbs, preds_lbs), "f1": f1_score(ys_lbs, preds_lbs), "report": classification_report(ys_lbs, preds_lbs, digits=4), } print(ret['report']) f1 = ret['f1'] # write predicted labels to file default_label = config['default_label'] write_prediction(opt, ys, preds, labels, pad_label_id, default_label) logger.info("[F1] : {}, {}".format(f1, total_examples)) logger.info("[Elapsed Time] : {} examples, {}ms, {}ms on average".format( total_examples, whole_time, avg_time)) logger.info( "[Elapsed Time(total_duration_time, average)] : {}ms, {}ms".format( total_duration_time, total_duration_time / (total_examples - 1)))
def report(labels, preds): return classification_report(labels, preds)
attention_mask=b_input_mask, labels=b_labels) logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # print(np.argmax(logits, axis=2).shape) predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) true_labels.extend(label_ids) tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += b_input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps print("Validation loss: {}".format(eval_loss)) print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps)) pred_tags = [[id2label[p_i] for p_i in p] for p in predictions] valid_tags = [[id2label[l_i] for l_i in l] for l in true_labels] with open("logs/logs_epoch_{}.txt".format(epoch), "w") as f: for tokens, pred, valid in zip(tokenized_test_text, pred_tags, valid_tags): f.write(" ".join(tokens) + "\n") f.write(" ".join(pred) + "\n") f.write(" ".join(valid) + "\n\n") print("F1-Score: {}".format(f1_score(pred_tags, valid_tags))) print(classification_report(pred_tags, valid_tags))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--percent", default=100, type=int, help="The percentage of examples used in the training data.\n") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--pretrain', action='store_true', help="Whether to load a pre-trained model for continuing training") parser.add_argument('--pretrained_model_file', type=str, help="The path of the pretrained_model_file") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) train_examples = train_examples[:int( len(train_examples) * args.percent / 100)] num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.pretrain: # Load a pre-trained model print('load a pre-trained model from ' + args.pretrained_model_file) pretrained_state_dict = torch.load(args.pretrained_model_file) model_state_dict = model.state_dict() print('pretrained_state_dict', pretrained_state_dict.keys()) print('model_state_dict', model_state_dict.keys()) pretrained_state = { k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.size() == model_state_dict[k].size() } model_state_dict.update(pretrained_state) print('updated_state_dict', model_state_dict.keys()) model.load_state_dict(model_state_dict) model.to(device) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 print('train loss', tr_loss) # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i: label for i, label in enumerate(label_list, 0)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) model_state_dict = torch.load(output_model_file) model = BertForTokenClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 0)} for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = torch.argmax(logits, dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, mask in enumerate(input_mask): temp_1 = [] temp_2 = [] for j, m in enumerate(mask): if j == 0: continue if m: if label_map[label_ids[i][j]] != "X": temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) else: temp_1.pop() temp_2.pop() break if temp_1[-1] == '[SEP]': temp_1.pop() temp_2.pop() y_true.append(temp_1) y_pred.append(temp_2) report = classification_report(y_true, y_pred, digits=4) prediction_file = os.path.join(args.output_dir, 'predictions.txt') write_predictions(eval_examples, y_true, y_pred, prediction_file) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def main(_): logging.set_verbosity(logging.INFO) args = flags.FLAGS.flag_values_dict() if (os.path.exists(args["output_dir"]) and os.listdir(args["output_dir"]) and args["do_train"] and not args["overwrite_output_dir"]): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args["output_dir"])) if args["fp16"]: tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": True}) if args["tpu"]: resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=args["tpu"]) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) args["n_device"] = args["num_tpu_cores"] elif len(args["gpus"].split(",")) > 1: args["n_device"] = len( [f"/gpu:{gpu}" for gpu in args["gpus"].split(",")]) strategy = tf.distribute.MirroredStrategy( devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")]) elif args["no_cuda"]: args["n_device"] = 1 strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") else: args["n_device"] = len(args["gpus"].split(",")) strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0]) logging.warning( "n_device: %s, distributed training: %s, 16-bits training: %s", args["n_device"], bool(args["n_device"] > 1), args["fp16"], ) labels = get_labels(args["labels"]) num_labels = len(labels) pad_token_label_id = -1 # IBO print(args["config_name"] if args["config_name"] else args["model_name_or_path"]) config = AutoConfig.from_pretrained( args["config_name"] if args["config_name"] else args["model_name_or_path"], num_labels=num_labels, cache_dir=args["cache_dir"], ) logging.info("Training/evaluation parameters %s", args) args["model_type"] = config.model_type # Training if args["do_train"]: tokenizer = AutoTokenizer.from_pretrained( args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"], do_lower_case=args["do_lower_case"], cache_dir=args["cache_dir"], ) with strategy.scope(): model = TFAutoModelForTokenClassification.from_pretrained( args["model_name_or_path"], from_pt=bool(".bin" in args["model_name_or_path"]), config=config, cache_dir=args["cache_dir"], ) train_batch_size = args["per_device_train_batch_size"] * args[ "n_device"] train_dataset, num_train_examples = load_and_cache_examples( args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train") train_dataset = strategy.experimental_distribute_dataset(train_dataset) train( args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id, ) os.makedirs(args["output_dir"], exist_ok=True) logging.info("Saving model to %s", args["output_dir"]) model.save_pretrained(args["output_dir"]) tokenizer.save_pretrained(args["output_dir"]) # Evaluation if args["do_eval"]: tokenizer = AutoTokenizer.from_pretrained( args["output_dir"], do_lower_case=args["do_lower_case"]) checkpoints = [] results = [] if args["eval_all_checkpoints"]: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int("".join(filter(str.isdigit, f)) or -1), )) logging.info("Evaluate the following checkpoints: %s", checkpoints) if len(checkpoints) == 0: checkpoints.append(args["output_dir"]) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if re.match( ".*checkpoint-[0-9]", checkpoint) else "final" with strategy.scope(): model = TFAutoModelForTokenClassification.from_pretrained( checkpoint) y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev") report = metrics.classification_report(y_true, y_pred, digits=4) if global_step: results.append({ global_step + "_report": report, global_step + "_loss": eval_loss }) output_eval_file = os.path.join(args["output_dir"], "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: for res in results: for key, val in res.items(): if "loss" in key: logging.info(key + " = " + str(val)) writer.write(key + " = " + str(val)) writer.write("\n") else: logging.info(key) logging.info("\n" + report) writer.write(key + "\n") writer.write(report) writer.write("\n") if args["do_predict"]: tokenizer = AutoTokenizer.from_pretrained( args["output_dir"], do_lower_case=args["do_lower_case"]) model = TFAutoModelForTokenClassification.from_pretrained( args["output_dir"]) eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"] predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test") y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test") output_test_results_file = os.path.join(args["output_dir"], "test_results.txt") output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt") report = metrics.classification_report(y_true, y_pred, digits=4) with tf.io.gfile.GFile(output_test_results_file, "w") as writer: report = metrics.classification_report(y_true, y_pred, digits=4) logging.info("\n" + report) writer.write(report) writer.write("\n\nloss = " + str(pred_loss)) with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer: with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith( "-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not y_pred[example_id]: example_id += 1 elif y_pred[example_id]: output_line = line.split( )[0] + " " + y_pred[example_id].pop(0) + "\n" writer.write(output_line) else: logging.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""): eval_dataset = FunsdDataset(args, tokenizer, labels, pad_token_label_id, mode=mode) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=None, ) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): with torch.no_grad(): inputs = { "input_ids": batch[0].to(args.device), "attention_mask": batch[1].to(args.device), "labels": batch[3].to(args.device), } if args.model_type in ["layoutlm"]: inputs["bbox"] = batch[4].to(args.device) inputs["token_type_ids"] = ( batch[2].to(args.device) if args.model_type in ["bert", "layoutlm"] else None ) # RoBERTa don"t use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] if args.n_gpu > 1: tmp_eval_loss = ( tmp_eval_loss.mean() ) # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 ) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) results = { "loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } report = classification_report(out_label_list, preds_list) logger.info("\n" + report) logger.info("***** Eval results %s *****", prefix) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results, preds_list