def valid_epoch(self, data): pbar = ProgressBar(n_total=len(data)) self.epoch_reset() self.model.eval() with torch.no_grad(): for step, batch in enumerate(data): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = self.model(input_ids, input_mask, segment_ids) self.outputs.append(logits.cpu().detach()) self.targets.append(label_ids.cpu().detach()) pbar.batch_step(step=step, info={}, bar_type='Evaluating') self.outputs = torch.cat(self.outputs, dim=0).cpu().detach() self.targets = torch.cat(self.targets, dim=0).cpu().detach() loss = self.criterion(target=self.targets, output=self.outputs) self.result['valid_loss'] = loss.item() print("------------- valid result --------------") if self.epoch_metrics: for metric in self.epoch_metrics: metric(logits=self.outputs, target=self.targets) value = metric.value() if value: self.result[f'valid_{metric.name()}'] = value if 'cuda' in str(self.device): torch.cuda.empty_cache() return self.result
def create_training_instances(input_file, tokenizer, max_seq_len, short_seq_prob, max_ngram, masked_lm_prob, max_predictions_per_seq): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. f = open(input_file, 'r') lines = f.readlines() pbar = ProgressBar(n_total=len(lines), desc='read data') for line_cnt, line in enumerate(lines): line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) pbar(step=line_cnt) print(' ') # Remove empty documents all_documents = [x for x in all_documents if x] random.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] pbar = ProgressBar(n_total=len(all_documents), desc='create instances') for document_index in range(len(all_documents)): instances.extend( create_instances_from_document(all_documents, document_index, max_seq_len, short_seq_prob, max_ngram, masked_lm_prob, max_predictions_per_seq, vocab_words)) pbar(step=document_index) print(' ') ex_idx = 0 while ex_idx < 5: instance = instances[ex_idx] logger.info("-------------------------Example-----------------------") logger.info(f"id: {ex_idx}") logger.info( f"tokens: {' '.join([str(x) for x in instance['tokens']])}") logger.info( f"masked_lm_labels: {' '.join([str(x) for x in instance['masked_lm_labels']])}" ) logger.info( f"segment_ids: {' '.join([str(x) for x in instance['segment_ids']])}" ) logger.info( f"masked_lm_positions: {' '.join([str(x) for x in instance['masked_lm_positions']])}" ) logger.info(f"is_random_next : {instance['is_random_next']}") ex_idx += 1 random.shuffle(instances) return instances
def create_examples(self, lines, example_type, cached_examples_file): ''' Creates examples for data ''' pbar = ProgressBar(n_total=len(lines)) if cached_examples_file.exists(): logger.info("Loading examples from cached file %s", cached_examples_file) examples = torch.load(cached_examples_file) else: examples = [] for i, line in enumerate(lines): guid = '%s-%d' % (example_type, i) text_a = line[0] label = line[1] if isinstance(label, str): label = [np.float(x) for x in label.split(",")] else: label = [np.float(x) for x in list(label)] text_b = None example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) pbar.batch_step(step=i, info={}, bar_type='create examples') logger.info("Saving examples into cached file %s", cached_examples_file) torch.save(examples, cached_examples_file) return examples
def predict(self, data, thresh): pbar = ProgressBar(n_total=len(data)) all_logits = None # y_true = torch.LongTensor() y_true = None self.model.eval() with torch.no_grad(): for step, batch in enumerate(data): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # y_true = torch.cat((y_true, label_ids), 0) if y_true is None: y_true = label_ids.detach().cpu().numpy() else: y_true = np.concatenate( [y_true, label_ids.detach().cpu().numpy()], axis=0) logits = self.model(input_ids, segment_ids, input_mask) logits = logits.sigmoid() if all_logits is None: all_logits = logits.detach().cpu().numpy() else: all_logits = np.concatenate( [all_logits, logits.detach().cpu().numpy()], axis=0) pbar.batch_step(step=step, info={}, bar_type='Testing') y_pred = (all_logits > thresh) * 1 micro = f1_score(y_true, y_pred, average='micro') macro = f1_score(y_true, y_pred, average='macro') score = (micro + macro) / 2 self.logger.info("\nScore: micro {}, macro {} Average {}".format( micro, macro, score)) if 'cuda' in str(self.device): torch.cuda.empty_cache() return all_logits, y_pred
def train_epoch(self, data): pbar = ProgressBar(n_total=len(data)) tr_loss = AverageMeter() self.epoch_reset() for step, batch in enumerate(data): self.batch_reset() self.model.train() batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch print("input_ids, input_mask, segment_ids, label_ids SIZE: \n") print(input_ids.size(), input_mask.size(), segment_ids.size(), label_ids.size()) logits = self.model(input_ids, input_mask, segment_ids) print("logits and label ids size: ", logits.size(), label_ids.size()) loss = self.criterion(output=logits, target=label_ids) if len(self.n_gpu) >= 2: loss = loss.mean() if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps if self.fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() clip_grad_norm_(amp.master_params(self.optimizer), self.grad_clip) else: loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_clip) if (step + 1) % self.gradient_accumulation_steps == 0: self.lr_scheduler.step() self.optimizer.step() self.optimizer.zero_grad() self.global_step += 1 if self.batch_metrics: for metric in self.batch_metrics: metric(logits=logits, target=label_ids) self.info[metric.name()] = metric.value() self.info['loss'] = loss.item() tr_loss.update(loss.item(), n=1) if self.verbose >= 1: pbar.batch_step(step=step, info=self.info, bar_type='Training') self.outputs.append(logits.cpu().detach()) self.targets.append(label_ids.cpu().detach()) print("\n------------- train result --------------") # epoch metric self.outputs = torch.cat(self.outputs, dim=0).cpu().detach() self.targets = torch.cat(self.targets, dim=0).cpu().detach() self.result['loss'] = tr_loss.avg if self.epoch_metrics: for metric in self.epoch_metrics: metric(logits=self.outputs, target=self.targets) value = metric.value() if value: self.result[f'{metric.name()}'] = value if "cuda" in str(self.device): torch.cuda.empty_cache() return self.result
def create_examples(self, lines, example_type, cached_examples_file): ''' Creates examples for data ''' pbar = ProgressBar(n_total=len(lines), desc='create examples') if cached_examples_file.exists(): logger.info("Loading examples from cached file %s", cached_examples_file) examples = torch.load(cached_examples_file) else: examples = [] for i, line in enumerate(lines): guid = '%s-%d' % (example_type, i) text_a = line[0] text_b = line[1] label = line[2] label = int(label) example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) pbar(step=i) logger.info("Saving examples into cached file %s", cached_examples_file) torch.save(examples, cached_examples_file) return examples
def take_eval_steps(args, model, tokenizer, prune, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) #eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} #for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): for eval_task in eval_task_names: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, data_type='dev') #if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: # os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model = model.model model.eval() pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} #inputs['token_type_ids'] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) pbar(step) if 'cuda' in str(args.device): torch.cuda.empty_cache() eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) return results
def read_data_and_create_examples(self, example_type, cached_examples_file, input_file): if cached_examples_file.exists(): logger.info("Loading examples from cached file %s", cached_examples_file) examples = torch.load(cached_examples_file) else: # create examples df_dataset = pd.read_csv(input_file).fillna("") pbar = ProgressBar(n_total=len(df_dataset), desc='create examples') examples = [] for i, row in df_dataset.iterrows(): guid = '%s-%d' % (example_type, i) seq_id = row["id"] text_a = row["title"] text_b = row["content"] label = row["label"] label = int(label) example = InputExample(guid=guid, seq_id=seq_id, text_a=text_a, text_b=text_b, label=label) examples.append(example) pbar(step=i) logger.info("Saving examples into cached file %s", cached_examples_file) torch.save(examples, cached_examples_file) return examples
def evaluate(args, model, eval_dataloader, metrics): # Eval! logger.info(" Num examples = %d", len(eval_dataloader)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = AverageMeter() metrics.reset() preds = [] targets = [] pbar = ProgressBar(n_total=len(eval_dataloader), desc='Evaluating') for bid, batch in enumerate(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } inputs['token_type_ids'] = batch[2] outputs = model(**inputs) loss, logits = outputs[:2] eval_loss.update(loss.item(), n=batch[0].size()[0]) preds.append(logits.cpu().detach()) targets.append(inputs['labels'].cpu().detach()) pbar(bid) preds = torch.cat(preds, dim=0).cpu().detach() targets = torch.cat(targets, dim=0).cpu().detach() metrics(preds, targets) eval_log = {"eval_acc": metrics.value(), 'eval_loss': eval_loss.avg} return eval_log
def predict(args, model, tokenizer, prefix=""): '''模型预测''' pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] output_submit_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] preds = logits.detach().cpu().numpy() preds = np.argmax(preds, axis=2).tolist() preds = preds[0][1:-1] # [CLS]XXXX[SEP] tags = [args.id2label[x] for x in preds] label_entities = get_entities(preds, args.id2label, args.markup) # 得到实体 json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join(tags) json_d['entities'] = label_entities results.append(json_d) pbar(step) logger.info("\n") with open(output_submit_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n')
def predict(args, model, tokenizer, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') print(len(test_dataset)) # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] output_submit_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "start_positions": None, "end_positions": None } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) start_logits, end_logits = outputs[:2] R = bert_extract_item(start_logits, end_logits) if R: label_entities = [[args.id2label[x[0]], x[1], x[2]] for x in R] else: label_entities = [] json_d = {} json_d['id'] = step json_d['entities'] = label_entities results.append(json_d) pbar(step) print(" ") with open(output_submit_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n')
def predict(args, model, pred_dataloader, config): # Predict (without compute metrics) # args.predict_save_path = config['pred_dir'] / f'{args.pred_dir_name}' # args.predict_save_path.mkdir(exist_ok=True) logger.info(" Num examples = %d", len(pred_dataloader)) logger.info(" Batch size = %d", args.eval_batch_size) seq_ids = [] preds = [] pbar = ProgressBar(n_total=len(pred_dataloader), desc='Predicting') for bid, batch in enumerate(pred_dataloader): model.eval() batch = tuple( t.to(args.device) if isinstance(t, torch.Tensor) else t for t in batch) seq_ids += list(batch[-1]) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } inputs['token_type_ids'] = batch[2] ############## # writer = SummaryWriter(config["output_dir"]) # ips = {k: v[[0], ...] for k, v in inputs.items()} # ops = model(**ips) # model_graph_inputs = ( # ips["input_ids"], ips["attention_mask"], ips["token_type_ids"], [1,2], [3,4], ips["labels"]) # writer.add_graph(model, model_graph_inputs) # writer.close() ############## outputs = model(**inputs) loss, logits = outputs[:2] preds.append(logits.cpu().detach()) pbar(bid) preds = torch.cat(preds, dim=0).cpu().detach() preds_label = torch.argmax(preds, dim=1) result_label = DataFrame(data={ "id": Series(seq_ids), "label": Series(preds_label) }) result_label.to_csv(config["predict_result"], index=False) preds_softmax = torch.softmax(preds, dim=1) result_softmax = DataFrame( data={ "id": Series(seq_ids), "label_0": Series(preds_softmax[:, 0]), "label_1": Series(preds_softmax[:, 1]), "label_2": Series(preds_softmax[:, 2]) }) result_softmax.to_csv(config["predict_softmax"], index=False) return result_label
def create_features(self, examples, max_seq_len, cached_features_file): pbar = ProgressBar(n_total=len(examples)) if cached_features_file.exists(): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: features = [] for ex_id, example in enumerate(examples): tokens = self.tokenizer.tokenize(example.text) label_ids = example.labels if len(tokens) > max_seq_len: tokens = tokens[:max_seq_len] input_ids = self.tokenizer.convert_tokens_to_ids(tokens) padding = [self.pad_id] * (max_seq_len - len(input_ids)) input_len = len(input_ids) input_ids += padding assert len(input_ids) == max_seq_len if ex_id < 2: logger.info("*** Example ***") logger.info(f"guid: {example.guid}" % ()) logger.info( f"tokens: {' '.join([str(x) for x in tokens])}") logger.info( f"input_ids: {' '.join([str(x) for x in input_ids])}") feature = InputFeature(input_ids=input_ids, label_ids=label_ids, input_len=input_len) features.append(feature) pbar.batch_step(step=ex_id, info={}, bar_type='create features') logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) return features
def evaluate(args, model, tokenizer, prefix=""): metric = SpanEntityScore(args.id2label) eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) eval_features = load_and_cache_examples(args, args.task_name, tokenizer, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 pbar = ProgressBar(n_total=len(eval_features), desc="Evaluating") for step, f in enumerate(eval_features): input_lens = f.input_len input_ids = torch.tensor([f.input_ids[:input_lens]], dtype=torch.long).to(args.device) input_mask = torch.tensor([f.input_mask[:input_lens]], dtype=torch.long).to(args.device) segment_ids = torch.tensor([f.segment_ids[:input_lens]], dtype=torch.long).to(args.device) start_ids = torch.tensor([f.start_ids[:input_lens]], dtype=torch.long).to(args.device) end_ids = torch.tensor([f.end_ids[:input_lens]], dtype=torch.long).to(args.device) subjects = f.subjects model.eval() with torch.no_grad(): inputs = {"input_ids": input_ids, "attention_mask": input_mask, "start_positions": start_ids, "end_positions": end_ids} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (segment_ids if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, start_logits, end_logits = outputs[:3] R = bert_extract_item(start_logits, end_logits) T = subjects metric.update(true_subject=T, pred_subject=R) if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 pbar(step) logger.info("\n") eval_loss = eval_loss / nb_eval_steps eval_info, entity_info = metric.result() results = {f'{key}': value for key, value in eval_info.items()} results['loss'] = eval_loss logger.info("***** Eval results %s *****", prefix) info = "-".join([f' {key}: {value:.4f} ' for key, value in results.items()]) logger.info(info) logger.info("***** Entity results %s *****", prefix) for key in sorted(entity_info.keys()): print("******* %s results ********" % key) info = "-".join([f' {key}: {value:.4f} ' for key, value in entity_info[key].items()]) print(info) return results
def predict(args, model, tokenizer, lines, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, lines, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, nn.DataParallel): model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None, 'input_lens': batch[4] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] preds, _ = model.crf._obtain_labels(logits, args.id2label, inputs['input_lens']) preds = preds[0][1:-1] # [CLS]XXXX[SEP] label_entities = get_entities(preds, args.id2label, args.markup) json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join(preds) json_d['entities'] = label_entities results.append(json_d) pbar(step) print(results[:3])
def predict(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) pred_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) pred_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs): pred_dataset = load_and_cache_examples(args, pred_task, tokenizer, data_type='test') if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) args.pred_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly pred_sampler = SequentialSampler(pred_dataset) if args.local_rank == -1 else DistributedSampler(pred_dataset) pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size=args.pred_batch_size, collate_fn=collate_fn) logger.info("***** Running prediction {} *****".format(prefix)) logger.info(" Num examples = %d", len(pred_dataset)) logger.info(" Batch size = %d", args.pred_batch_size) nb_pred_steps = 0 preds = None pbar = ProgressBar(n_total=len(pred_dataloader), desc="Predicting") for step, batch in enumerate(pred_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if ( 'bert' in args.model_type or 'xlnet' in args.model_type) else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) _, logits = outputs[:2] nb_pred_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) pbar(step) print(' ') if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) output_pred_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt") with open(output_pred_file, "w") as writer: for pred in preds: writer.write(str(pred) + '\n') return results
def _create_examples(self, lines, example_type): ''' Creates examples for data ''' datapbar = ProgressBar(n_total=len(lines), desc='create examples') examples = [] for i, line in tqdm.tqdm(enumerate(lines)): id = line['query_id'] context = line['passage'] query = line['query'] alternatives = line['alternatives'].split('|') random.shuffle(alternatives) if example_type == 'test': answer = None else: answer = self.get_anwser(line['answer'], alternatives) # test 没有这项 example = InputExample(example_id=id, question=query, contexts=context, endings=alternatives, label=answer) examples.append(example) # pbar(step=i) return examples
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] args.warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path ) and "checkpoint" in args.model_name_or_path: # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 if args.do_adv: fgm = FGM(model, emb_name=args.adv_name, epsilon=args.adv_epsilon) model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) for _ in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.do_adv: fgm.attack() loss_adv = model(**inputs)[0] if args.n_gpu > 1: loss_adv = loss_adv.mean() loss_adv.backward() fgm.restore() pbar(step, {'loss': loss.item()}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics print(" ") if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well evaluate(args, model, tokenizer) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = (model.module if hasattr(model, "module") else model) model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) tokenizer.save_vocabulary(output_dir) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) print(" ") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer, prefix=""): metric = SeqEntityScore(args.id2label, markup=args.markup) eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Eval! logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] if args.n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean( ) # mean() to average on multi-gpu parallel evaluating eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 preds = np.argmax(logits.cpu().numpy(), axis=2).tolist() out_label_ids = inputs['labels'].cpu().numpy().tolist() for i, label in enumerate(out_label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif out_label_ids[i][j] == args.label2id['[SEP]']: metric.update(pred_paths=[temp_2], label_paths=[temp_1]) break else: temp_1.append(args.id2label[out_label_ids[i][j]]) temp_2.append(preds[i][j]) pbar(step) print(' ') eval_loss = eval_loss / nb_eval_steps eval_info, entity_info = metric.result() results = {f'{key}': value for key, value in eval_info.items()} results['loss'] = eval_loss logger.info("***** Eval results %s *****", prefix) info = "-".join( [f' {key}: {value:.4f} ' for key, value in results.items()]) logger.info(info) logger.info("***** Entity results %s *****", prefix) for key in sorted(entity_info.keys()): logger.info("******* %s results ********" % key) info = "-".join([ f' {key}: {value:.4f} ' for key, value in entity_info[key].items() ]) logger.info(info) return results
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) if args.max_steps > 0: num_training_steps = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: num_training_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs args.warmup_steps = int(num_training_steps * args.warmup_proportion) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) optimizer = AdamW(params=optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_training_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", num_training_steps) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) for _ in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3], 'token_type_ids': batch[2] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well evaluate(args, model, tokenizer) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) pbar(step, {'loss': loss.item()}) print(" ") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step
def test(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) test_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) test_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for test_task, test_output_dir in zip(test_task_names, test_outputs_dirs): test_dataset = load_and_cache_examples(args, test_task, tokenizer, data_type='test') if not os.path.exists(test_output_dir) and args.local_rank in [-1, 0]: os.makedirs(test_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Test! logger.info("***** Running test {} *****".format(prefix)) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None pbar = ProgressBar(n_total=len(test_dataloader), desc="Testing") for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3], 'token_type_ids': batch[2] } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) pbar(step) print(' ') if 'cuda' in str(args.device): torch.cuda.empty_cache() eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(test_task, preds, out_label_ids) results.update(result) logger.info("***** Test results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) classreport = ClassReport([ 'Joint', 'Sequence', 'Progression', "Contrast", "Supplement", "Cause-Result", "Result-Cause", "Background", "Behavior-Purpose", "Purpose-Behavior", "Elaboration", "Summary", "Evaluation", "Statement-Illustration", "Illustration-Statement" ]) classreport(preds, out_label_ids) logger.info("%s : %s", classreport.name(), classreport.value()) return results
def evaluate(args, model, tokenizer, label_lists, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} logger.info("**** Evaluate *****") for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, data_type='dev') if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn_ner) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): now = datetime.datetime.now() model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} inputs['token_type_ids'] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = np.argmax(logits.detach().cpu().numpy(), axis = 2) out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds_argmax = np.argmax(logits.detach().cpu().numpy(), axis = 2) preds = collate_pred(preds, preds_argmax, label_lists) out_label_ids = collate_pred(out_label_ids, inputs['labels'].detach().cpu().numpy(), label_lists) pbar(step) delta = (datetime.datetime.now() - now).microseconds / 1000 logger.info("*** Evaluating timecost, input length %d, timecost %d" \ % (len(batch[0]), delta)) print(' ') if 'cuda' in str(args.device): torch.cuda.empty_cache() eval_loss = eval_loss / nb_eval_steps evaluater = NerAccuracyEvaluator(label_lists, "WORD") result = evaluater.evaluate(preds, out_label_ids, args.label_with_bi) results.update(result) logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) # Store evaluate results if args.do_eval and args.output_eval: print_eval_output(args.output_dir, preds, out_label_ids, label_lists) return results
def predict(args, model, tokenizer, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler( test_dataset) if args.local_rank == -1 else DistributedSampler( test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) results = [] output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") if isinstance(model, nn.DataParallel): model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": None } if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] tags = model.crf.decode(logits, inputs['attention_mask']) tags = tags.squeeze(0).cpu().numpy().tolist() preds = tags[0][1:-1] # [CLS]XXXX[SEP] label_entities = get_entities(preds, args.id2label, args.markup) json_d = {} json_d['id'] = step json_d['tag_seq'] = " ".join([args.id2label[x] for x in preds]) json_d['entities'] = label_entities results.append(json_d) pbar(step) logger.info("\n") with open(output_predict_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') if args.task_name == 'cluener': output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json") test_text = [] with open(os.path.join(args.data_dir, "test.json"), 'r') as fr: for line in fr: test_text.append(json.loads(line)) test_submit = [] for x, y in zip(test_text, results): json_d = {} json_d['id'] = x['id'] json_d['label'] = {} entities = y['entities'] words = list(x['text']) if len(entities) != 0: for subject in entities: tag = subject[0] start = subject[1] end = subject[2] word = "".join(words[start:end + 1]) if tag in json_d['label']: if word in json_d['label'][tag]: json_d['label'][tag][word].append([start, end]) else: json_d['label'][tag][word] = [[start, end]] else: json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] test_submit.append(json_d) json_to_text(output_submit_file, test_submit)
def train(args, train_dataloader, eval_dataloader, metrics, model): """ Train the model """ t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] args.warmup_steps = t_total * args.warmup_proportion optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 best_acc = 0 model.zero_grad() seed_everything(args.seed) for epoch in range(int(args.num_train_epochs)): tr_loss = AverageMeter() pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } inputs['token_type_ids'] = batch[2] outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss.update(loss.item(), n=1) pbar(step, info={"loss": loss.item()}) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 train_log = {'loss': tr_loss.avg} eval_log = evaluate(args, model, eval_dataloader, metrics) logs = dict(train_log, **eval_log) show_info = f'\nEpoch: {epoch} - ' + "-".join( [f' {key}: {value:.4f} ' for key, value in logs.items()]) logger.info(show_info) if logs['eval_acc'] > best_acc: logger.info( f"\nEpoch {epoch}: eval_acc improved from {best_acc} to {logs['eval_acc']}" ) logger.info("save model to disk.") best_acc = logs['eval_acc'] print("Valid Entity Score: ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_file = args.model_save_path output_file.mkdir(exist_ok=True) output_model_file = output_file / WEIGHTS_NAME torch.save(model_to_save.state_dict(), output_model_file) output_config_file = output_file / CONFIG_NAME with open(str(output_config_file), 'w') as f: f.write(model_to_save.config.to_json_string())
def evaluate(args, model, eval_dataloader, metrics): # Eval! logger.info(" Number of examples = %d", len(eval_dataloader)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = AverageMeter() metrics.reset() preds = [] targets = [] pbar = ProgressBar(n_total=len(eval_dataloader), desc='Evaluating') # pdb.set_trace() # (Pdb) a # args = Namespace(adam_epsilon=1e-08, albert_config_path= # 'pretrain/pytorch/albert_base_zh/albert_config_base.json', # arch='albert_base', bert_dir='pretrain/pytorch/albert_base_zh', # device=device(type='cuda'), do_eval=False, do_lower_case=False, # do_test=True, do_train=False, eval_all_checkpoints=False, # eval_batch_size=16, eval_max_seq_len=64, evaluate_during_training=False, # fp16=False, fp16_opt_level='O1', gradient_accumulation_steps=1, learning_rate=2e-05, # local_rank=-1, max_grad_norm=5.0, model_save_path=PosixPath('outputs/checkpoints/albert_base'), # n_gpu=1, no_cuda=False, num_train_epochs=3.0, overwrite_cache=False, # overwrite_output_dir=False, seed=42, server_ip='', server_port='', # share_type='all', task_name='lcqmc', train_batch_size=32, train_max_seq_len=64, # warmup_proportion=0.1, weight_decay=0.1) # # model = AlbertForSequenceClassification( # (bert): AlbertModel( # (embeddings): AlbertEmbeddings( # (word_embeddings): Embedding(21128, 128, padding_idx=0) # (word_embeddings_2): Linear(in_features=128, out_features=768, bias=False) # (position_embeddings): Embedding(512, 768) # (token_type_embeddings): Embedding(2, 768) # (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) # (dropout): Dropout(p=0.0, inplace=False) # ) # (encoder): AlbertEncoder( # (layer_shared): AlbertLayer( # (attention): AlbertAttention( # (self): AlbertSelfAttention( # (query): Linear(in_features=768, out_features=768, bias=True) # (key): Linear(in_features=768, out_features=768, bias=True) # (value): Linear(in_features=768, out_features=768, bias=True) # (dropout): Dropout(p=0.0, inplace=False) # ) # (output): AlbertSelfOutput( # (dense): Linear(in_features=768, out_features=768, bias=True) # (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) # (dropout): Dropout(p=0.0, inplace=False) # ) # ) # (intermediate): AlbertIntermediate( # (dense): Linear(in_features=768, out_features=3072, bias=True) # ) # (output): AlbertOutput( # (dense): Linear(in_features=3072, out_features=768, bias=True) # (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) # (dropout): Dropout(p=0.0, inplace=False) # ) # ) # ) # (pooler): AlbertPooler( # (dense): Linear(in_features=768, out_features=768, bias=True) # (activation): Tanh() # ) # ) # (dropout): Dropout(p=0.2, inplace=False) # (classifier): Linear(in_features=768, out_features=2, bias=True) # ) # eval_dataloader = <torch.utils.data.dataloader.DataLoader object at 0x7f113f07d668> # metrics = <common.metrics.Accuracy object at 0x7f11a904fa90> for bid, batch in enumerate(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } # inputs['token_type_ids'] = batch[2] outputs = model(**inputs) loss, logits = outputs[:2] eval_loss.update(loss.item(), n=batch[0].size()[0]) preds.append(logits.cpu().detach()) targets.append(inputs['labels'].cpu().detach()) pbar(bid) # pdb.set_trace() # (Pdb) pp batch[0].size(), batch[1].size(), batch[2].size(), batch[3].size() # (torch.Size([16, 64]), torch.Size([16, 64]), torch.Size([16, 64]), torch.Size([16])) # (Pdb) inputs['input_ids'][0] # tensor([ 101, 6443, 3300, 4312, 676, 6821, 2476, 7770, 3926, 4638, 102, 6821, # 2476, 7770, 3926, 1745, 8024, 6443, 3300, 102, 0, 0, 0, 0, # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0, 0, 0, 0], device='cuda:0') # (Pdb) inputs['attention_mask'][0] # tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0') # (Pdb) inputs['token_type_ids'][0] # tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0') # (Pdb) inputs['labels'] # tensor([0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1], device='cuda:0') preds = torch.cat(preds, dim=0).cpu().detach() targets = torch.cat(targets, dim=0).cpu().detach() metrics(preds, targets) eval_log = {"eval_acc": metrics.value(), 'eval_loss': eval_loss.avg} return eval_log
def create_features(self, examples, max_seq_len, cached_features_file): ''' # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 ''' pbar = ProgressBar(n_total=len(examples), desc='create features') if cached_features_file.exists(): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: features = [] for ex_id, example in enumerate(examples): tokens_a = self.tokenizer.tokenize(example.text_a) tokens_b = None label_id = example.label if example.text_b: tokens_b = self.tokenizer.tokenize(example.text_b) # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self.truncate_seq_pair(tokens_a, tokens_b, max_length=max_seq_len - 3) else: # Account for [CLS] and [SEP] with '-2' if len(tokens_a) > max_seq_len - 2: tokens_a = tokens_a[:max_seq_len - 2] tokens = ['[CLS]'] + tokens_a + ['[SEP]'] segment_ids = [0] * len(tokens) if tokens_b: tokens += tokens_b + ['[SEP]'] segment_ids += [1] * (len(tokens_b) + 1) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) padding = [0] * (max_seq_len - len(input_ids)) input_len = len(input_ids) input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == max_seq_len assert len(input_mask) == max_seq_len assert len(segment_ids) == max_seq_len if ex_id < 2: logger.info("*** Example ***") logger.info(f"guid: {example.guid}" % ()) logger.info( f"tokens: {' '.join([str(x) for x in tokens])}") logger.info( f"input_ids: {' '.join([str(x) for x in input_ids])}") logger.info( f"input_mask: {' '.join([str(x) for x in input_mask])}" ) logger.info( f"segment_ids: {' '.join([str(x) for x in segment_ids])}" ) logger.info(f"label id : {label_id}") feature = InputFeature(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, input_len=input_len) features.append(feature) pbar(step=ex_id) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) return features
def train(args, train_features, model, tokenizer, use_crf): """ Train the model """ # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] bert_param_optimizer = list(model.bert.named_parameters()) if args.model_encdec == 'bert2crf': crf_param_optimizer = list(model.crf.named_parameters()) linear_param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{ 'params': [ p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.learning_rate }, { 'params': [ p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.learning_rate }, { 'params': [ p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.crf_learning_rate }] elif args.model_encdec == 'bert2gru': gru_param_optimizer = list(model.decoder.named_parameters()) linear_param_optimizer = list(model.clsdense.named_parameters()) optimizer_grouped_parameters = [{ 'params': [ p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.learning_rate }, { 'params': [ p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.learning_rate }, { 'params': [ p for n, p in gru_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in gru_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.crf_learning_rate }] elif args.model_encdec == 'bert2soft': linear_param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{ 'params': [ p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.learning_rate }, { 'params': [ p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.crf_learning_rate }] elif args.model_encdec == 'multi2point': # gru_param_optimizer = list(model.decoder.named_parameters()) linear_param_optimizer = list(model.pointer.named_parameters()) optimizer_grouped_parameters = [{ 'params': [ p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.learning_rate }, { 'params': [ p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay, 'lr': args.point_learning_rate }, { 'params': [ p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.point_learning_rate }] t_total = len(train_features) // args.batch_size * args.num_train_epochs args.warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.batch_size * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) # logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 pre_result = {} model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) total_step = 0 best_spanf = -1 test_results = {} for ep in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_features) // args.batch_size, desc='Training') if ep == int(args.num_train_epochs) - 1: eval_features = load_and_cache_examples(args, args.data_type, tokenizer, data_type='dev') train_features.extend(eval_features) step = 0 for batch in batch_generator(features=train_features, batch_size=args.batch_size, use_crf=use_crf, answer_seq_len=args.answer_seq_len): batch_input_ids, batch_input_mask, batch_segment_ids, batch_label_ids, batch_multi_span_label, batch_context_mask, batch_start_position, batch_end_position, batch_raw_labels, _, batch_example = batch model.train() if args.model_encdec == 'bert2crf' or args.model_encdec == 'bert2gru' or args.model_encdec == 'bert2soft': batch_inputs = tuple(t.to(args.device) for t in batch[0:6]) inputs = { "input_ids": batch_inputs[0], "attention_mask": batch_inputs[1], "token_type_ids": batch_inputs[2], "context_mask": batch_inputs[5], "labels": batch_inputs[3], "testing": False } elif args.model_encdec == 'multi2point': batch_inputs = tuple(t.to(args.device) for t in batch[0:5]) inputs = { "input_ids": batch_inputs[0], "attention_mask": batch_inputs[1], "token_type_ids": batch_inputs[2], "span_label": batch_inputs[4], "testing": False } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training loss.backward() if step % 15 == 0: pbar(step, {'epoch': ep, 'loss': loss.item()}) step += 1 tr_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics print("start evalue") if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args=args, model=model, tokenizer=tokenizer, prefix="dev", use_crf=use_crf) span_f = results['span_f'] if span_f > best_spanf: output_dir = os.path.join(args.output_dir, "checkpoint-bestf") if os.path.exists(output_dir): shutil.rmtree(output_dir) print('remove file', args.output_dir) print('\n\n eval results:', results) test_results = evaluate(args=args, model=model, tokenizer=tokenizer, prefix="test", use_crf=use_crf) print('\n\n test results', test_results) print('\n epoch = :', ep) best_spanf = span_f os.makedirs(output_dir) # print('dir = ', output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) tokenizer.save_vocabulary(output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info( "Saving optimizer and scheduler states to %s", output_dir) np.random.seed() np.random.shuffle(train_features) logger.info("\n") # if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step, test_results
def evaluate(args, model, tokenizer, prefix="dev", use_crf=False): eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) eval_features = load_and_cache_examples(args, args.data_type, tokenizer, data_type=prefix) processor = processors[args.data_type]() logger.info("***** Running evaluation %s *****", prefix) logger.info(" Num examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.batch_size) eval_loss = 0.0 pbar = ProgressBar(n_total=len(eval_features), desc="Evaluating" + prefix) if isinstance(model, nn.DataParallel): model = model.module pre_labels, tru_labels, eval_examples = [], [], [] step = 0 for batch in batch_generator(features=eval_features, batch_size=args.batch_size, use_crf=use_crf, answer_seq_len=args.answer_seq_len): batch_input_ids, batch_input_mask, batch_segment_ids, batch_label_ids, batch_multi_span_label, batch_context_mask, batch_start_position, batch_end_position, batch_raw_labels, _, batch_example = batch model.eval() if args.model_encdec == 'bert2crf' or args.model_encdec == 'bert2gru' or args.model_encdec == 'bert2soft': batch_inputs = tuple(t.to(args.device) for t in batch[0:6]) inputs = { "input_ids": batch_inputs[0], "attention_mask": batch_inputs[1], "token_type_ids": batch_inputs[2], "context_mask": batch_inputs[5], "testing": True } elif args.model_encdec == 'multi2point': batch_inputs = tuple(t.to(args.device) for t in batch[0:5]) inputs = { "input_ids": batch_inputs[0], "attention_mask": batch_inputs[1], "token_type_ids": batch_inputs[2], "testing": True } outputs = model(**inputs) eval_examples.extend(batch_example) out_label_ids = batch[8].tolist() batch_lens = torch.sum(batch_context_mask, -1).cpu().numpy().tolist() if args.model_encdec == 'bert2crf': logits = outputs[0] tags = model.crf.decode(logits, inputs['attention_mask']) tags = tags.squeeze(0).cpu().numpy().tolist() for len_doc, cu_tags, cu_trus, exam in zip(batch_lens, tags, out_label_ids, batch_example): emotion_len = exam.emotion_len pre_labels.append(cu_tags[1:len_doc + 1]) tru_labels.append(cu_trus[1:len_doc + 1]) elif args.model_encdec == 'multi2point': start_label, end_label = outputs #[batch, ans_len] start_label, end_label = start_label.cpu().numpy().tolist( ), end_label.cpu().numpy().tolist() pres_batch = [] for s_num, e_num in zip(start_label, end_label): pre_tag = [0] * args.max_seq_length for s, e in zip(s_num, e_num): if s < e - 1: pre_tag[s] = 1 pre_tag[s + 1:e] = [2] * (e - s - 1) elif s == e - 1: pre_tag[s] = 1 pres_batch.append(pre_tag) for len_doc, cu_tags, cu_trus, exam in zip(batch_lens, pres_batch, out_label_ids, batch_example): emotion_len = exam.emotion_len pre_labels.append(cu_tags[1:len_doc + 1]) tru_labels.append(cu_trus[1:len_doc + 1]) elif args.model_encdec == 'bert2gru' or args.model_encdec == 'bert2soft': tags = outputs.detach().cpu().numpy() tags = tags.tolist() for len_doc, cu_tags, cu_trus, exam in zip(batch_lens, tags, out_label_ids, batch_example): pre_labels.append(cu_tags[1:len_doc + 1]) tru_labels.append(cu_trus[1:len_doc + 1]) step += 1 if step % 20 == 0: pbar(step) logger.info("\n") results = get_prf(pre_labels, tru_labels, eval_examples) logger.info("***** Eval results %s *****", prefix) info = "-".join( [f' {key}: {value:.4f} ' for key, value in results.items()]) logger.info(info) return results
def _test(self, args, model, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) test_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) test_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for test_task, test_output_dir in zip(test_task_names, test_outputs_dirs): test_dataset = self.dataset["test_dataset"] if not os.path.exists(test_output_dir) and args.local_rank in [ -1, 0 ]: os.makedirs(test_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly test_dataloader = DataLoader(test_dataset, batch_size=args.eval_batch_size) # Test! print("***** Running test {} *****".format(prefix)) print(" Num examples = %d", len(test_dataset)) print(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None pbar = ProgressBar(n_total=len(test_dataloader), desc="Testing") for step, batch in enumerate(test_dataloader): model.eval() with torch.no_grad(): inputs = { 'input_ids': batch["input_ids"].to(args.device), 'attention_mask': batch['attention_mask'].to(args.device), 'token_type_ids': batch['token_type_ids'].to(args.device), "labels": batch["labels"].to(args.device) } outputs = model(**inputs) tmp_eval_loss, logits = outputs.loss, outputs.logits eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) pbar(step) print(' ') if 'cuda' in str(args.device): torch.cuda.empty_cache() self._save_result(args.model_type + str(prefix), preds, out_label_ids) preds = np.argmax(preds, axis=1) result = acc_and_f1(preds, out_label_ids, average="macro") results.update(result) return results
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) if args.max_steps > 0: num_training_steps = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: num_training_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs args.warmup_steps = int(num_training_steps * args.warmup_proportion) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(params=optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_training_steps) if args.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", num_training_steps) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything( args.seed ) # Added here for reproductibility (even between python 2 and 3) for _ in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } inputs['token_type_ids'] = batch[2] outputs = model(**inputs) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1: evaluate(args, model, tokenizer) # if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) # if not os.path.exists(output_dir): # os.makedirs(output_dir) # model_to_save = model.module if hasattr(model, # 'module') else model # model_to_save.save_pretrained(output_dir) # torch.save(args, os.path.join(output_dir, 'training_args.bin')) # logger.info("Saving model checkpoint to %s", output_dir) pbar(step, {'loss': loss.item()}) print(" ") if 'cuda' in str(args.device): torch.cuda.empty_cache() # if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) return global_step, tr_loss / global_step