def setup_default_optimizer(self, weight_decay: float = 0.0, learning_rate: float = 5e-5, adam_epsilon: float = 1e-8, warmup_steps: int = 0, total_steps: int = 0): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] self.optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=warmup_steps, t_total=total_steps)
def __init__(self, args): self.args = args self.train_batch_num = args['train_batch'] self.Dataloader = Dataloader(args) print("preparing the train_data") self.train_data = self.Dataloader.load_train_batches() print("preparing the val_data") print("train data len:", len(self.train_data) * self.train_batch_num) self.cuda_gpu = (torch.cuda.is_available() and args['use_gpu']) print("build modeling:") self.global_model = Global_Model(args) if (self.cuda_gpu): # torch.nn.DataParallel (self.global_model, device_ids=gpus).cuda () self.global_model = self.global_model.cuda() self.global_optimer = AdamW(self.global_model.parameters(), lr=args['global_lr']) num_total_steps = len(self.train_data) * args['global_epoch'] num_warmup_steps = int(args['global_warmup_rate'] * num_total_steps) self.global_scheduler = WarmupLinearSchedule( self.global_optimer, warmup_steps=num_warmup_steps, t_total=num_total_steps)
def test_warmup_linear_scheduler(self): scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0] self.assertEqual(len(lrs[0]), 1) self.assertListEqual([l[0] for l in lrs], expected_learning_rates) scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
def train_task(args, model, memory, train_dataset, valid_dataset): # train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.n_workers, # shuffle=not args.reproduce, collate_fn=dynamic_collate_fn) train_dataloader = DataLoader(train_dataset, num_workers=args.n_workers, collate_fn=dynamic_collate_fn, batch_sampler=DynamicBatchSampler(train_dataset, args.batch_size)) # if valid_dataset: # valid_dataloader = DataLoader(valid_dataset, batch_size=args.batch_size * 6, # num_workers=args.n_workers, collate_fn=dynamic_collate_fn) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=len(train_dataset)//10) model.zero_grad() tot_epoch_loss, tot_n_inputs = 0, 0 def update_parameters(loss): loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() model.zero_grad() for step, batch in enumerate(train_dataloader): model.train() n_inputs, input_ids, masks, labels = prepare_inputs(batch) memory.add(input_ids, masks, labels) loss = model(input_ids=input_ids, attention_mask=masks, labels=labels)[0] update_parameters(loss) tot_n_inputs += n_inputs tot_epoch_loss += loss.item() * n_inputs if (step+1) % args.logging_steps == 0: logger.info("progress: {:.2f} , step: {} , lr: {:.2E} , avg batch size: {:.1f} , avg loss: {:.3f}".format( tot_n_inputs/args.n_train, step+1, scheduler.get_lr()[0], tot_n_inputs//(step+1), tot_epoch_loss/tot_n_inputs)) if args.replay_interval >= 1 and (step+1) % args.replay_interval == 0: torch.cuda.empty_cache() del loss, input_ids, masks, labels input_ids, masks, labels = memory.sample(tot_n_inputs // (step + 1)) loss = model(input_ids=input_ids, attention_mask=masks, labels=labels)[0] update_parameters(loss) logger.info("Finsih training, avg loss: {:.3f}".format(tot_epoch_loss/tot_n_inputs)) del optimizer, optimizer_grouped_parameters assert tot_n_inputs == len(train_dataset) == args.n_train
def build_optimizer(model, num_train_steps, learning_rate, adam_eps, warmup_steps, weight_decay): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_eps) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_steps) return optimizer, scheduler
def initialize_training(args, device): """Initalize the tokenizer, the data loaders, the model and the tools of the optimization process.""" # Create tokenizer, datasets and loaders tokenizer = EpisodeSummaryTokenizer.from_pretrained( args.gpt2_version, max_num_words=args.max_num_words, size_variance_handling=args.size_var_handling) train_dataset, val_dataset = create_datasets_from_jsons( args.json_paths, tokenizer, args.val_split) dataloaders = { 'train': DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, collate_fn=tokenizer.pad_batch_to_same_size), 'val': DataLoader(val_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=tokenizer.pad_batch_to_same_size) } # Load pre-trained network weights model = GPT2LMHeadModel.from_pretrained(args.gpt2_version) model = model.to(device) # Prepare optimizer and scheduler no_decay = ['bias', 'LayerNorm.weight'] # no decay for biases and layer norm optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=args.max_steps) model.zero_grad() train_state = make_train_state( save_path=args.model_save_path, early_stopping_patience=args.early_stopping_patience) return tokenizer, dataloaders, model, optimizer, scheduler, train_state
def train_eval_bert(corpus, res_dir, model_name, cfg_model): idx2tag, tag2idx = make_bert_tag_dict_from_flair_corpus(corpus) model = BertForTokenClassificationCustom.from_pretrained(model_name, cache_dir=cfg_model.cache_dir, num_labels=len(tag2idx)) tokenizer = BertTokenizer.from_pretrained(cfg_model.tokenizer, cache_dir=cfg_model.cache_dir, do_lower_case=('uncasaed' in cfg_model.tokenizer)) w_decay = 0.01 model = model.cuda() seq_tagger = SequenceTaggerBert(bert_model=model, bpe_tokenizer=tokenizer, idx2tag=idx2tag, tag2idx=tag2idx, max_len=cfg_model.max_len) train_dataset = prepare_flair_corpus(corpus.train) val_dataset = prepare_flair_corpus(corpus.dev) test_dataset = prepare_flair_corpus(corpus.test) optimizer = AdamW(get_model_parameters(model), lr=cfg_model.lr, betas=(0.9, 0.999), eps=1e-6, weight_decay=w_decay, correct_bias=True) if cfg_model.sched == 'warmup': lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0.1, t_total=(len(corpus.train) / cfg_model.bs)*cfg_model.n_epochs) trainer = ModelTrainerBert(model=seq_tagger, optimizer=optimizer, lr_scheduler=lr_scheduler, train_dataset=train_dataset, val_dataset=val_dataset, update_scheduler='es', validation_metrics=[f1_entity_level], batch_size=cfg_model.bs) elif cfg_model.sched == 'plateau': lr_scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=cfg_model.patience, mode='min') trainer = ModelTrainerBert(model=seq_tagger, optimizer=optimizer, lr_scheduler=lr_scheduler, train_dataset=train_dataset, val_dataset=val_dataset, keep_best_model=True, restore_bm_on_lr_change=True, update_scheduler='ee', validation_metrics=[f1_entity_level], decision_metric=lambda metrics: -metrics[1], batch_size=cfg_model.bs) trainer.train(epochs=cfg_model.n_epochs) _, __, metrics = seq_tagger.predict(test_dataset, evaluate=True, metrics=[f1_entity_level, f1_token_level]) print('Test performance:', metrics) return seq_tagger, metrics
def __init__(self, args, config, model, criterion, train_dataloader, valid_dataloader, logger, save_path, tb_writer): self.args = args self.config = config self.model = model self.criterion = criterion self.train_dataloader = train_dataloader self.valid_dataloader = valid_dataloader self.logger = logger self.save_path = save_path self.tb_writer = tb_writer self.t_total = len(self.train_dataloader) * self.args.epoch self.device = self.config.device self.optimizer = AdamW(self.get_model_parameters(), lr=self.config.learning_rate) self.scheduler = WarmupLinearSchedule(self.optimizer, 0.1 * self.t_total, self.t_total) self.global_step = 0 self.best_eval_acc = 0.2
def train(train_iter, val_iter, model): opt = AdamW(model.parameters(), lr=1e-4, eps=1e-8) scheduler = WarmupLinearSchedule(opt, warmup_steps=20, t_total=2500) model.train() losses = [] for i, ex in enumerate(train_iter): opt.zero_grad() words, mapper, _ = ex.word label, lengths = ex.head batch, _ = label.shape # Model final = model(words.cuda(), mapper) for b in range(batch): final[b, lengths[b]-1:, :] = 0 final[b, :, lengths[b]-1:] = 0 if not lengths.max() <= final.shape[1] + 1: print("fail") continue dist = DependencyCRF(final, lengths=lengths) labels = dist.struct.to_parts(label, lengths=lengths).type_as(final) log_prob = dist.log_prob(labels) loss = log_prob.sum() (-loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) opt.step() scheduler.step() losses.append(loss.detach()) if i % 50 == 1: print(-torch.tensor(losses).mean(), words.shape) losses = [] if i % 600 == 500: validate(val_iter)
def setup_optim(self, t_total, warmup_step): args, model = self.args, self.model # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [] optimizer_grouped_parameters.append({ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }) optimizer_grouped_parameters.append({ 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) self.model, optimizer = amp.initialize( model, optimizer, opt_level=args.fp16_opt_level) if args.schedule_half: scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=0, min_lr=args.min_lr, mode='max') self.scheduler, self.optimizer = scheduler, optimizer
def generate_bert_optimizer(t_total, bert_lr, model): no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': gamma }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=bert_lr, eps=1e-8) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=t_total) return scheduler, optimizer
def main(parser): # Config args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) # data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model ptr_tokenizer = SentencepieceTokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token_to_idx = vocab_of_gluonnlp.token_to_idx model_config.vocab_size = len(token_to_idx) vocab = Vocabulary(token_to_idx=token_to_idx) print("len(token_to_idx): ", len(token_to_idx)) with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f: json.dump(token_to_idx, f, ensure_ascii=False, indent=4) # save vocab & tokenizer with open(model_dir / "vocab.pkl", 'wb') as f: pickle.dump(vocab, f) # load vocab & tokenizer with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab, tokenizer=tokenizer, maxlen=model_config.maxlen, model_dir=model_dir) # Train & Val Datasets cwd = Path.cwd() data_in = cwd / "data_in" train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명" tr_clf_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir, model_dir=model_dir) tr_clf_ds.set_transform_fn(transform_source_fn=ner_formatter.transform_source_fn, transform_target_fn=ner_formatter.transform_target_fn) tr_clf_dl = DataLoader(tr_clf_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False) # Model model = KobertCRF(config=model_config, num_classes=len(tr_clf_ds.ner_to_index)) model.train() # optim train_examples_len = len(tr_clf_ds) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs t_total = len(tr_clf_dl) // model_config.gradient_accumulation_steps * model_config.epochs optimizer = AdamW(optimizer_grouped_parameters, lr=model_config.learning_rate, eps=model_config.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') n_gpu = torch.cuda.device_count() # if n_gpu > 1: # model = torch.nn.DataParallel(model) model.to(device) # save tb_writer = SummaryWriter('{}/runs'.format(model_dir)) checkpoint_manager = CheckpointManager(model_dir) summary_manager = SummaryManager(model_dir) best_val_loss = 1e+10 best_train_acc = 0 # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(tr_clf_ds)) logger.info(" Num Epochs = %d", model_config.epochs) logger.info(" Instantaneous batch size per GPU = %d", model_config.batch_size) # logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", # args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", model_config.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_dev_acc, best_dev_loss = 0.0, 99999999999.0 best_steps = 0 model.zero_grad() set_seed() # Added here for reproductibility (even between python 2 and 3) # Train train_iterator = trange(int(model_config.epochs), desc="Epoch") for _epoch, _ in enumerate(train_iterator): epoch_iterator = tqdm(tr_clf_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0] epoch = _epoch for step, batch in enumerate(epoch_iterator): model.train() x_input, token_type_ids, y_real = map(lambda elm: elm.to(device), batch) log_likelihood, sequence_of_tags = model(x_input, token_type_ids, y_real) # loss: negative log-likelihood loss = -1 * log_likelihood if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if model_config.gradient_accumulation_steps > 1: loss = loss / model_config.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), model_config.max_grad_norm) tr_loss += loss.item() if (step + 1) % model_config.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 with torch.no_grad(): sequence_of_tags = torch.tensor(sequence_of_tags) print("sequence_of_tags: ", sequence_of_tags) print("y_real: ", y_real) print("loss: ", loss) print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real)) mb_acc = (sequence_of_tags == y_real).float()[y_real != vocab.PAD_ID].mean() tr_acc = mb_acc.item() tr_loss_avg = tr_loss / global_step tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc} # if step % 50 == 0: print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, global_step, tr_summary['loss'], tr_summary['acc'])) if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0: # Log metrics if model_config.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well pass tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / model_config.logging_steps, global_step) logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss) / model_config.logging_steps), str(global_step)) logging_loss = tr_loss if model_config.save_steps > 0 and global_step % model_config.save_steps == 0: # Save model checkpoint output_dir = os.path.join(model_config.output_dir, 'epoch-{}'.format(epoch + 1)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Saving model checkpoint to %s", output_dir) state = {'global_step': global_step + 1, 'model_state_dict': model.state_dict(), 'opt_state_dict': optimizer.state_dict()} summary = {'train': tr_summary} summary_manager.update(summary) summary_manager.save('summary.json') is_best = tr_acc >= best_train_acc # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야) # Save if is_best: best_train_acc = tr_acc checkpoint_manager.save_checkpoint(state, 'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc)) else: torch.save(state, os.path.join(output_dir, 'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc))) tb_writer.close() logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step) return global_step, tr_loss / global_step, best_steps
class TransformerBase(TrainableModel): MODEL_CONFIGURATIONS = { 'bert': (BertConfig, BertTokenizer), 'bert+core': (BertConfig, BertTokenizer), 'xlnet': (XLNetConfig, XLNetTokenizer), 'xlm': (XLMConfig, XLMTokenizer), } def __init__(self, model_type, model_name_or_path, labels: List[str] = None, config_name=None, tokenizer_name=None, do_lower_case=False, output_path=None, device='cpu', n_gpus=0): assert model_type in self.MODEL_CONFIGURATIONS.keys( ), "unsupported model_type" self.model_type = model_type self.model_name_or_path = model_name_or_path self.labels = labels self.num_labels = len(self.labels) + 1 # +1 for 0 for padding self.labels_id_map = {k: v for k, v in enumerate(self.labels, 1)} self.do_lower_case = do_lower_case if output_path is not None and not os.path.exists(output_path): raise FileNotFoundError('output_path is not found') self.output_path = output_path self.model_class = None config_class, tokenizer_class = self.MODEL_CONFIGURATIONS[model_type] self.config_class = config_class self.tokenizer_class = tokenizer_class self.tokenizer_name = tokenizer_name self.tokenizer = self._load_tokenizer(self.tokenizer_name) self.config_name = config_name self.config = self._load_config(config_name) self.model = None self.device = device self.n_gpus = n_gpus self._optimizer = None self._scheduler = None def to(self, device='cpu', n_gpus=0): if self.model is not None: self.model.to(device) if n_gpus > 1: self.model = torch.nn.DataParallel(self.model) self.device = device self.n_gpus = n_gpus @property def optimizer(self): return self._optimizer @optimizer.setter def optimizer(self, opt): self._optimizer = opt @property def scheduler(self): return self._scheduler @scheduler.setter def scheduler(self, sch): self._scheduler = sch def setup_default_optimizer(self, weight_decay: float = 0.0, learning_rate: float = 5e-5, adam_epsilon: float = 1e-8, warmup_steps: int = 0, total_steps: int = 0): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] self.optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=warmup_steps, t_total=total_steps) def _load_config(self, config_name=None): config = self.config_class.from_pretrained( config_name if config_name else self.model_name_or_path, num_labels=self.num_labels) return config def _load_tokenizer(self, tokenizer_name=None): tokenizer = self.tokenizer_class.from_pretrained( tokenizer_name if tokenizer_name else self.model_name_or_path, do_lower_case=self.do_lower_case) return tokenizer def save_model(self, output_dir, save_checkpoint=False, args=None): """Save model/tokenizer/arguments to given output directory """ # Create output directory if needed if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Saving model checkpoint to %s", output_dir) model_to_save = self.model.module if hasattr(self.model, 'module') else self.model model_to_save.save_pretrained(output_dir) if not save_checkpoint: if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) with io.open(output_dir + os.sep + 'labels.txt', 'w', encoding='utf-8') as fw: for l in self.labels: fw.write('{}\n'.format(l)) if args is not None: torch.save(args, os.path.join(output_dir, 'training_args.bin')) @classmethod def load_model(cls, model_path, model_type): # Load a trained model and vocabulary from given path if not os.path.exists(model_path): raise FileNotFoundError with io.open(model_path + os.sep + 'labels.txt') as fp: labels = [l.strip() for l in fp.readlines()] return cls(model_type=model_type, model_name_or_path=model_path, labels=labels) def get_train_steps_epochs(self, max_steps, num_train_epochs, gradient_accumulation_steps, num_samples): if max_steps > 0: t_total = max_steps num_train_epochs = max_steps // (num_samples // gradient_accumulation_steps) + 1 else: t_total = num_samples // gradient_accumulation_steps * num_train_epochs return t_total, num_train_epochs def _train(self, data_set: DataLoader, dev_data_set: Union[DataLoader, List[DataLoader]] = None, test_data_set: Union[DataLoader, List[DataLoader]] = None, gradient_accumulation_steps: int = 1, per_gpu_train_batch_size: int = 8, max_steps: int = -1, num_train_epochs: int = 3, max_grad_norm: float = 1.0, logging_steps: int = 50, save_steps: int = 100): """Run model training batch_mapper: a function that maps a batch into parameters that the model expects in the forward method (for use with custom heads and models). If None it will default to the basic models input structure. logging_callback_fn: a function that is called in each evaluation step with the model as a parameter. """ t_total, num_train_epochs = self.get_train_steps_epochs( max_steps, num_train_epochs, gradient_accumulation_steps, len(data_set)) if self.optimizer is None and self.scheduler is None: logger.info("Loading default optimizer and scheduler") self.setup_default_optimizer(total_steps=t_total) train_batch_size = per_gpu_train_batch_size * max(1, self.n_gpus) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(data_set.dataset)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", train_batch_size * gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 self.model.zero_grad() epochs_eval = [] train_iterator = trange(num_train_epochs, desc="Epoch") for _ in train_iterator: epoch_iterator = tqdm(data_set, desc="Iteration") for step, batch in enumerate(epoch_iterator): self.model.train() batch = tuple(t.to(self.device) for t in batch) inputs = self._batch_mapper(batch) ouputs = self.model(**inputs) loss = ouputs[0] # get loss if self.n_gpus > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) tr_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: self.optimizer.step() self.scheduler.step() # Update learning rate schedule self.model.zero_grad() global_step += 1 if logging_steps > 0 and global_step % logging_steps == 0: # Log metrics and run evaluation on dev/test self._evaluate_during_train(dev_data_set, test_data_set) logger.info('lr = {}'.format( self.scheduler.get_lr()[0])) logger.info('loss = {}'.format( (tr_loss - logging_loss) / logging_steps)) logging_loss = tr_loss if save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint self.save_model_checkpoint( output_path=self.output_path, name='checkpoint-{}'.format(global_step)) if 0 < max_steps < global_step: epoch_iterator.close() break if 0 < max_steps < global_step: train_iterator.close() break epochs_eval.append( self._evaluate_during_train(dev_data_set, test_data_set)) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) return epochs_eval def _evaluate_during_train(self, dev_data_set, test_data_set) -> dict: ds_evals = {'dev': None, 'test': None} for ds_name, ds in zip(['dev', 'test'], [dev_data_set, test_data_set]): if ds is None: # got no data loader continue if isinstance(ds, DataLoader): ds = [ds] for d in ds: logits, label_ids = self._evaluate(d) ds_evals[ds_name] = self.evaluate_predictions( logits, label_ids) return ds_evals def _evaluate(self, data_set: DataLoader): logger.info("***** Running inference *****") logger.info(" Batch size: {}".format(data_set.batch_size)) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(data_set, desc="Inference iteration"): self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = self._batch_mapper(batch) outputs = self.model(**inputs) if 'labels' in inputs: tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() else: logits = outputs[0] nb_eval_steps += 1 model_output = logits.detach().cpu() model_out_label_ids = inputs['labels'].detach().cpu( ) if 'labels' in inputs else None if preds is None: preds = model_output out_label_ids = model_out_label_ids else: preds = torch.cat((preds, model_output), dim=0) out_label_ids = torch.cat( (out_label_ids, model_out_label_ids), dim=0) if out_label_ids is not None else None if out_label_ids is None: return preds return preds, out_label_ids def _batch_mapper(self, batch): mapping = { 'input_ids': batch[0], 'attention_mask': batch[1], # XLM don't use segment_ids 'token_type_ids': batch[2] if self.model_type in ['bert', 'xlnet'] else None } if len(batch) == 4: mapping.update({'labels': batch[3]}) return mapping def evaluate_predictions(self, logits, label_ids): raise Exception( 'evaluate_predictions method must be implemented in order to be used for ' 'dev/test set evaluation') def save_model_checkpoint(self, output_path, name): output_dir_path = os.path.join(output_path, name) self.save_model(output_dir_path, save_checkpoint=True)
def fit(model, training_iter, eval_iter, num_train_steps, device, n_gpu, verbose=1): # ------------------结果可视化------------------------ if args.local_rank in [-1, 0]: TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now()) tb_writer = SummaryWriter('log/%s'%TIMESTAMP) # ---------------------优化器------------------------- param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] t_total = num_train_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)#int(t_total*args.warmup_proportion) # ---------------------GPU半精度fp16----------------------------- if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # ---------------------模型初始化---------------------- model.to(device) tr_loss, logging_loss = 0.0, 0.0 # ------------------------训练------------------------------ best_f1 = 0 #start = time.time() global_step = 0 set_seed(args, n_gpu) # Added here for reproductibility (even between python 2 and 3) bar = tqdm(range(t_total), total = t_total) nb_tr_examples, nb_tr_steps = 0, 0 for step in bar: model.train() batch = next(training_iter) batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3]} encode = model(**inputs) encode = encode[0]#提取预测结果 loss = model.loss_fn(encode, labels=inputs['labels']) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() #torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() #torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() train_loss = round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += inputs['input_ids'].size(0) nb_tr_steps += 1 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 if (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.local_rank in [-1, 0] and \ args.do_eval and (step+1)%(args.eval_steps*args.gradient_accumulation_steps)==0: # -----------------------验证---------------------------- model.eval() y_predicts, y_labels = [], [] eval_loss, eval_acc, eval_f1 = 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for _, batch in enumerate(eval_iter): batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3]} with torch.no_grad(): encode = model(**inputs) encode = encode[0] # 提取预测结果 eval_los = model.loss_fn(encode, labels=inputs['labels']) predicts = model.predict(encode)#.detach().cpu().numpy() nb_eval_examples += inputs['input_ids'].size(0) nb_eval_steps += 1 eval_loss += eval_los.mean().item() y_predicts.append(torch.from_numpy(predicts)) labels = inputs['labels'].view(1, -1) labels = labels[labels != -1] y_labels.append(labels) eval_loss = eval_loss / nb_eval_steps eval_predicted = torch.cat(y_predicts, dim=0).cpu().numpy() eval_labeled = torch.cat(y_labels, dim=0).cpu().numpy() eval_f1 = model.acc_rec_f1(eval_predicted, eval_labeled)#eval_acc, eval_rec, logger.info( '\n\nglobal_step %d - train_loss: %4f - eval_loss: %4f - eval_f1:%4f\n' % (global_step, train_loss, eval_loss, eval_f1)) # 保存最好的模型 if eval_f1 > best_f1: best_f1 = eval_f1 save_model(model, args.output_dir) if args.local_rank in [-1, 0]: tb_writer.add_scalar('train_loss', train_loss, step)#.item() tb_writer.add_scalar('eval_loss', eval_loss, step)#.item() / count tb_writer.add_scalar('eval_f1', eval_f1, step)#eval_acc tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) if args.local_rank in [-1, 0]: tb_writer.close()
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader( ) num_train_optimization_steps = self.train_steps # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForSequenceClassification.from_pretrained( self.model_name_or_path, self.args, config=config) model.to(self.device) model.train() # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 best_MRR = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, utterance_mask, response_mask, history_mask, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, response_mask=response_mask, history_mask=history_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % ( self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] scores = [] ID = [x.guid for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, utterance_mask, response_mask, history_mask, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) utterance_mask = utterance_mask.to(self.device) response_mask = response_mask.to(self.device) history_mask = history_mask.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, response_mask=response_mask, history_mask=history_mask, labels=label_ids) logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, response_mask=response_mask, history_mask=history_mask, ) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracyCQA(inference_logits, gold_labels) eval_DOUBAN_MRR, eval_DOUBAN_mrr, eval_DOUBAN_MAP, eval_Precision1 = compute_DOUBAN( ID, scores, gold_labels) r_at_1 = r_at_k(ID, scores, gold_labels, 1) r_at_2 = r_at_k(ID, scores, gold_labels, 2) r_at_5 = r_at_k(ID, scores, gold_labels, 5) # print('eval_mrr',eval_mrr) print('eval_F1', eval_accuracy, 'eval_MRR', eval_DOUBAN_MRR, 'eval_MAP', eval_DOUBAN_MAP, 'eval_Precision1', eval_Precision1, 'r10@1', r_at_1, 'r10@2', r_at_2, 'r10@5', r_at_5, 'global_step', global_step, 'loss', train_loss) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'eval_MRR': eval_DOUBAN_MRR, 'eval_MAP': eval_DOUBAN_MAP, 'eval_Precision1': eval_Precision1, 'r10@1': r_at_1, 'r10@2': r_at_2, 'r10@5': r_at_5, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') # if eval_accuracy > best_acc : if eval_DOUBAN_MRR > best_MRR: print("=" * 80) print("Best MRR", eval_DOUBAN_MRR) print("Saving Model......") # best_acc = eval_accuracy best_MRR = eval_DOUBAN_MRR # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) # Prepare model model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and (step + 1) % ( args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if args.do_test: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='/hdd/lujunyu/dataset/multi_turn_corpus/douban/', type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--task_name", default='ubuntu', type=str, required=False, help="The name of the task to train.") parser.add_argument( "--output_dir", default='/hdd/lujunyu/model/chatbert/douban_base_si_aug/', type=str, required=False, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--dialog_augmentation_path", default= '/hdd/lujunyu/dataset/multi_turn_corpus/douban/train_augment_3.txt', ## train_augment_3.txt type=str, help="Whether to use augmentation") ## Other parameters parser.add_argument( "--init_model_name", default='bert-base-chinese', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_test", default=True, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--train_batch_size", default=800, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=100, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-6, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_steps", default=0.0, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=1e-3, type=float, help="weight_decay") parser.add_argument("--save_checkpoints_steps", default=25000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=12, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=20, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_pretrained(args.init_model_name, num_labels=2) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): if args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.init_model_name, do_lower_case=args.do_lower_case) if args.dialog_augmentation_path: train_dataset = DoubanDataset(file_path=args.dialog_augmentation_path, max_seq_length=args.max_seq_length, tokenizer=tokenizer) else: train_dataset = DoubanDataset(file_path=os.path.join( args.data_dir, "train.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer) eval_dataset = DoubanDataset(file_path=os.path.join( args.data_dir, "dev.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, sampler=RandomSampler(train_dataset), num_workers=4) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=args.eval_batch_size, sampler=SequentialSampler(eval_dataset), num_workers=4) model = BertForSequenceClassification.from_pretrained( '/hdd/lujunyu/model/chatbert/douban_base_si_aug_beifen/model.pt', config=bert_config) model.to(device) num_train_steps = None if args.do_train: num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer param_optimizer = list(model.named_parameters()) # remove pooler, which is not used thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_steps) else: optimizer = None scheduler = None if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 best_acc = 0.0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients scheduler.step() model.zero_grad() global_step += 1 if (step + 1) % args.save_checkpoints_steps == 0: ### Evaluate at the end of epoches model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy } output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) output_eval_file = os.path.join(args.output_dir, "logits_dev.txt") with open(output_eval_file, "w") as f: for i in range(len(logits_all)): for j in range(len(logits_all[i])): f.write(str(logits_all[i][j])) if j == len(logits_all[i]) - 1: f.write("\n") else: f.write(" ") ### Save the best checkpoint if best_acc < eval_accuracy: try: ### Remove 'module' prefix when using DataParallel state_dict = model.module.state_dict() except AttributeError: state_dict = model.state_dict() torch.save(state_dict, os.path.join(args.output_dir, "model.pt")) best_acc = eval_accuracy logger.info('Saving the best model in {}'.format( os.path.join(args.output_dir, "model.pt"))) model.train()
def train(self): model = BertForSequenceClassification.from_pretrained( self.args.model_name_or_path, self.args, config=self.config) model.to(self.device) logger.info('准备数据') data = DATABDCI( debug=False, data_dir='/home/lsy2018/文本匹配/DATA/DATA_BDCI/', data_process_output='/home/lsy2018/文本匹配/DATA/DATA_BDCI/data_1014/') train_examples = data.read_examples( os.path.join(self.data_process_output, 'train.csv')) train_features = data.convert_examples_to_features( train_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # 这步干嘛的? train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size // self.gradient_accumulation_steps) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) best_acc = 0 global_step = 0 model.train() train_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(self.train_steps), total=self.train_steps) train_dataloader = cycle(train_dataloader) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) train_loss += loss.item() train_loss = round( train_loss * self.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if self.do_eval and (step + 1) % ( self.eval_steps * self.gradient_accumulation_steps) == 0: inference_labels = [] scores = [] gold_labels = [] inference_logits = [] eval_examples = data.read_examples( os.path.join(self.data_process_output, 'dev.csv')) eval_features = data.convert_examples_to_features( eval_examples, self.tokenizer, self.max_seq_length) ID1 = [x.sentence_ID1 for x in eval_examples] ID2 = [x.sentence_ID2 for x in eval_examples] all_input_ids = torch.tensor(data.select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 count = 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: # ID1_list_eachbatch = ID1[count*args.eval_batch_size:(count+1)*args.eval_batch_size] # ID2_list_eachbatch = ID2[count * args.eval_batch_size:(count + 1) * args.eval_batch_size] input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) # scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) # scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) # eval_mrr = compute_MRR(scores, gold_labels, ID1, ID2) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, # 'mrr':eval_mrr, 'loss': train_loss } output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def train(config, train_dataset, model, tokenizer): """ Train the model """ config.train_batch_size = config.per_gpu_train_batch_size * \ max(1, config.n_gpu) if config.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=config.train_batch_size) if config.max_steps > 0: t_total = config.max_steps config.num_train_epochs = config.max_steps // ( len(train_dataloader) // config.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader) // config.gradient_accumulation_steps * config.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=config.warmup_steps, t_total=t_total) if config.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if config.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", config.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", config.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", config.train_batch_size * config.gradient_accumulation_steps * (torch.distributed.get_world_size() if config.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", config.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(config.num_train_epochs), desc="Epoch", disable=config.local_rank not in [-1, 0]) # Added here for reproductibility (even between python 2 and 3) set_seed(config.seed) loss_current=100000 for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=config.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(config.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], # XLM and RoBERTa don't use segment_ids 'token_type_ids': batch[2], 'labels': batch[3], 'e1_mask': batch[4], 'e2_mask': batch[5], } outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if config.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if config.gradient_accumulation_steps > 1: loss = loss / config.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), config.max_grad_norm) tr_loss += loss.item() if (step + 1) % config.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if config.local_rank in [-1, 0] and config.save_steps > 0 and global_step % config.save_steps == 0: # Save model checkpoint output_dir = os.path.join( config.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(config, os.path.join( output_dir, 'training_config.bin')) logger.info("Saving model checkpoint to %s", output_dir) if config.max_steps > 0 and global_step > config.max_steps: epoch_iterator.close() break if config.local_rank in [-1, 0] and config.logging_steps > 0 :#and global_step % config.logging_steps == 0: # Log metrics # Only evaluate when single GPU otherwise metrics may not average well if config.local_rank == -1 and config.evaluate_during_training: results = evaluate(config, model, tokenizer) logging_loss = tr_loss if config.max_steps > 0 and global_step > config.max_steps: train_iterator.close() break return global_step, tr_loss / global_step
def train(self, output_dir, train_batch_size, gradient_accumulation_steps, seed, epochs, data_path, pretrained_path, valid_path, no_cuda=False, dropout=0.3, weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5, adam_epsilon=1e-8, max_seq_length=128, squeeze=True, max_grad_norm=1.0, eval_batch_size=32, epoch_save_model=False, model_name='BERT', embedding_path=None, split_train_data=False, motherfile = False): if os.path.exists(output_dir) and os.listdir(output_dir): raise ValueError("Output directory (%s) already exists and is not empty." % output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, filename=os.path.join(output_dir, "log.txt")) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logger = logging.getLogger(__name__) if gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1" % gradient_accumulation_steps) train_batch_size = train_batch_size // gradient_accumulation_steps random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # add one for IGNORE label if motherfile: print(data_path) train_examples, train_label_list = get_examples_from_motherfile(data_path, 'train') val_examples, val_label_list = get_examples_from_motherfile(data_path, 'test') train_label_list.extend(val_label_list) label_list = list(set(train_label_list)) elif split_train_data: examples, label_list = get_examples(data_path, 'train') random.shuffle(examples) train_examples = examples[0:int(len(examples)*0.6)] val_examples = examples[int(len(examples)*0.6):int(len(examples)*0.8)] eval_examples = examples[int(len(examples)*0.8):] else: train_examples = None train_examples, label_list = get_examples(data_path, 'train') num_train_optimization_steps = 0 num_labels = len(label_list) + 1 num_train_optimization_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps) * epochs hidden_size = 300 if pretrained_path == None else 768 if 'base' in pretrained_path else 1024 device = 'cuda:0' if (torch.cuda.is_available() and not no_cuda) else 'cpu' logger.info(device) print(pretrained_path) if model_name == 'HERBERT': model = AutoTokenizerForTokenClassification( pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'BERT_MULTILINGUAL': model = BertBaseMultilingualCased( pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'Reformer': model = Reformer(n_labels=num_labels, hidden_size=512, dropout=dropout, device=device, max_seq_length=max_seq_length, batch_size=train_batch_size) else: model = XLMRForTokenClassification(pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout=dropout, device=device) model.to(device) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in params if not any( nd in n for nd in no_decay)], 'weight_decay': weight_decay}, {'params': [p for n, p in params if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) train_features = convert_examples_to_features( train_examples, label_list, max_seq_length, model.encode_word) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=train_batch_size) if not split_train_data: val_examples, _ = get_examples(valid_path, 'valid') val_features = convert_examples_to_features( val_examples, label_list, max_seq_length, model.encode_word) val_data = create_dataset(val_features) best_val_f1 = 0.0 for epoch_no in range(1, epochs+1): logger.info("Epoch %d" % epoch_no) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 model.train() steps = len(train_dataloader) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss = model(input_ids, label_ids, l_mask, valid_ids) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if step % 5 == 0: logger.info('Step = %d/%d; Loss = %.4f' % (step+1, steps, tr_loss / (step+1))) if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() logger.info("\nTesting on validation set...") f1, report = evaluate_model(model, val_data, label_list, eval_batch_size, device) print(report) if f1 > best_val_f1: best_val_f1 = f1 logger.info("\nFound better f1=%.4f on validation set. Saving model\n" % f1) logger.info("%s\n" % report) torch.save(model.state_dict(), open(os.path.join(output_dir, 'model.pt'), 'wb')) save_params(output_dir, dropout, num_labels, label_list) if epoch_save_model: epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no) os.makedirs(epoch_output_dir) torch.save(model.state_dict(), open(os.path.join(epoch_output_dir, 'model.pt'), 'wb')) save_params(epoch_output_dir, dropout, num_labels, label_list)
def run_train(args): # --------- data processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} train_data = processor.get_train(config['data_dir'] / f"{args.data_name}.train.pkl") train_examples = processor.create_examples(lines=train_data, example_type='train', cached_examples_file=config['data_dir'] / f"cached_train_examples_{args.arch}") train_features = processor.create_features(examples=train_examples, max_seq_len=args.train_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_train_features_{}_{}".format( args.train_max_seq_len, args.arch )) train_dataset = processor.create_dataset(train_features, is_sorted=args.sorted) if args.sorted: train_sampler = SequentialSampler(train_dataset) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) valid_data = processor.get_dev(config['data_dir'] / f"{args.data_name}.valid.pkl") valid_examples = processor.create_examples(lines=valid_data, example_type='valid', cached_examples_file=config['data_dir'] / f"cached_valid_examples_{args.arch}") valid_features = processor.create_features(examples=valid_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config['data_dir'] / "cached_valid_features_{}_{}".format( args.eval_max_seq_len, args.arch)) valid_dataset = processor.create_dataset(valid_features) valid_sampler = SequentialSampler(valid_dataset) valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.eval_batch_size) # ------- model logger.info("initializing model") if args.resume_path: args.resume_path = Path(args.resume_path) model = BertForMultiClass.from_pretrained(args.resume_path, num_labels=len(label_list)) else: model = BertForMultiClass.from_pretrained(config['bert_model_dir'], num_labels=len(label_list)) t_total = int(len(train_dataloader) / args.gradient_accumulation_steps * args.epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # ---- callbacks logger.info("initializing callbacks") train_monitor = TrainingMonitor(file_dir=config['figure_dir'], arch=args.arch) model_checkpoint = ModelCheckpoint(checkpoint_dir=config['checkpoint_dir'], mode=args.mode, monitor=args.monitor, arch=args.arch, save_best_only=args.save_best) # **************************** training model *********************** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num Epochs = %d", args.epochs) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * ( torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) trainer = Trainer(n_gpu=args.n_gpu, model=model, epochs=args.epochs, logger=logger, criterion=CrossEntropy(), optimizer=optimizer, lr_scheduler=lr_scheduler, early_stopping=None, training_monitor=train_monitor, fp16=args.fp16, resume_path=args.resume_path, grad_clip=args.grad_clip, model_checkpoint=model_checkpoint, gradient_accumulation_steps=args.gradient_accumulation_steps, evaluate=F1Score(), class_report=ClassReport(target_names=[id2label[x] for x in range(len(label2id))])) trainer.train(train_data=train_dataloader, valid_data=valid_dataloader, seed=args.seed)
def main(): torch.manual_seed(42) # Random #params = {'batch_size': 32, 'dropout': 0, 'hidden_dim': 128, 'learning_rate': 0.01, 'num_epochs': 5, 'num_layers': 2, 'oversample': False, 'soft_labels': False} # Glove params = { 'batch_size': 32, 'dropout': 0, 'hidden_dim': 128, 'learning_rate': 0.001, 'num_epochs': 5, 'num_layers': 2, 'oversample': False, 'soft_labels': False } # Random #params = {'batch_size': 32, 'dropout': 0, 'hidden_dim': 256, 'learning_rate': 0.0001, 'num_epochs': 5, 'num_layers': 3, 'oversample': False, 'soft_labels': False} #some params experiment_number = 1 test_percentage = 0.1 val_percentage = 0.2 batch_size = params["batch_size"] num_epochs = 5 #params["num_epochs"] dropout = params["dropout"] embedding_dim = 300 model_name = "CNN" #'Bert' #"CNN" #"LSTM" unsupervised = True embedding = "Glove" #"Random" ##"Glove" # "Both" # soft_labels = False combine = embedding == "Both" # LSTM parameters if model_name == "LSTM": hidden_dim = params["hidden_dim"] num_layers = params["num_layers"] # Bert parameter num_warmup_steps = 100 num_total_steps = 1000 if model_name == "Bert": embedding = "None" if embedding == "Both": combine = True embedding = "Random" else: combine = False learning_rate = params["learning_rate"] #5e-5, 3e-5, 2e-5 oversample_bool = False weighted_loss = True # load data dataset = Dataset("../data/cleaned_tweets_orig.csv", use_embedding=embedding, embedd_dim=embedding_dim, combine=combine, for_bert=(model_name == "Bert")) #dataset.oversample() train_data, val_test_data = split_dataset(dataset, test_percentage + val_percentage) val_data, test_data = split_dataset( val_test_data, test_percentage / (test_percentage + val_percentage)) # print(len(train_data)) #save_data(train_data, 'train') #save_data(test_data, 'test') #define loaders if oversample_bool: weights, targets = get_loss_weights(train_data, return_targets=True) class_sample_count = [ 1024 / 20, 13426, 2898 / 2 ] # dataset has 10 class-1 samples, 1 class-2 samples, etc. oversample_weights = 1 / torch.Tensor(class_sample_count) oversample_weights = oversample_weights[targets] # oversample_weights = torch.tensor([0.9414, 0.2242, 0.8344]) #torch.ones((3))- sampler = torch.utils.data.sampler.WeightedRandomSampler( oversample_weights, len(oversample_weights)) train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, collate_fn=my_collate, sampler=sampler) else: train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, collate_fn=my_collate) val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, collate_fn=my_collate) #define model if model_name == "CNN": vocab_size = len(dataset.vocab) model = CNN(vocab_size, embedding_dim, combine=combine) elif model_name == "LSTM": vocab_size = len(dataset.vocab) model = LSTM(vocab_size, embedding_dim, batch_size=batch_size, hidden_dim=hidden_dim, lstm_num_layers=num_layers, combine=combine, dropout=dropout) elif model_name == "Bert": model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=3) train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, collate_fn=bert_collate) val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, collate_fn=bert_collate) #device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #LOSS : weighted cross entropy loss, by class counts of other classess if weighted_loss: weights = torch.tensor([0.9414, 0.2242, 0.8344], device=device) else: weights = torch.ones(3, device=device) #weights = torch.tensor([1.0, 1.0, 1.0], device = device) #get_loss_weights(train_data).to(device) # not to run again criterion = nn.CrossEntropyLoss(weight=weights) if soft_labels: criterion = weighted_soft_cross_entropy #latent model if unsupervised: vocab_size = len(dataset.vocab) criterion = nn.CrossEntropyLoss(weight=weights, reduction='none') model = Rationalisation_model(vocab_size, embedding_dim=embedding_dim, model=model_name, batch_size=batch_size, combine=combine, criterion=criterion) if not model_name == "Bert": model.embedding.weight.data.copy_(dataset.vocab.vectors) if combine: model.embedding_glove.weight.data.copy_(dataset.glove.vectors) #model to device model.to(device) #optimiser optimizer = optim.Adam(model.parameters(), lr=learning_rate) if model_name == "Bert": optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False) # Linear scheduler for adaptive lr scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) else: scheduler = None plot_log = defaultdict(list) for epoch in range(num_epochs): #train and validate epoch_loss, epoch_acc = train_epoch(model, train_loader, optimizer, criterion, device, soft_labels=soft_labels, weights=weights, scheduler=scheduler, unsupervised=unsupervised) val_loss, val_acc = evaluate_epoch(model, val_loader, criterion, device, soft_labels=soft_labels, weights=weights, unsupervised=unsupervised) #save for plotting for name, point in zip( ["train_loss", "train_accuracy", "val_loss", "val_accuracy"], [epoch_loss, epoch_acc, val_loss, val_acc]): plot_log[f'{name}'] = point #realtime feel print(f'Epoch: {epoch+1}') print( f'\tTrain Loss: {epoch_loss:.5f} | Train Acc: {epoch_acc*100:.2f}%' ) print(f'\t Val. Loss: {val_loss:.5f} | Val. Acc: {val_acc*100:.2f}%') sample_sentences_and_z(model, train_loader, device, dataset.vocab) #save plot results_directory = f'plots/{experiment_number}' os.makedirs(results_directory, exist_ok=True) for name, data in plot_log.items(): save_plot(data, name, results_directory) #save model torch.save(model, os.path.join(results_directory, 'model_cnn.pth')) #confusion matrix and all that fun loss, acc, predictions, ground_truth = evaluate_epoch( model, val_loader, criterion, device, is_final=True, soft_labels=soft_labels, weights=weights, unsupervised=unsupervised) conf_matrix = confusion_matrix(ground_truth, predictions) class_report = classification_report(ground_truth, predictions) print('\nFinal Loss and Accuracy\n----------------\n') print(f'\t Val. Loss: {loss:.5f} | Val. Acc: {acc*100:.2f}%') print('\nCONFUSION MATRIX\n----------------\n') print(conf_matrix) print('\nCLASSSIFICATION REPORT\n----------------------\n') print(class_report) plot_confusion_matrix(ground_truth, predictions, classes=["Hate speech", "Offensive", "Neither"], normalize=False, title='Confusion matrix') plt.show()
def train(self): # Model model = KobertBiLSTMCRF(config=self.model_config, num_classes=len(self.tr_ds.ner_to_index)) model.train() # optim train_examples_len = len(self.tr_ds) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs t_total = len( self.tr_dl ) // self.model_config.gradient_accumulation_steps * self.model_config.epochs optimizer = AdamW(optimizer_grouped_parameters, lr=self.model_config.learning_rate, eps=self.model_config.adam_epsilon) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=self.model_config.warmup_steps, t_total=t_total) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') n_gpu = torch.cuda.device_count() # if n_gpu > 1: # model = torch.nn.DataParallel(model) model.to(device) # save tb_writer = SummaryWriter('{}/runs'.format(self.model_dir)) checkpoint_manager = CheckpointManager(self.model_dir) summary_manager = SummaryManager(self.model_dir) best_val_loss = 1e+10 best_train_acc = 0 # Train! self.logger.info("***** Running training *****") self.logger.info(" Num examples = %d", len(self.tr_ds)) self.logger.info(" Num Epochs = %d", self.model_config.epochs) self.logger.info(" Instantaneous batch size per GPU = %d", self.model_config.batch_size) # logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", # args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) self.logger.info(" Gradient Accumulation steps = %d", self.model_config.gradient_accumulation_steps) self.logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_dev_acc, best_dev_loss = 0.0, 99999999999.0 best_steps = 0 model.zero_grad() self.set_seed( ) # Added here for reproductibility (even between python 2 and 3) # Train train_iterator = trange(int(self.model_config.epochs), desc="Epoch") for _epoch, _ in enumerate(train_iterator): epoch_iterator = tqdm( self.tr_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0] epoch = _epoch for step, batch in enumerate(epoch_iterator): model.train() x_input, token_type_ids, y_real = map( lambda elm: elm.to(device), batch) log_likelihood, sequence_of_tags = model( x_input, token_type_ids, y_real) # loss: negative log-likelihood loss = -1 * log_likelihood if n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if self.model_config.gradient_accumulation_steps > 1: loss = loss / self.model_config.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), self.model_config.max_grad_norm) tr_loss += loss.item() if (step + 1 ) % self.model_config.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 with torch.no_grad(): sequence_of_tags = torch.tensor(sequence_of_tags) print("sequence_of_tags: ", sequence_of_tags) print("y_real: ", y_real) print("loss: ", loss) print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real)) _tags = torch.squeeze(sequence_of_tags, dim=0) mb_acc = (_tags == y_real).float()[ y_real != self.vocab.PAD_ID].mean() #mb_acc = (sequence_of_tags == y_real).float()[y_real != self.vocab.PAD_ID].mean() tr_acc = mb_acc.item() tr_loss_avg = tr_loss / global_step tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc} # if step % 50 == 0: print( 'epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}' .format(epoch + 1, global_step, tr_summary['loss'], tr_summary['acc'])) if self.model_config.logging_steps > 0 and global_step % self.model_config.logging_steps == 0: # Log metrics if self.model_config.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well pass tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / self.model_config.logging_steps, global_step) self.logger.info( "Average loss: %s at global step: %s", str((tr_loss - logging_loss) / self.model_config.logging_steps), str(global_step)) logging_loss = tr_loss if self.model_config.save_steps > 0 and global_step % self.model_config.save_steps == 0: eval_summary, list_of_y_real, list_of_pred_tags = self.evaluate( model, self.val_dl) # Save model checkpoint output_dir = os.path.join(self.model_config.output_dir, 'epoch-{}'.format(epoch + 1)) if not os.path.exists(output_dir): os.makedirs(output_dir) self.logger.info("Saving model checkpoint to %s", output_dir) state = { 'global_step': global_step + 1, 'model_state_dict': model.state_dict(), 'opt_state_dict': optimizer.state_dict() } summary = {'train': tr_summary} summary_manager.update(summary) summary_manager.save('summary.json') is_best = tr_acc >= best_train_acc # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야) # Save if is_best: best_train_acc = tr_acc checkpoint_manager.save_checkpoint( state, 'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format( epoch + 1, global_step, tr_acc)) print( "Saving model checkpoint as best-epoch-{}-step-{}-acc-{:.3f}.bin" .format(epoch + 1, global_step, best_dev_acc)) # print classification report and save confusion matrix cr_save_path = self.model_dir + '/best-epoch-{}-step-{}-acc-{:.3f}-cr.csv'.format( epoch + 1, global_step, best_dev_acc) cm_save_path = self.model_dir + '/best-epoch-{}-step-{}-acc-{:.3f}-cm.png'.format( epoch + 1, global_step, best_dev_acc) self.save_cr_and_cm(list_of_y_real, list_of_pred_tags, cr_save_path=cr_save_path, cm_save_path=cm_save_path) else: torch.save( state, os.path.join( output_dir, 'model-epoch-{}-step-{}-acc-{:.3f}.bin'. format(epoch + 1, global_step, tr_acc))) tb_writer.close() self.logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step)
def train(args, train_iter, dev, test, src_field, tgt_field, tag_field, checkpoint): # srcpadid = src_field.vocab.stoi['<pad>'] tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') model = Classify_Extractor(args, tgt_field) if torch.cuda.is_available(): model.cuda() print_params(model) decay = args.decay if args.optimizer == 'bert': weight_decay = 0.0 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] opt = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-8) totalnum = 0 for i in train_iter: totalnum += 1 #print(args.lr) #print(args.maximum_steps) #exit() t_total = totalnum // decay * args.maximum_steps scheduler = WarmupLinearSchedule(opt, warmup_steps=0, t_total=t_total) else: opt = torch.optim.Adadelta(model.parameters(), lr=args.lr) best_e = 0.0 best_c = 0.0 best_epoch_for_c = 0 best_epoch_for_e = 0 offset = 0.0 pre_epoch = 0 patience_c = 0 patience_e = 0 if checkpoint is not None: print('model.load_state_dict(checkpoint[model])') model.load_state_dict(checkpoint['model']) if args.resume: opt.load_state_dict(checkpoint['optim']) best_f = checkpoint['f'] offset = checkpoint['iters'] pre_epoch = checkpoint['epoch'] print('*************************************') print('resume from {} epoch {} iters and best_f {}'.format( pre_epoch, offset, best_f)) print('*************************************') print("**************start training****************") start = time.time() for epoch in range(args.maxepoch): train_iter.init_epoch() epoch += pre_epoch for iters, train_batch in enumerate(train_iter): iters += offset model.train() # model.zero_grad() # model.constrain_transition() t1 = time.time() batch_src = train_batch.src #print(batch_src) #exit() src = [tokenizer.convert_tokens_to_ids(s) for s in batch_src] maxlen = max([len(s) for s in batch_src]) src_mask = [] padded_sents = [] for s in src: new_s = s + [0] * (maxlen - len(s)) padded_sents.append(new_s) mask = [1] * len(s) + [0] * (maxlen - len(s)) src_mask.append(mask) # B T src = torch.tensor(padded_sents).long().cuda() # B T src_mask = torch.tensor(src_mask).byte().cuda() # src, src_mask = prepare_src(train_batch.src, srcpadid) tgt = prepare_tgt(train_batch.tgt) tag = train_batch.tag loss = model(src, src_mask, tgt, tag) # "update parameters" if decay > 1: loss = loss / decay loss.backward() # if args.grad_clip: # torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) if (iters + 1) % decay == 0: opt.step() scheduler.step() # Update learning rate schedule opt.zero_grad() # opt.step() t2 = time.time() loss = loss.item() print("epoch:{} iters:{} src:({},{}) tgt:({},{}) " "loss:{:.2f} t:{:.2f}".format(epoch + 1, iters + 1, *src.size(), *tgt.size(), loss, t2 - t1)) # if torch.cuda.is_available(): # torch.cuda.empty_cache() if (epoch + 1) % 1 == 0: print("=============validate model==============") with torch.no_grad(): dev.init_epoch() model.eval() # model.constrain_transition() sents = [] cy_true = [] cy_pred = [] for j, dev_batch in enumerate(dev): t1 = time.time() # src, src_mask = prepare_src(dev_batch.src, srcpadid) batch_src = dev_batch.src src = [ tokenizer.convert_tokens_to_ids(s) for s in batch_src ] maxlen = max([len(s) for s in batch_src]) src_mask = [] padded_sents = [] for s in src: new_s = s + [0] * (maxlen - len(s)) padded_sents.append(new_s) mask = [1] * len(s) + [0] * (maxlen - len(s)) src_mask.append(mask) # B T src = torch.tensor(padded_sents).long().cuda() # B T src_mask = torch.tensor(src_mask).byte().cuda() tgt = prepare_tgt(dev_batch.tgt) tag = dev_batch.tag.squeeze(-1) _, pre_tag = model.component_extraction(src, src_mask) pre_ctag = model.simile_classify(src, src_mask) cy_true.extend(tag.tolist()) cy_pred.extend(pre_ctag.tolist()) for sen, tags, p_tags, c_tags in zip( src, tgt, pre_tag, tag): sen = sen[:len(p_tags)].tolist() tags = tags[:len(p_tags)].tolist() if c_tags == 1: sents.append([ sen, [tgt_field.vocab.itos[t] for t in tags], [tgt_field.vocab.itos[t] for t in p_tags] ]) print('dev iters: {}, t:{}'.format(j, time.time() - t1)) _, eprecision, erecall, ef1 = evaluate(sents) cprecision = precision_score(cy_true, cy_pred) crecall = recall_score(cy_true, cy_pred) cf1 = f1_score(cy_true, cy_pred) print( 'epoch: {} classify--> precision: {} recall: {} f1: {} best:{}' .format(epoch + 1, cprecision, crecall, cf1, best_c)) print('extractor--> precision: {} recall: {} f1: {} best: {}'. format(eprecision, erecall, ef1, best_e)) if cf1 > best_c: best_c = cf1 best_epoch_for_c = epoch + 1 print( 'save best classifier model at epoch={}'.format(epoch + 1)) checkpoint = { 'model': model.state_dict(), 'optim': opt.state_dict(), 'args': args } torch.save( checkpoint, '{}/{}.classify.best.pt'.format( args.model_path, args.model)) patience_c = 0 else: patience_c += 1 if ef1 > best_e: best_e = ef1 best_epoch_for_e = epoch + 1 print( 'save best extractor model at epoch={}'.format(epoch + 1)) checkpoint = { 'model': model.state_dict(), 'optim': opt.state_dict(), 'args': args } torch.save( checkpoint, '{}/{}.extractor.best.pt'.format( args.model_path, args.model)) patience_e = 0 else: patience_e += 1 if patience_c > args.patience and patience_e > args.patience: print("early stop at {}".format(epoch)) break if args.decay: opt.param_groups[0]['lr'] = opt.param_groups[0]['lr'] * args.decay print('*******Done********{}'.format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) minutes = (time.time() - start) // 60 if minutes < 60: print( 'best_c:{}, best_e:{} best_epoch_c:{}, best_epoch_e:{}, time:{} mins' .format(best_c, best_e, best_epoch_for_c, best_epoch_for_e, minutes)) else: hours = minutes / 60 print( 'best_c:{}, best_e:{} best_epoch_c:{}, best_epoch_e:{}, time:{:.1f} hours' .format(best_c, best_e, best_epoch_for_c, best_epoch_for_e, hours)) print('*******Testing************') model1 = Classify_Extractor(args, tgt_field) model1.cuda() load_from = '{}/{}.classify.best.pt'.format(args.model_path, args.model) print('load the best model {}'.format(load_from)) checkpoint = torch.load(load_from, map_location='cpu') print('load parameters') model1.load_state_dict(checkpoint['model']) model2 = Classify_Extractor(args, tgt_field) model2.cuda() load_from = '{}/{}.extractor.best.pt'.format(args.model_path, args.model) print('load the best model {}'.format(load_from)) checkpoint = torch.load(load_from, map_location='cpu') print('load parameters') model2.load_state_dict(checkpoint['model']) with torch.no_grad(): test.init_epoch() model1.eval() model2.eval() sents = [] cy_true = [] cy_pred = [] for j, test_batch in enumerate(test): t1 = time.time() # src, src_mask = prepare_src(test_batch.src, srcpadid) batch_src = test_batch.src src = [tokenizer.convert_tokens_to_ids(s) for s in batch_src] maxlen = max([len(s) for s in batch_src]) src_mask = [] padded_sents = [] for s in src: new_s = s + [0] * (maxlen - len(s)) padded_sents.append(new_s) mask = [1] * len(s) + [0] * (maxlen - len(s)) src_mask.append(mask) # B T src = torch.tensor(padded_sents).long().cuda() # B T src_mask = torch.tensor(src_mask).byte().cuda() tgt = prepare_tgt(test_batch.tgt) tag = test_batch.tag.squeeze(-1) _, pre_tag = model2.component_extraction(src, src_mask) pre_ctag = model1.simile_classify(src, src_mask) cy_true.extend(tag.tolist()) cy_pred.extend(pre_ctag.tolist()) # for sen, tags, p_tags in zip(src, tgt, pre_tag): # sen = sen[:len(p_tags)].tolist() # tags = tags[:len(p_tags)].tolist() # sents.append([sen, [tgt_field.vocab.itos[t] for t in tags], # [tgt_field.vocab.itos[t] for t in p_tags]]) for sen, tags, p_tags, c_tags in zip(src, tgt, pre_tag, pre_ctag): sen = sen[:len(p_tags)].tolist() tags = tags[:len(p_tags)].tolist() if c_tags == 1: sents.append([ sen, [tgt_field.vocab.itos[t] for t in tags], [tgt_field.vocab.itos[t] for t in p_tags] ]) elif c_tags == 0: sents.append([ sen, [tgt_field.vocab.itos[t] for t in tags], ['O' for t in p_tags] ]) print('test iters: {}, t:{}'.format(j, time.time() - t1)) _, eprecision, erecall, ef1 = evaluate(sents) cprecision = precision_score(cy_true, cy_pred) crecall = recall_score(cy_true, cy_pred) cf1 = f1_score(cy_true, cy_pred) print('Testing classify--> precision: {} recall: {} f1: {}'.format( cprecision, crecall, cf1)) print('extractor--> precision: {} recall: {} f1: {}'.format( eprecision, erecall, ef1))
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) data_splitList = DATACQA.load_data(os.path.join(self.data_dir, 'train.csv'),n_splits=5) for split_index,each_data in enumerate(data_splitList): # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForSequenceClassification.from_pretrained(self.model_name_or_path, self.args, config=config) model.to(self.device) logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(each_data) num_train_optimization_steps = self.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] scores = [] questions = [x.text_a for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) # Run prediction for full data model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracyCQA(inference_logits, gold_labels) eval_mrr = compute_MRR_CQA(scores,gold_labels,questions) eval_5R20 = compute_5R20(scores,gold_labels,questions) result = {'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'eval_MRR':eval_mrr, 'eval_5R20':eval_5R20, 'global_step': global_step, 'loss': train_loss} output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc : print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr(model,'module') else model output_model_file = os.path.join(self.output_dir, "pytorch_model_{}.bin".format(split_index)) torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) del model gc.collect()
def train(model, criterion, dataset, logger, train_csv_logger, val_csv_logger, test_csv_logger, args, epoch_offset): model = model.cuda() # process generalization adjustment stuff adjustments = [float(c) for c in args.generalization_adjustment.split(',')] assert len(adjustments) in (1, dataset['train_data'].n_groups) if len(adjustments) == 1: adjustments = np.array(adjustments * dataset['train_data'].n_groups) else: adjustments = np.array(adjustments) train_loss_computer = LossComputer( criterion, is_robust=args.robust, dataset=dataset['train_data'], alpha=args.alpha, gamma=args.gamma, adj=adjustments, step_size=args.robust_step_size, normalize_loss=args.use_normalized_loss, btl=args.btl, min_var_weight=args.minimum_variational_weight, sp=args.sp, mode=args.mode, ratio=args.ratio) # BERT uses its own scheduler and optimizer if args.model == 'bert': no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon) t_total = len(dataset['train_loader']) * args.n_epochs print(f'\nt_total is {t_total}\n') scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: if args.adam: optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.weight_decay) else: optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) if args.scheduler: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', factor=0.1, patience=5, threshold=0.0001, min_lr=0, eps=1e-08) else: scheduler = None best_val_acc = 0 for epoch in range(epoch_offset, epoch_offset + args.n_epochs): logger.write('\nEpoch [%d]:\n' % epoch) logger.write(f'Training:\n') run_epoch(epoch, model, optimizer, dataset['train_loader'], train_loss_computer, logger, train_csv_logger, args, is_training=True, show_progress=args.show_progress, log_every=args.log_every, scheduler=scheduler) logger.write(f'\nValidation:\n') val_loss_computer = LossComputer(criterion, is_robust=args.robust, dataset=dataset['val_data'], step_size=args.robust_step_size, alpha=args.alpha) run_epoch(epoch, model, optimizer, dataset['val_loader'], val_loss_computer, logger, val_csv_logger, args, is_training=False) # Test set; don't print to avoid peeking # if dataset['test_data'] is not None: # test_loss_computer = LossComputer( # criterion, # is_robust=args.robust, # dataset=dataset['test_data'], # step_size=args.robust_step_size, # alpha=args.alpha) # run_epoch( # epoch, model, optimizer, # dataset['test_loader'], # test_loss_computer, # None, test_csv_logger, args, # is_training=False) # Inspect learning rates if (epoch + 1) % 1 == 0: for param_group in optimizer.param_groups: curr_lr = param_group['lr'] logger.write('Current lr: %f\n' % curr_lr) if args.scheduler and args.model != 'bert': if args.robust: val_loss, _ = val_loss_computer.compute_robust_loss_greedy( val_loss_computer.avg_group_loss, val_loss_computer.avg_group_loss) else: val_loss = val_loss_computer.avg_actual_loss scheduler.step( val_loss) #scheduler step to update lr at the end of epoch if epoch % args.save_step == 0: torch.save(model, os.path.join(args.log_dir, '%d_model.pth' % epoch)) if args.save_last: torch.save(model, os.path.join(args.log_dir, 'last_model.pth')) if args.save_best: if args.robust or args.reweight_groups: curr_val_acc = min(val_loss_computer.avg_group_acc) else: curr_val_acc = val_loss_computer.avg_acc logger.write(f'Current validation accuracy: {curr_val_acc}\n') if curr_val_acc > best_val_acc: best_val_acc = curr_val_acc torch.save(model, os.path.join(args.log_dir, 'best_model.pth')) logger.write(f'Best model saved at epoch {epoch}\n') if args.automatic_adjustment: gen_gap = val_loss_computer.avg_group_loss - train_loss_computer.exp_avg_loss adjustments = gen_gap * torch.sqrt( train_loss_computer.group_counts) train_loss_computer.adj = adjustments logger.write('Adjustments updated\n') for group_idx in range(train_loss_computer.n_groups): logger.write( f' {train_loss_computer.get_group_name(group_idx)}:\t' f'adj = {train_loss_computer.adj[group_idx]:.3f}\n') logger.write('\n')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--test_file", default='', type=str, help="Test file") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--validate_per_epoch", default=3, type=int, help="validations number per epoch") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_validation", action='store_true', help="Whether to run validation.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--drop", default=0.1, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_tag_labels(args.data_dir) global EVAL_TAGS EVAL_TAGS = [ label for label in label_list if label not in ['O', '[CLS]', '[SEP]'] ] # EVAL_TAGS = [f'{x}-{y}' for x in ['B', 'I'] for y in EVAL_TAGS] logger.info(EVAL_TAGS) num_labels = len(label_list) + 1 allowed_tags = set(EVAL_TAGS + ['O']) do_lower_case = 'uncased' in args.bert_model tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Prepare model config = BertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name, hidden_dropout_prob=args.drop) print(config) model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} best_dev = 0.0 if args.do_validation: dev_examples = processor.get_dev_examples(args.data_dir, label_list) dev_features = convert_examples_to_features(dev_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Dev set *****") logger.info(" Num examples = %d", len(dev_examples)) all_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in dev_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in dev_features], dtype=torch.long) dev_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.eval_batch_size) validation_steps = int( len(train_examples) / args.train_batch_size) // args.validate_per_epoch if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() start_time = time.time() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask, device=device) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.do_validation and (step + 1) % validation_steps == 0: logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_dataloader), time.time() - start_time, tr_loss / nb_tr_steps)) model.eval() y_true = [] y_pred = [] label_map = { i: label for i, label in enumerate(label_list, 1) } label_map[0] = '[PAD]' for batch in tqdm(dev_dataloader, desc='Validation'): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, None, valid_ids, l_mask, device=device) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() # input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map) - 1: y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) y_true_copy = [[ x if x in allowed_tags else 'O' for x in y ] for y in y_true] y_pred_copy = [[ x if x in allowed_tags else 'O' for x in y ] for y in y_pred] report = classification_report(y_true_copy, y_pred_copy, digits=6) # report_dict = classification_report(y_true_copy, # y_pred_copy, # output_dict=True) # report_dict = report logger.info("***** Validation results *****") logger.info("\n%s", report) fscore = float([ line.strip().split()[4] for line in report.split('\n') if line.strip().startswith('micro') ][0]) if fscore > best_dev: logger.info(f'!!!Best dev: {fscore}') logger.info(f'at epoch: {epoch}') best_dev = fscore model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) label_map = { i: label for i, label in enumerate(label_list, 1) } label_map[0] = '[PAD]' model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open( os.path.join(args.output_dir, "model_config.json"), "w")) model.train() model = Ner.from_pretrained(args.output_dir) do_lower_case = 'uncased' in args.bert_model tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): args.test_file = os.path.join( args.data_dir, 'test.json') if args.test_file == '' else args.test_file eval_examples = processor.get_test_examples(args.test_file) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] tag_scores = [] label_map = {i: label for i, label in enumerate(label_list, 1)} label_map[0] = '[PAD]' for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask, device=device) scores = np.max(F.softmax(logits, dim=-1).cpu().numpy(), axis=-1) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() # scores = scores.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] temp_3 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map) - 1: y_true.append(temp_1) y_pred.append(temp_2) tag_scores.append(temp_3) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) temp_3.append(scores[i][j]) y_true_copy = [[x if x in allowed_tags else 'O' for x in y] for y in y_true] y_pred_copy = [[x if x in allowed_tags else 'O' for x in y] for y in y_pred] # report = classification_report(y_true_copy, # y_pred_copy, digits=4) report = 'all scores are 0!\n' logger.info("\n%s", report) output_eval_file = os.path.join( args.output_dir, f"{args.test_file.split('/')[-1]}_eval_results.txt") output_preds_file = os.path.join( args.output_dir, f"{args.test_file.split('/')[-1]}_predictions.tsv") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report) prediction_results = { 'id': [ex.guid for ex in eval_examples], 'token': [ex.text_a for ex in eval_examples], 'tag_label': [' '.join(ex.label) for ex in eval_examples], 'tag_pred': [' '.join(pred) for pred in y_pred], 'scores': [' '.join([str(x) for x in score]) for score in tag_scores] } pd.DataFrame(prediction_results).to_csv(output_preds_file, sep='\t', index=False)
def main(args): assert args.use_one_optim is True if args.use_cls_only: args.no_dial = True print("### use_cls_only: {:}".format(args.use_cls_only)) print("### no_dial: {:}".format(args.no_dial)) if args.recover_e > 0: raise NotImplementedError("This option is from my oldest code version. " "I have not checked it for this code version.") if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) print("### mkdir {:}".format(args.save_dir)) def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) n_gpu = 0 if torch.cuda.is_available() and (not args.use_cpu): n_gpu = torch.cuda.device_count() device = torch.device('cuda') print("### Device: {:}".format(device)) else: print("### Use CPU (Debugging)") device = torch.device("cpu") if args.random_seed < 0: print("### Pick a random seed") args.random_seed = random.sample(list(range(0, 100000)), 1)[0] print("### Random Seed: {:}".format(args.random_seed)) np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: if args.random_seed >= 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) ontology = json.load(open(args.ontology_data)) slot_meta, ontology = make_slot_meta(ontology) op2id = OP_SET[args.op_code] print(op2id) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) train_path = os.path.join(args.data_root, "train.pt") dev_path = os.path.join(args.data_root, "dev.pt") test_path = os.path.join(args.data_root, "test.pt") if not os.path.exists(test_path): test_data_raw = prepare_dataset(data_path=args.test_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) torch.save(test_data_raw, test_path) else: test_data_raw = torch.load(test_path) print("# test examples %d" % len(test_data_raw)) if not os.path.exists(train_path): train_data_raw = prepare_dataset(data_path=args.train_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) torch.save(train_data_raw, train_path) else: train_data_raw = torch.load(train_path) train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta, args.max_seq_length, rng, ontology, args.word_dropout, args.shuffle_state, args.shuffle_p, pad_id=tokenizer.convert_tokens_to_ids(['[PAD]'])[0], slot_id=tokenizer.convert_tokens_to_ids(['[SLOT]'])[0], decoder_teacher_forcing=args.decoder_teacher_forcing, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only) print("# train examples %d" % len(train_data_raw)) if not os.path.exists(dev_path): dev_data_raw = prepare_dataset(data_path=args.dev_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) torch.save(dev_data_raw, dev_path) else: dev_data_raw = torch.load(dev_path) print("# dev examples %d" % len(dev_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob type_vocab_size = 4 dec_config = args model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id), op2id['update'], tokenizer.convert_tokens_to_ids(['[MASK]'])[0], tokenizer.convert_tokens_to_ids(['[SEP]'])[0], tokenizer.convert_tokens_to_ids(['[PAD]'])[0], tokenizer.convert_tokens_to_ids(['-'])[0], type_vocab_size, args.exclude_domain) if not os.path.exists(args.bert_ckpt_path): args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets') state_dict = torch.load(args.bert_ckpt_path, map_location='cpu') _k = 'embeddings.token_type_embeddings.weight' print("config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format( type_vocab_size, state_dict[_k].shape[0])) state_dict[_k].resize_( type_vocab_size, state_dict[_k].shape[1]) state_dict[_k].data[2, :].copy_(state_dict[_k].data[0, :]) state_dict[_k].data[3, :].copy_(state_dict[_k].data[0, :]) model.bert.load_state_dict(state_dict) print("\n### Done Load BERT") sys.stdout.flush() # re-initialize added special tokens ([SLOT], [NULL], [EOS]) model.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02) model.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) # re-initialize seg-2, seg-3 model.bert.embeddings.token_type_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.bert.embeddings.token_type_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) model.to(device) num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs) if args.use_one_optim: print("### Use One Optim") param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.enc_lr) scheduler = WarmupLinearSchedule(optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) else: no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] enc_param_optimizer = list(model.bert.named_parameters()) # TODO: For BERT only print('### Optim BERT: {:}'.format(len(enc_param_optimizer))) enc_optimizer_grouped_parameters = [ {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr) enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) dec_param_optimizer = list(model.named_parameters()) # TODO: For other parameters print('### Optim All: {:}'.format(len(dec_param_optimizer))) dec_param_optimizer = [p for (n, p) in dec_param_optimizer if 'bert' not in n] print('### Optim OTH: {:}'.format(len(dec_param_optimizer))) dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr) dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup), t_total=num_train_steps) if args.recover_e > 0: model_recover, enc_recover, dec_recover = load(args, str(args.recover_e)) print("### Recover Model E{:}".format(args.recover_e)) sys.stdout.flush() model.load_state_dict(model_recover) print("### Recover Optim E{:}".format(args.recover_e)) sys.stdout.flush() enc_optimizer.load_state_dict(enc_recover) dec_optimizer.load_state_dict(dec_optimizer) if n_gpu > 1: model = torch.nn.DataParallel(model) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size, collate_fn=train_data.collate_fn, num_workers=args.num_workers, worker_init_fn=worker_init_fn) loss_fnc = nn.CrossEntropyLoss() best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0} start_time = time.time() for epoch in range(args.n_epochs): batch_loss = [] model.train() for step, batch in enumerate(train_dataloader): batch = [b.to(device) if (not isinstance(b, int)) and (not isinstance(b, dict) and (not isinstance(b, list)) and (not isinstance(b, np.ndarray))) else b for b in batch] input_ids_p, segment_ids_p, input_mask_p, \ state_position_ids, op_ids, domain_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, \ masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, n_total_pred = batch domain_scores, state_scores, loss_g = model(input_ids_p, segment_ids_p, input_mask_p, state_position_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, only_pred_op=args.only_pred_op, n_gpu=n_gpu) if n_total_pred > 0: loss_g = loss_g.sum() / n_total_pred else: loss_g = 0 loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1)) if args.only_pred_op: loss = loss_s else: loss = loss_s + loss_g if args.exclude_domain is not True: loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1)) loss = loss + loss_d batch_loss.append(loss.item()) loss.backward() if args.use_one_optim: optimizer.step() scheduler.step() else: enc_optimizer.step() enc_scheduler.step() dec_optimizer.step() dec_scheduler.step() model.zero_grad() if step % 100 == 0: try: loss_g = loss_g.item() except AttributeError: loss_g = loss_g if args.exclude_domain is not True: print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \ % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g, loss_d.item())) else: print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \ % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g)) sys.stdout.flush() batch_loss = [] if args.use_one_optim: save(args, epoch + 1, model, optimizer) else: save(args, epoch + 1, model, enc_optimizer, dec_optimizer) if ((epoch+1) % args.eval_epoch == 0) and (epoch+1 >= 8): eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu) print("### Epoch {:} Score : ".format(epoch+1), eval_res) if eval_res['joint_acc'] > best_score['joint_acc']: best_score = eval_res print("### Best Joint Acc: {:} ###".format(best_score['joint_acc'])) print('\n') if epoch+1 >= 8: # To speed up eval_res_test = model_evaluation(model, test_data_raw, tokenizer, slot_meta, epoch + 1, args.op_code, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu) print("### Epoch {:} Test Score : ".format(epoch + 1), eval_res_test)
def train(args, train_dataset, model, dev_dataset): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(log_dir=args.tensorboard_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for epoch in train_iterator: set_seed( args, epoch + 10 ) # Added here for reproductibility (even between python 2 and 3) # logger.info(" seed = %d", torch.initial_seed()) epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'entity_a': batch[1], 'entity_b': batch[2], 'attention_mask': batch[3], 'token_type_ids': batch[4] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[5] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results, eval_loss = evaluate(args, model, dev_dataset) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) logger.info(" global step = %d", global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if 0 < args.max_steps < global_step: epoch_iterator.close() break if 0 < args.max_steps < global_step: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) epochnum = 0 best_val_acc = None for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") logging_steps = 0 preds = None out_label_ids = None epochnum += 1 for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids 'labels': batch[3] } outputs = model(**inputs) loss, logits = outputs[:2] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 logging_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) preds = np.argmax(preds, axis=1) tr_acc = (preds == out_label_ids).mean() # evaluate model eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, type='dev') eval_loss, eval_acc = evaluate(args, model, eval_dataset) if not best_val_acc or eval_acc > best_val_acc: if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) best_val_acc = eval_acc else: args.learning_rate /= 4.0 for g in optimizer.param_groups: g['lr'] = args.learning_rate print( 'epoch {} | train_loss {:.6f} | train_acc {:.6f} | dev_loss {:.6f} | dev_acc {:.6f}' .format(epochnum, (tr_loss - logging_loss) / logging_steps, tr_acc, eval_loss, eval_acc)) logging_loss = tr_loss return global_step, tr_loss / global_step