def _setup_training(self): if self.hparams.save_dirpath == 'checkpoints/': self.save_dirpath = os.path.join(self.hparams.root_dir, self.hparams.save_dirpath) self.summary_writer = SummaryWriter(self.save_dirpath) self.checkpoint_manager = CheckpointManager(self.model, self.optimizer, self.save_dirpath, hparams=self.hparams) # If loading from checkpoint, adjust start epoch and load parameters. if self.hparams.load_pthpath == "": self.start_epoch = 1 else: # "path/to/checkpoint_xx.pth" -> xx self.start_epoch = int( self.hparams.load_pthpath.split("_")[-1][:-4]) self.start_epoch += 1 model_state_dict, optimizer_state_dict = load_checkpoint( self.hparams.load_pthpath) if isinstance(self.model, nn.DataParallel): self.model.module.load_state_dict(model_state_dict) else: self.model.load_state_dict(model_state_dict) self.optimizer.load_state_dict(optimizer_state_dict) self.previous_model_path = self.hparams.load_pthpath print("Loaded model from {}".format(self.hparams.load_pthpath)) print(""" # ------------------------------------------------------------------------- # Setup Training Finished # ------------------------------------------------------------------------- """)
class ResponseSelection(object): def __init__(self, hparams): self.hparams = hparams self._logger = logging.getLogger(__name__) np.random.seed(hparams.random_seed) torch.manual_seed(hparams.random_seed) torch.cuda.manual_seed_all(hparams.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True def _build_dataloader(self): # ============================================================================= # SETUP DATASET, DATALOADER # ============================================================================= self.train_dataset = ResponseSelectionDataset(self.hparams, split="train") self.train_dataloader = DataLoader( self.train_dataset, batch_size=self.hparams.train_batch_size, num_workers=self.hparams.cpu_workers, shuffle=True, drop_last=True) print(""" # ------------------------------------------------------------------------- # DATALOADER FINISHED # ------------------------------------------------------------------------- """) def _build_model(self): # ============================================================================= # MODEL : Standard, Mention Pooling, Entity Marker # ============================================================================= print('\t* Building model...') self.model = Model(self.hparams) self.model = self.model.to(self.device) # Use Multi-GPUs if -1 not in self.hparams.gpu_ids and len(self.hparams.gpu_ids) > 1: self.model = nn.DataParallel(self.model, self.hparams.gpu_ids) # ============================================================================= # CRITERION # ============================================================================= self.iterations = len( self.train_dataset) // self.hparams.virtual_batch_size # Prepare optimizer and schedule (linear warmup and decay) if self.hparams.optimizer_type == "Adam": self.optimizer = optim.Adam(self.model.parameters(), lr=self.hparams.learning_rate) elif self.hparams.optimizer_type == "AdamW": no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': self.hparams.weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.iterations * self.hparams.num_epochs) def _setup_training(self): if self.hparams.save_dirpath == 'checkpoints/': self.save_dirpath = os.path.join(self.hparams.root_dir, self.hparams.save_dirpath) self.summary_writer = SummaryWriter(self.save_dirpath) self.checkpoint_manager = CheckpointManager(self.model, self.optimizer, self.save_dirpath, hparams=self.hparams) # If loading from checkpoint, adjust start epoch and load parameters. if self.hparams.load_pthpath == "": self.start_epoch = 1 else: # "path/to/checkpoint_xx.pth" -> xx self.start_epoch = int( self.hparams.load_pthpath.split("_")[-1][:-4]) self.start_epoch += 1 model_state_dict, optimizer_state_dict = load_checkpoint( self.hparams.load_pthpath) if isinstance(self.model, nn.DataParallel): self.model.module.load_state_dict(model_state_dict) else: self.model.load_state_dict(model_state_dict) self.optimizer.load_state_dict(optimizer_state_dict) self.previous_model_path = self.hparams.load_pthpath print("Loaded model from {}".format(self.hparams.load_pthpath)) print(""" # ------------------------------------------------------------------------- # Setup Training Finished # ------------------------------------------------------------------------- """) def train(self): self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self._build_dataloader() self._build_model() self._setup_training() # ins, del, mod check! # Evaluation Setup evaluation = Evaluation(self.hparams, model=self.model) start_time = datetime.now().strftime('%H:%M:%S') self._logger.info("Start train model at %s" % start_time) train_begin = datetime.utcnow() # New global_iteration_step = 0 accu_loss, accu_res_sel_loss, accu_ins_loss, accu_del_loss, accu_srch_loss = 0, 0, 0, 0, 0 accu_cnt = 0 for epoch in range(self.start_epoch, self.hparams.num_epochs + 1): self.model.train() tqdm_batch_iterator = tqdm(self.train_dataloader) accu_batch = 0 for batch_idx, batch in enumerate(tqdm_batch_iterator): buffer_batch = batch.copy() for task_key in batch: for key in buffer_batch[task_key]: buffer_batch[task_key][key] = buffer_batch[task_key][ key].to(self.device) _, losses = self.model(buffer_batch) res_sel_loss, ins_loss, del_loss, srch_loss = losses if res_sel_loss is not None: res_sel_loss = self.hparams.res_sel_loss_ratio * res_sel_loss.mean( ) accu_res_sel_loss += res_sel_loss.item() if ins_loss is not None: ins_loss = self.hparams.ins_loss_ratio * ins_loss.mean() accu_ins_loss += ins_loss.item() if del_loss is not None: del_loss = self.hparams.del_loss_ratio * del_loss.mean() accu_del_loss += del_loss.item() if srch_loss is not None: srch_loss = self.hparams.srch_loss_ratio * srch_loss.mean() accu_srch_loss += srch_loss.item() loss = None for task_tensor_loss in [ res_sel_loss, ins_loss, del_loss, srch_loss ]: if task_tensor_loss is not None: loss = loss + task_tensor_loss if loss is not None else task_tensor_loss loss.backward() accu_loss += loss.item() accu_cnt += 1 # TODO: virtual batch implementation accu_batch += buffer_batch["res_sel"]["label"].shape[0] if self.hparams.virtual_batch_size == accu_batch \ or batch_idx == (len(self.train_dataset) // self.hparams.train_batch_size): # last batch nn.utils.clip_grad_norm_(self.model.parameters(), self.hparams.max_gradient_norm) self.optimizer.step() if self.hparams.optimizer_type == "AdamW": self.scheduler.step() self.optimizer.zero_grad() accu_batch = 0 global_iteration_step += 1 description = "[{}][Epoch: {:3d}][Iter: {:6d}][Loss: {:6f}][Res_Loss: {:4f}]" \ "[Ins_Loss: {:4f}][Del_Loss: {:4f}][Srch_Loss: {:4f}][lr: {:7f}]".format( datetime.utcnow() - train_begin, epoch, global_iteration_step, accu_loss / accu_cnt, accu_res_sel_loss / accu_cnt, accu_ins_loss / accu_cnt, accu_del_loss / accu_cnt, accu_srch_loss / accu_cnt, self.optimizer.param_groups[0]['lr']) tqdm_batch_iterator.set_description(description) # tensorboard if global_iteration_step % self.hparams.tensorboard_step == 0: self._logger.info(description) accu_loss, accu_res_sel_loss, accu_ins_loss, accu_del_loss, accu_srch_loss, accu_cnt = 0, 0, 0, 0, 0, 0 # ------------------------------------------------------------------------- # ON EPOCH END (checkpointing and validation) # ------------------------------------------------------------------------- self.checkpoint_manager.step(epoch) self.previous_model_path = os.path.join( self.checkpoint_manager.ckpt_dirpath, "checkpoint_%d.pth" % (epoch)) self._logger.info(self.previous_model_path) torch.cuda.empty_cache() self._logger.info("Evaluation after %d epoch" % epoch) evaluation.run_evaluate(self.previous_model_path) torch.cuda.empty_cache()
class PostTraining(object): def __init__(self, hparams): self.hparams = hparams self._logger = logging.getLogger(__name__) def _build_dataloader(self): # ============================================================================= # SETUP DATASET, DATALOADER # ============================================================================= self.pretrained_type = self.hparams.model_type.split("_")[0] training_dataset_map = { "bert": BertPostTrainingDataset, "electra": ElectraPostTrainingDataset, "electra-nsp": BertPostTrainingDataset } self.train_dataset = training_dataset_map[self.pretrained_type]( self.hparams, split="train") self.train_dataloader = DataLoader( self.train_dataset, batch_size=self.hparams.train_batch_size, num_workers=self.hparams.cpu_workers, shuffle=True if self.pretrained_type == "electra" else False, drop_last=True) print(""" # ------------------------------------------------------------------------- # DATALOADER FINISHED # ------------------------------------------------------------------------- """) def _build_model(self): # ============================================================================= # MODEL : Standard, Mention Pooling, Entity Marker # ============================================================================= print('\t* Building model...') training_model_map = { "bert": BertDPT, "electra": ElectraDPT, "electra-nsp": ElectraNSPDPT } self.model = training_model_map[self.pretrained_type](self.hparams) self.model = self.model.to(self.device) # Use Multi-GPUs if -1 not in self.hparams.gpu_ids and len(self.hparams.gpu_ids) > 1: self.model = nn.DataParallel(self.model, self.hparams.gpu_ids) self.optimizer = optim.Adam(self.model.parameters(), lr=self.hparams.learning_rate) self.iterations = len( self.train_dataset) // self.hparams.virtual_batch_size print(""" # ------------------------------------------------------------------------- # Building Model Finished # ------------------------------------------------------------------------- """) def _setup_training(self): if self.hparams.save_dirpath == 'checkpoints/': self.save_dirpath = os.path.join(self.hparams.root_dir, self.hparams.save_dirpath) self.summary_writer = SummaryWriter(self.save_dirpath) self.checkpoint_manager = CheckpointManager(self.model, self.optimizer, self.save_dirpath, hparams=self.hparams) # If loading from checkpoint, adjust start epoch and load parameters. if self.hparams.load_pthpath == "": self.start_epoch = 1 else: # "path/to/checkpoint_xx.pth" -> xx self.start_epoch = int( self.hparams.load_pthpath.split("_")[-1][:-4]) self.start_epoch += 1 model_state_dict, optimizer_state_dict = load_checkpoint( self.hparams.load_pthpath) if isinstance(self.model, nn.DataParallel): self.model.module.load_state_dict(model_state_dict) else: self.model.load_state_dict(model_state_dict) self.optimizer.load_state_dict(optimizer_state_dict) self.previous_model_path = self.hparams.load_pthpath print("Loaded model from {}".format(self.hparams.load_pthpath)) print(""" # ------------------------------------------------------------------------- # Setup Training Finished # ------------------------------------------------------------------------- """) def train(self): self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self._build_dataloader() self._build_model() self._setup_training() start_time = datetime.now().strftime('%H:%M:%S') self._logger.info("Start train model at %s" % start_time) train_begin = datetime.utcnow() # New global_iteration_step = 0 accu_electra_loss, accu_mlm_loss, accu_nsp_loss = 0, 0, 0 accumulate_batch, accu_count = 0, 0 for epoch in range(self.start_epoch, self.hparams.num_epochs): self.model.train() tqdm_batch_iterator = tqdm(self.train_dataloader) for batch_idx, batch in enumerate(tqdm_batch_iterator): buffer_batch = batch.copy() for key in batch: buffer_batch[key] = buffer_batch[key].to(self.device) losses = self.model(buffer_batch) electra_loss, mlm_loss, nsp_loss = losses if electra_loss is not None: electra_loss = electra_loss.mean() accu_electra_loss += electra_loss.item() if mlm_loss is not None: mlm_loss = mlm_loss.mean() accu_mlm_loss += mlm_loss.item() if nsp_loss is not None: nsp_loss = nsp_loss.mean() accu_nsp_loss += nsp_loss.item() loss = None for task_tensor_loss in [electra_loss, mlm_loss, nsp_loss]: if task_tensor_loss is not None: loss = loss + task_tensor_loss if loss is not None else task_tensor_loss loss.backward() accu_count += 1 # TODO: virtual batch implementation accumulate_batch += buffer_batch["input_ids"].shape[0] if self.hparams.virtual_batch_size == accumulate_batch \ or batch_idx == (len(self.train_dataset) // self.hparams.train_batch_size): # last batch self.optimizer.step() nn.utils.clip_grad_norm_(self.model.parameters(), self.hparams.max_gradient_norm) self.optimizer.zero_grad() global_iteration_step += 1 description = "[{}][Epoch: {:3d}][Iter: {:6d}][ELECTRA_Loss: {:6f}][MLM_Loss: {:6f}][NSP_Loss: {:6f}][lr: {:7f}]".format( datetime.utcnow() - train_begin, epoch, global_iteration_step, (accu_electra_loss / accu_count), (accu_mlm_loss / accu_count), (accu_nsp_loss / accu_count), self.optimizer.param_groups[0]['lr']) tqdm_batch_iterator.set_description(description) # tensorboard if global_iteration_step % self.hparams.tensorboard_step == 0: description = "[{}][Epoch: {:3d}][Iter: {:6d}][ELECTRA_Loss: {:6f}][MLM_Loss: {:6f}][NSP_Loss: {:6f}][lr: {:7f}]".format( datetime.utcnow() - train_begin, epoch, global_iteration_step, (accu_electra_loss / accu_count), (accu_mlm_loss / accu_count), (accu_nsp_loss / accu_count), self.optimizer.param_groups[0]['lr'], ) self._logger.info(description) accumulate_batch, accu_count = 0, 0 accu_electra_loss, accu_mlm_loss, accu_nsp_loss = 0, 0, 0 if global_iteration_step % self.hparams.checkpoint_save_step == 0: # ------------------------------------------------------------------------- # ON EPOCH END (checkpointing and validation) # ------------------------------------------------------------------------- self.checkpoint_manager.step(global_iteration_step) self.previous_model_path = os.path.join( self.checkpoint_manager.ckpt_dirpath, "checkpoint_%d.pth" % (global_iteration_step)) self._logger.info(self.previous_model_path)
class ResponseSelection(object): def __init__(self, hparams): self.hparams = hparams self._logger = logging.getLogger(__name__) def _build_dataloader(self): # ============================================================================= # SETUP DATASET, DATALOADER # ============================================================================= self.train_dataset = ResponseSelectionDataset(self.hparams, split="train") self.train_dataloader = DataLoader( self.train_dataset, batch_size=self.hparams.train_batch_size, num_workers=self.hparams.cpu_workers, shuffle=True, drop_last=True) print(""" # ------------------------------------------------------------------------- # DATALOADER FINISHED # ------------------------------------------------------------------------- """) def _build_model(self): # ============================================================================= # MODEL : Standard, Mention Pooling, Entity Marker # ============================================================================= print('\t* Building model...') self.model = Model(self.hparams) self.model = self.model.to(self.device) # Use Multi-GPUs if -1 not in self.hparams.gpu_ids and len(self.hparams.gpu_ids) > 1: self.model = nn.DataParallel(self.model, self.hparams.gpu_ids) # ============================================================================= # CRITERION # ============================================================================= self.criterion = nn.BCEWithLogitsLoss() self.optimizer = optim.Adam(self.model.parameters(), lr=self.hparams.learning_rate) self.iterations = len( self.train_dataset) // self.hparams.virtual_batch_size def _setup_training(self): if self.hparams.save_dirpath == 'checkpoints/': self.save_dirpath = os.path.join(self.hparams.root_dir, self.hparams.save_dirpath) self.summary_writer = SummaryWriter(self.save_dirpath) self.checkpoint_manager = CheckpointManager(self.model, self.optimizer, self.save_dirpath, hparams=self.hparams) # If loading from checkpoint, adjust start epoch and load parameters. if self.hparams.load_pthpath == "": self.start_epoch = 1 else: # "path/to/checkpoint_xx.pth" -> xx self.start_epoch = int( self.hparams.load_pthpath.split("_")[-1][:-4]) self.start_epoch += 1 model_state_dict, optimizer_state_dict = load_checkpoint( self.hparams.load_pthpath) if isinstance(self.model, nn.DataParallel): self.model.module.load_state_dict(model_state_dict) else: self.model.load_state_dict(model_state_dict) self.optimizer.load_state_dict(optimizer_state_dict) self.previous_model_path = self.hparams.load_pthpath print("Loaded model from {}".format(self.hparams.load_pthpath)) print(""" # ------------------------------------------------------------------------- # Setup Training Finished # ------------------------------------------------------------------------- """) def train(self): self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self._build_dataloader() self._build_model() self._setup_training() # Evaluation Setup evaluation = Evaluation(self.hparams, model=self.model, split="test") start_time = datetime.now().strftime('%H:%M:%S') self._logger.info("Start train model at %s" % start_time) train_begin = datetime.utcnow() # New global_iteration_step = 0 accumulate_loss = 0 accu_count = 0 for epoch in range(self.start_epoch, self.hparams.num_epochs): self.model.train() tqdm_batch_iterator = tqdm(self.train_dataloader) accumulate_batch = 0 for batch_idx, batch in enumerate(tqdm_batch_iterator): buffer_batch = batch.copy() for key in batch: buffer_batch[key] = buffer_batch[key].to(self.device) logits = self.model(buffer_batch) loss = self.criterion(logits, buffer_batch["label"]) loss.backward() accumulate_loss += loss.item() accu_count += 1 # TODO: virtual batch implementation accumulate_batch += buffer_batch["label"].shape[0] if self.hparams.virtual_batch_size == accumulate_batch \ or batch_idx == (len(self.train_dataset) // self.hparams.train_batch_size): # last batch self.optimizer.step() nn.utils.clip_grad_norm_(self.model.parameters(), self.hparams.max_gradient_norm) self.optimizer.zero_grad() accumulate_batch = 0 global_iteration_step += 1 description = "[{}][Epoch: {:3d}][Iter: {:6d}][Loss: {:6f}][lr: {:7f}]".format( datetime.utcnow() - train_begin, epoch, global_iteration_step, (accumulate_loss / accu_count), self.optimizer.param_groups[0]['lr']) tqdm_batch_iterator.set_description(description) # tensorboard if global_iteration_step % self.hparams.tensorboard_step == 0: description = "[{}][Epoch: {:3d}][Iter: {:6d}][Loss: {:6f}][lr: {:7f}]".format( datetime.utcnow() - train_begin, epoch, global_iteration_step, (accumulate_loss / accu_count), self.optimizer.param_groups[0]['lr'], ) self._logger.info(description) accumulate_loss, accu_count = 0, 0 # ------------------------------------------------------------------------- # ON EPOCH END (checkpointing and validation) # ------------------------------------------------------------------------- self.checkpoint_manager.step(epoch) self.previous_model_path = os.path.join( self.checkpoint_manager.ckpt_dirpath, "checkpoint_%d.pth" % (epoch)) self._logger.info(self.previous_model_path) torch.cuda.empty_cache() self._logger.info("Evaluation after %d epoch" % epoch) evaluation.run_evaluate(self.previous_model_path) torch.cuda.empty_cache()