def train(args, train_dataset, t_model, s_model, order, d_criterion, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(s_model.named_parameters()) + list( d_criterion.named_parameters()) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(s_model, optimizer, opt_level=args.fp16_opt_level) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss = 0.0 average_loss = 0.0 train_avg_loss = 0.0 soft_avg_loss = 0.0 distill_avg_loss = 0.0 s_model.zero_grad() train_iterator = range(int(args.num_train_epochs)) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for epoch in train_iterator: for step, batch in enumerate(train_dataloader): s_model.train() t_model.eval() batch = tuple(t.to(args.device) for t in batch) input_ids, attention_mask, token_type_ids, labels = batch[ 0], batch[1], batch[2], batch[3] train_loss, soft_loss, distill_loss = d_criterion( t_model=t_model, s_model=s_model, order=order, input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels, args=args) loss = args.alpha * train_loss + ( 1 - args.alpha) * soft_loss + args.beta * distill_loss if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training train_loss = train_loss.mean() soft_loss = soft_loss.mean() distill_loss = distill_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps train_loss = train_loss / args.gradient_accumulation_steps soft_loss = soft_loss / args.gradient_accumulation_steps distill_loss = distill_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(s_model.parameters(), args.max_grad_norm) tr_loss += loss.item() average_loss += loss.item() train_avg_loss += train_loss.item() soft_avg_loss += soft_loss.item() distill_avg_loss += distill_loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() if args.schedule: scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, s_model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('total loss', average_loss / args.logging_steps, global_step) tb_writer.add_scalar('train loss', train_avg_loss / args.logging_steps, global_step) tb_writer.add_scalar('soft loss', soft_avg_loss / args.logging_steps, global_step) tb_writer.add_scalar('distill loss', distill_avg_loss / args.logging_steps, global_step) average_loss = 0.0 train_avg_loss = 0.0 soft_avg_loss = 0.0 distill_avg_loss = 0.0 # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(epoch + 1)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = s_model.module if hasattr( s_model, 'module' ) else s_model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) return global_step, tr_loss / global_step
class TransformerBase(TrainableModel): """ Transformers base model (for working with pytorch-transformers models) """ MODEL_CONFIGURATIONS = { 'bert': (BertConfig, BertTokenizer), 'quant_bert': (QuantizedBertConfig, BertTokenizer), 'xlnet': (XLNetConfig, XLNetTokenizer), 'xlm': (XLMConfig, XLMTokenizer), } def __init__(self, model_type: str, model_name_or_path: str, labels: List[str] = None, num_labels: int = None, config_name=None, tokenizer_name=None, do_lower_case=False, output_path=None, device='cpu', n_gpus=0): """ Transformers base model (for working with pytorch-transformers models) Args: model_type (str): transformer model type model_name_or_path (str): model name or path to model labels (List[str], optional): list of labels. Defaults to None. num_labels (int, optional): number of labels. Defaults to None. config_name ([type], optional): configuration name. Defaults to None. tokenizer_name ([type], optional): tokenizer name. Defaults to None. do_lower_case (bool, optional): lower case input words. Defaults to False. output_path ([type], optional): model output path. Defaults to None. device (str, optional): backend device. Defaults to 'cpu'. n_gpus (int, optional): num of gpus. Defaults to 0. Raises: FileNotFoundError: [description] """ assert model_type in self.MODEL_CONFIGURATIONS.keys( ), "unsupported model_type" self.model_type = model_type self.model_name_or_path = model_name_or_path self.labels = labels self.num_labels = num_labels self.do_lower_case = do_lower_case if output_path is not None and not os.path.exists(output_path): raise FileNotFoundError('output_path is not found') self.output_path = output_path self.model_class = None config_class, tokenizer_class = self.MODEL_CONFIGURATIONS[model_type] self.config_class = config_class self.tokenizer_class = tokenizer_class self.tokenizer_name = tokenizer_name self.tokenizer = self._load_tokenizer(self.tokenizer_name) self.config_name = config_name self.config = self._load_config(config_name) self.model = None self.device = device self.n_gpus = n_gpus self._optimizer = None self._scheduler = None def to(self, device='cpu', n_gpus=0): if self.model is not None: self.model.to(device) if n_gpus > 1: self.model = torch.nn.DataParallel(self.model) self.device = device self.n_gpus = n_gpus @property def optimizer(self): return self._optimizer @optimizer.setter def optimizer(self, opt): self._optimizer = opt @property def scheduler(self): return self._scheduler @scheduler.setter def scheduler(self, sch): self._scheduler = sch def setup_default_optimizer(self, weight_decay: float = 0.0, learning_rate: float = 5e-5, adam_epsilon: float = 1e-8, warmup_steps: int = 0, total_steps: int = 0): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] self.optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=warmup_steps, t_total=total_steps) def _load_config(self, config_name=None): config = self.config_class.from_pretrained( config_name if config_name else self.model_name_or_path, num_labels=self.num_labels) return config def _load_tokenizer(self, tokenizer_name=None): tokenizer = self.tokenizer_class.from_pretrained( tokenizer_name if tokenizer_name else self.model_name_or_path, do_lower_case=self.do_lower_case) return tokenizer def save_model(self, output_dir: str, save_checkpoint: bool = False, args=None): """ Save model/tokenizer/arguments to given output directory Args: output_dir (str): path to output directory save_checkpoint (bool, optional): save as checkpoint. Defaults to False. args ([type], optional): arguments object to save. Defaults to None. """ # Create output directory if needed if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Saving model checkpoint to %s", output_dir) model_to_save = self.model.module if hasattr(self.model, 'module') else self.model model_to_save.save_pretrained(output_dir) if not save_checkpoint: if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) with io.open(output_dir + os.sep + 'labels.txt', 'w', encoding='utf-8') as fw: for l in self.labels: fw.write('{}\n'.format(l)) if args is not None: torch.save(args, os.path.join(output_dir, 'training_args.bin')) @classmethod def load_model(cls, model_path: str, model_type: str): """ Create a TranformerBase deom from given path Args: model_path (str): path to model model_type (str): model type Returns: TransformerBase: model """ # Load a trained model and vocabulary from given path if not os.path.exists(model_path): raise FileNotFoundError with io.open(model_path + os.sep + 'labels.txt') as fp: labels = [l.strip() for l in fp.readlines()] return cls(model_type=model_type, model_name_or_path=model_path, labels=labels) @staticmethod def get_train_steps_epochs(max_steps: int, num_train_epochs: int, gradient_accumulation_steps: int, num_samples: int): """ get train steps and epochs Args: max_steps (int): max steps num_train_epochs (int): num epochs gradient_accumulation_steps (int): gradient accumulation steps num_samples (int): number of samples Returns: Tuple: total steps, number of epochs """ if max_steps > 0: t_total = max_steps num_train_epochs = max_steps // (num_samples // gradient_accumulation_steps) + 1 else: t_total = num_samples // gradient_accumulation_steps * num_train_epochs return t_total, num_train_epochs def get_logits(self, batch): self.model.eval() inputs = self._batch_mapper(batch) outputs = self.model(**inputs) return outputs[-1] def _train(self, data_set: DataLoader, dev_data_set: Union[DataLoader, List[DataLoader]] = None, test_data_set: Union[DataLoader, List[DataLoader]] = None, gradient_accumulation_steps: int = 1, per_gpu_train_batch_size: int = 8, max_steps: int = -1, num_train_epochs: int = 3, max_grad_norm: float = 1.0, logging_steps: int = 50, save_steps: int = 100): """Run model training batch_mapper: a function that maps a batch into parameters that the model expects in the forward method (for use with custom heads and models). If None it will default to the basic models input structure. logging_callback_fn: a function that is called in each evaluation step with the model as a parameter. """ t_total, num_train_epochs = self.get_train_steps_epochs( max_steps, num_train_epochs, gradient_accumulation_steps, len(data_set)) if self.optimizer is None and self.scheduler is None: logger.info("Loading default optimizer and scheduler") self.setup_default_optimizer(total_steps=t_total) train_batch_size = per_gpu_train_batch_size * max(1, self.n_gpus) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(data_set.dataset)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per GPU/CPU = %d", per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", train_batch_size * gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 self.model.zero_grad() train_iterator = trange(num_train_epochs, desc="Epoch") for _ in train_iterator: epoch_iterator = tqdm(data_set, desc="Train iteration") for step, batch in enumerate(epoch_iterator): self.model.train() batch = tuple(t.to(self.device) for t in batch) inputs = self._batch_mapper(batch) outputs = self.model(**inputs) loss = outputs[0] # get loss if self.n_gpus > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) tr_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: self.optimizer.step() self.scheduler.step() self.model.zero_grad() global_step += 1 if logging_steps > 0 and global_step % logging_steps == 0: # Log metrics and run evaluation on dev/test for ds in [dev_data_set, test_data_set]: if ds is None: # got no data loader continue if isinstance(ds, DataLoader): ds = [ds] for d in ds: logits, label_ids = self._evaluate(d) self.evaluate_predictions(logits, label_ids) logger.info('lr = {}'.format( self.scheduler.get_lr()[0])) logger.info('loss = {}'.format( (tr_loss - logging_loss) / logging_steps)) logging_loss = tr_loss if save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint self.save_model_checkpoint( output_path=self.output_path, name='checkpoint-{}'.format(global_step)) if 0 < max_steps < global_step: epoch_iterator.close() break if 0 < max_steps < global_step: train_iterator.close() break logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) def _evaluate(self, data_set: DataLoader): logger.info("***** Running inference *****") logger.info(" Batch size: {}".format(data_set.batch_size)) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(data_set, desc="Inference iteration"): self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = self._batch_mapper(batch) outputs = self.model(**inputs) if 'labels' in inputs: tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() else: logits = outputs[0] nb_eval_steps += 1 model_output = logits.detach().cpu() model_out_label_ids = inputs['labels'].detach().cpu( ) if 'labels' in inputs else None if preds is None: preds = model_output out_label_ids = model_out_label_ids else: preds = torch.cat((preds, model_output), dim=0) out_label_ids = torch.cat( (out_label_ids, model_out_label_ids), dim=0) if out_label_ids is not None else None if out_label_ids is None: return preds return preds, out_label_ids def _batch_mapper(self, batch): mapping = { 'input_ids': batch[0], 'attention_mask': batch[1], # XLM don't use segment_ids 'token_type_ids': batch[2] if self.model_type in ['bert', 'quant_bert', 'xlnet'] else None } if len(batch) == 4: mapping.update({'labels': batch[3]}) return mapping def evaluate_predictions(self, logits, label_ids): raise NotImplementedError( 'evaluate_predictions method must be implemented in order to' 'be used for dev/test set evaluation') def save_model_checkpoint(self, output_path: str, name: str): """ save model checkpoint Args: output_path (str): output path name (str): name of checkpoint """ output_dir_path = os.path.join(output_path, name) self.save_model(output_dir_path, save_checkpoint=True)
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids 'labels': batch[3], 'weights': batch[4]} outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model. """ tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( 'Please install apex from https://www.github.com/nvidia/apex to use fp16 training.' ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info('***** Running training *****') logger.info(' Num examples = %d', len(train_dataset)) logger.info(' Num Epochs = %d', args.num_train_epochs) logger.info(' Instantaneous batch size per GPU = %d', args.per_gpu_train_batch_size) logger.info(' Total train batch size (w. parallel & accumulation) = %d', args.train_batch_size * args.gradient_accumulation_steps) logger.info(' Gradient Accumulation steps = %d', args.gradient_accumulation_steps) logger.info(' Total optimization steps = %d', t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc='Epoch') set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc='Iteration') for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3], 'ct_clf_input_ids': batch[4], 'ct_clf_attention_mask': batch[5], 'ct_clf_token_type_ids': batch[6], 'categories': batch[7], 'hand_features': batch[8] } outputs = model(**inputs) loss, clf_loss = outputs[0][0], outputs[1][ 0] # model outputs are always tuple in pytorch_transformers (see doc) total_loss = loss + clf_loss if args.n_gpu > 1: total_loss = total_loss.mean() if args.gradient_accumulation_steps > 1: total_loss = total_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_los(total_loss.optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: total_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += total_loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.evaluate_during_training: result = evaluate(args, model, tokenizer) for key, value in result.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(args, 'training_args.bin') logger.info('Saving model checkpoint to %s', output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader() num_train_optimization_steps = self.train_steps # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained(self.model_name_or_path,self.args, config=config) model.to(self.device) model.train() # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 best_MRR = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids,label_domainslot, label_domain,label_dependcy = batch loss_domainslot,loss_domain,loss_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label_domainslot = label_domainslot, label_domain=label_domain, label_dependcy = label_dependcy ) loss = loss_domainslot+loss_domain+loss_dependcy tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels_domainslot = [] gold_labels_domain = [] gold_labels_dependcy = [] inference_logits = [] scores_domainslot = [] scores_domain = [] scores_dependcy = [] # ID = [x.guid for x in eval_examples] dialogueID = [x.guid for x in eval_examples] utterance_text = [x.text_eachturn for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) model.eval() eval_loss_domainslot,eval_loss_domain,eval_loss_dependcy = 0,0,0 eval_accuracy_domainslot,eval_accuracy_domain,eval_accuracy_dependcy = 0,0,0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids,label_domainslot,label_domain,label_dependcy in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_domainslot = label_domainslot.to(self.device) label_domain = label_domain.to(self.device) label_dependcy = label_dependcy.to(self.device) with torch.no_grad(): batch_eval_loss_domainslot,batch_eval_loss_domain,batch_eval_loss_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label_domainslot = label_domainslot, label_domain=label_domain, label_dependcy=label_dependcy ) logits_domainslot,logits_domain,logits_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask ) logits_domainslot = torch.sigmoid(logits_domainslot) logits_domainslot = (logits_domainslot > 0.4).float() logits_domainslot = logits_domainslot.cpu().long().numpy() logits_domain = logits_domain.view(-1, self.num_labels_domain).detach().cpu().numpy() logits_dependcy = logits_dependcy.view(-1, self.num_labels_dependcy).detach().cpu().numpy() label_domainslot = label_domainslot.to('cpu').numpy() label_domain = label_domain.view(-1).to('cpu').numpy() label_dependcy = label_dependcy.view(-1).to('cpu').numpy() scores_domainslot.append(logits_domainslot) scores_domain.append(logits_domain) scores_dependcy.append(logits_dependcy) gold_labels_domainslot.append(label_domainslot) gold_labels_domain.append(label_domain) gold_labels_dependcy.append(label_dependcy) eval_loss_domainslot += batch_eval_loss_domainslot.mean().item() eval_loss_domain += batch_eval_loss_domain.mean().item() eval_loss_dependcy += batch_eval_loss_dependcy.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels_domainslot = np.concatenate(gold_labels_domainslot, 0) gold_labels_domain = np.concatenate(gold_labels_domain, 0) gold_labels_dependcy = np.concatenate(gold_labels_dependcy, 0) scores_domainslot = np.concatenate(scores_domainslot, 0) scores_domain = np.concatenate(scores_domain, 0) scores_dependcy = np.concatenate(scores_dependcy, 0) model.train() eval_loss_domainslot = eval_loss_domainslot/nb_eval_steps eval_loss_domain = eval_loss_domain / nb_eval_steps eval_loss_dependcy = eval_loss_dependcy / nb_eval_steps # print(scores_domainslot.shape) # print(gold_labels_domainslot.shape) # print(scores_domainslot) # print(gold_labels_domainslot) # exit() eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain,mode='domain') eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy ,mode= 'dependcy') eval_jointGoal_domainslot = compute_jointGoal_domainslot( dialogueID, utterance_text, scores_domainslot, gold_labels_domainslot, scores_domain, gold_labels_domain, scores_dependcy, gold_labels_dependcy ) print( 'eval_jointGoal_domainslot',eval_jointGoal_domainslot, 'eval_F1_domain',eval_accuracy_domain, 'eval_F1_dependcy', eval_accuracy_dependcy, 'global_step',global_step, 'loss',train_loss ) result = { 'eval_jointGoal_domainslot':eval_jointGoal_domainslot, 'eval_loss_domainslot':eval_loss_domainslot, 'eval_loss_domain': eval_loss_domain, 'eval_loss_dependcy':eval_loss_dependcy, 'eval_F1_domain': eval_accuracy_domain, 'eval_F1_dependcy': eval_accuracy_dependcy, 'global_step': global_step, 'loss': train_loss} output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy_domain > best_acc : print("=" * 80) print("Best F1", eval_accuracy_domain) print("Saving Model......") # best_acc = eval_accuracy best_acc = eval_accuracy_domain # Save a trained model model_to_save = model.module if hasattr(model,'module') else model output_model_file = os.path.join(self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def train(args, train_dataset, model, tokenizer): """Train the model""" if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(log_dir=args.log_dir) args.train_batch_size = args.per_gpu_train_batch_size * max( 1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps # t_total: iteration total? args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) # so here we define no_decay as None no_decay = ['bias', 'LayerNorm.weight'] # n: name, p: para, nd: no_decay optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp pass except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 # tr_loss: train loss tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility (even between python 2 and 3) set_seed(args) best_f1 = 0 for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() # Q: why use tuple rather than list? batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } ouputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = ouputs[0] if args.n_gpu > 1: # mean() to average on multi-gpu parallel (not distributed) training loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.eval_steps > 0 and global_step % args.eval_steps == 0: # Log metrics # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) if results['F1'] > best_f1: best_f1 = results['F1'] best_save_dir = os.path.join( args.output_dir, 'best') if not os.path.exists(best_save_dir): os.makedirs(best_save_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(best_save_dir) tokenizer.save_pretrained(best_save_dir) torch.save( args, os.path.join(best_save_dir, 'training_args.bin')) logger.info( f"Saving model checkpoint to {best_save_dir} with best F1: {best_f1}" ) for key, value in results.items(): tb_writer.add_scalar(f'{key}/train', value, global_step) logger.info(f'{key}/train: {value:.4}') tb_writer.add_scalar('LR/train', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('Loss/train', (tr_loss - logging_loss) / args.eval_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint ckpt_save_dir = os.path.join(args.output_dir, f'checkpoint-{global_step}') if not os.path.exists(ckpt_save_dir): os.makedirs(ckpt_save_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(ckpt_save_dir) torch.save( args, os.path.join(ckpt_save_dir, 'training_args.bin')) logger.info(f"Saving model checkpoint to {ckpt_save_dir}") if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str) parser.add_argument("--eval_file", default=None, type=str) parser.add_argument("--test_file", default=None, type=str) parser.add_argument("--model_name_or_path", default=None, type=str) parser.add_argument("--output_dir", default=None, type=str) ## other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=256, type=int) parser.add_argument("--do_train", default=False, type=boolean_string) parser.add_argument("--do_eval", default=False, type=boolean_string) parser.add_argument("--do_test", default=False, type=boolean_string) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument("--eval_batch_size", default=8, type=int) parser.add_argument("--learning_rate", default=3e-5, type=float) parser.add_argument("--num_train_epochs", default=10, type=float) parser.add_argument("--warmup_proprotion", default=0.1, type=float) parser.add_argument("--use_weight", default=1, type=int) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--seed", type=int, default=2019) parser.add_argument("--fp16", default=False) parser.add_argument("--loss_scale", type=float, default=0) parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--warmup_steps", default=0, type=int) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--max_steps", default=-1, type=int) parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--logging_steps", default=500, type=int) parser.add_argument("--clean", default=False, type=boolean_string, help="clean the output dir") parser.add_argument("--need_birnn", default=False, type=boolean_string) parser.add_argument("--rnn_dim", default=128, type=int) args = parser.parse_args() device = torch.device("cuda") # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ args.device = device n_gpu = torch.cuda.device_count() logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.info(f"device: {device} n_gpu: {n_gpu}") if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) # now_time = datetime.datetime.now().strftime('%Y-%m-%d_%H') # tmp_dir = args.output_dir + '/' +str(now_time) + '_ernie' # if not os.path.exists(tmp_dir): # os.makedirs(tmp_dir) # args.output_dir = tmp_dir if args.clean and args.do_train: # logger.info("清理") if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) print(c_path) if os.path.isdir(c_path): del_file(c_path) os.rmdir(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if not os.path.exists(os.path.join(args.output_dir, "eval")): os.makedirs(os.path.join(args.output_dir, "eval")) writer = SummaryWriter(logdir=os.path.join(args.output_dir, "eval"), comment="Linear") processor = NerProcessor() label_list = processor.get_labels(args) num_labels = len(label_list) args.label_list = label_list if os.path.exists(os.path.join(args.output_dir, "label2id.pkl")): with open(os.path.join(args.output_dir, "label2id.pkl"), "rb") as f: label2id = pickle.load(f) else: label2id = {l: i for i, l in enumerate(label_list)} with open(os.path.join(args.output_dir, "label2id.pkl"), "wb") as f: pickle.dump(label2id, f) id2label = {value: key for key, value in label2id.items()} # Prepare optimizer and schedule (linear warmup and decay) if args.do_train: tokenizer = BertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels) model = BERT_BiLSTM_CRF.from_pretrained(args.model_name_or_path, config=config, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) train_examples, train_features, train_data = get_Dataset(args, processor, tokenizer, mode="train") train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: eval_examples, eval_features, eval_data = get_Dataset(args, processor, tokenizer, mode="eval") if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total optimization steps = %d", t_total) model.train() global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1 = 0.0 for ep in trange(int(args.num_train_epochs), desc="Epoch"): model.train() for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch outputs = model(input_ids, label_ids, segment_ids, input_mask) loss = outputs if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: tr_loss_avg = (tr_loss - logging_loss) / args.logging_steps writer.add_scalar("Train/loss", tr_loss_avg, global_step) logging_loss = tr_loss if args.do_eval: all_ori_tokens_eval = [f.ori_tokens for f in eval_features] overall, by_type = evaluate(args, eval_data, model, id2label, all_ori_tokens_eval) # add eval result to tensorboard f1_score = overall.fscore writer.add_scalar("Eval/precision", overall.prec, ep) writer.add_scalar("Eval/recall", overall.rec, ep) writer.add_scalar("Eval/f1_score", overall.fscore, ep) # save the best performs model if f1_score > best_f1: logger.info( f"----------the best f1 is {f1_score}---------") best_f1 = f1_score model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save( args, os.path.join(args.output_dir, 'training_args.bin')) # logger.info(f'epoch {ep}, train loss: {tr_loss}') # writer.add_graph(model) writer.close() # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training # model_to_save.save_pretrained(args.output_dir) # tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model # torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) if args.do_test: # model = BertForTokenClassification.from_pretrained(args.output_dir) # model.to(device) label_map = {i: label for i, label in enumerate(label_list)} tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) args = torch.load(os.path.join(args.output_dir, 'training_args.bin')) model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) test_examples, test_features, test_data = get_Dataset(args, processor, tokenizer, mode="test") logger.info("***** Running test *****") logger.info(f" Num examples = {len(test_examples)}") logger.info(f" Batch size = {args.eval_batch_size}") all_ori_tokens = [f.ori_tokens for f in test_features] all_ori_labels = [e.label.split(" ") for e in test_examples] test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() pred_labels = [] for b_i, (input_ids, input_mask, segment_ids, label_ids) in enumerate( tqdm(test_dataloader, desc="Predicting")): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model.predict(input_ids, segment_ids, input_mask) # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) # logits = logits.detach().cpu().numpy() for l in logits: pred_label = [] for idx in l: pred_label.append(id2label[idx]) pred_labels.append(pred_label) assert len(pred_labels) == len(all_ori_tokens) == len(all_ori_labels) print(len(pred_labels)) with open(os.path.join(args.output_dir, "token_labels_.txt"), "w", encoding="utf-8") as f: for ori_tokens, ori_labels, prel in zip(all_ori_tokens, all_ori_labels, pred_labels): for ot, ol, pl in zip(ori_tokens, ori_labels, prel): if ot in ["[CLS]", "[SEP]"]: continue else: f.write(f"{ot} {ol} {pl}\n") f.write("\n")
def train_and_eval(args, train_dataset, dev_dataset, test_dataset, model, tokenizer): """Train and evaluate the model on some steps""" if args.local_rank in [-1, 0]: tb_writer = SummaryWriter( comment= f"_with-similarity_{args.task_name}_{args.mt_system}_{args.num_train_epochs}" ) # TensorboardX args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) best_dev_model = None best_dev_result = 0.0 best_dev_step = 0 no_improve_step = 0 # no improved in this steps, then end training no_improve_step_num = 200 evaluate_f1 = "macro_f1" if args.task_name == "tacred": evaluate_f1 = "f1_micro" for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) if no_improve_step >= no_improve_step_num: logger.info( f"No improved in {no_improve_step_num}, total global step: {global_step}" ) break for step, batch in enumerate(epoch_iterator): model.train() # Move all data to device in batch batch = tuple(t.to(args.device) for t in batch) # Traverse bag=[batch_size, source_num, *] # source_num=4: human-label, google, baidu, xiaoniu # for i in range(batch[0].size()[1]): inputs = { 'input_ids': batch[0], # along dim to select index 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids 'entity_position': batch[3], 'labels': batch[4], 'loss_weight': args.loss_weight, 'mode': 'train' } outputs = model(mt_system=args.mt_system, **inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) #print(f"batch loss: {loss}, similarity loss: {outputs[1]}") if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if no_improve_step >= no_improve_step_num: logger.info( f"No improved in {no_improve_step_num}, total global step: {global_step}" ) break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train_val_test(args, model, train_loader, val_loader, test_loader, missing_keys=() ): """ Train the model """ t_total = len(train_loader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # optimizer_grouped_parameters = [ # {'params': [p for n, p in model.named_parameters() if n in missing_keys], 'lr': 0.001}, # {'params': [p for n, p in model.named_parameters() if n not in missing_keys]} # ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # wt = torch.Tensor([2785/16560, 13775/16560]) # the classification task is class-imbanlanced # wt = torch.Tensor([0.25, 0.75]) loss_func = nn.CrossEntropyLoss(weight=wt).to(args.device) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_loader.dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size = %d", args.batch_size) logger.info(" Total train batch size ( distributed & accumulation) = %d", args.batch_size * args.gradient_accumulation_steps ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss = 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") res_eval=[] res_test=[] eval_best=0 for _ in train_iterator: epoch_iterator = tqdm(train_loader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() targets = torch.LongTensor(batch['label']).to(args.device) outputs = model(input_ids=batch['input_ids'], attention_mask=batch['input_mask'],token_type_ids=batch['segment_ids'], max_seqs_per_doc=args.max_seq_per_doc ) prob = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) loss = loss_func(prob, targets) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if (step+1)%500==0: logger.info('step:%d, training loss: %s', global_step, tr_loss / global_step) results_eval = evaluate(args, model, val_loader) results_test = evaluate(args, model, test_loader) res_eval.append(results_eval) res_test.append(results_test) if results_eval[-4]>eval_best: output_dir = os.path.join(pretrained_path, 'AKI/model_{}_ep{}_{}pool_wd0.001_drop{}{}'.format(args.sampling, args.num_train_epochs, args.pooling, args.dropout,'_oneseq' if args.as_one_sequence else '')) if not os.path.exists(os.path.join(pretrained_path,'AKI')): os.makedirs(os.path.join(pretrained_path,'AKI')) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) # output_dir = os.path.join(args.output_dir, 'checkpoint-{}-{}'.format(args.bert_model.split('/')[-2] if args.bert_model!='bert-base-uncased' else 'bert-base-uncased',global_step)) # if not os.path.exists(output_dir): # os.makedirs(output_dir) # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training # model_to_save.save_pretrained(output_dir) # torch.save(args, os.path.join(output_dir, 'training_args.bin')) # logger.info("Saving model checkpoint to %s", output_dir) return global_step, tr_loss / global_step, np.array(res_eval), np.array(res_test)
def train(model, tokenizer, train_dataset, eval_dataset, output_dir, per_gpu_train_batch_size, max_steps, num_train_epochs, gradient_accumulation_steps, learning_rate, adam_epsilon, warmup_steps, max_grad_norm, weight_decay, logging_steps=50, evaluate_during_training=True, save_steps=50): """ Train the model """ n_gpu = torch.cuda.device_count() train_batch_size = per_gpu_train_batch_size * max(1, n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) if max_steps > 0: t_total = max_steps num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader) // gradient_accumulation_steps * num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", train_batch_size * gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() tb_writer = SummaryWriter(log_dir=get_log_dir(output_dir)) device = get_device() train_iterator = trange(int(num_train_epochs), desc="Epoch", disable=False) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False) for step, batch in enumerate(epoch_iterator): model.train() input_ids, attention_mask, token_type_ids, labels = tuple( t.to(device) for t in batch) outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels) loss = outputs[0] if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) tr_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if logging_steps > 0 and global_step % logging_steps == 0: # Log metrics log_performance_evaluation(train_dataset, "train", global_step, model, output_dir, tb_writer) if evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well log_performance_evaluation(eval_dataset, "eval", global_step, model, output_dir, tb_writer) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / logging_steps, global_step) tb_writer.flush() logging_loss = tr_loss if save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint save_model_checkpoint(tokenizer, model, global_step, output_dir) if 0 < max_steps < global_step: epoch_iterator.close() break if 0 < max_steps < global_step: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
class Trainer: def __init__(self, args, config, model, criterion, train_dataloader, valid_dataloader, logger, save_path, tb_writer): self.args = args self.config = config self.model = model self.criterion = criterion self.train_dataloader = train_dataloader self.valid_dataloader = valid_dataloader self.logger = logger self.save_path = save_path self.tb_writer = tb_writer self.t_total = len(self.train_dataloader) * self.args.epoch self.device = self.config.device # self.optimizer = AdamW(self.get_model_parameters(), lr=self.config.learning_rate) self.optimizer = Adam(self.get_model_parameters(), lr=self.config.learning_rate) self.scheduler = WarmupLinearSchedule(self.optimizer, 0.1 * self.t_total, self.t_total) self.global_step = 0 self.best_eval_loss = 7.0 def get_model_parameters(self): param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] return optimizer_grouped_parameters def train(self): for epoch in range(self.args.epoch): self.train_epoch(epoch) self.evaluation(epoch) self.write_to_tb() self.save_model(epoch) self.tb_writer.close() def transform_to_bert_input(self, batch): input_ids = torch.from_numpy(batch[0]).to(self.device) valid_length = batch[1].clone().detach().to(self.device) token_type_ids = torch.tensor(batch[2]).long().to(self.device) label = torch.from_numpy(batch[3]).to(self.device) return input_ids, valid_length, token_type_ids, label def train_epoch(self, epoch): self.model.to(self.device) self.model.train() tr_correct_cnt, tr_total_cnt = 0, 0 tr_loss = 0.0 train_loader = self.train_dataloader for step, batch in enumerate(train_loader): self.model.zero_grad() input_idx, valid_length, token_type_idx, label = self.transform_to_bert_input( batch) output = self.model(input_idx, valid_length, token_type_idx) loss = self.criterion(output.view(-1, output.size(-1)), label.view(-1)) tr_loss += loss.item() loss.backward() if step > 0 and ( step) % self.config.gradient_accumulation_steps == 0: self.global_step += self.config.gradient_accumulation_steps self.optimizer.step() self.optimizer.zero_grad() self.scheduler.step() self.tr_avg_loss = tr_loss / step if self.global_step % 100 == 0: self.logger.info( 'epoch : {} /{}, global_step : {} /{}, loss: {:.3f}, tr_avg_loss: {:.3f}' .format(epoch + 1, self.args.epoch, self.global_step, self.t_total, loss.item(), self.tr_avg_loss)) def evaluation(self, epoch): self.model.eval() eval_loss = 0.0 eval_step = 1 self.logger.info('*****************Evaluation*****************') valid_loader = tqdm(self.valid_dataloader) for step, batch in enumerate(valid_loader): with torch.no_grad(): input_idx, valid_length, token_type_idx, label = self.transform_to_bert_input( batch) output = self.model(input_idx, valid_length, token_type_idx) loss = self.criterion(output.view(-1, output.size(-1)), label.view(-1)) eval_loss += loss.item() eval_step += 1.0 self.eval_avg_loss = eval_loss / eval_step self.logger.info( 'epoch : {} /{}, global_step : {} /{}, eval_loss: {:.3f}'.format( epoch + 1, self.args.epoch, self.global_step, self.t_total, self.eval_avg_loss)) def save_model(self, epoch): if self.eval_avg_loss < self.best_eval_loss: self.best_eval_loss = self.eval_avg_loss self.model.to(torch.device('cpu')) state = { 'epoch': epoch + 1, 'model_state_dict': self.model.state_dict() } save_model_path = '{}/epoch_{}_step_tr_loss_{:.3f}_eval_loss_{:.3f}.pt'.format( self.save_path, epoch + 1, self.global_step, self.tr_avg_loss, self.eval_avg_loss) # Delte previous checkpoint if len(glob.glob(self.save_path + '/epoch*.pt')) > 0: os.remove(glob.glob(self.save_path + '/epoch*.pt')[0]) torch.save(state, save_model_path) self.logger.info(' Model saved to {}'.format(save_model_path)) os.mkdir(self.save_path + '/epoch_{}_eval_loss_{:.3f}'.format( epoch + 1, self.eval_avg_loss)) def write_to_tb(self): self.tb_writer.add_scalars('loss', { 'train': self.tr_avg_loss, 'val': self.eval_avg_loss }, self.global_step)
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility (even between python 2 and 3) for epoch_num in train_iterator: for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3]} outputs = model(**inputs) loss = outputs # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics logger.info("Step {0}, Loss: {1}".format(global_step, loss.item())) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_cmsqa_examples(args.predict_file, is_training=True, num_choices=args.num_choices) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.per_gpu_eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: device = args.device input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = None label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} if args.do_train: result['global_step'] = global_step output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: writer.write('Epoch' + str(epoch_num) + '\n') logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) output_dir = os.path.join(args.output_dir, str(epoch_num)) os.makedirs(output_dir) logger.info("Saving model checkpoint to %s", output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default='fintuned_gpt', type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--num_prior', type=int, default=2) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['[BOS]', '[SEP]', '[CLS]'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name) model.resize_token_embeddings(new_num_tokens=len(tokenizer)) model.to(device) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_dataset(tokenizer, args.dataset, num_prior=args.num_prior, t='train') eval_dataset = load_dataset(tokenizer, args.dataset, num_prior=args.num_prior, t='eval') datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids, lm_labels, mc_labels) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) data_splitList = DATACQA.load_data(os.path.join( self.data_dir, 'train.csv'), n_splits=5) for split_index, each_data in enumerate(data_splitList): # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForSequenceClassification.from_pretrained( self.model_name_or_path, self.args, config=config) model.to(self.device) logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader( each_data) num_train_optimization_steps = self.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and ( step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] scores = [] questions = [x.text_a for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) # Run prediction for full data model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracyCQA(inference_logits, gold_labels) eval_mrr = compute_MRR_CQA(scores, gold_labels, questions) # eval_5R20 = compute_5R20(scores,gold_labels,questions) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'eval_MRR': eval_mrr, # 'eval_5R20':eval_5R20, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join( self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( self.output_dir, "pytorch_model_{}.bin".format(split_index)) torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) del model gc.collect()
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(log_dir=args.result_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs data_length = len(train_dataloader) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=True) set_seed( args) # Added here for reproducibility (even between python 2 and 3) for epo in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True) for step, batch in enumerate(epoch_iterator): #inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs, labels = batch['masked_text_seq'], batch[ 'masked_text_label'] lang_attention_mask = batch['lang_attention_mask'] img_feats = batch['feature_all'] actions = batch['teacher'] inputs = inputs.to(args.device) labels = labels.to(args.device) actions = actions.to(args.device) img_feats = img_feats.to(args.device) lang_attention_mask = lang_attention_mask.to(args.device) model.train() #outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) if args.include_next: outputs = model(inputs, labels, actions, img_feats, lang_mask=lang_attention_mask) else: outputs = model(inputs, labels, None, img_feats, lang_mask=lang_attention_mask) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 #if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % data_length == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if step % 100 == 0: print("\n") print("PROGRESS: {}%".format( round((epo * len(train_dataloader) + step) * 100 / t_total, 4))) print("EVALERR: {}%".format(tr_loss / (global_step))) tb_writer.add_scalar("evalerr", tr_loss / global_step, global_step=global_step) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break return global_step, tr_loss / global_step
def train(args, train_dataset, model): train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: # 如果指定了最大的执行step , 则重新计算epoch_num total_train_steps = args.max_steps args.train_epoch_num = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: total_train_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.train_epoch_num # no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.eps) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=total_train_steps) print("*********** Train ***********") print("\tExamples num: %d" % (len(train_dataset))) print("\tEpoch num: %d" % (args.train_epoch_num)) print("\tBatch size: %d" % (args.train_batch_size)) print("\tGradient accumulation steps: %d" % (args.gradient_accumulation_steps)) print("\tTrain data loader num: %d" % (len(train_dataloader))) print("\tTotol optimization steps: %d" % (total_train_steps)) global_step = 0 train_loss = 0.0 model.zero_grad() #将梯度置为0 train_iterator = trange(args.train_epoch_num, desc="Epoch") fix_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): batch = [ex.to(args.device) for ex in batch] inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], #segment_ids 'labels': batch[3] } outputs = model(**inputs) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) #截断梯度, 防止梯度爆炸 train_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: # 当达到设定的累积梯度的step数时, 更新梯度, 学习率, 然后将累积的梯度置零 #根据WARNING信息, 这二者的顺序在目前的pytorch中需要按如下来写 optimizer.step() #更新梯度 scheduler.step() #更新学习率 model.zero_grad() global_step += 1 if args.save_steps > 0 and global_step % args.save_steps == 0: #根据传入的save_steps的数值保存模型 checkpoint_path = os.path.join( args.output_model_dir, 'checkpoint-on-step-{}'.format(global_step)) if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) #model.save_pretrained(checkpoint_path) #torch.save(args, os.path.join(checkpoint_path, 'training_args.bin')) model.config.save_pretrained(checkpoint_path) torch.save(model.state_dict(), os.path.join(checkpoint_path, "pytorch_model.bin"), _use_new_zipfile_serialization=False) torch.save(args, os.path.join(checkpoint_path, 'train_args.bin'), _use_new_zipfile_serialization=False) if args.max_steps > 0 and global_step > args.max_steps: # 达到最大step数时, 停止迭代 epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: # 达到最大step数时, 停止迭代 train_iterator.close() break return global_step, train_loss / global_step
def train(args, train_dataset, model, tokenizer): args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) ## DATALOADER train_sampler = SequentialSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) graph_train_dataloader_a, graph_train_dataloader_b = load_graph_examples( args, reverse=True) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs assert len(train_dataset) == len(graph_train_dataloader_a.dataset) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) ## OPTIMIZERS AND SCHEDULERS if not args.text_only: graph_optimizer = AdamW(model.graph_encoder.parameters(), lr=args.graph_lr, weight_decay=args.weight_decay) linear_optimizer = AdamW(model.classifier.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) linear_c_optimizer = AdamW(model.classifier_c.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) linear_c2_optimizer = AdamW(model.classifier_c2.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) linear_type_optimizer = AdamW(model.classifier_type.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) bert_optimizer_grouped_parameters = get_bert_param_groups( model.text_encoder, args) bert_optimizer = AdamW(bert_optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, weight_decay=args.weight_decay) bert_scheduler = WarmupLinearSchedule(bert_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if not args.text_only: graph_scheduler = WarmupLinearSchedule(graph_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) linear_scheduler = WarmupLinearSchedule(linear_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) linear_c_scheduler = WarmupLinearSchedule(linear_c_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) linear_c2_scheduler = WarmupLinearSchedule(linear_c2_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) linear_type_scheduler = WarmupLinearSchedule( linear_type_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) ## TRAIN global_step = 0 best_val = 0. tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() set_seed(args) print('layernorm on: {}'.format(args.do_layernorm)) for _ in trange(int(args.num_train_epochs), desc='Epoch'): for batch, data_a, data_b in tqdm(zip(train_dataloader, graph_train_dataloader_a, graph_train_dataloader_b), desc='Iteration', total=len(train_dataloader)): model.train() batch = tuple(t.to(args.device) for t in batch) data_a, data_b = data_a.to(args.device), data_b.to(args.device) loss_combined = 0.0 if args.aux == 'all': modes = ['medsts', 'medsts_c', 'medsts_c2', 'medsts_type'] elif args.aux == 'medsts_c': modes = ['medsts', 'medsts_c'] elif args.aux == 'medsts_c2': modes = ['medsts', 'medsts_c2'] elif args.aux == 'medsts_type': modes = ['medsts', 'medsts_type'] else: modes = ['medsts'] for mode in modes: # torch.cuda.empty_cache() logits = model(batch[0], batch[1], batch[2], data_a, data_b, mode, args.do_layernorm) if mode == 'medsts': if args.kd: loss = F.mse_loss( logits, data_a.label ) + args.loss_factor * bounded_kd_loss( logits, batch[5], data_a.label, margin=args.margin) else: loss = F.mse_loss(logits, data_a.label) elif mode == 'medsts_c': loss = F.cross_entropy(logits, data_a.label_c) elif mode == 'medsts_c2': loss = F.cross_entropy(logits, data_a.label_c2) elif mode == 'medsts_type': loss = F.cross_entropy(logits, data_a.label_type) loss_combined += loss loss_combined.backward() torch.nn.utils.clip_grad_norm_(model.text_encoder.parameters(), args.max_grad_norm) tr_loss += loss.item() bert_scheduler.step() if not args.text_only: graph_scheduler.step() linear_scheduler.step() linear_c_scheduler.step() linear_c2_scheduler.step() linear_type_scheduler.step() bert_optimizer.step() if not args.text_only: graph_optimizer.step() linear_optimizer.step() linear_c_optimizer.step() linear_c2_optimizer.step() linear_type_optimizer.step() model.zero_grad() global_step += 1 args.logging_steps = len(train_dataloader) // 4 if global_step % args.logging_steps == 0: if args.do_eval: result, errors = evaluate(args, model, tokenizer) if result['pearson'] > best_val: best_val = result['pearson'] output_dir = os.path.join(args.output_dir, 'checkpoint') if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info( "Saving checkpoint with acc {} at {}".format( best_val, output_dir)) torch.save( args, os.path.join(output_dir, 'training_args.bin')) model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), os.path.join(output_dir, 'saved_model.pth')) tokenizer.save_pretrained(output_dir) #saving text encoder model.text_encoder.save_pretrained(output_dir) #error analysis errors.to_csv(os.path.join(output_dir, 'errors.csv'), index=False) #save results.txt best_eval_file = os.path.join(output_dir, 'eval_results.txt') with open(best_eval_file, 'a') as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("{} = {} \t step {} \n".format( key, str(result[key]), global_step)) else: if global_step == t_total - 5: print( '@@@@@@@@@@@@@@@@@@@@@@@@@ saving checkpoint @@@@@@@@@@@@@@@@@@@@@@' ) output_dir = os.path.join( args.output_dir, 'checkpoint_{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info( "Saving checkpoint at {}".format(output_dir)) model.text_encoder.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, 'training_args.bin')) model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), os.path.join(output_dir, 'saved_model.pth')) tokenizer.save_pretrained(output_dir) logging_loss = tr_loss return global_step, tr_loss / global_step
def train_portion(ace_ontology, data_ace, ratio): # parameters n_epoch = 10 batch_size = 12 learning_rate = 5e-5 adam_epsilon = 1e-8 warmup_steps = 0 max_grad_norm = 1.0 # load model os.environ['CUDA_VISIBLE_DEVICES'] = '1' device = 'cuda' tokenizer, model = load_model('../data/my-bert-large-cased-squad/', device) # tokenizer, model = load_model('/home/jliu/data/BertModel/bert-large-cased', device) cut_idx = int(len(data_ace['train']) * ratio) print('Training examples', cut_idx) max_seq_len = 120 training_set = build_examples(ace_ontology, data_ace['train'][:cut_idx], training=True) training_set = transfer_to_query_bert_format(training_set, tokenizer, max_seq_len, training=True) train_dataset = Dataset(batch_size, max_seq_len, training_set) # developping set dev_set = build_examples(ace_ontology, data_ace['dev'], training=False) dev_set = transfer_to_query_bert_format(dev_set, tokenizer, max_seq_len, training=False) dev_dataset = Dataset(batch_size, max_seq_len, dev_set) t_total = int(n_epoch * len(training_set) / batch_size) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) torch.cuda.empty_cache() global_step = 0 for _ in range(n_epoch): for batch in train_dataset.get_tqdm(device, shuffle=True): global_step += 1 model.train() input_ids, input_mask, segment_ids, start_positions, end_positions, token_to_orig_map, example = batch inputs = {'input_ids': input_ids, 'attention_mask': input_mask, 'token_type_ids': segment_ids, 'start_positions': start_positions, 'end_positions': end_positions} outputs = model(**inputs) loss = outputs[0] loss = loss.mean() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() model.eval() with torch.no_grad(): model_evaluation(model, dev_dataset, device) print() #model.destroy() del model torch.cuda.empty_cache()
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--vocab_file", default='bert-base-uncased-vocab.txt', type=str, required=True) parser.add_argument("--model_file", default='bert-base-uncased.tar.gz', type=str, required=True) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument( "--predict_dir", default=None, type=str, required=True, help="The output directory where the predictions will be written.") # Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=2.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--view_id', type=int, default=1, help="view id of multi-view co-training(two-view)") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--adam_epsilon', type=float, default=1e-8) parser.add_argument('--max_grad_norm', type=float, default=1.0) # Base setting parser.add_argument('--pretrain', type=str, default=None) parser.add_argument('--max_ctx', type=int, default=2) parser.add_argument('--task_name', type=str, default='coqa_yesno') parser.add_argument('--bert_name', type=str, default='baseline') parser.add_argument('--reader_name', type=str, default='coqa') # model parameters parser.add_argument('--evidence_lambda', type=float, default=0.8) parser.add_argument('--tf_layers', type=int, default=1) parser.add_argument('--tf_inter_size', type=int, default=3072) # Parameters for running labeling model parser.add_argument('--do_label', default=False, action='store_true') parser.add_argument('--sentence_id_files', nargs='*') parser.add_argument('--weight_threshold', type=float, default=0.0) parser.add_argument('--only_correct', default=False, action='store_true') parser.add_argument('--label_threshold', type=float, default=0.0) args = parser.parse_args() logger = setting_logger(args.output_dir) logger.info('================== Program start. ========================') model_params = prepare_model_params(args) read_params = prepare_read_params(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if args.do_train: if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) if args.do_predict: os.makedirs(args.predict_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.vocab_file) data_reader = initialize_reader(args.reader_name) num_train_steps = None if args.do_train or args.do_label: train_examples = data_reader.read(input_file=args.train_file, **read_params) cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_{4}_{5}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), str(args.max_ctx), str(args.task_name)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except FileNotFoundError: train_features = data_reader.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) print(train_features[-1].unique_id) num_train_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if args.pretrain is not None: logger.info('Load pretrained model from {}'.format(args.pretrain)) model_state_dict = torch.load(args.pretrain, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) else: model = initialize_model(args.bert_name, args.model_file, **model_params) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps # if args.local_rank != -1: # t_total = t_total // torch.distributed.get_world_size() # if args.fp16: # try: # from apex.optimizers import FP16_Optimizer # from apex.optimizers import FusedAdam # except ImportError: # raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") # # optimizer = FusedAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # bias_correction=False, # max_grad_norm=1.0) # if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) # else: # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) # else: # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=t_total) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_proportion * t_total, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # Prepare data eval_examples = data_reader.read(input_file=args.predict_file, **read_params) eval_features = data_reader.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) eval_tensors = data_reader.data_to_tensors(eval_features) eval_data = TensorDataset(*eval_tensors) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) if args.do_train: if args.do_label: logger.info('Training in State Wise.') sentence_id_file_list = args.sentence_id_files if sentence_id_file_list is not None: for file in sentence_id_file_list: train_features = data_reader.generate_features_sentence_ids( train_features, file) else: train_features = data_reader.mask_all_sentence_ids( train_features) logger.info('No sentence id supervision is found.') else: logger.info('Training in traditional way.') logger.info("Start training") train_loss = AverageMeter() best_acc = 0.0 summary_writer = SummaryWriter(log_dir=args.output_dir) global_step = 0 eval_loss = AverageMeter() train_tensors = data_reader.data_to_tensors(train_features) train_data = TensorDataset(*train_tensors) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for epoch in trange(int(args.num_train_epochs), desc="Epoch"): # Train model.train() for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( batch, train_features, do_label=args.do_label, model_state=ModelState.Train) loss = model(**inputs)['loss'] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: # optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() model.zero_grad() global_step += 1 # # modify learning rate with special warm up BERT uses # """ 19.7.4: Fix learning rate bug. """ # if args.fp16: # """ 19.7.4 warmup_linear is used as the function in optimization not as the comment above. """ # lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) # for param_group in optimizer.param_groups: # param_group['lr'] = lr_this_step # summary_writer.add_scalar('lr', lr_this_step, global_step) # else: # summary_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) # # optimizer.step() # optimizer.zero_grad() # global_step += 1 train_loss.update(loss.item(), args.train_batch_size) summary_writer.add_scalar('train_loss', train_loss.avg, global_step) summary_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) # Evaluation model.eval() all_results = [] logger.info("Start evaluating") for eval_step, batch in enumerate( tqdm(eval_dataloader, desc="Evaluating")): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( batch, eval_features, do_label=args.do_label, model_state=ModelState.Evaluate) with torch.no_grad(): output_dict = model(**inputs) loss, batch_choice_logits = output_dict[ 'loss'], output_dict['yesno_logits'] eval_loss.update(loss.item(), args.predict_batch_size) summary_writer.add_scalar( 'eval_loss', eval_loss.avg, epoch * len(eval_dataloader) + eval_step) example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu( ).tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResultChoice(unique_id=unique_id, choice_logits=choice_logits)) data_reader.write_predictions(eval_examples, eval_features, all_results, None, null_score_diff_threshold=0.0) yes_metric = data_reader.yesno_cate.f1_measure('yes', 'no') no_metric = data_reader.yesno_cate.f1_measure('no', 'yes') current_acc = yes_metric['accuracy'] summary_writer.add_scalar('eval_yes_f1', yes_metric['f1'], epoch) summary_writer.add_scalar('eval_yes_recall', yes_metric['recall'], epoch) summary_writer.add_scalar('eval_yes_precision', yes_metric['precision'], epoch) summary_writer.add_scalar('eval_no_f1', no_metric['f1'], epoch) summary_writer.add_scalar('eval_no_recall', no_metric['recall'], epoch) summary_writer.add_scalar('eval_no_precision', no_metric['precision'], epoch) summary_writer.add_scalar('eval_yesno_acc', current_acc, epoch) torch.cuda.empty_cache() if current_acc > best_acc: best_acc = current_acc model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) logger.info('Epoch: %d, Accuracy: %f (Best Accuracy: %f)' % (epoch, current_acc, best_acc)) data_reader.yesno_cate.reset() summary_writer.close() # Loading trained model. output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") model_state_dict = torch.load(output_model_file, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) model.to(device) # Write Yes/No predictions if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = eval_examples test_features = eval_features test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] logger.info("Start predicting yes/no on Dev set.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs(batch, test_features, do_label=args.do_label, model_state=ModelState.Test) with torch.no_grad(): batch_choice_logits = model(**inputs)['yesno_logits'] example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu().tolist() test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( RawResultChoice(unique_id=unique_id, choice_logits=choice_logits)) output_prediction_file = os.path.join(args.predict_dir, 'predictions.json') data_reader.write_predictions(eval_examples, eval_features, all_results, output_prediction_file, null_score_diff_threshold=0.0) yes_metric = data_reader.yesno_cate.f1_measure('yes', 'no') no_metric = data_reader.yesno_cate.f1_measure('no', 'yes') logger.info('Yes Metrics: %s' % json.dumps(yes_metric, indent=2)) logger.info('No Metrics: %s' % json.dumps(no_metric, indent=2)) # Labeling sentence id. if args.do_label and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = train_examples test_features = train_features test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running labeling *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] logger.info("Start labeling.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device(batch, device) inputs = data_reader.generate_inputs(batch, test_features, do_label=args.do_label, model_state=ModelState.Test) with torch.no_grad(): output_dict = model(**inputs) batch_choice_logits = output_dict['yesno_logits'] batch_max_weight_indexes = output_dict['max_weight_index'] batch_max_weight = output_dict['max_weight'] example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu().tolist() max_weight_index = batch_max_weight_indexes[i].detach().cpu( ).tolist() max_weight = batch_max_weight[i].detach().cpu().tolist() test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( WeightResultChoice(unique_id=unique_id, choice_logits=choice_logits, max_weight_index=max_weight_index, max_weight=max_weight)) output_prediction_file = os.path.join(args.predict_dir, 'sentence_id_file.json') data_reader.predict_sentence_ids( test_examples, test_features, all_results, output_prediction_file, weight_threshold=args.weight_threshold, only_correct=args.only_correct, label_threshold=args.label_threshold)
def train(self, output_dir, train_batch_size, gradient_accumulation_steps, seed, epochs, data_path, pretrained_path, valid_path=None, no_cuda=False, dropout=0.3, weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5, adam_epsilon=1e-8, max_seq_length=128, squeeze=True, max_grad_norm=1.0, eval_batch_size=32, epoch_save_model=False, model_name='XLMR', embedding_path=None, split_train_data=False, data_divider=0.6, wandb=None, save=True, logger=None, json_dataset=False, label_file=None, xlm_dataset=False, div=None, div_2=None, motherfile=False, multi_source_labels=False, device=0): epoch_times = [] if wandb: import wandb print(wandb) wandb.init(project='ABOM-PolEmo', config={ "epochs": epochs, "language_model": pretrained_path, "batch_size": train_batch_size, "max_seq_length": max_seq_length, "warmup_proportion": warmup_proportion, "learning_rate": learning_rate, "gradient_accumulation_steps": gradient_accumulation_steps, "squeeze": squeeze, "dropout": dropout, "output_dit": output_dir }) if save and os.path.exists(output_dir) and os.listdir(output_dir): raise ValueError( "Output directory (%s) already exists and is not empty." % output_dir) if save and not os.path.exists(output_dir): os.makedirs(output_dir) if not logger: logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, filename=os.path.join(output_dir, "log.txt")) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logger = logging.getLogger(__name__) if gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" % gradient_accumulation_steps) train_batch_size = train_batch_size // gradient_accumulation_steps random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if split_train_data: if json_dataset: examples, label_list = get_examples_from_json(data_path) elif motherfile: examples, label_list = get_examples_from_motherfile(data_path) elif xlm_dataset: examples, label_list = get_examples_from_xml(data_path) else: examples, label_list = get_examples(data_path, 'train') random.shuffle(examples) train_examples = examples[0:int(len(examples) * data_divider)] val_examples = examples[int(len(examples) * data_divider):] eval_examples = examples[( int(len(examples) * data_divider) + int(len(examples) * ((1 - data_divider) / 2))):] else: train_examples = None if json_dataset: examples, label_list = get_examples_from_json(data_path) elif motherfile: train_examples, train_label_list = get_examples_from_motherfile( data_path, 'train') val_examples, val_label_list = get_examples_from_motherfile( data_path, 'test') train_label_list.extend(val_label_list) label_list = list(set(train_label_list)) elif xlm_dataset: examples, label_list = get_examples_from_xml(data_path) else: train_examples, label_list = get_examples(data_path, 'train') logger.info("\nDATA SIZE\n") logger.info("\Train = %d\n" % len(train_examples)) logger.info("\Val = %d\n" % len(val_examples)) num_train_optimization_steps = 0 num_labels = len(label_list) + 1 num_train_optimization_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps) * epochs hidden_size = 300 if pretrained_path == None else 768 if 'base' in pretrained_path else 1024 device = 'cuda:0' if (torch.cuda.is_available() and not no_cuda) else 'cpu' logger.info(device) if model_name == 'HERBERT': model = AutoTokenizerForTokenClassification( pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'BERT_MULTILINGUAL': model = BertBaseMultilingualCased(pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) elif model_name == 'REFORMER': model = Reformer(n_labels=num_labels, hidden_size=512, dropout=dropout, device=device, max_seq_length=max_seq_length, batch_size=train_batch_size) elif model_name == 'POLISH_ROBERTA': model = PolishRoberta(pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout, device=device) else: model = XLMRForTokenClassification(pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout=dropout, device=device) model.to(device) if wandb: wandb.watch(model) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, model.encode_word) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size) if not split_train_data and not val_examples and not motherfile: val_examples, _ = get_examples(valid_path, 'valid') val_features = convert_examples_to_features(val_examples, label_list, max_seq_length, model.encode_word) val_data = create_dataset(val_features) best_val_f1 = 0.0 best_precision = 0.0 best_recall = 0.0 for epoch_no in range(1, epochs + 1): start = timer() epoch_stats = {"epoch": epoch_no} logger.info("Epoch %d" % epoch_no) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 model.train() steps = len(train_dataloader) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss = model(input_ids, label_ids, l_mask, valid_ids) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) tr_loss += loss.item() epoch_stats["loss"] = loss if wandb: wandb.log({"loss": loss}) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if step % 5 == 0: logger.info('Step = %d/%d; Loss = %.4f' % (step + 1, steps, tr_loss / (step + 1))) if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() del batch logger.info("\nTesting on validation set...") f1, report, entity_scores, precision, recall = evaluate_model( model, val_data, label_list, eval_batch_size, device) epoch_stats["validation_F1"] = f1 print(report) if f1 > best_val_f1: best_val_f1 = f1 best_precision = precision best_recall = recall logger.info( "\nFound better f1=%.4f on validation set. Saving model\n" % f1) logger.info("%s\n" % report) if save: torch.save( model.state_dict(), open(os.path.join(output_dir, 'model.pt'), 'wb')) save_params(output_dir, dropout, num_labels, label_list) if save and epoch_save_model: epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no) os.makedirs(epoch_output_dir) if save: torch.save( model.state_dict(), open(os.path.join(epoch_output_dir, 'model.pt'), 'wb')) save_params(epoch_output_dir, dropout, num_labels, label_list) if wandb: wandb.log(epoch_stats) epoch_times.append(timer() - start) model.cpu() del model, logger torch.cuda.empty_cache() print("Avg. epoch time") print(np.mean(epoch_times, axis=0)) print(max_seq_length) return best_val_f1, entity_scores, best_precision, epoch_times, best_recall
inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() logits = model(inputs)[0] idx = batch['sum_idx'].item() # index of separator token # only consider loss on reference summary just like seq2seq models shift_logits = logits[..., idx:-1, :].contiguous() shift_labels = labels[..., idx+1:].contiguous() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss = loss/args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 writer.add_scalar('lr', scheduler.get_lr()[0], global_step) writer.add_scalar('loss', (tr_loss - logging_loss)/args.gradient_accumulation_steps, global_step) logging_loss = tr_loss print("loss:", loss.item(), end='\n\n') if (step + 1)/args.gradient_accumulation_steps == 1.0: print('After 1st update: ', end='\n\n') generate_sample(valid_dataset, tokenizer, num=2, eval_step=False) if (step + 1) % (10*args.gradient_accumulation_steps) == 0: results = evaluate(args, model, valid_dataset, ignore_index, global_step) for key, value in results.items(): writer.add_scalar('eval_{}'.format(key), value, global_step)
def train(args, train_dataset, model, tokenizer): tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) ## DATALOADER train_sampler = SequentialSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) graph_train_dataloader_a, graph_train_dataloader_b = load_graph_examples( args) args.logging_steps = len(train_dataloader) args.save_steps = len(train_dataloader) if args.max_steps > 0: t_total = args.max_steps args.num_total_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs assert len(train_dataset) == len(graph_train_dataloader_a.dataset) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) graph_optimizer = AdamW(model.graph_encoder.parameters(), lr=args.graph_lr, weight_decay=args.weight_decay) linear_optimizer = AdamW(model.classifier.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) linear_c_optimizer = AdamW(model.classifier_c.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) linear_type_optimizer = AdamW(model.classifier_type.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) # bert_optimizer_grouped_parameters = get_bert_param_groups(model, args) bert_optimizer_grouped_parameters = get_bert_param_groups( model.text_encoder, args) bert_optimizer = AdamW(bert_optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, weight_decay=args.weight_decay) if args.scheduler == 'linear': scheduler = WarmupLinearSchedule(bert_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) graph_scheduler = WarmupLinearSchedule(graph_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) linear_scheduler = WarmupLinearSchedule(linear_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) linear_c_scheduler = WarmupLinearSchedule( linear_c_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) linear_type_scheduler = WarmupLinearSchedule( linear_type_optimizer, warmup_steps=args.warmup_steps, t_total=t_total) elif args.scheduler == 'cosine': scheduler = WarmupCosineWithHardRestartsSchedule( bert_optimizer, warmup_steps=args.warmup_steps, t_total=t_total, cycles=2.) graph_scheduler = WarmupCosineWithHardRestartsSchedule( graph_optimizer, warmup_steps=args.warmup_steps, t_total=t_total, cycles=2.) linear_scheduler = WarmupCosineWithHardRestartsSchedule( linear_optimizer, warmup_steps=args.warmup_steps, t_total=t_total, cycles=2.) linear_c_scheduler = WarmupCosineWithHardRestartsSchedule( linear_c_optimizer, warmup_steps=args.warmup_steps, t_total=t_total, cycles=2.) linear_type_scheduler = WarmupCosineWithHardRestartsSchedule( linear_type_optimizer, warmup_steps=args.warmup_steps, t_total=t_total, cycles=2.) ## TRAIN global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() set_seed(args) for _ in trange(int(args.num_train_epochs), desc='Epoch'): for batch, data_a, data_b in tqdm(zip(train_dataloader, graph_train_dataloader_a, graph_train_dataloader_b), desc='Iteration', total=len(train_dataloader)): model.train() batch = tuple(t.to(args.device) for t in batch) data_a, data_b = data_a.to(args.device), data_b.to(args.device) loss_fcts = { 'mse': F.mse_loss, 'smooth_l1': F.smooth_l1_loss, 'l1': F.l1_loss } loss_fct = loss_fcts[args.loss_fct] loss_combined = 0.0 for mode in ['medsts', 'medsts_c', 'medsts_type']: torch.cuda.empty_cache() logits = model(batch[0], batch[1], batch[2], data_a, data_b, mode=mode) if mode == 'medsts': loss = loss_fct(logits, data_a.label) elif mode == 'medsts_c': loss = F.cross_entropy(logits, data_a.label_c) elif mode == 'medsts_type': loss = F.cross_entropy(logits, data_a.label_type) loss_combined += loss loss_combined.backward() if args.clip == 'all': torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.text_encoder.parameters(), args.max_grad_norm) tr_loss += loss.item() scheduler.step() bert_optimizer.step() graph_scheduler.step() linear_scheduler.step() graph_optimizer.step() # print('learning rate: {} \t graph optimizer lr: {}'.format(linear_optimizer.param_groups[0]['lr'], graph_optimizer.param_groups[0]['lr'])) linear_optimizer.step() linear_c_scheduler.step() linear_type_scheduler.step() linear_c_optimizer.step() linear_type_optimizer.step() model.zero_grad() global_step += 1 args.logging_steps = len(train_dataloader) // 4 if args.logging_steps > 0 and global_step % args.logging_steps == 0: result = evaluate(args, model, tokenizer) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training # model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) # logger.info("Saving model checkpoint to %s", output_dir) # result = evaluate(args, model, tokenizer) tb_writer.close() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument("--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--predict_eval", action='store_true', help="Whether to predict eval set.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument("--eval_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument("--freeze", default=0, type=int, required=False, help="freeze bert.") parser.add_argument("--not_do_eval_steps", default=0.35, type=float, help="not_do_eval_steps.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) # Prepare model model = BertForSequenceClassification_last3embedding_cls.from_pretrained(args.model_name_or_path,args,config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True) train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length,args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size//args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps//args.gradient_accumulation_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc=0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps),total=num_train_optimization_steps) train_dataloader=cycle(train_dataloader) # 先做一个eval for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = {'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) model.train() for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) nb_tr_examples += input_ids.size(0) del input_ids, input_mask, segment_ids, label_ids if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss=round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4) bar.set_description("loss {}".format(train_loss)) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and step>num_train_optimization_steps*args.not_do_eval_steps and (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0: for file in ['dev.csv']: inference_labels=[] gold_labels=[] inference_logits=[] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training = True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,args.split_num,False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels=np.concatenate(gold_labels,0) inference_logits=np.concatenate(inference_logits,0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = {'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*'*80) writer.write('\n') if eval_accuracy>best_acc and 'dev' in file: print("="*80) print("Best F1",eval_accuracy) print("Saving Model......") best_acc=eval_accuracy # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("="*80) else: print("="*80) if args.do_test: del model gc.collect() args.do_train=False model = BertForSequenceClassification_last3embedding_cls.from_pretrained(os.path.join(args.output_dir, "pytorch_model.bin"),args,config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file,flag in [('dev.csv','dev'),('test.csv','test')]: inference_labels=[] gold_labels=[] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training = False) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,args.split_num,False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels=np.concatenate(gold_labels,0) logits=np.concatenate(inference_labels,0) print(flag, accuracy(logits, gold_labels)) if flag=='test': df=pd.read_csv(os.path.join(args.data_dir, file)) df['label_0']=logits[:,0] df['label_1']=logits[:,1] df['label_2']=logits[:,2] df[['id','label_0','label_1','label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"),index=False) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False) if args.predict_eval: del model gc.collect() args.do_train = False model = BertForSequenceClassification_last3embedding_cls.from_pretrained(os.path.join(args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False)
def main(): parser = argparse.ArgumentParser() parser = add_xlmr_args(parser) args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory (%s) already exists and is not empty." % args.output_dir) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, filename=os.path.join(args.output_dir, "log.txt")) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logger = logging.getLogger(__name__) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" % args.gradient_accumulation_steps) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval and not args.prediction: raise ValueError( "At least one of `do_train`, `do_eval` or prediction must be present." ) processor = NerProcessor() label_list = processor.get_labels(args.data_dir) print(*label_list, sep="\n") num_labels = len(label_list) + 1 # add one for IGNORE label train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # preparing model configs hidden_size = 768 if 'base' in args.pretrained_path else 1024 # TODO: move this inside model.__init__ #device = 'cuda' if (torch.cuda.is_available() and not args.no_cuda) else 'cpu' device = "cuda:0" logger.info(device) # creating model model = XLMRForTokenClassification(pretrained_path=args.pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=args.dropout, device=device) model.to(device) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) # freeze model if necessary if args.freeze_model: logger.info("Freezing XLM-R model...") for n, p in model.named_parameters(): if 'xlmr' in n and p.requires_grad: p.requires_grad = False if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) #global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, model.encode_word) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # getting validation samples val_examples = processor.get_dev_examples(args.data_dir) val_features = convert_examples_to_features(val_examples, label_list, args.max_seq_length, model.encode_word) val_data = create_dataset(val_features) best_val_f1 = 0.0 for epoch_no in range(1, args.num_train_epochs + 1): logger.info("Epoch %d" % epoch_no) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 model.train() steps = len(train_dataloader) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss = model(input_ids, label_ids, l_mask, valid_ids) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if step % 100 == 99: logger.info('Step = %d/%d; Loss = %.4f' % (step + 1, steps, tr_loss / (step + 1))) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() logger.info("\nTesting on validation set...") f1, report = evaluate_model(model, val_data, label_list, args.eval_batch_size, device) if f1 > best_val_f1: best_val_f1 = f1 logger.info( "\nFound better f1=%.4f on validation set. Saving model\n" % f1) logger.info("%s\n" % report) torch.save( model.state_dict(), open(os.path.join(args.output_dir, 'model.pt'), 'wb')) else: logger.info("\nNo better F1 score: {}\n".format(f1)) else: # load a saved model state_dict = torch.load( open(os.path.join(args.output_dir, 'model.pt'), 'rb')) model.load_state_dict(state_dict) logger.info("Loaded saved model") model.to(device) if args.do_eval: if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, model.encode_word) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_data = create_dataset(eval_features) f1_score, report = evaluate_model(model, eval_data, label_list, args.eval_batch_size, device) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Writing results to file *****") writer.write(report) logger.info("Done.") if args.prediction and args.eval_on: eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, model.encode_word) eval_data = create_dataset(eval_features) predictions = predict_model(model, eval_data, label_list, args.eval_batch_size, device) with codecs.open(args.prediction, "w", "utf8") as fout: for tokens, labels in zip(eval_examples, predictions): for token, label in zip(tokens.text_a.split(" "), labels): fout.write("%s\t%s\n" % (token, label)) fout.write("\n")
def train(args, train_dataloader, model_vae, encoder_tokenizer, decoder_tokenizer, table_name): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) # train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) # model_encoder, model_decoder, model_connector = model_vae.encoder, model_vae.decoder, model_vae.linear no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model_vae.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model_vae.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model_vae, optimizer = amp.initialize(model_vae, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model_vae = torch.nn.DataParallel(model_vae, device_ids=range(args.n_gpu)).to(args.device) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model_vae = torch.nn.parallel.DistributedDataParallel(model_vae, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataloader.num_examples) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model_vae.zero_grad() # model_vae = model_vae.module if hasattr(model_vae, 'module') else model_vae # Take care of distributed/parallel training train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) n_iter = int(args.num_train_epochs) * len(train_dataloader) beta_t_list = frange_cycle_zero_linear(n_iter, start=0.0, stop=args.beta, n_cycle=1, ratio_increase=args.ratio_increase, ratio_zero=args.ratio_zero) tmp_list = [] set_seed(args) # Added here for reproducibility (even between python 2 and 3) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): tokenized_text0, tokenized_text1, tokenized_text_lengths = batch # tokenized_text0 = tokenized_text0.to(args.device) # tokenized_text1 = tokenized_text1.to(args.device) # prepare input-output data for reconstruction if (tokenized_text0>len(encoder_tokenizer)).sum().item()>0.0 or (tokenized_text1>len(decoder_tokenizer)).sum().item()>0.0: pdb.set_trace() continue inputs, labels = mask_tokens(tokenized_text0, encoder_tokenizer, args) if args.mlm else (tokenized_text0, tokenized_text1) labels = tokenized_text1 tokenized_text1 = tokenized_text1.to(args.device) inputs = inputs.to(args.device) labels = labels.to(args.device) model_vae.train() beta_t = beta_t_list[step + epoch*len(epoch_iterator)] model_vae.module.args.beta = beta_t if beta_t == 0.0: model_vae.module.args.fb_mode = 0 else: model_vae.module.args.fb_mode = 1 if args.use_deterministic_connect: model_vae.module.args.fb_mode = 2 loss_rec, loss_kl, loss = model_vae(inputs, labels) # Chunyuan: loss_rec size is [4], while latent_z size is [12] if args.n_gpu > 1: loss_rec = loss_rec.mean() # mean() to average on multi-gpu parallel training loss_kl = loss_kl.mean() loss = loss.mean() if args.use_philly: print("PROGRESS: {}%".format(round(100 * (step + epoch*len(epoch_iterator) ) /(int(args.num_train_epochs) * len(epoch_iterator)) , 4))) print("EVALERR: {}%".format(loss_rec)) epoch_iterator.set_description( ( f'iter: {step + epoch*len(epoch_iterator) }; loss: {loss.item():.3f}; ' f'loss_rec: {loss_rec.item():.3f}; loss_kl: {loss_kl.item():.3f}; ' f'beta: {model_vae.module.args.beta:.3f}' ) ) # if global_step % 5 == 0: # row = { # 'PartitionKey': 'MILU_Rule_Rule_Template', # 'RowKey': str(datetime.now()), # 'ExpName' : args.ExpName, # 'iter': str( step + epoch*len(epoch_iterator) ), # 'loss': str( loss.item()), # 'loss_rec': str(loss_rec.item()), # 'loss_kl': str(loss_kl.item()), # 'beta': str(model_vae.args.beta) # } # # pdb.set_trace() # ts.insert_entity(table_name, row) # pdb.set_trace() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model_vae.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model_vae.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model_vae, encoder_tokenizer, decoder_tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save encoder model checkpoint output_encoder_dir = os.path.join(args.output_dir, 'checkpoint-encoder-{}'.format(global_step)) if not os.path.exists(output_encoder_dir): os.makedirs(output_encoder_dir) model_encoder_to_save = model_vae.module.encoder if hasattr(model_vae, 'module') else model_vae.encoder # Take care of distributed/parallel training if args.use_philly: save_solid = False while not save_solid: try: model_encoder_to_save.save_pretrained(output_encoder_dir) torch.save(args, os.path.join(output_encoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_encoder_dir) save_solid = True except: pass else: model_encoder_to_save.save_pretrained(output_encoder_dir) torch.save(args, os.path.join(output_encoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_encoder_dir) # Save decoder model checkpoint output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(global_step)) if not os.path.exists(output_decoder_dir): os.makedirs(output_decoder_dir) model_decoder_to_save = model_vae.module.decoder if hasattr(model_vae, 'module') else model_vae.decoder # Take care of distributed/parallel training if args.use_philly: save_solid = False while not save_solid: try: model_decoder_to_save.save_pretrained(output_decoder_dir) torch.save(args, os.path.join(output_decoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_decoder_dir) save_solid = True except: pass else: model_decoder_to_save.save_pretrained(output_decoder_dir) torch.save(args, os.path.join(output_decoder_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_decoder_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) args.logging_steps = len(train_dataloader) args.save_steps = len(train_dataloader) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) # no_decay = ['bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, # {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] optimizer_grouped_parameters = get_bert_param_groups(model, args) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, 'labels': batch[3] } ouputs = model(**inputs) loss = ouputs[0] if args.n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) # logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
def main(): os.environ["CUDA_VISIBLE_DEVICES"] = "0" torch.set_num_threads(1) parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=False, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval or not.") parser.add_argument("--eval_on", default="dev", help="Whether to run eval on the dev set or test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=128, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=128, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() #processors = FormationProcessor if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = FormationProcessor() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples() num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Prepare model config = BertConfig.from_pretrained(args.bert_model, num_labels=1, finetuning_task=args.task_name) model = formation_model.Formation_model.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 #label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train and args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): train_features = convert_examples_to_features(train_examples, args.max_seq_length, tokenizer) logger.info("***** Reading Training Data*****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) loss_test = nn.L1Loss() if args.eval_on == "dev": eval_examples = processor.get_dev_examples() elif args.eval_on == "test": eval_examples = processor.get_test_examples() else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, args.max_seq_length, tokenizer) logger.info("***** Reading Evaluation Data*****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) for epoch in range(int(args.num_train_epochs)): epoch_start_time = time.time() model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = \ model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 model.to(device) model.eval() with torch.no_grad(): eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)[0] # logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2) input_mask = input_mask.to('cpu').numpy() batch_loss = loss_test(logits, label_ids) eval_loss += batch_loss.item() y_true.append(label_ids) y_pred.append(logits) # 将结果print出来 print('[%03d/%03d] %2.2f sec(s) Train Loss: %3.6f | Val Loss: %3.6f' % \ (epoch + 1, args.num_train_epochs, time.time() - epoch_start_time, tr_loss/len(train_dataloader), eval_loss / len(eval_dataloader))) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # label_map = {i : label for i, label in enumerate(label_list,1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": 1 } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w"))
def main(): parser = argparse.ArgumentParser() parser = add_xlmr_args(parser) args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = en_fr_processor() train_examples = processor.get_train_examples(args.data_dir) # preparing model configs hidden_size = 768 if 'base' in args.pretrained_path else 1024 # TODO: move this inside model.__init__ device = 'cuda' if (torch.cuda.is_available() and not args.no_cuda) else 'cpu' # creating model model = XLMR_Encoder_Decoder(pretrained_path=args.pretrained_path, hidden_size=hidden_size, dropout_p=args.dropout, device=device) model.encoder.to(device) model.decoder.to(device) params = model.encoder.named_parameters() + model.decoder.named_parameters( ) optimizer_grouped_parameters = [{'params': [p for n, p in params]}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=1, t_total=1) train_features = convert_examples_to_features(train_examples, args.max_seq_length, model.encoder.encode_word) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) #logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for _ in tqdm(range(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 tbar = tqdm(train_dataloader, desc="Iteration") model.encoder.train() for step, batch in enumerate(tbar): batch = tuple(t.to(device) for t in batch) src_tensor, target_tensor = batch enc_out = model.encoder(src_tensor) torch.nn.utils.clip_grad_norm_(model.encoder.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.encoder.zero_grad() model.encoder.to(device)
def train(train_dataset, model, tokenizer): tb_writer = SummaryWriter() train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size']) t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs'] no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon']) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args['warmup_steps'], t_total=t_total) if args['fp16']: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level']) if torch.cuda.device_count()>1: model = torch.nn.parallel.DistributedDataParallel(model) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args['num_train_epochs']) logger.info(" Total train batch size = %d", args['train_batch_size']) logger.info(" Gradient Accumulation steps = %d", args['gradient_accumulation_steps']) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args['num_train_epochs']), desc="Epoch") for _ in train_iterator: epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3]} outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) print("\r%f" % loss, end='') if args['gradient_accumulation_steps'] > 1: loss = loss / args['gradient_accumulation_steps'] if args['fp16']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm']) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm']) tr_loss += loss.item() if (step + 1) % args['gradient_accumulation_steps'] == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0: # Log metrics if args['evaluate_during_training']: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args['logging_steps'], global_step) logging_loss = tr_loss if args['save_steps'] > 0 and global_step % args['save_steps'] == 0: # Save model checkpoint output_dir = os.path.join(args['output_dir'], 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) logger.info("Saving model checkpoint to %s", output_dir) return global_step, tr_loss / global_step
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader( ) config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=3) # Prepare model model = BertForSequenceClassification.from_pretrained( self.model_name_or_path, self.args, config=config) model.to(self.device) for i in range(1): # Prepare data loader num_train_optimization_steps = self.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and ( step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) # Run prediction for full data model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = self.accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join( self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module' ) else model # Only save the model it-self output_model_file = os.path.join( self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if self.do_test: del model gc.collect() self.do_train = False model = BertForSequenceClassification.from_pretrained( os.path.join(self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = self.read_examples(os.path.join( self.data_dir, file), is_training=False) print('exa', len(eval_examples)) eval_features = self.convert_examples_to_features( eval_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(self.select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(self.select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(self.select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) if flag == 'dev': print(flag, self.accuracy(logits, gold_labels)) if flag == 'test': df = pd.read_csv(os.path.join(self.data_dir, file), names=['id', 'content', 'title', 'label']) predict = np.argmax(logits, axis=1).tolist() print(df.shape[0]) print(len(predict)) df['labelpre'] = predict df[['id', 'labelpre' ]].to_csv(os.path.join(self.output_dir, "sub.csv"), index=False, header=False)