def get_optimization(model, float16, learning_rate, total_steps, schedule, warmup_rate, weight_decay_rate, max_grad_norm, opt_pooler=False): # Prepare optimizer assert 0.0 <= warmup_rate <= 1.0 param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex if opt_pooler is False: param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [ p for n, p in param_optimizer if not any([nd in n for nd in no_decay]) ], 'weight_decay_rate': weight_decay_rate }, { 'params': [p for n, p in param_optimizer if any([nd in n for nd in no_decay])], 'weight_decay_rate': 0.0 }] if float16: try: from apex.contrib.optimizers import FP16_Optimizer from apex.contrib.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_parameters, lr=learning_rate, bias_correction=False, max_grad_norm=max_grad_norm) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = BERTAdam(params=optimizer_parameters, lr=learning_rate, warmup=warmup_rate, max_grad_norm=max_grad_norm, t_total=total_steps, schedule=schedule, weight_decay_rate=weight_decay_rate) return optimizer
def main(args): device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler(logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler(logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) logger.info(args) logger.info("device: {}, n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) processor = DataProcessor() label_list = processor.get_labels(args.data_dir, args.negative_label) label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) special_tokens = {} if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir, tokenizer) eval_features = convert_examples_to_features( eval_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode) logger.info("***** Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_span_ids = torch.tensor([f.span_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_span_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) eval_label_ids = all_label_ids if args.do_train: train_examples = processor.get_train_examples(args.data_dir,tokenizer) train_features = convert_examples_to_features( train_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_span_ids = torch.tensor([f.span_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_span_ids) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size) train_batches = [batch for batch in train_dataloader] num_train_optimization_steps = \ len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs logger.info("***** Training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_result = None eval_step = max(1, len(train_batches) // args.eval_per_epoch) lrs = [args.learning_rate] if args.learning_rate else \ [1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5] for lr in lrs: model = BertForMTB.from_pretrained(args.model,model_name = args.model,num_labels=num_labels,examples = train_examples,mode = args.repre_mode) # BertForMTB.from_pretrained(args.model ,model_name = args.model,num_labels=num_labels,examples = train_examples,mode = args.repre_mode) # BertForSequenceClassification.from_pretrained( # args.model, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE), num_labels=num_labels,examples = train_examples) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: # from apex.optimizers import FP16_Optimizer # from apex.optimizers import FusedAdam from apex.contrib.optimizers import FP16_Optimizer from apex.contrib.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex" "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) start_time = time.time() global_step = 0 tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 for epoch in range(int(args.num_train_epochs)): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, lr)) if args.train_mode == 'random' or args.train_mode == 'random_sorted': random.shuffle(train_batches) for step, batch in enumerate(train_batches): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, span_ids = batch model_output = model(input_ids, input_mask, segment_ids,span_ids ,label_ids,output_attentions = True,output_hidden_states=True) print(model_output) loss = model_output.loss if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = lr * \ warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % eval_step == 0: logger.info('Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}'.format( epoch, step + 1, len(train_batches), time.time() - start_time, tr_loss / nb_tr_steps)) save_model = False if args.do_eval: preds, result = evaluate(model, device, eval_dataloader, eval_label_ids, num_labels) model.train() result['global_step'] = global_step result['epoch'] = epoch result['learning_rate'] = lr result['batch_size'] = args.train_batch_size logger.info("First 20 predictions:") for pred, label in zip(preds[:20], eval_label_ids.numpy()[:20]): sign = u'\u2713' if pred == label else u'\u2718' logger.info("pred = %s, label = %s %s" % (id2label[pred], id2label[label], sign)) if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]): best_result = result save_model = True logger.info("!!! Best dev %s (lr=%s, epoch=%d): %.2f" % (args.eval_metric, str(lr), epoch, result[args.eval_metric] * 100.0)) else: save_model = True if save_model: model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) if best_result: output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_eval: if args.eval_test: eval_examples = processor.get_test_examples(args.data_dir, tokenizer) eval_features = convert_examples_to_features( eval_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode) logger.info("***** Test *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_span_ids = torch.tensor([f.span_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) eval_label_ids = all_label_ids model = BertForMTB.from_pretrained(args.model,model_name = args.model,num_labels=num_labels,examples = eval_examples,mode = args.repre_mode) # from_pretrained(args.output_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) preds, result = evaluate(model, device, eval_dataloader, eval_label_ids, num_labels) with open(os.path.join(args.output_dir, "predictions.txt"), "w") as f: for ex, pred in zip(eval_examples, preds): f.write("%s\t%s\n" % (ex.guid, id2label[pred])) with open(os.path.join(args.output_dir, "test_results.txt"), "w") as f: for key in sorted(result.keys()): f.write("%s = %s\n" % (key, str(result[key])))
def __init__(self, model, criterion, opt_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', train_iterations=0, checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', loss_scaling={}, cuda=True, distributed=False, distributed_overlap_allreduce=False, distributed_overlap_num_allreduce_streams=1, distributed_overlap_allreduce_messagesize=1e7, distributed_overlap_allreduce_communicators=None, intra_epoch_eval=0, prealloc_mode='always', iter_size=1, verbose=False, args=None): """ Constructor for the Seq2SeqTrainer. :param model: model to train :param criterion: criterion (loss function) :param opt_config: dictionary with options for the optimizer :param print_freq: prints short summary every 'print_freq' iterations :param save_freq: saves checkpoint every 'save_freq' iterations :param grad_clip: coefficient for gradient clipping :param batch_first: if True the model uses (batch,seq,feature) tensors, if false the model uses (seq, batch, feature) :param save_info: dict with additional state stored in each checkpoint :param save_path: path to the directiory for checkpoints :param train_iterations: total number of training iterations to execute :param checkpoint_filename: name of files with checkpoints :param keep_checkpoints: max number of checkpoints to keep :param math: arithmetic type :param loss_scaling: options for dynamic loss scaling :param cuda: if True use cuda, if False train on cpu :param distributed: if True run distributed training :param intra_epoch_eval: number of additional eval runs within each training epoch :param prealloc_mode: controls preallocation, choices=['off', 'once', 'always'] :param iter_size: number of iterations between weight updates :param verbose: enables verbose logging """ super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None self.translator = None self.scheduler = None self.intra_epoch_eval = intra_epoch_eval self.iter_size = iter_size self.prealloc_mode = prealloc_mode self.preallocated = False # Assume multi-tensor apply if with APEX DDP self.args = args self.use_mt = (distributed and iter_size == 1 and \ opt_config['optimizer'] == 'FusedAdam') # Use APEX gradient average if gradient accumulation option enabled self.retain_allreduce_buffers = True if iter_size == 1 else False self.gradient_average = False if iter_size == 1 else True if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() params = self.model.parameters() if math == 'fp16': self.model = self.model.half() if distributed and self.args.distributed_weight_update != 2: self.model = DDP(self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce), num_allreduce_streams=distributed_overlap_num_allreduce_streams, allreduce_communicators=distributed_overlap_allreduce_communicators, retain_allreduce_buffers=self.retain_allreduce_buffers, gradient_average=self.gradient_average) if self.args.distributed_weight_update == 2: # gradient clipping maintained by DistributedFusedAdam self.fp_optimizer = DwuFp16Optimizer( self.model, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval'] ) params = list(self.model.parameters()) else: self.fp_optimizer = Fp16Optimizer( self.model, grad_clip, use_mt=self.use_mt, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval'] ) params = self.fp_optimizer.fp32_params if isinstance(self.fp_optimizer.fp32_params, list) \ else [self.fp_optimizer.fp32_params] elif math == 'fp32': if distributed: self.model = DDP(self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce)) self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) # params = self.model.parameters() opt_name = opt_config.pop('optimizer') if opt_name == 'FusedAdam': if math == 'fp16' or math == 'fp32': if self.args.distributed_weight_update == 2: dwu_args = self.distributed_weight_update_config self.optimizer = DistributedFusedAdam(params, max_grad_norm=grad_clip, **dwu_args, **opt_config) self.optimizer.set_global_scale(1.0) # used for grad norm clipping in step function else: # Maintain grad norm and scaling by ourselves self.optimizer = FusedAdam(params, use_mt=self.use_mt, **opt_config) else: self.optimizer = FusedAdam(params, use_mt=self.use_mt, max_grad_norm=grad_clip, amp_scale_adjustment=get_world_size(), **opt_config) else: self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config) logging.info(f'Using optimizer: {self.optimizer}') log_event(key=constants.OPT_NAME, value=constants.ADAM, sync=False) log_event(key=constants.OPT_BASE_LR, value=opt_config['lr'], sync=False) log_event(key=constants.OPT_ADAM_BETA_1, value=self.optimizer.defaults['betas'][0], sync=False) log_event(key=constants.OPT_ADAM_BETA_2, value=self.optimizer.defaults['betas'][1], sync=False) log_event(key=constants.OPT_ADAM_EPSILON, value=self.optimizer.defaults['eps'], sync=False)
class Seq2SeqTrainer: """ Seq2SeqTrainer """ def __init__(self, model, criterion, opt_config, print_freq=10, save_freq=1000, grad_clip=float('inf'), batch_first=False, save_info={}, save_path='.', train_iterations=0, checkpoint_filename='checkpoint%s.pth', keep_checkpoints=5, math='fp32', loss_scaling={}, cuda=True, distributed=False, distributed_overlap_allreduce=False, distributed_overlap_num_allreduce_streams=1, distributed_overlap_allreduce_messagesize=1e7, distributed_overlap_allreduce_communicators=None, intra_epoch_eval=0, prealloc_mode='always', iter_size=1, verbose=False, args=None): """ Constructor for the Seq2SeqTrainer. :param model: model to train :param criterion: criterion (loss function) :param opt_config: dictionary with options for the optimizer :param print_freq: prints short summary every 'print_freq' iterations :param save_freq: saves checkpoint every 'save_freq' iterations :param grad_clip: coefficient for gradient clipping :param batch_first: if True the model uses (batch,seq,feature) tensors, if false the model uses (seq, batch, feature) :param save_info: dict with additional state stored in each checkpoint :param save_path: path to the directiory for checkpoints :param train_iterations: total number of training iterations to execute :param checkpoint_filename: name of files with checkpoints :param keep_checkpoints: max number of checkpoints to keep :param math: arithmetic type :param loss_scaling: options for dynamic loss scaling :param cuda: if True use cuda, if False train on cpu :param distributed: if True run distributed training :param intra_epoch_eval: number of additional eval runs within each training epoch :param prealloc_mode: controls preallocation, choices=['off', 'once', 'always'] :param iter_size: number of iterations between weight updates :param verbose: enables verbose logging """ super(Seq2SeqTrainer, self).__init__() self.model = model self.criterion = criterion self.epoch = 0 self.save_info = save_info self.save_path = save_path self.save_freq = save_freq self.save_counter = 0 self.checkpoint_filename = checkpoint_filename self.checkpoint_counter = cycle(range(keep_checkpoints)) self.opt_config = opt_config self.cuda = cuda self.distributed = distributed self.print_freq = print_freq self.batch_first = batch_first self.verbose = verbose self.loss = None self.translator = None self.scheduler = None self.intra_epoch_eval = intra_epoch_eval self.iter_size = iter_size self.prealloc_mode = prealloc_mode self.preallocated = False # Assume multi-tensor apply if with APEX DDP self.args = args self.use_mt = (distributed and iter_size == 1 and \ opt_config['optimizer'] == 'FusedAdam') # Use APEX gradient average if gradient accumulation option enabled self.retain_allreduce_buffers = True if iter_size == 1 else False self.gradient_average = False if iter_size == 1 else True if cuda: self.model = self.model.cuda() self.criterion = self.criterion.cuda() params = self.model.parameters() if math == 'fp16': self.model = self.model.half() if distributed and self.args.distributed_weight_update != 2: self.model = DDP(self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce), num_allreduce_streams=distributed_overlap_num_allreduce_streams, allreduce_communicators=distributed_overlap_allreduce_communicators, retain_allreduce_buffers=self.retain_allreduce_buffers, gradient_average=self.gradient_average) if self.args.distributed_weight_update == 2: # gradient clipping maintained by DistributedFusedAdam self.fp_optimizer = DwuFp16Optimizer( self.model, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval'] ) params = list(self.model.parameters()) else: self.fp_optimizer = Fp16Optimizer( self.model, grad_clip, use_mt=self.use_mt, loss_scale=loss_scaling['init_scale'], dls_upscale_interval=loss_scaling['upscale_interval'] ) params = self.fp_optimizer.fp32_params if isinstance(self.fp_optimizer.fp32_params, list) \ else [self.fp_optimizer.fp32_params] elif math == 'fp32': if distributed: self.model = DDP(self.model, message_size=distributed_overlap_allreduce_messagesize, delay_allreduce=(not distributed_overlap_allreduce)) self.fp_optimizer = Fp32Optimizer(self.model, grad_clip) # params = self.model.parameters() opt_name = opt_config.pop('optimizer') if opt_name == 'FusedAdam': if math == 'fp16' or math == 'fp32': if self.args.distributed_weight_update == 2: dwu_args = self.distributed_weight_update_config self.optimizer = DistributedFusedAdam(params, max_grad_norm=grad_clip, **dwu_args, **opt_config) self.optimizer.set_global_scale(1.0) # used for grad norm clipping in step function else: # Maintain grad norm and scaling by ourselves self.optimizer = FusedAdam(params, use_mt=self.use_mt, **opt_config) else: self.optimizer = FusedAdam(params, use_mt=self.use_mt, max_grad_norm=grad_clip, amp_scale_adjustment=get_world_size(), **opt_config) else: self.optimizer = torch.optim.__dict__[opt_name](params, **opt_config) logging.info(f'Using optimizer: {self.optimizer}') log_event(key=constants.OPT_NAME, value=constants.ADAM, sync=False) log_event(key=constants.OPT_BASE_LR, value=opt_config['lr'], sync=False) log_event(key=constants.OPT_ADAM_BETA_1, value=self.optimizer.defaults['betas'][0], sync=False) log_event(key=constants.OPT_ADAM_BETA_2, value=self.optimizer.defaults['betas'][1], sync=False) log_event(key=constants.OPT_ADAM_EPSILON, value=self.optimizer.defaults['eps'], sync=False) @property def distributed_weight_update_config(self): """ Return a kwarg dictionary that provides arguments for the distributed weight update feature. """ return { 'dwu_group_size': self.args.dwu_group_size, 'dwu_num_blocks': self.args.dwu_num_blocks, 'dwu_num_chunks': self.args.dwu_num_chunks, 'dwu_num_rs_pg': self.args.dwu_num_rs_pg, 'dwu_num_ar_pg': self.args.dwu_num_ar_pg, 'dwu_num_ag_pg': self.args.dwu_num_ag_pg, 'overlap_reductions': self.args.dwu_overlap_reductions, 'full_pipeline': self.args.dwu_full_pipeline, 'compute_L2_grad_norm': self.args.dwu_grad_norm, 'e5m2_allgather': self.args.dwu_e5m2_allgather, 'predivide': False, 'flat_mt': True, } def iterate(self, src, tgt, update=True, training=True): """ Performs one iteration of the training/validation. :param src: batch of examples from the source language :param tgt: batch of examples from the target language :param update: if True: optimizer does update of the weights :param training: if True: executes optimizer """ src, src_length = src tgt, tgt_length = tgt src_length = torch.LongTensor(src_length) tgt_length = torch.LongTensor(tgt_length) num_toks = {} num_toks['tgt'] = int(sum(tgt_length - 1)) num_toks['src'] = int(sum(src_length)) if self.cuda: src = src.cuda(non_blocking=True) tgt = tgt.cuda(non_blocking=True) if self.batch_first: output = self.model(src, src_length, tgt[:, :-1]) tgt_labels = tgt[:, 1:] T, B = output.size(1), output.size(0) else: output = self.model(src, src_length, tgt[:-1]) tgt_labels = tgt[1:] T, B = output.size(0), output.size(1) loss = self.criterion(output.view(T * B, -1), tgt_labels.contiguous().view(-1)) loss_per_batch = torch.empty((1), dtype=torch.float, device='cpu', requires_grad=False, pin_memory=True) loss_per_batch.copy_(loss, non_blocking=True) loss /= (B * self.iter_size) if training: self.fp_optimizer.step(loss, self.optimizer, self.scheduler, update) loss_per_batch = loss_per_batch.item() loss_per_token = loss_per_batch / num_toks['tgt'] loss_per_sentence = loss_per_batch / B return loss_per_token, loss_per_sentence, num_toks def feed_data(self, data_loader, training=True): """ Runs training or validation on batches from data_loader. :param data_loader: data loader :param training: if True runs training else runs validation """ if training: assert self.optimizer is not None eval_fractions = np.linspace(0, 1, self.intra_epoch_eval+2)[1:-1] iters_with_update = len(data_loader) // self.iter_size eval_iters = (eval_fractions * iters_with_update).astype(int) eval_iters = eval_iters * self.iter_size eval_iters = set(eval_iters) batch_time = AverageMeter(skip_first=False) data_time = AverageMeter(skip_first=False) losses_per_token = AverageMeter(skip_first=False) losses_per_sentence = AverageMeter(skip_first=False) tot_tok_time = AverageMeter(skip_first=False) src_tok_time = AverageMeter(skip_first=False) tgt_tok_time = AverageMeter(skip_first=False) batch_size = data_loader.batch_size end = time.time() for i, (src, tgt) in enumerate(data_loader): self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) update = False if i % self.iter_size == self.iter_size - 1: update = True # do a train/evaluate iteration stats = self.iterate(src, tgt, update, training=training) loss_per_token, loss_per_sentence, num_toks = stats # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg if training and i in eval_iters: assert self.translator is not None test_bleu, _ = self.translator.run(calc_bleu=True, epoch=self.epoch, iteration=i) log = [] log += [f'TRAIN [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'BLEU: {test_bleu:.2f}'] log = '\t'.join(log) logging.info(log) self.model.train() self.preallocate(data_loader.batch_size, data_loader.dataset.max_len, training=True) if i % self.print_freq == 0: phase = 'TRAIN' if training else 'VALIDATION' log = [] log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'] log += [f'Data {data_time.val:.2e} ({data_time.avg:.2e})'] log += [f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})'] if self.verbose: log += [f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})'] log += [f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})'] log += [f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})'] log += [f'Loss/tok {losses_per_token.val:.4f} ({losses_per_token.avg:.4f})'] if training: lr = self.optimizer.param_groups[0]['lr'] log += [f'LR {lr:.3e}'] log = '\t'.join(log) logging.info(log) save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) end = time.time() tot_tok_time.reduce('sum') losses_per_token.reduce('mean') return losses_per_token.avg, tot_tok_time.avg def preallocate(self, batch_size, max_length, training): """ Generates maximum sequence length batch and runs forward and backward pass without updating model parameters. :param batch_size: batch size for preallocation :param max_length: max sequence length for preallocation :param training: if True preallocates memory for backward pass """ if self.prealloc_mode == 'always' or (self.prealloc_mode == 'once' and not self.preallocated): logging.info('Executing preallocation') torch.cuda.empty_cache() src_length = [max_length] * batch_size tgt_length = [max_length] * batch_size if self.batch_first: shape = (batch_size, max_length) else: shape = (max_length, batch_size) src = torch.full(shape, 4, dtype=torch.int64) tgt = torch.full(shape, 4, dtype=torch.int64) src = src, src_length tgt = tgt, tgt_length self.iterate(src, tgt, update=False, training=training) self.model.zero_grad() self.preallocated = True def optimize(self, data_loader): """ Sets model in training mode, preallocates memory and runs training on data provided by data_loader. :param data_loader: data loader """ torch.set_grad_enabled(True) self.model.train() self.preallocate(data_loader.batch_size, data_loader.dataset.max_len, training=True) output = self.feed_data(data_loader, training=True) self.model.zero_grad() return output def evaluate(self, data_loader): """ Sets model in eval mode, disables gradients, preallocates memory and runs validation on data provided by data_loader. :param data_loader: data loader """ torch.set_grad_enabled(False) self.model.eval() self.preallocate(data_loader.batch_size, data_loader.dataset.max_len, training=False) output = self.feed_data(data_loader, training=False) self.model.zero_grad() return output def load(self, filename): """ Loads checkpoint from filename. :param filename: path to the checkpoint file """ if os.path.isfile(filename): checkpoint = torch.load(filename, map_location={'cuda:0': 'cpu'}) if self.distributed: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) self.fp_optimizer.initialize_model(self.model) self.optimizer.load_state_dict(checkpoint['optimizer']) assert self.scheduler is not None self.scheduler.load_state_dict(checkpoint['scheduler']) self.epoch = checkpoint['epoch'] self.loss = checkpoint['loss'] logging.info(f'Loaded checkpoint {filename} (epoch {self.epoch})') else: logging.error(f'Invalid checkpoint: {filename}') def save(self, identifier=None, is_best=False, save_all=False): """ Stores checkpoint to a file. :param identifier: identifier for periodic checkpoint :param is_best: if True stores checkpoint to 'model_best.pth' :param save_all: if True stores checkpoint after completed training epoch """ def write_checkpoint(state, filename): filename = os.path.join(self.save_path, filename) logging.info(f'Saving model to {filename}') torch.save(state, filename) if self.distributed: model_state = self.model.module.state_dict() else: model_state = self.model.state_dict() assert self.scheduler is not None state = { 'epoch': self.epoch, 'state_dict': model_state, 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict(), 'loss': getattr(self, 'loss', None), } state = dict(list(state.items()) + list(self.save_info.items())) if identifier is not None: filename = self.checkpoint_filename % identifier write_checkpoint(state, filename) if is_best: filename = 'model_best.pth' write_checkpoint(state, filename) if save_all: filename = f'checkpoint_epoch_{self.epoch:03d}.pth' write_checkpoint(state, filename)
def _prepare_optimizer(self, learning_rate, loss_scale, warmup_proportion, num_train_optimization_steps): """Initialize the optimizer Arguments: learning_rate {float} -- The initial learning rate for Adam loss_scale {float} -- Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True. 0 (default value): dynamic loss scaling. Positive power of 2: static loss scaling value. warmup_proportion {float} -- Proportion of training to perform linear learning rate warmup for E.g., 0.1 = 10%% of training num_train_optimization_steps {int} -- Number of optimization steps Returns: Optimizer -- The optimizer to use while training """ param_optimizer = list(self.model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if self.fp16: optimizer = FusedAdam(optimizer_grouped_parameters, lr=learning_rate, bias_correction=False, max_grad_norm=1.0) if loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) warmup_linear = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_proportion, num_training_steps=num_train_optimization_steps) else: optimizer = AdamW( optimizer_grouped_parameters, lr=learning_rate, ) warmup_linear = None return optimizer, warmup_linear
'weight_decay': 0.0 }] logger.info("Loading Apex and building the FusedAdam optimizer") if fp16: try: from apex.contrib.optimizers import FP16_Optimizer, FusedAdam except: raise ImportError( "To use distributed and fp16 training, please install apex from the branch bertonazureml/apex at https://www.github.com/microsoft/apex." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=job_config.get_learning_rate(), bias_correction=False, max_grad_norm=1.0) if loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=job_config.get_learning_rate()) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=job_config.get_warmup_proportion(), num_training_steps=job_config.get_total_training_steps()) global_step = 0 start_epoch = 0
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--train_data_dir", default="", type=str, # required=True, help="The input train corpus.", ) parser.add_argument( "--val_data_dir", default="", type=str, # required=True, help="The input val corpus.", ) parser.add_argument( "--from_pretrained", default="", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, # required=True, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_interbert.json", type=str, # required=True, help="The config file which specified the model details.", ) ## Other parameters parser.add_argument( "--max_seq_length", default=36, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument("--predict_feature", action="store_true", help="visual target.") parser.add_argument( "--train_batch_size", default=512, type=int, help="Total batch size for training.", ) parser.add_argument( "--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--start_epoch", default=0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--continue_training", action="store_true", help="if we need to continue a stopped pretraining procedure, add this" ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--img_weight", default=1, type=float, help="weight for image loss") parser.add_argument("--itm_weight", default=1, type=float, help="weight for itm loss") parser.add_argument("--text_weight", default=1, type=float, help="weight for text loss") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", type=bool, default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=3, help="Number of workers in the dataloader.", ) parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of the model need to fixed.") parser.add_argument("--distributed", action="store_true", help="whether use chunck for parallel training.") parser.add_argument("--without_coattention", action="store_true", help="whether pair loss.") parser.add_argument("--span_mask", action="store_true", help="whether to use span_masking.") parser.add_argument("--cond_mask", action="store_true", help="Whether to use conditional masking method.") parser.add_argument("--dynamic_masking", action="store_true", help="whether to use dynamic masking") args = parser.parse_args() print(args) if args.save_name is not '': timeStamp = args.save_name else: timeStamp = strftime("%d-%b-%y-%X-%a", gmtime()) timeStamp += "_{:0>6d}".format(random.randint(0, 10e6)) savePath = os.path.join(args.output_dir, timeStamp) if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if args.freeze > config.t_biattention_id[0]: config.fixed_t_layer = config.t_biattention_id[0] if args.without_coattention: config.with_coattention = False # # save all the hidden parameters. # with open(os.path.join(savePath, 'command.txt'), 'w') as f: # print(args, file=f) # Python 3.x # print('\n', file=f) # print(config, file=f) bert_weight_name = json.load( open("config/" + "bert-base-uncased_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) num_train_optimization_steps = None viz = TBlogger("logs", timeStamp) train_dataset = ConceptCapLoaderTrain(args.train_data_dir, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, predict_feature=args.predict_feature, num_workers=args.num_workers, distributed=args.distributed, span_mask=args.span_mask, cond_mask=args.cond_mask) validation_dataset = ConceptCapLoaderVal( args.val_data_dir, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, predict_feature=args.predict_feature, num_workers=2, distributed=args.distributed, span_mask=args.span_mask, cond_mask=args.cond_mask) if args.continue_training: assert args.start_epoch > 0 # must have pretrained at least one epoch num_train_optimization_steps = ( int(train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs) if args.cond_mask: num_train_optimization_steps *= 2 finished_steps = ( int(train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * args.start_epoch) if args.cond_mask: finished_steps *= 2 else: num_train_optimization_steps = ( int(train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * (args.num_train_epochs - args.start_epoch)) if args.cond_mask: num_train_optimization_steps *= 2 finished_steps = 0 default_gpu = False if dist.is_available() and args.distributed: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, 'command.txt'), 'w') as f: print(args, file=f) # Python 3.x print('\n', file=f) print(config, file=f) # pdb.set_trace() if args.predict_feature: config.v_target_size = 2048 config.predict_feature = True else: config.v_target_size = 1601 config.predict_feature = False if args.from_pretrained: if args.continue_training: ckpt_load_path = os.path.join( args.from_pretrained, "pytorch_model_{}.bin".format(int(args.start_epoch) - 1)) model = InterBertForMultiModalPreTraining.from_pretrained( ckpt_load_path, config) else: model = InterBertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config) else: model = InterBertForMultiModalPreTraining(config) model.cuda() if args.fp16: model.half() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if 'embeddings' in name: bert_weight_name_filtered.append(name) elif 'encoder' in name: layer_num = name.split('.')[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # set different parameters for vision branch and lanugage branch. if args.fp16: try: from apex.contrib.optimizers import FP16_Optimizer from apex.contrib.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0, ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: if args.from_pretrained: optimizer = BertAdam( optimizer_grouped_parameters, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, ) else: optimizer = BertAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, ) if args.continue_training: opt_state_dict_path = os.path.join( args.from_pretrained, "optimizer_state_{}.bin".format(int(args.start_epoch) - 1)) optimizer.load_state_dict( torch.load(opt_state_dict_path, map_location='cpu')) logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps - finished_steps) startIterID = 0 global_step = finished_steps masked_loss_v_tmp = 0 masked_loss_t_tmp = 0 next_sentence_loss_tmp = 0 loss_tmp = 0 start_t = timer() for epochId in range(int(args.start_epoch), int(args.num_train_epochs)): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # iter_dataloader = iter(train_dataloader) for step, batch in enumerate(train_dataset): iterId = startIterID + step + (epochId * len(train_dataset)) # batch = iter_dataloader.next() batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, multimodal_mask, image_ids = ( batch) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, multimodal_mask, lm_label_ids, image_label, image_target, is_next, ) if args.without_coattention: next_sentence_loss = next_sentence_loss * 0 masked_loss_v = masked_loss_v * args.img_weight next_sentence_loss = next_sentence_loss * args.itm_weight loss = masked_loss_t * args.text_weight + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if math.isnan(loss.item()): pdb.set_trace() tr_loss += loss.item() rank = 0 if dist.is_available() and args.distributed: rank = dist.get_rank() else: rank = 0 viz.linePlot(iterId, loss.item(), "loss_" + str(rank), "train") viz.linePlot(iterId, masked_loss_t.item(), "masked_loss_t_" + str(rank), "train") viz.linePlot(iterId, masked_loss_v.item(), "masked_loss_v_" + str(rank), "train") viz.linePlot(iterId, next_sentence_loss.item(), "next_sentence_loss_" + str(rank), "train") # viz.linePlot(iterId, optimizer.get_lr()[0], 'learning_rate', 'train') loss_tmp += loss.item() masked_loss_v_tmp += masked_loss_v.item() masked_loss_t_tmp += masked_loss_t.item() next_sentence_loss_tmp += next_sentence_loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if step % 20 == 0 and step != 0: masked_loss_t_tmp = masked_loss_t_tmp / 20.0 masked_loss_v_tmp = masked_loss_v_tmp / 20.0 next_sentence_loss_tmp = next_sentence_loss_tmp / 20.0 loss_tmp = loss_tmp / 20.0 end_t = timer() timeStamp = strftime("%a %d %b %y %X", gmtime()) Ep = epochId + nb_tr_steps / float(len(train_dataset)) printFormat = "[%s][Ep: %.2f][Iter: %d][Time: %5.2fs][Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g][LR: %.8g]" printInfo = [ timeStamp, Ep, nb_tr_steps, end_t - start_t, loss_tmp, masked_loss_v_tmp, masked_loss_t_tmp, next_sentence_loss_tmp, optimizer.get_lr()[0], ] start_t = end_t print(printFormat % tuple(printInfo)) masked_loss_v_tmp = 0 masked_loss_t_tmp = 0 next_sentence_loss_tmp = 0 loss_tmp = 0 # Do the evaluation torch.set_grad_enabled(False) start_t = timer() numBatches = len(validation_dataset) eval_masked_loss_t = 0 eval_masked_loss_v = 0 eval_next_sentence_loss = 0 eval_total_loss = 0 model.eval() for step, batch in enumerate(validation_dataset): batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, multimodal_mask, image_ids = ( batch) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, multimodal_mask, lm_label_ids, image_label, image_target, is_next, ) masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() eval_masked_loss_t += masked_loss_t.item() eval_masked_loss_v += masked_loss_v.item() eval_next_sentence_loss += next_sentence_loss.item() eval_total_loss += loss.item() end_t = timer() delta_t = " Time: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % ('val', step + 1, numBatches)) sys.stdout.flush() eval_masked_loss_t = eval_masked_loss_t / float(numBatches) eval_masked_loss_v = eval_masked_loss_v / float(numBatches) eval_next_sentence_loss = eval_next_sentence_loss / float(numBatches) eval_total_loss = eval_total_loss / float(numBatches) printFormat = "Evaluation: [Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g]" printInfo = [ eval_total_loss, eval_masked_loss_v, eval_masked_loss_t, eval_next_sentence_loss ] print(printFormat % tuple(printInfo)) torch.set_grad_enabled(True) viz.linePlot(epochId, eval_total_loss, "loss_" + str(rank), "val") viz.linePlot(epochId, eval_masked_loss_t, "masked_loss_t_" + str(rank), "val") viz.linePlot(epochId, eval_masked_loss_v, "masked_loss_v_" + str(rank), "val") viz.linePlot(epochId, eval_next_sentence_loss, "next_sentence_loss_" + str(rank), "val") if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin") torch.save(model_to_save.state_dict(), output_model_file) output_opt_state_dict_file = os.path.join( savePath, "optimizer_state_" + str(epochId) + ".bin") torch.save(optimizer.state_dict(), output_opt_state_dict_file) if args.dynamic_masking and epochId + 1 < int(args.num_train_epochs): del train_dataset gc.collect() train_dataset = ConceptCapLoaderTrain( args.train_data_dir, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, predict_feature=args.predict_feature, num_workers=args.num_workers, distributed=args.distributed, span_mask=args.span_mask)
def main(opts): if opts.local_rank == -1: assert torch.cuda.is_available() device = torch.device("cuda") n_gpu = 1 else: torch.cuda.set_device(opts.local_rank) device = torch.device("cuda", opts.local_rank) # Initializes the distributed backend which will take care of # sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') n_gpu = torch.distributed.get_world_size() logger.info("device: {} n_gpu: {}, distributed training: {}, " "16-bits training: {}".format( device, n_gpu, bool(opts.local_rank != -1), opts.fp16)) opts.n_gpu = n_gpu if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) is_master = opts.local_rank == -1 or torch.distributed.get_rank() == 0 # if is_master: # save_training_meta(opts) random.seed(opts.seed) np.random.seed(opts.seed) torch.manual_seed(opts.seed) if n_gpu > 0: torch.cuda.manual_seed_all(opts.seed) tokenizer = BertTokenizer.from_pretrained( opts.bert_model, do_lower_case='uncased' in opts.bert_model) # train_examples = None print("Loading Train Dataset", opts.train_file) vocab_dump = torch.load(opts.vocab_file) print ('vocab dump', vocab_dump) # vocab = vocab_dump['tgt'].fields[0][1].vocab.stoi vocab = vocab_dump['src'].fields[0][1].vocab.stoi # f = open("./vocab_test_tgt.txt", "w") # f.write(str(vocab)) # f.close() # f = open("./vocab_bert.txt", "w") # f.write(str(tokenizer.get_vocab())) # f.close() train_dataset = BertDataset(opts.train_file, tokenizer, vocab, seq_len=opts.max_seq_length, max_len=opts.max_sent_length) print ('train dataset',train_dataset[0]) # Prepare model print ('len train dataset',len(train_dataset)) model = BertForSeq2seq.from_pretrained(opts.bert_model) embedding = convert_embedding( tokenizer, vocab, model.bert.embeddings.word_embeddings.weight) # changing the output embedding layer to have words from new vocab instead of old bert vocab? model.update_output_layer(embedding) if opts.fp16: model.half() model.to(device) if opts.local_rank != -1: # need to make sure models are the same in the beginning params = [p.data for p in model.parameters()] broadcast_tensors(params) for name, module in model.named_modules(): # we might want to tune dropout for smaller dataset if isinstance(module, torch.nn.Dropout): module.p = opts.dropout # Prepare optimizer param_optimizer = [(n, p) for n, p in model.named_parameters() if 'pooler' not in n] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if opts.fp16: try: from apex.contrib.optimizers import FP16_Optimizer from apex.contrib.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from " "https://www.github.com/nvidia/apex " "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=opts.learning_rate, bias_correction=False, max_grad_norm=1.0) if opts.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=opts.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=opts.learning_rate) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", opts.train_batch_size) logger.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) logger.info(" Num steps = %d", opts.num_train_steps) if opts.local_rank == -1: train_sampler = TokenBucketSampler( train_dataset.lens, bucket_size=8192, batch_size=opts.train_batch_size, droplast=True) train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=4, collate_fn=BertDataset.pad_collate) else: train_sampler = DistributedTokenBucketSampler( n_gpu, opts.local_rank, train_dataset.lens, bucket_size=8192, batch_size=opts.train_batch_size, droplast=True) train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=4, collate_fn=BertDataset.pad_collate) if is_master: TB_LOGGER.create(join(opts.output_dir, 'log')) running_loss = RunningMeter('loss') model.train() if is_master: pbar = tqdm(total=opts.num_train_steps) else: logger.disabled = True pbar = None n_examples = 0 n_epoch = 0 start = time() while True: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) if t is not None else t for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch n_examples += input_ids.size(0) mask = lm_label_ids != -1 loss = model(input_ids, segment_ids, input_mask, lm_label_ids, mask, True) if opts.fp16: optimizer.backward(loss) else: loss.backward() running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 if opts.fp16: # modify learning rate with special warm up BERT uses # if opts.fp16 is False, AdamW is used that handles # this automatically lr_this_step = opts.learning_rate * warmup_linear( global_step/opts.num_train_steps, opts.warmup_proportion) if lr_this_step < 0: # save guard for possible miscalculation of train steps lr_this_step = 1e-8 for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # NOTE running loss not gathered across GPUs for speed TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() if opts.local_rank != -1: # gather gradients from every processes grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) optimizer.step() optimizer.zero_grad() if pbar is not None: pbar.update(1) if global_step % 5 == 0: torch.cuda.empty_cache() if global_step % 100 == 0: if opts.local_rank != -1: total = sum(all_gather_list(n_examples)) else: total = n_examples if is_master: ex_per_sec = int(total / (time()-start)) logger.info(f'{total} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar('ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: logger.info(f"start validation at Step {global_step}") with torch.no_grad(): val_log = validate(model, opts.valid_src, opts.valid_tgt, tokenizer, vocab, device, opts.local_rank) logger.info(f"Val Acc: {val_log['val_acc']}; " f"Val Loss: {val_log['val_loss']}") TB_LOGGER.log_scaler_dict(val_log) if is_master: output_model_file = join( opts.output_dir, 'ckpt', f"model_step_{global_step}.pt") # save cpu checkpoint state_dict = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in model.state_dict().items()} torch.save(state_dict, output_model_file) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 if is_master: logger.info(f"finished {n_epoch} epochs") if opts.num_train_steps % opts.valid_steps != 0: with torch.no_grad(): val_log = validate(model, opts.valid_src, opts.valid_tgt, tokenizer, vocab, device, opts.local_rank) TB_LOGGER.log_scaler_dict(val_log) if is_master: output_model_file = join(opts.output_dir, 'ckpt', f"model_step_{global_step}.pt") # save cpu checkpoint state_dict = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in model.state_dict().items()} torch.save(model.state_dict(), output_model_file)