def get_optimization(model,
                     float16,
                     learning_rate,
                     total_steps,
                     schedule,
                     warmup_rate,
                     weight_decay_rate,
                     max_grad_norm,
                     opt_pooler=False):
    # Prepare optimizer
    assert 0.0 <= warmup_rate <= 1.0
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    if opt_pooler is False:
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [{
        'params': [
            p for n, p in param_optimizer
            if not any([nd in n for nd in no_decay])
        ],
        'weight_decay_rate':
        weight_decay_rate
    }, {
        'params':
        [p for n, p in param_optimizer if any([nd in n for nd in no_decay])],
        'weight_decay_rate':
        0.0
    }]
    if float16:
        try:
            from apex.contrib.optimizers import FP16_Optimizer
            from apex.contrib.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_parameters,
                              lr=learning_rate,
                              bias_correction=False,
                              max_grad_norm=max_grad_norm)
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
    else:
        optimizer = BERTAdam(params=optimizer_parameters,
                             lr=learning_rate,
                             warmup=warmup_rate,
                             max_grad_norm=max_grad_norm,
                             t_total=total_steps,
                             schedule=schedule,
                             weight_decay_rate=weight_decay_rate)

    return optimizer
Example #2
0
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if args.do_train:
        logger.addHandler(logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w'))
    else:
        logger.addHandler(logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w'))
    logger.info(args)
    logger.info("device: {}, n_gpu: {}, 16-bits training: {}".format(
        device, n_gpu, args.fp16))

    processor = DataProcessor()
    label_list = processor.get_labels(args.data_dir, args.negative_label)
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}
    num_labels = len(label_list)
    tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case)

    special_tokens = {}
    if args.do_eval:
        eval_examples = processor.get_dev_examples(args.data_dir, tokenizer)
        eval_features = convert_examples_to_features(
            eval_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode)
        logger.info("***** Dev *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        all_span_ids = torch.tensor([f.span_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_span_ids)
        eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size)
        eval_label_ids = all_label_ids

    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir,tokenizer)
        train_features = convert_examples_to_features(
                train_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode)

        if args.train_mode == 'sorted' or args.train_mode == 'random_sorted':
            train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask))
        else:
            random.shuffle(train_features)

        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        all_span_ids = torch.tensor([f.span_id for f in train_features], dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_span_ids)
        train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size)
        train_batches = [batch for batch in train_dataloader]

        num_train_optimization_steps = \
            len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        logger.info("***** Training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_result = None
        eval_step = max(1, len(train_batches) // args.eval_per_epoch)
        lrs = [args.learning_rate] if args.learning_rate else \
            [1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5]
        for lr in lrs:
            model = BertForMTB.from_pretrained(args.model,model_name = args.model,num_labels=num_labels,examples = train_examples,mode = args.repre_mode)
            # BertForMTB.from_pretrained(args.model ,model_name = args.model,num_labels=num_labels,examples = train_examples,mode = args.repre_mode)
            # BertForSequenceClassification.from_pretrained(
                # args.model, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE), num_labels=num_labels,examples = train_examples)
            if args.fp16:

                model.half()
            model.to(device)
            if n_gpu > 1:
                model = torch.nn.DataParallel(model)

            param_optimizer = list(model.named_parameters())
            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer
                            if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer
                            if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
            if args.fp16:
                try:
                    # from apex.optimizers import FP16_Optimizer
                    # from apex.optimizers import FusedAdam
                    from apex.contrib.optimizers import FP16_Optimizer
                    from apex.contrib.optimizers import FusedAdam
                except ImportError:
                    raise ImportError("Please install apex from https://www.github.com/nvidia/apex"
                                      "to use distributed and fp16 training.")

                optimizer = FusedAdam(optimizer_grouped_parameters,
                                      lr=lr,
                                      bias_correction=False,
                                      max_grad_norm=1.0)
                if args.loss_scale == 0:
                    optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
                else:
                    optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

            else:
                optimizer = BertAdam(optimizer_grouped_parameters,
                                     lr=lr,
                                     warmup=args.warmup_proportion,
                                     t_total=num_train_optimization_steps)

            start_time = time.time()
            global_step = 0
            tr_loss = 0
            nb_tr_examples = 0
            nb_tr_steps = 0
            for epoch in range(int(args.num_train_epochs)):
                model.train()
                logger.info("Start epoch #{} (lr = {})...".format(epoch, lr))
                if args.train_mode == 'random' or args.train_mode == 'random_sorted':
                    random.shuffle(train_batches)
                for step, batch in enumerate(train_batches):
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, segment_ids, label_ids, span_ids = batch
                    model_output = model(input_ids, input_mask, segment_ids,span_ids ,label_ids,output_attentions = True,output_hidden_states=True)
                    print(model_output)
                    loss = model_output.loss
                    if n_gpu > 1:
                        loss = loss.mean()
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    if args.fp16:
                        optimizer.backward(loss)
                    else:
                        loss.backward()

                    tr_loss += loss.item()
                    nb_tr_examples += input_ids.size(0)
                    nb_tr_steps += 1

                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        if args.fp16:
                            lr_this_step = lr * \
                                warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
                            for param_group in optimizer.param_groups:
                                param_group['lr'] = lr_this_step
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1

                    if (step + 1) % eval_step == 0:
                        logger.info('Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}'.format(
                                     epoch, step + 1, len(train_batches),
                                     time.time() - start_time, tr_loss / nb_tr_steps))
                        save_model = False
                        if args.do_eval:
                            preds, result = evaluate(model, device, eval_dataloader, eval_label_ids, num_labels)
                            model.train()
                            result['global_step'] = global_step
                            result['epoch'] = epoch
                            result['learning_rate'] = lr
                            result['batch_size'] = args.train_batch_size
                            logger.info("First 20 predictions:")
                            for pred, label in zip(preds[:20], eval_label_ids.numpy()[:20]):
                                sign = u'\u2713' if pred == label else u'\u2718'
                                logger.info("pred = %s, label = %s %s" % (id2label[pred], id2label[label], sign))
                            if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]):
                                best_result = result
                                save_model = True
                                logger.info("!!! Best dev %s (lr=%s, epoch=%d): %.2f" %
                                            (args.eval_metric, str(lr), epoch, result[args.eval_metric] * 100.0))
                        else:
                            save_model = True

                        if save_model:
                            model_to_save = model.module if hasattr(model, 'module') else model
                            output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
                            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
                            torch.save(model_to_save.state_dict(), output_model_file)
                            model_to_save.config.to_json_file(output_config_file)
                            tokenizer.save_vocabulary(args.output_dir)
                            if best_result:
                                output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
                                with open(output_eval_file, "w") as writer:
                                    for key in sorted(result.keys()):
                                        writer.write("%s = %s\n" % (key, str(result[key])))

    if args.do_eval:
        if args.eval_test:
            eval_examples = processor.get_test_examples(args.data_dir, tokenizer)
            eval_features = convert_examples_to_features(
                eval_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode)
            logger.info("***** Test *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
            all_span_ids = torch.tensor([f.span_id for f in eval_features], dtype=torch.long)
            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size)
            eval_label_ids = all_label_ids
        model = BertForMTB.from_pretrained(args.model,model_name = args.model,num_labels=num_labels,examples = eval_examples,mode = args.repre_mode)
        # from_pretrained(args.output_dir, num_labels=num_labels)
        if args.fp16:
            model.half()
        model.to(device)
        preds, result = evaluate(model, device, eval_dataloader, eval_label_ids, num_labels)
        with open(os.path.join(args.output_dir, "predictions.txt"), "w") as f:
            for ex, pred in zip(eval_examples, preds):
                f.write("%s\t%s\n" % (ex.guid, id2label[pred]))
        with open(os.path.join(args.output_dir, "test_results.txt"), "w") as f:
            for key in sorted(result.keys()):
                f.write("%s = %s\n" % (key, str(result[key])))
Example #3
0
    def __init__(self,
                 model,
                 criterion,
                 opt_config,
                 print_freq=10,
                 save_freq=1000,
                 grad_clip=float('inf'),
                 batch_first=False,
                 save_info={},
                 save_path='.',
                 train_iterations=0,
                 checkpoint_filename='checkpoint%s.pth',
                 keep_checkpoints=5,
                 math='fp32',
                 loss_scaling={},
                 cuda=True,
                 distributed=False,
                 distributed_overlap_allreduce=False,
                 distributed_overlap_num_allreduce_streams=1,
                 distributed_overlap_allreduce_messagesize=1e7,
                 distributed_overlap_allreduce_communicators=None,
                 intra_epoch_eval=0,
                 prealloc_mode='always',
                 iter_size=1,
                 verbose=False,
                 args=None):
        """
        Constructor for the Seq2SeqTrainer.

        :param model: model to train
        :param criterion: criterion (loss function)
        :param opt_config: dictionary with options for the optimizer
        :param print_freq: prints short summary every 'print_freq' iterations
        :param save_freq: saves checkpoint every 'save_freq' iterations
        :param grad_clip: coefficient for gradient clipping
        :param batch_first: if True the model uses (batch,seq,feature) tensors,
            if false the model uses (seq, batch, feature)
        :param save_info: dict with additional state stored in each checkpoint
        :param save_path: path to the directiory for checkpoints
        :param train_iterations: total number of training iterations to execute
        :param checkpoint_filename: name of files with checkpoints
        :param keep_checkpoints: max number of checkpoints to keep
        :param math: arithmetic type
        :param loss_scaling: options for dynamic loss scaling
        :param cuda: if True use cuda, if False train on cpu
        :param distributed: if True run distributed training
        :param intra_epoch_eval: number of additional eval runs within each
            training epoch
        :param prealloc_mode: controls preallocation,
            choices=['off', 'once', 'always']
        :param iter_size: number of iterations between weight updates
        :param verbose: enables verbose logging
        """
        super(Seq2SeqTrainer, self).__init__()
        self.model = model
        self.criterion = criterion
        self.epoch = 0
        self.save_info = save_info
        self.save_path = save_path
        self.save_freq = save_freq
        self.save_counter = 0
        self.checkpoint_filename = checkpoint_filename
        self.checkpoint_counter = cycle(range(keep_checkpoints))
        self.opt_config = opt_config
        self.cuda = cuda
        self.distributed = distributed
        self.print_freq = print_freq
        self.batch_first = batch_first
        self.verbose = verbose
        self.loss = None
        self.translator = None
        self.scheduler = None
        self.intra_epoch_eval = intra_epoch_eval
        self.iter_size = iter_size
        self.prealloc_mode = prealloc_mode
        self.preallocated = False
        
        # Assume multi-tensor apply if with APEX DDP
        self.args = args
        self.use_mt = (distributed  and iter_size == 1 and \
            opt_config['optimizer'] == 'FusedAdam')

        # Use APEX gradient average if gradient accumulation option enabled
        self.retain_allreduce_buffers = True if iter_size == 1 else False
        self.gradient_average = False if iter_size == 1 else True

        if cuda:
            self.model = self.model.cuda()
            self.criterion = self.criterion.cuda()

        params = self.model.parameters()
        if math == 'fp16':
            self.model = self.model.half()
            if distributed and self.args.distributed_weight_update != 2:
                self.model = DDP(self.model,
                                 message_size=distributed_overlap_allreduce_messagesize,
                                 delay_allreduce=(not distributed_overlap_allreduce),
                                 num_allreduce_streams=distributed_overlap_num_allreduce_streams,
                                 allreduce_communicators=distributed_overlap_allreduce_communicators,
                                 retain_allreduce_buffers=self.retain_allreduce_buffers,
                                 gradient_average=self.gradient_average)

            if self.args.distributed_weight_update == 2:
                # gradient clipping maintained by DistributedFusedAdam
                self.fp_optimizer = DwuFp16Optimizer(
                    self.model,
                    loss_scale=loss_scaling['init_scale'],
                    dls_upscale_interval=loss_scaling['upscale_interval']
                    )
                params = list(self.model.parameters())
            else:
                self.fp_optimizer = Fp16Optimizer(
                    self.model, grad_clip,
                    use_mt=self.use_mt,
                    loss_scale=loss_scaling['init_scale'],
                    dls_upscale_interval=loss_scaling['upscale_interval']
                    )
                params = self.fp_optimizer.fp32_params if isinstance(self.fp_optimizer.fp32_params, list) \
                    else [self.fp_optimizer.fp32_params]
        elif math == 'fp32':
            if distributed:
                self.model = DDP(self.model,
                                 message_size=distributed_overlap_allreduce_messagesize,
                                 delay_allreduce=(not distributed_overlap_allreduce))
            self.fp_optimizer = Fp32Optimizer(self.model, grad_clip)
            # params = self.model.parameters()

        opt_name = opt_config.pop('optimizer')
        if opt_name == 'FusedAdam':
            if math == 'fp16' or math == 'fp32':
                if self.args.distributed_weight_update == 2:
                    dwu_args = self.distributed_weight_update_config
                    self.optimizer = DistributedFusedAdam(params, max_grad_norm=grad_clip,
                                                          **dwu_args, **opt_config)
                    self.optimizer.set_global_scale(1.0) # used for grad norm clipping in step function
                else:
                    # Maintain grad norm and scaling by ourselves
                    self.optimizer = FusedAdam(params, use_mt=self.use_mt, **opt_config)
            else:
                self.optimizer = FusedAdam(params, use_mt=self.use_mt, max_grad_norm=grad_clip,
                                           amp_scale_adjustment=get_world_size(), **opt_config)
        else:
            self.optimizer = torch.optim.__dict__[opt_name](params,
                                                            **opt_config)
        logging.info(f'Using optimizer: {self.optimizer}')

        log_event(key=constants.OPT_NAME,
                  value=constants.ADAM, sync=False)
        log_event(key=constants.OPT_BASE_LR,
                  value=opt_config['lr'], sync=False)
        log_event(key=constants.OPT_ADAM_BETA_1,
                  value=self.optimizer.defaults['betas'][0], sync=False)
        log_event(key=constants.OPT_ADAM_BETA_2,
                  value=self.optimizer.defaults['betas'][1], sync=False)
        log_event(key=constants.OPT_ADAM_EPSILON,
                  value=self.optimizer.defaults['eps'], sync=False)
Example #4
0
class Seq2SeqTrainer:
    """
    Seq2SeqTrainer
    """
    def __init__(self,
                 model,
                 criterion,
                 opt_config,
                 print_freq=10,
                 save_freq=1000,
                 grad_clip=float('inf'),
                 batch_first=False,
                 save_info={},
                 save_path='.',
                 train_iterations=0,
                 checkpoint_filename='checkpoint%s.pth',
                 keep_checkpoints=5,
                 math='fp32',
                 loss_scaling={},
                 cuda=True,
                 distributed=False,
                 distributed_overlap_allreduce=False,
                 distributed_overlap_num_allreduce_streams=1,
                 distributed_overlap_allreduce_messagesize=1e7,
                 distributed_overlap_allreduce_communicators=None,
                 intra_epoch_eval=0,
                 prealloc_mode='always',
                 iter_size=1,
                 verbose=False,
                 args=None):
        """
        Constructor for the Seq2SeqTrainer.

        :param model: model to train
        :param criterion: criterion (loss function)
        :param opt_config: dictionary with options for the optimizer
        :param print_freq: prints short summary every 'print_freq' iterations
        :param save_freq: saves checkpoint every 'save_freq' iterations
        :param grad_clip: coefficient for gradient clipping
        :param batch_first: if True the model uses (batch,seq,feature) tensors,
            if false the model uses (seq, batch, feature)
        :param save_info: dict with additional state stored in each checkpoint
        :param save_path: path to the directiory for checkpoints
        :param train_iterations: total number of training iterations to execute
        :param checkpoint_filename: name of files with checkpoints
        :param keep_checkpoints: max number of checkpoints to keep
        :param math: arithmetic type
        :param loss_scaling: options for dynamic loss scaling
        :param cuda: if True use cuda, if False train on cpu
        :param distributed: if True run distributed training
        :param intra_epoch_eval: number of additional eval runs within each
            training epoch
        :param prealloc_mode: controls preallocation,
            choices=['off', 'once', 'always']
        :param iter_size: number of iterations between weight updates
        :param verbose: enables verbose logging
        """
        super(Seq2SeqTrainer, self).__init__()
        self.model = model
        self.criterion = criterion
        self.epoch = 0
        self.save_info = save_info
        self.save_path = save_path
        self.save_freq = save_freq
        self.save_counter = 0
        self.checkpoint_filename = checkpoint_filename
        self.checkpoint_counter = cycle(range(keep_checkpoints))
        self.opt_config = opt_config
        self.cuda = cuda
        self.distributed = distributed
        self.print_freq = print_freq
        self.batch_first = batch_first
        self.verbose = verbose
        self.loss = None
        self.translator = None
        self.scheduler = None
        self.intra_epoch_eval = intra_epoch_eval
        self.iter_size = iter_size
        self.prealloc_mode = prealloc_mode
        self.preallocated = False
        
        # Assume multi-tensor apply if with APEX DDP
        self.args = args
        self.use_mt = (distributed  and iter_size == 1 and \
            opt_config['optimizer'] == 'FusedAdam')

        # Use APEX gradient average if gradient accumulation option enabled
        self.retain_allreduce_buffers = True if iter_size == 1 else False
        self.gradient_average = False if iter_size == 1 else True

        if cuda:
            self.model = self.model.cuda()
            self.criterion = self.criterion.cuda()

        params = self.model.parameters()
        if math == 'fp16':
            self.model = self.model.half()
            if distributed and self.args.distributed_weight_update != 2:
                self.model = DDP(self.model,
                                 message_size=distributed_overlap_allreduce_messagesize,
                                 delay_allreduce=(not distributed_overlap_allreduce),
                                 num_allreduce_streams=distributed_overlap_num_allreduce_streams,
                                 allreduce_communicators=distributed_overlap_allreduce_communicators,
                                 retain_allreduce_buffers=self.retain_allreduce_buffers,
                                 gradient_average=self.gradient_average)

            if self.args.distributed_weight_update == 2:
                # gradient clipping maintained by DistributedFusedAdam
                self.fp_optimizer = DwuFp16Optimizer(
                    self.model,
                    loss_scale=loss_scaling['init_scale'],
                    dls_upscale_interval=loss_scaling['upscale_interval']
                    )
                params = list(self.model.parameters())
            else:
                self.fp_optimizer = Fp16Optimizer(
                    self.model, grad_clip,
                    use_mt=self.use_mt,
                    loss_scale=loss_scaling['init_scale'],
                    dls_upscale_interval=loss_scaling['upscale_interval']
                    )
                params = self.fp_optimizer.fp32_params if isinstance(self.fp_optimizer.fp32_params, list) \
                    else [self.fp_optimizer.fp32_params]
        elif math == 'fp32':
            if distributed:
                self.model = DDP(self.model,
                                 message_size=distributed_overlap_allreduce_messagesize,
                                 delay_allreduce=(not distributed_overlap_allreduce))
            self.fp_optimizer = Fp32Optimizer(self.model, grad_clip)
            # params = self.model.parameters()

        opt_name = opt_config.pop('optimizer')
        if opt_name == 'FusedAdam':
            if math == 'fp16' or math == 'fp32':
                if self.args.distributed_weight_update == 2:
                    dwu_args = self.distributed_weight_update_config
                    self.optimizer = DistributedFusedAdam(params, max_grad_norm=grad_clip,
                                                          **dwu_args, **opt_config)
                    self.optimizer.set_global_scale(1.0) # used for grad norm clipping in step function
                else:
                    # Maintain grad norm and scaling by ourselves
                    self.optimizer = FusedAdam(params, use_mt=self.use_mt, **opt_config)
            else:
                self.optimizer = FusedAdam(params, use_mt=self.use_mt, max_grad_norm=grad_clip,
                                           amp_scale_adjustment=get_world_size(), **opt_config)
        else:
            self.optimizer = torch.optim.__dict__[opt_name](params,
                                                            **opt_config)
        logging.info(f'Using optimizer: {self.optimizer}')

        log_event(key=constants.OPT_NAME,
                  value=constants.ADAM, sync=False)
        log_event(key=constants.OPT_BASE_LR,
                  value=opt_config['lr'], sync=False)
        log_event(key=constants.OPT_ADAM_BETA_1,
                  value=self.optimizer.defaults['betas'][0], sync=False)
        log_event(key=constants.OPT_ADAM_BETA_2,
                  value=self.optimizer.defaults['betas'][1], sync=False)
        log_event(key=constants.OPT_ADAM_EPSILON,
                  value=self.optimizer.defaults['eps'], sync=False)

    @property
    def distributed_weight_update_config(self):
        """
        Return a kwarg dictionary that provides arguments for the distributed
        weight update feature.
        """
        return {
            'dwu_group_size': self.args.dwu_group_size,
            'dwu_num_blocks': self.args.dwu_num_blocks,
            'dwu_num_chunks': self.args.dwu_num_chunks,
            'dwu_num_rs_pg': self.args.dwu_num_rs_pg,
            'dwu_num_ar_pg': self.args.dwu_num_ar_pg,
            'dwu_num_ag_pg': self.args.dwu_num_ag_pg,
            'overlap_reductions': self.args.dwu_overlap_reductions,
            'full_pipeline': self.args.dwu_full_pipeline,
            'compute_L2_grad_norm': self.args.dwu_grad_norm,
            'e5m2_allgather': self.args.dwu_e5m2_allgather,
            'predivide': False,
            'flat_mt': True,
        }

    def iterate(self, src, tgt, update=True, training=True):
        """
        Performs one iteration of the training/validation.

        :param src: batch of examples from the source language
        :param tgt: batch of examples from the target language
        :param update: if True: optimizer does update of the weights
        :param training: if True: executes optimizer
        """
        src, src_length = src
        tgt, tgt_length = tgt
        src_length = torch.LongTensor(src_length)
        tgt_length = torch.LongTensor(tgt_length)

        num_toks = {}
        num_toks['tgt'] = int(sum(tgt_length - 1))
        num_toks['src'] = int(sum(src_length))

        if self.cuda:
            src = src.cuda(non_blocking=True)
            tgt = tgt.cuda(non_blocking=True)

        if self.batch_first:
            output = self.model(src, src_length, tgt[:, :-1])
            tgt_labels = tgt[:, 1:]
            T, B = output.size(1), output.size(0)
        else:
            output = self.model(src, src_length, tgt[:-1])
            tgt_labels = tgt[1:]
            T, B = output.size(0), output.size(1)

        loss = self.criterion(output.view(T * B, -1),
                              tgt_labels.contiguous().view(-1))

        loss_per_batch = torch.empty((1), dtype=torch.float, device='cpu',
            requires_grad=False, pin_memory=True)
        loss_per_batch.copy_(loss, non_blocking=True)
        loss /= (B * self.iter_size)

        if training:
            self.fp_optimizer.step(loss, self.optimizer, self.scheduler,
                                   update)

        loss_per_batch = loss_per_batch.item()
        loss_per_token = loss_per_batch / num_toks['tgt']
        loss_per_sentence = loss_per_batch / B

        return loss_per_token, loss_per_sentence, num_toks

    def feed_data(self, data_loader, training=True):
        """
        Runs training or validation on batches from data_loader.

        :param data_loader: data loader
        :param training: if True runs training else runs validation
        """
        if training:
            assert self.optimizer is not None
            eval_fractions = np.linspace(0, 1, self.intra_epoch_eval+2)[1:-1]
            iters_with_update = len(data_loader) // self.iter_size
            eval_iters = (eval_fractions * iters_with_update).astype(int)
            eval_iters = eval_iters * self.iter_size
            eval_iters = set(eval_iters)

        batch_time = AverageMeter(skip_first=False)
        data_time = AverageMeter(skip_first=False)
        losses_per_token = AverageMeter(skip_first=False)
        losses_per_sentence = AverageMeter(skip_first=False)

        tot_tok_time = AverageMeter(skip_first=False)
        src_tok_time = AverageMeter(skip_first=False)
        tgt_tok_time = AverageMeter(skip_first=False)

        batch_size = data_loader.batch_size

        end = time.time()
        for i, (src, tgt) in enumerate(data_loader):
            self.save_counter += 1
            # measure data loading time
            data_time.update(time.time() - end)

            update = False
            if i % self.iter_size == self.iter_size - 1:
                update = True

            # do a train/evaluate iteration
            stats = self.iterate(src, tgt, update, training=training)
            loss_per_token, loss_per_sentence, num_toks = stats

            # measure accuracy and record loss
            losses_per_token.update(loss_per_token, num_toks['tgt'])
            losses_per_sentence.update(loss_per_sentence, batch_size)

            # measure elapsed time
            elapsed = time.time() - end
            batch_time.update(elapsed)
            src_tok_time.update(num_toks['src'] / elapsed)
            tgt_tok_time.update(num_toks['tgt'] / elapsed)
            tot_num_toks = num_toks['tgt'] + num_toks['src']
            tot_tok_time.update(tot_num_toks / elapsed)
            self.loss = losses_per_token.avg

            if training and i in eval_iters:
                assert self.translator is not None
                test_bleu, _ = self.translator.run(calc_bleu=True,
                                                   epoch=self.epoch,
                                                   iteration=i)

                log = []
                log += [f'TRAIN [{self.epoch}][{i}/{len(data_loader)}]']
                log += [f'BLEU: {test_bleu:.2f}']
                log = '\t'.join(log)
                logging.info(log)

                self.model.train()
                self.preallocate(data_loader.batch_size,
                                 data_loader.dataset.max_len, training=True)

            if i % self.print_freq == 0:
                phase = 'TRAIN' if training else 'VALIDATION'
                log = []
                log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]']
                log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})']
                log += [f'Data {data_time.val:.2e} ({data_time.avg:.2e})']
                log += [f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})']
                if self.verbose:
                    log += [f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})']
                    log += [f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})']
                    log += [f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})']
                log += [f'Loss/tok {losses_per_token.val:.4f} ({losses_per_token.avg:.4f})']
                if training:
                    lr = self.optimizer.param_groups[0]['lr']
                    log += [f'LR {lr:.3e}']
                log = '\t'.join(log)
                logging.info(log)

            save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1)
            if training and save_chkpt:
                self.save_counter = 0
                self.save_info['iteration'] = i
                identifier = next(self.checkpoint_counter, -1)
                if identifier != -1:
                    with sync_workers() as rank:
                        if rank == 0:
                            self.save(identifier=identifier)

            end = time.time()

        tot_tok_time.reduce('sum')
        losses_per_token.reduce('mean')

        return losses_per_token.avg, tot_tok_time.avg

    def preallocate(self, batch_size, max_length, training):
        """
        Generates maximum sequence length batch and runs forward and backward
        pass without updating model parameters.

        :param batch_size: batch size for preallocation
        :param max_length: max sequence length for preallocation
        :param training: if True preallocates memory for backward pass
        """
        if self.prealloc_mode == 'always' or (self.prealloc_mode == 'once' and
                                              not self.preallocated):
            logging.info('Executing preallocation')
            torch.cuda.empty_cache()

            src_length = [max_length] * batch_size
            tgt_length = [max_length] * batch_size

            if self.batch_first:
                shape = (batch_size, max_length)
            else:
                shape = (max_length, batch_size)

            src = torch.full(shape, 4, dtype=torch.int64)
            tgt = torch.full(shape, 4, dtype=torch.int64)
            src = src, src_length
            tgt = tgt, tgt_length
            self.iterate(src, tgt, update=False, training=training)
            self.model.zero_grad()
            self.preallocated = True

    def optimize(self, data_loader):
        """
        Sets model in training mode, preallocates memory and runs training on
        data provided by data_loader.

        :param data_loader: data loader
        """
        torch.set_grad_enabled(True)
        self.model.train()
        self.preallocate(data_loader.batch_size, data_loader.dataset.max_len,
                         training=True)

        output = self.feed_data(data_loader, training=True)

        self.model.zero_grad()
        return output

    def evaluate(self, data_loader):
        """
        Sets model in eval mode, disables gradients, preallocates memory and
        runs validation on data provided by data_loader.

        :param data_loader: data loader
        """
        torch.set_grad_enabled(False)
        self.model.eval()
        self.preallocate(data_loader.batch_size, data_loader.dataset.max_len,
                         training=False)

        output = self.feed_data(data_loader, training=False)

        self.model.zero_grad()
        return output

    def load(self, filename):
        """
        Loads checkpoint from filename.

        :param filename: path to the checkpoint file
        """
        if os.path.isfile(filename):
            checkpoint = torch.load(filename, map_location={'cuda:0': 'cpu'})
            if self.distributed:
                self.model.module.load_state_dict(checkpoint['state_dict'])
            else:
                self.model.load_state_dict(checkpoint['state_dict'])
            self.fp_optimizer.initialize_model(self.model)
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            assert self.scheduler is not None
            self.scheduler.load_state_dict(checkpoint['scheduler'])
            self.epoch = checkpoint['epoch']
            self.loss = checkpoint['loss']
            logging.info(f'Loaded checkpoint {filename} (epoch {self.epoch})')
        else:
            logging.error(f'Invalid checkpoint: {filename}')

    def save(self, identifier=None, is_best=False, save_all=False):
        """
        Stores checkpoint to a file.

        :param identifier: identifier for periodic checkpoint
        :param is_best: if True stores checkpoint to 'model_best.pth'
        :param save_all: if True stores checkpoint after completed training
            epoch
        """

        def write_checkpoint(state, filename):
            filename = os.path.join(self.save_path, filename)
            logging.info(f'Saving model to {filename}')
            torch.save(state, filename)

        if self.distributed:
            model_state = self.model.module.state_dict()
        else:
            model_state = self.model.state_dict()

        assert self.scheduler is not None
        state = {
            'epoch': self.epoch,
            'state_dict': model_state,
            'optimizer': self.optimizer.state_dict(),
            'scheduler': self.scheduler.state_dict(),
            'loss': getattr(self, 'loss', None),
        }
        state = dict(list(state.items()) + list(self.save_info.items()))

        if identifier is not None:
            filename = self.checkpoint_filename % identifier
            write_checkpoint(state, filename)

        if is_best:
            filename = 'model_best.pth'
            write_checkpoint(state, filename)

        if save_all:
            filename = f'checkpoint_epoch_{self.epoch:03d}.pth'
            write_checkpoint(state, filename)
Example #5
0
    def _prepare_optimizer(self, learning_rate, loss_scale, warmup_proportion,
                           num_train_optimization_steps):
        """Initialize the optimizer

        Arguments:
            learning_rate {float} -- The initial learning rate for Adam
            loss_scale {float} -- Loss scaling to improve fp16 numeric
                stability. Only used when fp16 set to True.
                0 (default value): dynamic loss scaling.
                Positive power of 2: static loss scaling value.
            warmup_proportion {float} -- Proportion of training to perform
                                         linear learning rate warmup for
                                         E.g., 0.1 = 10%% of training
            num_train_optimization_steps {int} -- Number of optimization steps

        Returns:
            Optimizer -- The optimizer to use while training
        """
        param_optimizer = list(self.model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex

        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        if self.fp16:

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=loss_scale)

            warmup_linear = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=warmup_proportion,
                num_training_steps=num_train_optimization_steps)

        else:
            optimizer = AdamW(
                optimizer_grouped_parameters,
                lr=learning_rate,
            )
            warmup_linear = None

        return optimizer, warmup_linear
Example #6
0
        'weight_decay':
        0.0
    }]

    logger.info("Loading Apex and building the FusedAdam optimizer")

    if fp16:
        try:
            from apex.contrib.optimizers import FP16_Optimizer, FusedAdam
        except:
            raise ImportError(
                "To use distributed and fp16 training, please install apex from the branch bertonazureml/apex at https://www.github.com/microsoft/apex."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=job_config.get_learning_rate(),
                              bias_correction=False,
                              max_grad_norm=1.0)
        if loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)
    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=job_config.get_learning_rate())
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=job_config.get_warmup_proportion(),
            num_training_steps=job_config.get_total_training_steps())

    global_step = 0
    start_epoch = 0
Example #7
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--train_data_dir",
        default="",
        type=str,
        # required=True,
        help="The input train corpus.",
    )
    parser.add_argument(
        "--val_data_dir",
        default="",
        type=str,
        # required=True,
        help="The input val corpus.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="save",
        type=str,
        # required=True,
        help=
        "The output directory where the model checkpoints will be written.",
    )

    parser.add_argument(
        "--config_file",
        default="config/bert_base_6layer_interbert.json",
        type=str,
        # required=True,
        help="The config file which specified the model details.",
    )
    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=36,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.",
    )
    parser.add_argument("--predict_feature",
                        action="store_true",
                        help="visual target.")

    parser.add_argument(
        "--train_batch_size",
        default=512,
        type=int,
        help="Total batch size for training.",
    )
    parser.add_argument(
        "--learning_rate",
        default=1e-4,
        type=float,
        help="The initial learning rate for Adam.",
    )
    parser.add_argument(
        "--num_train_epochs",
        default=10.0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--start_epoch",
        default=0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--continue_training",
        action="store_true",
        help="if we need to continue a stopped pretraining procedure, add this"
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument("--img_weight",
                        default=1,
                        type=float,
                        help="weight for image loss")
    parser.add_argument("--itm_weight",
                        default=1,
                        type=float,
                        help="weight for itm loss")
    parser.add_argument("--text_weight",
                        default=1,
                        type=float,
                        help="weight for text loss")
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        type=bool,
        default=True,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="local_rank for distributed training on gpus",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=3,
        help="Number of workers in the dataloader.",
    )
    parser.add_argument(
        "--save_name",
        default='',
        type=str,
        help="save name for training.",
    )
    parser.add_argument(
        "--freeze",
        default=-1,
        type=int,
        help="till which layer of textual stream of the model need to fixed.")
    parser.add_argument("--distributed",
                        action="store_true",
                        help="whether use chunck for parallel training.")
    parser.add_argument("--without_coattention",
                        action="store_true",
                        help="whether pair loss.")
    parser.add_argument("--span_mask",
                        action="store_true",
                        help="whether to use span_masking.")
    parser.add_argument("--cond_mask",
                        action="store_true",
                        help="Whether to use conditional masking method.")
    parser.add_argument("--dynamic_masking",
                        action="store_true",
                        help="whether to use dynamic masking")
    args = parser.parse_args()

    print(args)
    if args.save_name is not '':
        timeStamp = args.save_name
    else:
        timeStamp = strftime("%d-%b-%y-%X-%a", gmtime())
        timeStamp += "_{:0>6d}".format(random.randint(0, 10e6))

    savePath = os.path.join(args.output_dir, timeStamp)

    if not os.path.exists(savePath):
        os.makedirs(savePath)

    config = BertConfig.from_json_file(args.config_file)

    if args.freeze > config.t_biattention_id[0]:
        config.fixed_t_layer = config.t_biattention_id[0]

    if args.without_coattention:
        config.with_coattention = False
    # # save all the hidden parameters.
    # with open(os.path.join(savePath, 'command.txt'), 'w') as f:
    #     print(args, file=f)  # Python 3.x
    #     print('\n', file=f)
    #     print(config, file=f)

    bert_weight_name = json.load(
        open("config/" + "bert-base-uncased_weight_name.json", "r"))
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    num_train_optimization_steps = None

    viz = TBlogger("logs", timeStamp)

    train_dataset = ConceptCapLoaderTrain(args.train_data_dir,
                                          tokenizer,
                                          seq_len=args.max_seq_length,
                                          batch_size=args.train_batch_size,
                                          predict_feature=args.predict_feature,
                                          num_workers=args.num_workers,
                                          distributed=args.distributed,
                                          span_mask=args.span_mask,
                                          cond_mask=args.cond_mask)

    validation_dataset = ConceptCapLoaderVal(
        args.val_data_dir,
        tokenizer,
        seq_len=args.max_seq_length,
        batch_size=args.train_batch_size,
        predict_feature=args.predict_feature,
        num_workers=2,
        distributed=args.distributed,
        span_mask=args.span_mask,
        cond_mask=args.cond_mask)

    if args.continue_training:
        assert args.start_epoch > 0  # must have pretrained at least one epoch
        num_train_optimization_steps = (
            int(train_dataset.num_dataset / args.train_batch_size /
                args.gradient_accumulation_steps) * args.num_train_epochs)
        if args.cond_mask:
            num_train_optimization_steps *= 2
        finished_steps = (
            int(train_dataset.num_dataset / args.train_batch_size /
                args.gradient_accumulation_steps) * args.start_epoch)
        if args.cond_mask:
            finished_steps *= 2
    else:
        num_train_optimization_steps = (
            int(train_dataset.num_dataset / args.train_batch_size /
                args.gradient_accumulation_steps) *
            (args.num_train_epochs - args.start_epoch))
        if args.cond_mask:
            num_train_optimization_steps *= 2
        finished_steps = 0

    default_gpu = False
    if dist.is_available() and args.distributed:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu:
        # save all the hidden parameters.
        with open(os.path.join(savePath, 'command.txt'), 'w') as f:
            print(args, file=f)  # Python 3.x
            print('\n', file=f)
            print(config, file=f)

    # pdb.set_trace()
    if args.predict_feature:
        config.v_target_size = 2048
        config.predict_feature = True
    else:
        config.v_target_size = 1601
        config.predict_feature = False

    if args.from_pretrained:
        if args.continue_training:
            ckpt_load_path = os.path.join(
                args.from_pretrained,
                "pytorch_model_{}.bin".format(int(args.start_epoch) - 1))
            model = InterBertForMultiModalPreTraining.from_pretrained(
                ckpt_load_path, config)
        else:
            model = InterBertForMultiModalPreTraining.from_pretrained(
                args.from_pretrained, config)
    else:
        model = InterBertForMultiModalPreTraining(config)

    model.cuda()

    if args.fp16:
        model.half()
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    if args.freeze != -1:
        bert_weight_name_filtered = []
        for name in bert_weight_name:
            if 'embeddings' in name:
                bert_weight_name_filtered.append(name)
            elif 'encoder' in name:
                layer_num = name.split('.')[2]
                if int(layer_num) <= args.freeze:
                    bert_weight_name_filtered.append(name)

        optimizer_grouped_parameters = []
        for key, value in dict(model.named_parameters()).items():
            if key[12:] in bert_weight_name_filtered:
                value.requires_grad = False

        if default_gpu:
            print("filtered weight")
            print(bert_weight_name_filtered)

    if not args.from_pretrained:
        param_optimizer = list(model.named_parameters())
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
    else:
        optimizer_grouped_parameters = []
        for key, value in dict(model.named_parameters()).items():
            if value.requires_grad:
                if key[12:] in bert_weight_name:
                    lr = args.learning_rate * 0.1
                else:
                    lr = args.learning_rate

                if any(nd in key for nd in no_decay):
                    optimizer_grouped_parameters += [{
                        "params": [value],
                        "lr": lr,
                        "weight_decay": 0.01
                    }]

                if not any(nd in key for nd in no_decay):
                    optimizer_grouped_parameters += [{
                        "params": [value],
                        "lr": lr,
                        "weight_decay": 0.0
                    }]
        if default_gpu:
            print(len(list(model.named_parameters())),
                  len(optimizer_grouped_parameters))

    # set different parameters for vision branch and lanugage branch.
    if args.fp16:
        try:
            from apex.contrib.optimizers import FP16_Optimizer
            from apex.contrib.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            bias_correction=False,
            max_grad_norm=1.0,
        )
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        if args.from_pretrained:
            optimizer = BertAdam(
                optimizer_grouped_parameters,
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps,
            )
        else:
            optimizer = BertAdam(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps,
            )

        if args.continue_training:
            opt_state_dict_path = os.path.join(
                args.from_pretrained,
                "optimizer_state_{}.bin".format(int(args.start_epoch) - 1))
            optimizer.load_state_dict(
                torch.load(opt_state_dict_path, map_location='cpu'))

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_dataset.num_dataset)
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d",
                num_train_optimization_steps - finished_steps)

    startIterID = 0
    global_step = finished_steps
    masked_loss_v_tmp = 0
    masked_loss_t_tmp = 0
    next_sentence_loss_tmp = 0
    loss_tmp = 0
    start_t = timer()

    for epochId in range(int(args.start_epoch), int(args.num_train_epochs)):
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        # iter_dataloader = iter(train_dataloader)
        for step, batch in enumerate(train_dataset):
            iterId = startIterID + step + (epochId * len(train_dataset))
            # batch = iter_dataloader.next()
            batch = tuple(
                t.cuda(device=device, non_blocking=True) for t in batch)

            input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, multimodal_mask, image_ids = (
                batch)

            masked_loss_t, masked_loss_v, next_sentence_loss = model(
                input_ids,
                image_feat,
                image_loc,
                segment_ids,
                input_mask,
                image_mask,
                multimodal_mask,
                lm_label_ids,
                image_label,
                image_target,
                is_next,
            )

            if args.without_coattention:
                next_sentence_loss = next_sentence_loss * 0

            masked_loss_v = masked_loss_v * args.img_weight
            next_sentence_loss = next_sentence_loss * args.itm_weight
            loss = masked_loss_t * args.text_weight + masked_loss_v + next_sentence_loss

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
                masked_loss_t = masked_loss_t.mean()
                masked_loss_v = masked_loss_v.mean()
                next_sentence_loss = next_sentence_loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            if math.isnan(loss.item()):
                pdb.set_trace()

            tr_loss += loss.item()

            rank = 0

            if dist.is_available() and args.distributed:
                rank = dist.get_rank()
            else:
                rank = 0

            viz.linePlot(iterId, loss.item(), "loss_" + str(rank), "train")
            viz.linePlot(iterId, masked_loss_t.item(),
                         "masked_loss_t_" + str(rank), "train")
            viz.linePlot(iterId, masked_loss_v.item(),
                         "masked_loss_v_" + str(rank), "train")
            viz.linePlot(iterId, next_sentence_loss.item(),
                         "next_sentence_loss_" + str(rank), "train")
            # viz.linePlot(iterId, optimizer.get_lr()[0], 'learning_rate', 'train')

            loss_tmp += loss.item()
            masked_loss_v_tmp += masked_loss_v.item()
            masked_loss_t_tmp += masked_loss_t.item()
            next_sentence_loss_tmp += next_sentence_loss.item()

            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / num_train_optimization_steps,
                        args.warmup_proportion,
                    )
                    for param_group in optimizer.param_groups:
                        param_group["lr"] = lr_this_step

                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if step % 20 == 0 and step != 0:
                masked_loss_t_tmp = masked_loss_t_tmp / 20.0
                masked_loss_v_tmp = masked_loss_v_tmp / 20.0
                next_sentence_loss_tmp = next_sentence_loss_tmp / 20.0
                loss_tmp = loss_tmp / 20.0

                end_t = timer()
                timeStamp = strftime("%a %d %b %y %X", gmtime())

                Ep = epochId + nb_tr_steps / float(len(train_dataset))
                printFormat = "[%s][Ep: %.2f][Iter: %d][Time: %5.2fs][Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g][LR: %.8g]"

                printInfo = [
                    timeStamp,
                    Ep,
                    nb_tr_steps,
                    end_t - start_t,
                    loss_tmp,
                    masked_loss_v_tmp,
                    masked_loss_t_tmp,
                    next_sentence_loss_tmp,
                    optimizer.get_lr()[0],
                ]

                start_t = end_t
                print(printFormat % tuple(printInfo))

                masked_loss_v_tmp = 0
                masked_loss_t_tmp = 0
                next_sentence_loss_tmp = 0
                loss_tmp = 0

        # Do the evaluation
        torch.set_grad_enabled(False)
        start_t = timer()
        numBatches = len(validation_dataset)
        eval_masked_loss_t = 0
        eval_masked_loss_v = 0
        eval_next_sentence_loss = 0
        eval_total_loss = 0

        model.eval()
        for step, batch in enumerate(validation_dataset):
            batch = tuple(
                t.cuda(device=device, non_blocking=True) for t in batch)

            input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, multimodal_mask, image_ids = (
                batch)

            masked_loss_t, masked_loss_v, next_sentence_loss = model(
                input_ids,
                image_feat,
                image_loc,
                segment_ids,
                input_mask,
                image_mask,
                multimodal_mask,
                lm_label_ids,
                image_label,
                image_target,
                is_next,
            )

            masked_loss_v = masked_loss_v * args.img_weight
            loss = masked_loss_t + masked_loss_v + next_sentence_loss

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
                masked_loss_t = masked_loss_t.mean()
                masked_loss_v = masked_loss_v.mean()
                next_sentence_loss = next_sentence_loss.mean()

            eval_masked_loss_t += masked_loss_t.item()
            eval_masked_loss_v += masked_loss_v.item()
            eval_next_sentence_loss += next_sentence_loss.item()
            eval_total_loss += loss.item()

            end_t = timer()
            delta_t = " Time: %5.2fs" % (end_t - start_t)
            start_t = end_t
            progressString = "\r Evaluating split '%s' [%d/%d]\t" + delta_t
            sys.stdout.write(progressString % ('val', step + 1, numBatches))
            sys.stdout.flush()

        eval_masked_loss_t = eval_masked_loss_t / float(numBatches)
        eval_masked_loss_v = eval_masked_loss_v / float(numBatches)
        eval_next_sentence_loss = eval_next_sentence_loss / float(numBatches)
        eval_total_loss = eval_total_loss / float(numBatches)

        printFormat = "Evaluation: [Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g]"
        printInfo = [
            eval_total_loss, eval_masked_loss_v, eval_masked_loss_t,
            eval_next_sentence_loss
        ]

        print(printFormat % tuple(printInfo))
        torch.set_grad_enabled(True)

        viz.linePlot(epochId, eval_total_loss, "loss_" + str(rank), "val")
        viz.linePlot(epochId, eval_masked_loss_t, "masked_loss_t_" + str(rank),
                     "val")
        viz.linePlot(epochId, eval_masked_loss_v, "masked_loss_v_" + str(rank),
                     "val")
        viz.linePlot(epochId, eval_next_sentence_loss,
                     "next_sentence_loss_" + str(rank), "val")

        if default_gpu:
            # Save a trained model
            logger.info("** ** * Saving fine - tuned model ** ** * ")
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Only save the model it-self
            output_model_file = os.path.join(
                savePath, "pytorch_model_" + str(epochId) + ".bin")
            torch.save(model_to_save.state_dict(), output_model_file)
            output_opt_state_dict_file = os.path.join(
                savePath, "optimizer_state_" + str(epochId) + ".bin")
            torch.save(optimizer.state_dict(), output_opt_state_dict_file)

        if args.dynamic_masking and epochId + 1 < int(args.num_train_epochs):
            del train_dataset
            gc.collect()
            train_dataset = ConceptCapLoaderTrain(
                args.train_data_dir,
                tokenizer,
                seq_len=args.max_seq_length,
                batch_size=args.train_batch_size,
                predict_feature=args.predict_feature,
                num_workers=args.num_workers,
                distributed=args.distributed,
                span_mask=args.span_mask)
Example #8
0
def main(opts):
    if opts.local_rank == -1:
        assert torch.cuda.is_available()
        device = torch.device("cuda")
        n_gpu = 1
    else:
        torch.cuda.set_device(opts.local_rank)
        device = torch.device("cuda", opts.local_rank)
        # Initializes the distributed backend which will take care of
        # sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        n_gpu = torch.distributed.get_world_size()
        logger.info("device: {} n_gpu: {}, distributed training: {}, "
                    "16-bits training: {}".format(
                        device, n_gpu, bool(opts.local_rank != -1), opts.fp16))
    opts.n_gpu = n_gpu

    if opts.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                            opts.gradient_accumulation_steps))

    is_master = opts.local_rank == -1 or torch.distributed.get_rank() == 0

    # if is_master:
    #     save_training_meta(opts)

    random.seed(opts.seed)
    np.random.seed(opts.seed)
    torch.manual_seed(opts.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(opts.seed)

    tokenizer = BertTokenizer.from_pretrained(
        opts.bert_model, do_lower_case='uncased' in opts.bert_model)



    # train_examples = None
    print("Loading Train Dataset", opts.train_file)
    vocab_dump = torch.load(opts.vocab_file)
    print ('vocab dump', vocab_dump)

    # vocab = vocab_dump['tgt'].fields[0][1].vocab.stoi
    vocab = vocab_dump['src'].fields[0][1].vocab.stoi
    # f = open("./vocab_test_tgt.txt", "w")
    # f.write(str(vocab))
    # f.close()

    # f = open("./vocab_bert.txt", "w")
    # f.write(str(tokenizer.get_vocab()))
    # f.close()

    train_dataset = BertDataset(opts.train_file, tokenizer, vocab,
                                seq_len=opts.max_seq_length,
                                max_len=opts.max_sent_length)
    print ('train dataset',train_dataset[0])
    # Prepare model
    print ('len train dataset',len(train_dataset))

    model = BertForSeq2seq.from_pretrained(opts.bert_model)

    embedding = convert_embedding(
        tokenizer, vocab, model.bert.embeddings.word_embeddings.weight)
    # changing the output embedding layer to have words from new vocab instead of old bert vocab?
    model.update_output_layer(embedding)

    if opts.fp16:
        model.half()
    model.to(device)

    if opts.local_rank != -1:
        # need to make sure models are the same in the beginning
        params = [p.data for p in model.parameters()]
        broadcast_tensors(params)
    for name, module in model.named_modules():
        # we might want to tune dropout for smaller dataset
        if isinstance(module, torch.nn.Dropout):
            module.p = opts.dropout

    # Prepare optimizer
    param_optimizer = [(n, p) for n, p in model.named_parameters()
                       if 'pooler' not in n]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0}
    ]

    if opts.fp16:
        try:
            from apex.contrib.optimizers import FP16_Optimizer
            from apex.contrib.optimizers import FusedAdam

        except ImportError:
            raise ImportError("Please install apex from "
                              "https://www.github.com/nvidia/apex "
                              "to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=opts.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if opts.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=opts.loss_scale)

    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                             lr=opts.learning_rate)

    global_step = 0
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Batch size = %d", opts.train_batch_size)
    logger.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    logger.info("  Num steps = %d", opts.num_train_steps)

    if opts.local_rank == -1:
        train_sampler = TokenBucketSampler(
            train_dataset.lens,
            bucket_size=8192,
            batch_size=opts.train_batch_size,
            droplast=True)
        train_dataloader = DataLoader(train_dataset,
                                      batch_sampler=train_sampler,
                                      num_workers=4,
                                      collate_fn=BertDataset.pad_collate)
    else:
        train_sampler = DistributedTokenBucketSampler(
            n_gpu, opts.local_rank,
            train_dataset.lens,
            bucket_size=8192,
            batch_size=opts.train_batch_size,
            droplast=True)
        train_dataloader = DataLoader(train_dataset,
                                      batch_sampler=train_sampler,
                                      num_workers=4,
                                      collate_fn=BertDataset.pad_collate)

    if is_master:
        TB_LOGGER.create(join(opts.output_dir, 'log'))
    running_loss = RunningMeter('loss')
    model.train()
    if is_master:
        pbar = tqdm(total=opts.num_train_steps)
    else:
        logger.disabled = True
        pbar = None
    n_examples = 0
    n_epoch = 0
    start = time()
    while True:
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) if t is not None else t for t in batch)
            input_ids, input_mask, segment_ids, lm_label_ids = batch
            n_examples += input_ids.size(0)
            mask = lm_label_ids != -1
            loss = model(input_ids, segment_ids, input_mask,
                         lm_label_ids, mask, True)
            if opts.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()
            running_loss(loss.item())
            if (step + 1) % opts.gradient_accumulation_steps == 0:
                global_step += 1
                if opts.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if opts.fp16 is False, AdamW is used that handles
                    # this automatically
                    lr_this_step = opts.learning_rate * warmup_linear(
                        global_step/opts.num_train_steps,
                        opts.warmup_proportion)
                    if lr_this_step < 0:
                        # save guard for possible miscalculation of train steps
                        lr_this_step = 1e-8
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    TB_LOGGER.add_scalar('lr',
                                         lr_this_step, global_step)

                # NOTE running loss not gathered across GPUs for speed
                TB_LOGGER.add_scalar('loss', running_loss.val, global_step)
                TB_LOGGER.step()

                if opts.local_rank != -1:
                    # gather gradients from every processes
                    grads = [p.grad.data for p in model.parameters()
                             if p.requires_grad and p.grad is not None]
                    all_reduce_and_rescale_tensors(grads, float(1))
                optimizer.step()
                optimizer.zero_grad()
                if pbar is not None:
                    pbar.update(1)
                if global_step % 5 == 0:
                    torch.cuda.empty_cache()
                if global_step % 100 == 0:
                    if opts.local_rank != -1:
                        total = sum(all_gather_list(n_examples))
                    else:
                        total = n_examples
                    if is_master:
                        ex_per_sec = int(total / (time()-start))
                        logger.info(f'{total} examples trained at '
                                    f'{ex_per_sec} ex/s')
                    TB_LOGGER.add_scalar('ex_per_s', ex_per_sec, global_step)

                if global_step % opts.valid_steps == 0:
                    logger.info(f"start validation at Step {global_step}")
                    with torch.no_grad():
                        val_log = validate(model,
                                           opts.valid_src, opts.valid_tgt,
                                           tokenizer, vocab, device,
                                           opts.local_rank)
                    logger.info(f"Val Acc: {val_log['val_acc']}; "
                                f"Val Loss: {val_log['val_loss']}")
                    TB_LOGGER.log_scaler_dict(val_log)
                    if is_master:
                        output_model_file = join(
                            opts.output_dir, 'ckpt',
                            f"model_step_{global_step}.pt")
                        # save cpu checkpoint
                        state_dict = {k: v.cpu() if isinstance(v, torch.Tensor)
                                      else v
                                      for k, v in model.state_dict().items()}
                        torch.save(state_dict, output_model_file)
            if global_step >= opts.num_train_steps:
                break
        if global_step >= opts.num_train_steps:
            break
        n_epoch += 1
        if is_master:
            logger.info(f"finished {n_epoch} epochs")
    if opts.num_train_steps % opts.valid_steps != 0:
        with torch.no_grad():
            val_log = validate(model, opts.valid_src, opts.valid_tgt,
                               tokenizer, vocab, device, opts.local_rank)
        TB_LOGGER.log_scaler_dict(val_log)
        if is_master:
            output_model_file = join(opts.output_dir, 'ckpt',
                                     f"model_step_{global_step}.pt")
            # save cpu checkpoint
            state_dict = {k: v.cpu() if isinstance(v, torch.Tensor)
                          else v
                          for k, v in model.state_dict().items()}
            torch.save(model.state_dict(), output_model_file)