コード例 #1
0
def get_constant_schedule_with_warmup(optimizer, epochs, batch_size, n_samples):
    warmup_proportion = 0.3
    n_steps = int(np.ceil(n_samples / batch_size))
    num_training_steps = n_steps * epochs
    num_warmup_steps = int(warmup_proportion * num_training_steps)
    sch = optimization.get_constant_schedule_with_warmup(optimizer, num_warmup_steps)
    return sch
コード例 #2
0
 def init_fn(optimizer, epochs, batch_size, n_samples):
     n_steps = int(np.ceil(n_samples / batch_size))
     num_training_steps = n_steps * epochs
     num_warmup_steps = int(warmup_proportion * num_training_steps)
     sch = optimization.get_constant_schedule_with_warmup(
         optimizer, num_warmup_steps)
     update_in_batch, update_in_epoch = True, False
     return sch, update_in_batch, update_in_epoch
コード例 #3
0
 def __init__(self,
              optimizer: Optimizer,
              num_warmup_steps: int,
              last_epoch: int = -1) -> None:
     lr_scheduler = get_constant_schedule_with_warmup(
         optimizer=optimizer,
         num_warmup_steps=num_warmup_steps,
         last_epoch=last_epoch)
     super().__init__(lr_scheduler)
コード例 #4
0
ファイル: model.py プロジェクト: andim461/project-ML
    def configure_optimizers(self):

        optimizer = AdamW(params=self.parameters(),
                          lr=self.hparams['learning_rate'])
        warmup_steps = self.hparams['warmup_steps']
        scheduler = get_constant_schedule_with_warmup(
            optimizer, num_warmup_steps=warmup_steps)

        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]
コード例 #5
0
    def _create_lr_scheduler(self) -> Dict:
        """Returns one of three default schedulers

        Possibilities: constant/linear/cosine schedule with or without warmup
        """
        steps_per_epoch = math.ceil(
            len(self._train_instances) / self._trainer_config.batch_size
        )
        try:
            training_steps = min(
                self._trainer_config.max_steps,
                self._trainer_config.max_epochs * steps_per_epoch,
            )
        # One or both of the max_* is None:
        except TypeError:
            training_steps = (
                self._trainer_config.max_steps
                # 1000 is the default of the lightning trainer
                or (self._trainer_config.max_epochs or 1000) * steps_per_epoch
            )

        if self._trainer_config.lr_decay == "linear":
            scheduler = get_linear_schedule_with_warmup(
                optimizer=self._pipeline.model.optimizer,
                num_warmup_steps=self._trainer_config.warmup_steps,
                num_training_steps=training_steps,
            )
        elif self._trainer_config.lr_decay == "cosine":
            scheduler = get_cosine_schedule_with_warmup(
                optimizer=self._pipeline.model.optimizer,
                num_warmup_steps=self._trainer_config.warmup_steps,
                num_training_steps=training_steps,
            )
        else:
            scheduler = get_constant_schedule_with_warmup(
                optimizer=self._pipeline.model.optimizer,
                num_warmup_steps=self._trainer_config.warmup_steps,
            )

        return {
            "scheduler": scheduler,
            "interval": "step",
            "name": "learning_rate",
        }
コード例 #6
0
def train(args):
    # torch.multiprocessing.set_sharing_strategy('file_system')
    # too many barriers / one node data parallel and multiple node DDP
    os.environ['MASTER_ADDR'] = args["master_addr"]
    os.environ['MASTER_PORT'] = args["master_port"]
    os.environ['TOKENIZERS_PARALLELISM'] = "true"
    torch.backends.cudnn.benchmark = True
    rank = args["nr"]
    gpus = args["gpus_per_node"]
    if args["cpu"]:
        assert args["world_size"] == 1
        device = torch.device("cpu")
        barrier = get_barrier(False)
    else:
        dist.init_process_group(args["dist_backend"], rank=rank, world_size=args["world_size"])
        device = torch.device('cuda:0')  # Unique only on individual node.
        torch.cuda.set_device(device)
        barrier = get_barrier(True)

    set_seeds(args["seed"])
    mconf = model_config.to_dict()
    config = dict(md_config=md_config, sm_config=sm_config)[mconf.pop("model_size")]
    tokenizer = get_tokenizer(mconf.pop("tokenizer_name"))
    config.vocab_size = len(tokenizer) + 22
    config.tokenizer_length = 1024
    config.tokenizer_length = config.tokenizer_length - config.num_highway_cls_tokens
    config.max_position_embeddings = config.max_position_embeddings + config.num_highway_cls_tokens

    collate_fn = get_collate_fn(config.num_highway_cls_tokens, tokenizer.pad_token_id)

    model = FastFormerForFusedELECTRAPretraining(config, tokenizer=tokenizer, **mconf).to(device)
    print("Trainable Params = %s" % (numel(model) / 1_000_000))
    if args["pretrained_model"] is not None:
        model.load_state_dict(torch.load(args["pretrained_model"], map_location={'cuda:%d' % 0: 'cuda:%d' % 0}))
    model.data_parallel = True
    # Take model to local rank
    if args["cpu"]:
        ddp_model = model
    else:
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
        ddp_model = DDP(model, device_ids=[0], find_unused_parameters=True)
    all_params = list(filter(lambda p: p.requires_grad, ddp_model.parameters()))
    optc = optimizer_config.to_dict()
    optimizer = AdamW(all_params, lr=optc["lr"], eps=optc["eps"], weight_decay=optc["weight_decay"], betas=(optc["beta_1"], optc["beta_2"]))
    optimizer.zero_grad()
    scaler = GradScaler()

    model_save_dir = args["model_save_dir"]
    model_save_name = args["model_save_name"]
    if rank == 0:
        if not os.path.exists(model_save_dir):
            os.makedirs(model_save_dir)
    assert os.path.exists(model_save_dir)
    barrier()
    print("Optimizer Created for Rank = %s" % rank)
    shuffle_dataset = args["shuffle_dataset"]
    sampling_fraction = optc["sampling_fraction"]
    if not args["validate_only"] and not args["test_only"]:
        train_loader = build_dataloader(args["train_dataset"], shuffle_dataset, sampling_fraction, config, collate_fn, tokenizer, world_size=args["world_size"], num_workers=args["num_workers"])

    print("Data Loaded for Rank = %s" % rank)
    validate_every_steps = args["validate_every_steps"]
    log_every_steps = args["log_every_steps"]
    save_every_steps = args["save_every_steps"]
    scheduler = optimization.get_constant_schedule_with_warmup(optimizer, optc["warmup_steps"])
    gradient_clipping = optc["gradient_clipping"]
    _ = model.train()
    barrier()

    start_time = time.time()
    batch_times = []
    model_times = []
    full_times = []
    print("Start Training for Rank = %s" % rank)
    for step, batch in enumerate(train_loader):
        model.zero_grad()
        optimizer.zero_grad()
        if step == 0:
            print("First Batch Training for Rank = %s" % rank)
        # if step <= 39:
        #     continue
        gen_batch_time = time.time() - start_time
        batch_times.append(gen_batch_time)
        if (step + 1) % save_every_steps == 0:
            if rank == 0:
                torch.save(ddp_model.state_dict(), os.path.join(model_save_dir, model_save_name))
            barrier()
        if (step + 1) % validate_every_steps == 0:
            if rank == 0:
                val_results = LargeValidator(args["validation_dataset"], ddp_model, config, device, tokenizer)()
                print("Rank = %s, steps = %s, Val = %s" % (rank, step, val_results))
            barrier()
        record_accuracy = False
        if (step + 1) % log_every_steps == 0:
            record_accuracy = True

        batch["record_accuracy"] = record_accuracy
        labels = batch["label_mlm_input_ids"] if "label_mlm_input_ids" in batch else batch["input_ids"]
        labels = labels.to(device)
        model_start_time = time.time()
        if args["cpu"]:
            output = ddp_model(**batch, labels=labels)
            output = {key: [item[key] for item in output]
                      for key in list(functools.reduce(
                    lambda x, y: x.union(y),
                    (set(dicts.keys()) for dicts in output)
                ))
                      }
            output = {k: torch.mean(v) for k, v in output.items()}
            loss = output["loss"]
            loss_dict = output["loss_dict"]
            loss.backward()
            torch.nn.utils.clip_grad_norm_(all_params, gradient_clipping)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        else:
            with autocast():

                output = ddp_model(**batch, labels=labels)
                output = {key: [item[key] for item in output]
                          for key in list(functools.reduce(
                        lambda x, y: x.union(y),
                        (set(dicts.keys()) for dicts in output)
                    ))
                          }
                output = {k: torch.mean(v) for k, v in output.items()}
                loss = output["loss"]
                loss_dict = output["loss_dict"]
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(all_params, gradient_clipping)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()
        model_end_time = time.time() - model_start_time
        model_times.append(model_end_time)
        full_time = time.time() - start_time
        full_times.append(full_time)
        start_time = time.time()
        if (step + 1) % log_every_steps == 0:
            print("Rank = %s, steps = %s, batch_size = %s, Loss = %s, Accuracy = %s" % (rank, step, batch["input_ids"].size(), loss_dict, output["accuracy_hist"]))
            print("Batch time = %s, Model Time = %s, Full time = %s" % (np.mean(batch_times), np.mean(model_times), np.mean(full_times)))
            batch_times = []
            model_times = []
            full_times = []
            clean_memory()
            barrier()



    # Take inputs to local_rank

    # TODO: validate on multigpu, sort the val datasets alphabetically and let the gpu with rank == dataset rank in sort pick up the dataset. GPUs with rank > len(datasetDict) stay idle.
    # TODO: select one dataset and make full batch from it, this way rebalancing can be easy.
    # TODO: dataset rebalancing.
    # TODO: save model only in local_rank == 0 process
    # TODO: Check if all initialised model weights are same??
    # I've been tracking an ema of sample training loss during training and using that to guide weighted data sampling (rather than the typical uniform sampling). Seems to help with a variety of real world datasets where the bulk of the data is often very similar and easy to learn but certain subpopulations are much more challenging.

    pass
コード例 #7
0
def main(args):
    local_config = json.load(open(args.local_config_path))
    local_config['loss'] = args.loss
    local_config['data_dir'] = args.data_dir
    local_config['train_batch_size'] = args.train_batch_size
    local_config[
        'gradient_accumulation_steps'] = args.gradient_accumulation_steps
    local_config['lr_scheduler'] = args.lr_scheduler
    local_config['model_name'] = args.model_name
    local_config['pool_type'] = args.pool_type
    local_config['seed'] = args.seed
    local_config['do_train'] = args.do_train
    local_config['do_validation'] = args.do_validation
    local_config['do_eval'] = args.do_eval
    local_config['use_cuda'] = args.use_cuda.lower() == 'true'
    local_config['num_train_epochs'] = args.num_train_epochs
    local_config['eval_batch_size'] = args.eval_batch_size
    local_config['max_seq_len'] = args.max_seq_len
    local_config['syns'] = ["Target", "Synonym"]
    local_config['target_embeddings'] = args.target_embeddings
    local_config['symmetric'] = args.symmetric.lower() == 'true'
    local_config['mask_syns'] = args.mask_syns
    local_config['train_scd'] = args.train_scd
    local_config['ckpt_path'] = args.ckpt_path
    local_config['head_batchnorm'] = args.head_batchnorm
    local_config['head_hidden_size'] = args.head_hidden_size
    local_config['linear_head'] = args.linear_head.lower() == 'true'
    local_config['emb_size_for_cosine'] = args.emb_size_for_cosine
    local_config['add_fc_layer'] = args.add_fc_layer

    if local_config['do_train'] and os.path.exists(args.output_dir):
        from glob import glob
        model_weights = glob(os.path.join(args.output_dir, '*.bin'))
        if model_weights:
            print(f'{model_weights}: already computed: skipping ...')
            return
        else:
            print(
                f'already existing {args.output_dir}. but without model weights ...'
            )
            return

    device = torch.device("cuda" if local_config['use_cuda'] else "cpu")
    n_gpu = torch.cuda.device_count()

    if local_config['gradient_accumulation_steps'] < 1:
        raise ValueError(
            "gradient_accumulation_steps parameter should be >= 1")

    local_config['train_batch_size'] = \
        local_config['train_batch_size'] // local_config['gradient_accumulation_steps']

    if local_config['do_train']:
        random.seed(local_config['seed'])
        np.random.seed(local_config['seed'])
        torch.manual_seed(local_config['seed'])

    if n_gpu > 0:
        torch.cuda.manual_seed_all(local_config['seed'])

    if not local_config['do_train'] and not local_config['do_eval']:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if local_config['do_train'] and not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
        os.makedirs(os.path.join(args.output_dir, 'nen-nen-weights'))
    elif local_config['do_train'] or local_config['do_validation']:
        raise ValueError(args.output_dir, 'output_dir already exists')

    suffix = datetime.now().isoformat().replace('-', '_').replace(
        ':', '_').split('.')[0].replace('T', '-')
    if local_config['do_train']:
        train_writer = SummaryWriter(log_dir=os.path.join(
            args.output_dir, f'tensorboard-{suffix}', 'train'))
        dev_writer = SummaryWriter(log_dir=os.path.join(
            args.output_dir, f'tensorboard-{suffix}', 'dev'))

        logger.addHandler(
            logging.FileHandler(
                os.path.join(args.output_dir, f"train_{suffix}.log"), 'w'))
        eval_logger.addHandler(
            logging.FileHandler(
                os.path.join(args.output_dir, f"scores_{suffix}.log"), 'w'))
    else:
        logger.addHandler(
            logging.FileHandler(
                os.path.join(args.ckpt_path, f"eval_{suffix}.log"), 'w'))

    logger.info(args)
    logger.info(json.dumps(vars(args), indent=4))
    if args.do_train:
        json.dump(
            local_config,
            open(os.path.join(args.output_dir, 'local_config.json'), 'w'))
        json.dump(vars(args),
                  open(os.path.join(args.output_dir, 'args.json'), 'w'))
    logger.info("device: {}, n_gpu: {}".format(device, n_gpu))

    with open(os.path.join(args.output_dir, 'local_config.json'), 'w') as outp:
        json.dump(local_config, outp, indent=4)
    with open(os.path.join(args.output_dir, 'args.json'), 'w') as outp:
        json.dump(vars(args), outp, indent=4)

    syns = sorted(local_config['syns'])
    id2classifier = {i: classifier for i, classifier in enumerate(syns)}

    model_name = local_config['model_name']
    data_processor = DataProcessor()

    train_dir = os.path.join(local_config['data_dir'], 'train/')
    dev_dir = os.path.join(local_config['data_dir'], 'dev')

    if local_config['do_train']:

        config = configs[local_config['model_name']]
        config = config.from_pretrained(local_config['model_name'],
                                        hidden_dropout_prob=args.dropout)
        if args.ckpt_path != '':
            model_path = args.ckpt_path
        else:
            model_path = local_config['model_name']
        model = models[model_name].from_pretrained(
            model_path,
            cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE),
            local_config=local_config,
            data_processor=data_processor,
            config=config)

        param_optimizer = list(model.named_parameters())

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                param for name, param in param_optimizer
                if not any(nd in name for nd in no_decay)
            ],
            'weight_decay':
            float(args.weight_decay)
        }, {
            'params': [
                param for name, param in param_optimizer
                if any(nd in name for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=float(args.learning_rate),
                          eps=1e-6,
                          betas=(0.9, 0.98),
                          correct_bias=True)

        train_features = model.convert_dataset_to_features(train_dir, logger)

        if args.train_mode == 'sorted' or args.train_mode == 'random_sorted':
            train_features = sorted(train_features,
                                    key=lambda f: np.sum(f.input_mask))
        else:
            random.shuffle(train_features)


#        import pdb; pdb.set_trace()
        train_dataloader = \
            get_dataloader_and_tensors(train_features, local_config['train_batch_size'])
        train_batches = [batch for batch in train_dataloader]

        num_train_optimization_steps = \
            len(train_batches) // local_config['gradient_accumulation_steps'] * \
                local_config['num_train_epochs']

        warmup_steps = int(args.warmup_proportion *
                           num_train_optimization_steps)
        if local_config['lr_scheduler'] == 'linear_warmup':
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=num_train_optimization_steps)
        elif local_config['lr_scheduler'] == 'constant_warmup':
            scheduler = get_constant_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps)
        logger.info("***** Training *****")
        logger.info("  Num examples = %d", len(train_features))
        logger.info("  Batch size = %d", local_config['train_batch_size'])
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if local_config['do_validation']:
            dev_features = model.convert_dataset_to_features(dev_dir, logger)
            logger.info("***** Dev *****")
            logger.info("  Num examples = %d", len(dev_features))
            logger.info("  Batch size = %d", local_config['eval_batch_size'])
            dev_dataloader = \
                get_dataloader_and_tensors(dev_features, local_config['eval_batch_size'])
            test_dir = os.path.join(local_config['data_dir'], 'test/')
            if os.path.exists(test_dir):
                test_features = model.convert_dataset_to_features(
                    test_dir, test_logger)
                logger.info("***** Test *****")
                logger.info("  Num examples = %d", len(test_features))
                logger.info("  Batch size = %d",
                            local_config['eval_batch_size'])

                test_dataloader = \
                    get_dataloader_and_tensors(test_features, local_config['eval_batch_size'])

        best_result = defaultdict(float)

        eval_step = max(1, len(train_batches) // args.eval_per_epoch)

        start_time = time.time()
        global_step = 0

        model.to(device)
        lr = float(args.learning_rate)
        for epoch in range(1, 1 + local_config['num_train_epochs']):
            tr_loss = 0
            nb_tr_examples = 0
            nb_tr_steps = 0
            cur_train_loss = defaultdict(float)

            model.train()
            logger.info("Start epoch #{} (lr = {})...".format(
                epoch,
                scheduler.get_lr()[0]))
            if args.train_mode == 'random' or args.train_mode == 'random_sorted':
                random.shuffle(train_batches)

            train_bar = tqdm(train_batches,
                             total=len(train_batches),
                             desc='training ... ')
            for step, batch in enumerate(train_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, token_type_ids, \
                syn_labels, positions = batch
                train_loss, _ = model(input_ids=input_ids,
                                      token_type_ids=token_type_ids,
                                      attention_mask=input_mask,
                                      input_labels={
                                          'syn_labels': syn_labels,
                                          'positions': positions
                                      })
                loss = train_loss['total'].mean().item()
                for key in train_loss:
                    cur_train_loss[key] += train_loss[key].mean().item()

                train_bar.set_description(
                    f'training... [epoch == {epoch} / {local_config["num_train_epochs"]}, loss == {loss}]'
                )

                loss_to_optimize = train_loss['total']

                if local_config['gradient_accumulation_steps'] > 1:
                    loss_to_optimize = \
                        loss_to_optimize / local_config['gradient_accumulation_steps']

                loss_to_optimize.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

                tr_loss += loss_to_optimize.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if (step +
                        1) % local_config['gradient_accumulation_steps'] == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    global_step += 1

                if local_config['do_validation'] and (step +
                                                      1) % eval_step == 0:
                    logger.info(
                        'Ep: {}, Stp: {}/{}, usd_t={:.2f}s, loss={:.6f}'.
                        format(epoch, step + 1, len(train_batches),
                               time.time() - start_time,
                               tr_loss / nb_tr_steps))

                    cur_train_mean_loss = {}
                    for key, value in cur_train_loss.items():
                        cur_train_mean_loss[f'train_{key}_loss'] = \
                            value / nb_tr_steps

                    dev_predictions = os.path.join(args.output_dir,
                                                   'dev_predictions')

                    metrics = predict(model,
                                      dev_dataloader,
                                      dev_predictions,
                                      dev_features,
                                      args,
                                      cur_train_mean_loss=cur_train_mean_loss,
                                      logger=eval_logger)

                    metrics['global_step'] = global_step
                    metrics['epoch'] = epoch
                    metrics['learning_rate'] = scheduler.get_lr()[0]
                    metrics['batch_size'] = \
                        local_config['train_batch_size'] * local_config['gradient_accumulation_steps']

                    for key, value in metrics.items():
                        dev_writer.add_scalar(key, value, global_step)
                    scores_to_logger = tuple([
                        round(metrics[save_by_score] * 100.0, 2)
                        for save_by_score in args.save_by_score.split('+')
                    ])
                    logger.info(
                        f"dev %s (lr=%s, epoch=%d): %s" %
                        (args.save_by_score, str(
                            scheduler.get_lr()[0]), epoch, scores_to_logger))

                    predict_parts = [
                        part for part in metrics if part.endswith('.score')
                        and metrics[part] > args.start_save_threshold
                        and metrics[part] > best_result[part]
                    ]
                    if len(predict_parts) > 0:
                        best_dev_predictions = os.path.join(
                            args.output_dir, 'best_dev_predictions')
                        dev_predictions = os.path.join(args.output_dir,
                                                       'dev_predictions')
                        os.makedirs(best_dev_predictions, exist_ok=True)
                        for part in predict_parts:
                            logger.info(
                                "!!! Best dev %s (lr=%s, epoch=%d): %.2f -> %.2f"
                                % (part, str(scheduler.get_lr()[0]), epoch,
                                   best_result[part] * 100.0,
                                   metrics[part] * 100.0))
                            best_result[part] = metrics[part]
                            if [
                                    save_weight for save_weight in
                                    args.save_by_score.split('+')
                                    if save_weight == part
                            ]:
                                os.makedirs(os.path.join(
                                    args.output_dir, part),
                                            exist_ok=True)
                                output_model_file = os.path.join(
                                    args.output_dir, part, WEIGHTS_NAME)
                                save_model(args, model, output_model_file,
                                           metrics)
                            if 'nen-nen' not in part:
                                os.system(
                                    f'cp {dev_predictions}/{".".join(part.split(".")[1:-1])}* {best_dev_predictions}/'
                                )
                            else:
                                output_model_file = os.path.join(
                                    args.output_dir, 'nen-nen-weights',
                                    WEIGHTS_NAME)
                                save_model(args, model, output_model_file,
                                           metrics)

                        # dev_predictions = os.path.join(args.output_dir, 'dev_predictions')
                        # predict(
                        #     model, dev_dataloader, dev_predictions,
                        #     dev_features, args, only_parts='+'.join(predict_parts)
                        # )
                        # best_dev_predictions = os.path.join(args.output_dir, 'best_dev_predictions')
                        # os.makedirs(best_dev_predictions, exist_ok=True)
                        # os.system(f'mv {dev_predictions}/* {best_dev_predictions}/')
                        if 'scd' not in '+'.join(
                                predict_parts) and os.path.exists(test_dir):
                            test_predictions = os.path.join(
                                args.output_dir, 'test_predictions')
                            test_metrics = predict(
                                model,
                                test_dataloader,
                                test_predictions,
                                test_features,
                                args,
                                only_parts='+'.join([
                                    'test' + part[3:] for part in predict_parts
                                    if 'nen-nen' not in part
                                ]))
                            best_test_predictions = os.path.join(
                                args.output_dir, 'best_test_predictions')
                            os.makedirs(best_test_predictions, exist_ok=True)
                            os.system(
                                f'mv {test_predictions}/* {best_test_predictions}/'
                            )

                            for key, value in test_metrics.items():
                                if key.endswith('score'):
                                    dev_writer.add_scalar(
                                        key, value, global_step)

            if args.log_train_metrics:
                metrics = predict(model,
                                  train_dataloader,
                                  os.path.join(args.output_dir,
                                               'train_predictions'),
                                  train_features,
                                  args,
                                  logger=logger)
                metrics['global_step'] = global_step
                metrics['epoch'] = epoch
                metrics['learning_rate'] = scheduler.get_lr()[0]
                metrics['batch_size'] = \
                    local_config['train_batch_size'] * local_config['gradient_accumulation_steps']

                for key, value in metrics.items():
                    train_writer.add_scalar(key, value, global_step)

    if local_config['do_eval']:
        assert args.ckpt_path != '', 'in do_eval mode ckpt_path should be specified'
        test_dir = args.eval_input_dir
        config = configs[model_name].from_pretrained(model_name)
        model = models[model_name].from_pretrained(
            args.ckpt_path,
            local_config=local_config,
            data_processor=data_processor,
            config=config)
        model.to(device)
        test_features = model.convert_dataset_to_features(
            test_dir, test_logger)
        logger.info("***** Test *****")
        logger.info("  Num examples = %d", len(test_features))
        logger.info("  Batch size = %d", local_config['eval_batch_size'])

        test_dataloader = \
            get_dataloader_and_tensors(test_features, local_config['eval_batch_size'])

        metrics = predict(model,
                          test_dataloader,
                          os.path.join(args.output_dir, args.eval_output_dir),
                          test_features,
                          args,
                          compute_metrics=True)
        print(metrics)
        with open(
                os.path.join(args.output_dir, args.eval_output_dir,
                             'metrics.txt'), 'w') as outp:
            print(metrics, file=outp)
コード例 #8
0
    def train(
        self,
        train_dataset,
        output_dir,
        show_running_loss=True,
        eval_data=None,
        verbose=True,
        **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        model = self.model
        args = self.args
        device = self.device

        tb_writer = SummaryWriter(logdir=args.tensorboard_dir)
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(
            train_dataset,
            sampler=train_sampler,
            batch_size=args.train_batch_size,
            num_workers=self.args.dataloader_num_workers,
        )

        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = (
                args.max_steps
                // (len(train_dataloader) // args.gradient_accumulation_steps)
                + 1
            )
        else:
            t_total = (
                len(train_dataloader)
                // args.gradient_accumulation_steps
                * args.num_train_epochs
            )

        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = []
        custom_parameter_names = set()
        for group in self.args.custom_parameter_groups:
            params = group.pop("params")
            custom_parameter_names.update(params)
            param_group = {**group}
            param_group["params"] = [
                p for n, p in model.named_parameters() if n in params
            ]
            optimizer_grouped_parameters.append(param_group)

        for group in self.args.custom_layer_parameters:
            layer_number = group.pop("layer")
            layer = f"layer.{layer_number}."
            group_d = {**group}
            group_nd = {**group}
            group_nd["weight_decay"] = 0.0
            params_d = []
            params_nd = []
            for n, p in model.named_parameters():
                if n not in custom_parameter_names and layer in n:
                    if any(nd in n for nd in no_decay):
                        params_nd.append(p)
                    else:
                        params_d.append(p)
                    custom_parameter_names.add(n)
            group_d["params"] = params_d
            group_nd["params"] = params_nd

            optimizer_grouped_parameters.append(group_d)
            optimizer_grouped_parameters.append(group_nd)

        if not self.args.train_custom_parameters_only:
            optimizer_grouped_parameters.extend(
                [
                    {
                        "params": [
                            p
                            for n, p in model.named_parameters()
                            if n not in custom_parameter_names
                            and not any(nd in n for nd in no_decay)
                        ],
                        "weight_decay": args.weight_decay,
                    },
                    {
                        "params": [
                            p
                            for n, p in model.named_parameters()
                            if n not in custom_parameter_names
                            and any(nd in n for nd in no_decay)
                        ],
                        "weight_decay": 0.0,
                    },
                ]
            )

        warmup_steps = math.ceil(t_total * args.warmup_ratio)
        args.warmup_steps = (
            warmup_steps if args.warmup_steps == 0 else args.warmup_steps
        )

        if args.optimizer == "AdamW":
            optimizer = AdamW(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                eps=args.adam_epsilon,
            )
        elif args.optimizer == "Adafactor":
            optimizer = Adafactor(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                eps=args.adafactor_eps,
                clip_threshold=args.adafactor_clip_threshold,
                decay_rate=args.adafactor_decay_rate,
                beta1=args.adafactor_beta1,
                weight_decay=args.weight_decay,
                scale_parameter=args.adafactor_scale_parameter,
                relative_step=args.adafactor_relative_step,
                warmup_init=args.adafactor_warmup_init,
            )
            print("Using Adafactor for T5")
        else:
            raise ValueError(
                "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead.".format(
                    args.optimizer
                )
            )

        if args.scheduler == "constant_schedule":
            scheduler = get_constant_schedule(optimizer)

        elif args.scheduler == "constant_schedule_with_warmup":
            scheduler = get_constant_schedule_with_warmup(
                optimizer, num_warmup_steps=args.warmup_steps
            )

        elif args.scheduler == "linear_schedule_with_warmup":
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
            )

        elif args.scheduler == "cosine_schedule_with_warmup":
            scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                num_cycles=args.cosine_schedule_num_cycles,
            )

        elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup":
            scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                num_cycles=args.cosine_schedule_num_cycles,
            )

        elif args.scheduler == "polynomial_decay_schedule_with_warmup":
            scheduler = get_polynomial_decay_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                lr_end=args.polynomial_decay_schedule_lr_end,
                power=args.polynomial_decay_schedule_power,
            )

        else:
            raise ValueError("{} is not a valid scheduler.".format(args.scheduler))

        if (
            args.model_name
            and os.path.isfile(os.path.join(args.model_name, "optimizer.pt"))
            and os.path.isfile(os.path.join(args.model_name, "scheduler.pt"))
        ):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(
                torch.load(os.path.join(args.model_name, "optimizer.pt"))
            )
            scheduler.load_state_dict(
                torch.load(os.path.join(args.model_name, "scheduler.pt"))
            )

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        logger.info(" Training started")

        global_step = 0
        training_progress_scores = None
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(
            int(args.num_train_epochs), desc="Epoch", disable=args.silent, mininterval=0
        )
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0
        steps_trained_in_current_epoch = 0
        epochs_trained = 0

        if args.model_name and os.path.exists(args.model_name):
            try:
                # set global_step to gobal_step of last saved checkpoint from model path
                checkpoint_suffix = args.model_name.split("/")[-1].split("-")
                if len(checkpoint_suffix) > 2:
                    checkpoint_suffix = checkpoint_suffix[1]
                else:
                    checkpoint_suffix = checkpoint_suffix[-1]
                global_step = int(checkpoint_suffix)
                epochs_trained = global_step // (
                    len(train_dataloader) // args.gradient_accumulation_steps
                )
                steps_trained_in_current_epoch = global_step % (
                    len(train_dataloader) // args.gradient_accumulation_steps
                )

                logger.info(
                    "   Continuing training from checkpoint, will skip to saved global_step"
                )
                logger.info("   Continuing training from epoch %d", epochs_trained)
                logger.info("   Continuing training from global step %d", global_step)
                logger.info(
                    "   Will skip the first %d steps in the current epoch",
                    steps_trained_in_current_epoch,
                )
            except ValueError:
                logger.info("   Starting fine-tuning.")

        if args.evaluate_during_training:
            training_progress_scores = self._create_training_progress_scores(**kwargs)

        if args.wandb_project:
            wandb.init(
                project=args.wandb_project,
                config={**asdict(args)},
                **args.wandb_kwargs,
            )
            wandb.run._label(repo="simpletransformers")
            wandb.watch(self.model)

        if args.fp16:
            from torch.cuda import amp

            scaler = amp.GradScaler()

        for current_epoch in train_iterator:
            model.train()
            if epochs_trained > 0:
                epochs_trained -= 1
                continue
            train_iterator.set_description(
                f"Epoch {epoch_number + 1} of {args.num_train_epochs}"
            )
            batch_iterator = tqdm(
                train_dataloader,
                desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}",
                disable=args.silent,
                mininterval=0,
            )
            for step, batch in enumerate(batch_iterator):
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                inputs = self._get_inputs_dict(batch)
                if args.fp16:
                    with amp.autocast():
                        outputs = model(**inputs)
                        # model outputs are always tuple in pytorch-transformers (see doc)
                        loss = outputs[0]
                else:
                    outputs = model(**inputs)
                    # model outputs are always tuple in pytorch-transformers (see doc)
                    loss = outputs[0]

                if args.n_gpu > 1:
                    loss = (
                        loss.mean()
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    batch_iterator.set_description(
                        f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}"
                    )

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    scaler.scale(loss).backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        scaler.unscale_(optimizer)
                    if args.optimizer == "AdamW":
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(), args.max_grad_norm
                        )

                    if args.fp16:
                        scaler.step(optimizer)
                        scaler.update()
                    else:
                        optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        # Log metrics
                        tb_writer.add_scalar(
                            "lr", scheduler.get_last_lr()[0], global_step
                        )
                        tb_writer.add_scalar(
                            "loss",
                            (tr_loss - logging_loss) / args.logging_steps,
                            global_step,
                        )
                        logging_loss = tr_loss
                        if args.wandb_project or self.is_sweeping:
                            wandb.log(
                                {
                                    "Training loss": current_loss,
                                    "lr": scheduler.get_last_lr()[0],
                                    "global_step": global_step,
                                }
                            )

                    if args.save_steps > 0 and global_step % args.save_steps == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step)
                        )

                        self.save_model(
                            output_dir_current, optimizer, scheduler, model=model
                        )

                    if args.evaluate_during_training and (
                        args.evaluate_during_training_steps > 0
                        and global_step % args.evaluate_during_training_steps == 0
                    ):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results = self.eval_model(
                            eval_data,
                            verbose=verbose and args.evaluate_during_training_verbose,
                            silent=args.evaluate_during_training_silent,
                            **kwargs,
                        )
                        for key, value in results.items():
                            try:
                                tb_writer.add_scalar(
                                    "eval_{}".format(key), value, global_step
                                )
                            except (NotImplementedError, AssertionError):
                                pass

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step)
                        )

                        if args.save_eval_checkpoints:
                            self.save_model(
                                output_dir_current,
                                optimizer,
                                scheduler,
                                model=model,
                                results=results,
                            )

                        training_progress_scores["global_step"].append(global_step)
                        training_progress_scores["train_loss"].append(current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(
                                args.output_dir, "training_progress_scores.csv"
                            ),
                            index=False,
                        )

                        if args.wandb_project or self.is_sweeping:
                            wandb.log(self._get_last_metrics(training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[args.early_stopping_metric]
                            self.save_model(
                                args.best_model_dir,
                                optimizer,
                                scheduler,
                                model=model,
                                results=results,
                            )
                        if best_eval_metric and args.early_stopping_metric_minimize:
                            if (
                                results[args.early_stopping_metric] - best_eval_metric
                                < args.early_stopping_delta
                            ):
                                best_eval_metric = results[args.early_stopping_metric]
                                self.save_model(
                                    args.best_model_dir,
                                    optimizer,
                                    scheduler,
                                    model=model,
                                    results=results,
                                )
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if (
                                        early_stopping_counter
                                        < args.early_stopping_patience
                                    ):
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args.early_stopping_metric}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args.early_stopping_patience}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args.early_stopping_patience} steps reached"
                                            )
                                            logger.info(" Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step
                                            if not self.args.evaluate_during_training
                                            else training_progress_scores,
                                        )
                        else:
                            if (
                                results[args.early_stopping_metric] - best_eval_metric
                                > args.early_stopping_delta
                            ):
                                best_eval_metric = results[args.early_stopping_metric]
                                self.save_model(
                                    args.best_model_dir,
                                    optimizer,
                                    scheduler,
                                    model=model,
                                    results=results,
                                )
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if (
                                        early_stopping_counter
                                        < args.early_stopping_patience
                                    ):
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args.early_stopping_metric}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args.early_stopping_patience}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args.early_stopping_patience} steps reached"
                                            )
                                            logger.info(" Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step
                                            if not self.args.evaluate_during_training
                                            else training_progress_scores,
                                        )
                        model.train()

            epoch_number += 1
            output_dir_current = os.path.join(
                output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)
            )

            if args.save_model_every_epoch or args.evaluate_during_training:
                os.makedirs(output_dir_current, exist_ok=True)

            if args.save_model_every_epoch:
                self.save_model(output_dir_current, optimizer, scheduler, model=model)

            if args.evaluate_during_training and args.evaluate_each_epoch:
                results = self.eval_model(
                    eval_data,
                    verbose=verbose and args.evaluate_during_training_verbose,
                    silent=args.evaluate_during_training_silent,
                    **kwargs,
                )

                if args.save_eval_checkpoints:
                    self.save_model(
                        output_dir_current, optimizer, scheduler, results=results
                    )

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(
                    os.path.join(args.output_dir, "training_progress_scores.csv"),
                    index=False,
                )

                if args.wandb_project or self.is_sweeping:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args.early_stopping_metric]
                    self.save_model(
                        args.best_model_dir,
                        optimizer,
                        scheduler,
                        model=model,
                        results=results,
                    )
                if best_eval_metric and args.early_stopping_metric_minimize:
                    if (
                        results[args.early_stopping_metric] - best_eval_metric
                        < args.early_stopping_delta
                    ):
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(
                            args.best_model_dir,
                            optimizer,
                            scheduler,
                            model=model,
                            results=results,
                        )
                        early_stopping_counter = 0
                    else:
                        if (
                            args.use_early_stopping
                            and args.early_stopping_consider_epochs
                        ):
                            if early_stopping_counter < args.early_stopping_patience:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(
                                        f" No improvement in {args.early_stopping_metric}"
                                    )
                                    logger.info(
                                        f" Current step: {early_stopping_counter}"
                                    )
                                    logger.info(
                                        f" Early stopping patience: {args.early_stopping_patience}"
                                    )
                            else:
                                if verbose:
                                    logger.info(
                                        f" Patience of {args.early_stopping_patience} steps reached"
                                    )
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return (
                                    global_step,
                                    tr_loss / global_step
                                    if not self.args.evaluate_during_training
                                    else training_progress_scores,
                                )
                else:
                    if (
                        results[args.early_stopping_metric] - best_eval_metric
                        > args.early_stopping_delta
                    ):
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(
                            args.best_model_dir,
                            optimizer,
                            scheduler,
                            model=model,
                            results=results,
                        )
                        early_stopping_counter = 0
                    else:
                        if (
                            args.use_early_stopping
                            and args.early_stopping_consider_epochs
                        ):
                            if early_stopping_counter < args.early_stopping_patience:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(
                                        f" No improvement in {args.early_stopping_metric}"
                                    )
                                    logger.info(
                                        f" Current step: {early_stopping_counter}"
                                    )
                                    logger.info(
                                        f" Early stopping patience: {args.early_stopping_patience}"
                                    )
                            else:
                                if verbose:
                                    logger.info(
                                        f" Patience of {args.early_stopping_patience} steps reached"
                                    )
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return (
                                    global_step,
                                    tr_loss / global_step
                                    if not self.args.evaluate_during_training
                                    else training_progress_scores,
                                )

        return (
            global_step,
            tr_loss / global_step
            if not self.args.evaluate_during_training
            else training_progress_scores,
        )
コード例 #9
0
 def __new__(cls, optimizer, *args, **kwargs):
     return get_constant_schedule_with_warmup(optimizer, *args, **kwargs)
コード例 #10
0
    def train(
        self, train_dataloader, output_dir, show_running_loss=True, eval_dataloader=None, verbose=True, **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        device = self.device
        model = self.model
        args = self.args

        tb_writer = SummaryWriter(logdir=args.tensorboard_dir)

        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = []
        custom_parameter_names = set()
        for group in self.args.custom_parameter_groups:
            params = group.pop("params")
            custom_parameter_names.update(params)
            param_group = {**group}
            param_group["params"] = [p for n, p in model.named_parameters() if n in params]
            optimizer_grouped_parameters.append(param_group)

        for group in self.args.custom_layer_parameters:
            layer_number = group.pop("layer")
            layer = f"layer.{layer_number}."
            group_d = {**group}
            group_nd = {**group}
            group_nd["weight_decay"] = 0.0
            params_d = []
            params_nd = []
            for n, p in model.named_parameters():
                if n not in custom_parameter_names and layer in n:
                    if any(nd in n for nd in no_decay):
                        params_nd.append(p)
                    else:
                        params_d.append(p)
                    custom_parameter_names.add(n)
            group_d["params"] = params_d
            group_nd["params"] = params_nd

            optimizer_grouped_parameters.append(group_d)
            optimizer_grouped_parameters.append(group_nd)

        if not self.args.train_custom_parameters_only:
            optimizer_grouped_parameters.extend(
                [
                    {
                        "params": [
                            p
                            for n, p in model.named_parameters()
                            if n not in custom_parameter_names and not any(nd in n for nd in no_decay)
                        ],
                        "weight_decay": args.weight_decay,
                    },
                    {
                        "params": [
                            p
                            for n, p in model.named_parameters()
                            if n not in custom_parameter_names and any(nd in n for nd in no_decay)
                        ],
                        "weight_decay": 0.0,
                    },
                ]
            )

        warmup_steps = math.ceil(t_total * args.warmup_ratio)
        args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps

        if args.optimizer == "AdamW":
            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        elif args.optimizer == "Adafactor":
            optimizer = Adafactor(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                eps=args.adafactor_eps,
                clip_threshold=args.adafactor_clip_threshold,
                decay_rate=args.adafactor_decay_rate,
                beta1=args.adafactor_beta1,
                weight_decay=args.weight_decay,
                scale_parameter=args.adafactor_scale_parameter,
                relative_step=args.adafactor_relative_step,
                warmup_init=args.adafactor_warmup_init,
            )
            print("Using Adafactor for T5")
        else:
            raise ValueError(
                "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead.".format(
                    args.optimizer
                )
            )

        if args.scheduler == "constant_schedule":
            scheduler = get_constant_schedule(optimizer)

        elif args.scheduler == "constant_schedule_with_warmup":
            scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps)

        elif args.scheduler == "linear_schedule_with_warmup":
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
            )

        elif args.scheduler == "cosine_schedule_with_warmup":
            scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                num_cycles=args.cosine_schedule_num_cycles,
            )

        elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup":
            scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                num_cycles=args.cosine_schedule_num_cycles,
            )

        elif args.scheduler == "polynomial_decay_schedule_with_warmup":
            scheduler = get_polynomial_decay_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_steps,
                num_training_steps=t_total,
                lr_end=args.polynomial_decay_schedule_lr_end,
                power=args.polynomial_decay_schedule_power,
            )

        else:
            raise ValueError("{} is not a valid scheduler.".format(args.scheduler))

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        global_step = 0
        training_progress_scores = None
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent)
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0

        if args.evaluate_during_training:
            training_progress_scores = self._create_training_progress_scores(**kwargs)

        if args.wandb_project:
            wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs)
            wandb.watch(self.model)

        if args.fp16:
            from torch.cuda import amp

            scaler = amp.GradScaler()

        for _ in train_iterator:
            model.train()
            train_iterator.set_description(f"Epoch {epoch_number + 1} of {args.num_train_epochs}")
            batch_iterator = tqdm(
                train_dataloader,
                desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}",
                disable=args.silent,
                mininterval=0,
            )
            for step, batch in enumerate(batch_iterator):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, labels, mc_labels, token_type_ids = batch

                if args.fp16:
                    with amp.autocast():
                        outputs = model(
                            input_ids,
                            token_type_ids=token_type_ids,
                            mc_token_ids=mc_token_ids,
                            mc_labels=mc_labels,
                            labels=labels,
                        )

                        lm_loss, mc_loss = outputs[:2]
                        # model outputs are always tuple in pytorch-transformers (see doc)
                        loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef
                else:
                    outputs = model(
                        input_ids,
                        token_type_ids=token_type_ids,
                        mc_token_ids=mc_token_ids,
                        mc_labels=mc_labels,
                        labels=labels,
                    )

                    lm_loss, mc_loss = outputs[:2]
                    # model outputs are always tuple in pytorch-transformers (see doc)
                    loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef

                if args.n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    print("\rRunning loss: %f" % current_loss, end="")

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    scaler.scale(loss).backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        scaler.unscale_(optimizer)
                    if args.optimizer == "AdamW":
                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                    if args.fp16:
                        scaler.step(optimizer)
                        scaler.update()
                    else:
                        optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                        logging_loss = tr_loss
                        if args.wandb_project or self.is_sweeping:
                            wandb.log(
                                {
                                    "Training loss": current_loss,
                                    "lr": scheduler.get_last_lr()[0],
                                    "global_step": global_step,
                                }
                            )

                    if args.save_steps > 0 and global_step % args.save_steps == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step))

                        self.save_model(output_dir_current, model=model)

                    if args.evaluate_during_training and (
                        args.evaluate_during_training_steps > 0
                        and global_step % args.evaluate_during_training_steps == 0
                    ):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results, _, _ = self.eval_model(
                            eval_dataloader,
                            verbose=verbose and args.evaluate_during_training_verbose,
                            silent=args.evaluate_during_training_silent,
                            **kwargs,
                        )
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)

                        output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step))

                        if args.save_eval_checkpoints:
                            self.save_model(output_dir_current, model=model, results=results)

                        training_progress_scores["global_step"].append(global_step)
                        training_progress_scores["train_loss"].append(current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(args.output_dir, "training_progress_scores.csv"), index=False,
                        )

                        if args.wandb_project or self.is_sweeping:
                            wandb.log(self._get_last_metrics(training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[args.early_stopping_metric]
                            self.save_model(args.best_model_dir, model=model, results=results)
                        if best_eval_metric and args.early_stopping_metric_minimize:
                            if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                                best_eval_metric = results[args.early_stopping_metric]
                                self.save_model(args.best_model_dir, model=model, results=results)
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if early_stopping_counter < args.early_stopping_patience:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(f" No improvement in {args.early_stopping_metric}")
                                            logger.info(f" Current step: {early_stopping_counter}")
                                            logger.info(f" Early stopping patience: {args.early_stopping_patience}")
                                    else:
                                        if verbose:
                                            logger.info(f" Patience of {args.early_stopping_patience} steps reached")
                                            logger.info(" Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step
                                            if not self.args.evaluate_during_training
                                            else training_progress_scores,
                                        )
                        else:
                            if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta:
                                best_eval_metric = results[args.early_stopping_metric]
                                self.save_model(args.best_model_dir, model=model, results=results)
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if early_stopping_counter < args.early_stopping_patience:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(f" No improvement in {args.early_stopping_metric}")
                                            logger.info(f" Current step: {early_stopping_counter}")
                                            logger.info(f" Early stopping patience: {args.early_stopping_patience}")
                                    else:
                                        if verbose:
                                            logger.info(f" Patience of {args.early_stopping_patience} steps reached")
                                            logger.info(" Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step
                                            if not self.args.evaluate_during_training
                                            else training_progress_scores,
                                        )

            epoch_number += 1
            output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args.save_model_every_epoch or args.evaluate_during_training:
                os.makedirs(output_dir_current, exist_ok=True)

            if args.save_model_every_epoch:
                self.save_model(output_dir_current, model=model)

            if args.evaluate_during_training and args.evaluate_each_epoch:
                results, _, _ = self.eval_model(
                    eval_dataloader, verbose=verbose and args.evaluate_during_training_verbose, silent=True, **kwargs,
                )

                self.save_model(output_dir_current, results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(os.path.join(args.output_dir, "training_progress_scores.csv"), index=False)

                if args.wandb_project or self.is_sweeping:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args.early_stopping_metric]
                    self.save_model(args.best_model_dir, model=model, results=results)
                if best_eval_metric and args.early_stopping_metric_minimize:
                    if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(args.best_model_dir, model=model, results=results)
                        early_stopping_counter = 0
                else:
                    if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta:
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(args.best_model_dir, model=model, results=results)
                        early_stopping_counter = 0
                model.train()

        return (
            global_step,
            tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores,
        )
コード例 #11
0
# test_data = encode(mrpc_sets['test'])
from torch.utils.data import DataLoader
train_loader = DataLoader(train_data, batch_size=16)
batch = next(iter(train_loader))
for batch, i in enumerate(train_loader):
    print('bloop')
trainer_args = TrainingArguments(
    output_dir=
    '/home/ahoffman/research/transformers/examples/alex/tutorials/out',
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    # do_predict=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    fp16=False)

from transformers.optimization import AdamW, get_constant_schedule_with_warmup
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=.1)
scheduler = get_constant_schedule_with_warmup(optimizer, 500)

trainer = Trainer(model=model,
                  args=trainer_args,
                  tokenizer=tokenizer,
                  train_dataset=train_data,
                  eval_dataset=valid_data,
                  optimizers=(optimizer, scheduler))

# loader =
trainer.train()
コード例 #12
0
def main(args):
    if not args.do_eval:
        assert all([
            x in ['true', 'false'] for x in
            [args.use_cuda, args.symmetric, args.linear_head, args.siamese]
        ])
        args.use_cuda = args.use_cuda.lower() == 'true'
        args.symmetric = args.symmetric.lower() == 'true'
        args.linear_head = args.linear_head.lower() == 'true'
        args.siamese = args.siamese.lower() == 'true'

    if args.siamese:
        assert args.train_batch_size % 2 == 0, 'train batch size should be even in siamese mode'
        assert not args.symmetric

    if args.do_train and os.path.exists(args.output_dir):
        model_weights = glob(os.path.join(args.output_dir, '*.bin'))
        if model_weights:
            print(f'{model_weights}: already computed: skipping ...')
            return
        else:
            print(
                f'already existing {args.output_dir}. but without model weights ...'
            )
            return

    device = torch.device("cuda" if args.use_cuda else "cpu")
    n_gpu = torch.cuda.device_count()

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "gradient_accumulation_steps parameter should be >= 1")

    if args.do_train:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if args.do_train and not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
        os.makedirs(os.path.join(args.output_dir, 'nen-nen-weights'))
    elif args.do_train or args.do_validation:
        raise ValueError(f'{args.output_dir} already exists')

    suffix = datetime.now().isoformat().replace('-', '_').replace(
        ':', '_').split('.')[0].replace('T', '-')

    if args.do_train:
        train_writer = SummaryWriter(log_dir=os.path.join(
            args.output_dir, f'tensorboard-{suffix}', 'train'))
        dev_writer = SummaryWriter(log_dir=os.path.join(
            args.output_dir, f'tensorboard-{suffix}', 'dev'))
        test_writer = SummaryWriter(log_dir=os.path.join(
            args.output_dir, f'tensorboard-{suffix}', 'test'))

        logger.addHandler(
            logging.FileHandler(
                os.path.join(args.output_dir, f"train_logs_{suffix}.log"),
                'w'))
    else:
        logger.addHandler(
            logging.FileHandler(
                os.path.join(args.ckpt_path, f"eval_logs_{suffix}.log"), 'w'))

    logger.info(json.dumps(vars(args), indent=4))
    if args.do_train:
        json.dump(vars(args),
                  open(os.path.join(args.output_dir, 'args.json'), 'w'),
                  indent=4)
    logger.info("device: {}, n_gpu: {}".format(device, n_gpu))

    args.train_batch_size = \
        args.train_batch_size // args.gradient_accumulation_steps

    model_name = args.model_name
    data_processor = DataProcessor()

    train_dir = os.path.join(args.data_dir, 'train/')
    dev_dir = os.path.join(args.data_dir, 'dev')

    if args.do_train:
        config = configs[args.model_name]
        config = config.from_pretrained(args.model_name,
                                        hidden_dropout_prob=args.dropout)
        if args.ckpt_path != '':
            model_path = args.ckpt_path
        else:
            model_path = args.model_name

        model = models[model_name]
        model = model.from_pretrained(
            model_path,
            cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE),
            args=args,
            data_processor=data_processor,
            config=config)

        if args.freeze_featurizer:
            trainable_weights = []
            for name, parameter in model.named_parameters():
                # if name not in ['syn_clf.bn1.weight', 'syn_clf.bn1.bias', 'syn_clf.bn1.running_mean', 'syn_clf.bn1.running_var', 'syn_clf.dense.weight', 'syn_clf.dense.bias', 'syn_clf.out_proj.weight', 'syn_clf.out_proj.bias']:
                if name.startswith('roberta'):
                    parameter.requires_grad = False
                else:
                    trainable_weights.append(name)
            logger.info(f'trainable weights: {trainable_weights}')

        model.to(device)

        param_optimizer = list(model.named_parameters())

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                param for name, param in param_optimizer
                if not any(nd in name for nd in no_decay)
            ],
            'weight_decay':
            float(args.weight_decay)
        }, {
            'params': [
                param for name, param in param_optimizer
                if any(nd in name for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=float(args.learning_rate),
                          eps=1e-6,
                          betas=(0.9, 0.98),
                          correct_bias=True)

        train_features = model.convert_dataset_to_features(train_dir, logger)

        train_dataloader = \
            get_dataloader_and_tensors(train_features, args.train_batch_size, 'siamese_random' if args.siamese else 'random')
        train_batches_len = len(train_dataloader)

        num_train_optimization_steps = \
            train_batches_len // args.gradient_accumulation_steps * \
                args.num_train_epochs

        warmup_steps = int(args.warmup_proportion *
                           num_train_optimization_steps)
        if args.lr_scheduler == 'linear_warmup':
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=num_train_optimization_steps)
        elif args.lr_scheduler == 'constant_warmup':
            scheduler = get_constant_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps)

        if args.fp16:
            from apex import amp
            model, optimizer = amp.initialize(
                model,
                optimizer,
                opt_level=args.fp16_opt_level,
                # loss_scale=args.loss_scale,
                # min_loss_scale=args.fp16_min_loss_scale,
                # max_loss_scale=args.fp16_max_loss_scale,
            )
        logger.info("***** Training *****")
        logger.info("  Num examples = %d", len(train_features))
        logger.info("  Batch size = %d",
                    args.train_batch_size * args.gradient_accumulation_steps)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_features = None

        if args.do_validation:
            dev_features = model.convert_dataset_to_features(dev_dir, logger)
            logger.info("***** Dev *****")
            logger.info("  Num examples = %d", len(dev_features))
            logger.info("  Batch size = %d", args.eval_batch_size)
            dev_dataloader = \
                get_dataloader_and_tensors(dev_features, args.eval_batch_size, 'sequential')
            test_dir = os.path.join(args.data_dir, 'test/')
            if os.path.exists(test_dir):
                test_features = model.convert_dataset_to_features(
                    test_dir, logger)
                logger.info("***** Test *****")
                logger.info("  Num examples = %d", len(test_features))
                logger.info("  Batch size = %d", args.eval_batch_size)

                test_dataloader = \
                    get_dataloader_and_tensors(test_features, args.eval_batch_size, 'sequential')

        best_result = defaultdict(float)

        eval_step = max(1, train_batches_len // args.eval_per_epoch)

        start_time = time.time()
        global_step = 0

        lr = float(args.learning_rate)
        for epoch in range(1, 1 + args.num_train_epochs):
            tr_loss = 0
            nb_tr_examples = 0
            nb_tr_steps = 0
            cur_train_loss = defaultdict(float)

            model.train()
            logger.info("Start epoch #{} (lr = {})...".format(
                epoch,
                scheduler.get_lr()[0]))

            train_bar = tqdm(train_dataloader,
                             total=train_batches_len,
                             desc='training ... ')
            for step, batch in enumerate(train_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, token_type_ids, \
                    syn_labels, positions = batch

                train_loss, _ = model(input_ids=input_ids,
                                      token_type_ids=token_type_ids,
                                      attention_mask=input_mask,
                                      input_labels={
                                          'syn_labels': syn_labels,
                                          'positions': positions
                                      })
                loss = train_loss['total'].mean().item()
                for key in train_loss:
                    cur_train_loss[key] += train_loss[key].mean().item()

                train_bar.set_description(
                    f'training... [epoch == {epoch} / {args.num_train_epochs}, loss == {loss}]'
                )

                loss_to_optimize = train_loss['total']

                if args.gradient_accumulation_steps > 1:
                    loss_to_optimize = \
                        loss_to_optimize / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss_to_optimize,
                                        optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss_to_optimize.backward()

                tr_loss += loss_to_optimize.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                    # optimizer.zero_grad()
                    model.zero_grad()
                    global_step += 1

                if args.do_validation and (step + 1) % eval_step == 0:
                    logger.info(
                        'Ep: {}, Stp: {}/{}, usd_t={:.2f}s, loss={:.6f}'.
                        format(epoch, step + 1, train_batches_len,
                               time.time() - start_time,
                               tr_loss / nb_tr_steps))
                    cur_train_mean_loss = {}
                    for key, value in cur_train_loss.items():
                        cur_train_mean_loss[f'{key}_loss'] = \
                            value / nb_tr_steps

                    dev_predictions = os.path.join(args.output_dir,
                                                   'dev_predictions')

                    metrics = model.predict(dev_dataloader,
                                            dev_predictions,
                                            dev_features,
                                            compute_metrics=True)

                    metrics['global_step'] = global_step
                    metrics['epoch'] = epoch
                    metrics['learning_rate'] = scheduler.get_lr()[0]
                    metrics['batch_size'] = \
                        args.train_batch_size * args.gradient_accumulation_steps

                    for key, value in metrics.items():
                        dev_writer.add_scalar(key, value, global_step)
                    for key, value in cur_train_mean_loss.items():
                        train_writer.add_scalar(key, value, global_step)
                    scores_to_logger = tuple([
                        round(metrics[save_by_score] * 100.0, 2)
                        for save_by_score in args.save_by_score.split('+')
                    ])
                    logger.info(
                        f"dev %s (lr=%s, epoch=%d): %s" %
                        (args.save_by_score, str(
                            scheduler.get_lr()[0]), epoch, scores_to_logger))

                    improved_parts = [
                        part for part in metrics if part.endswith('.score')
                        and metrics[part] > args.start_save_threshold
                        and metrics[part] > best_result[part]
                    ]
                    if improved_parts:
                        best_dev_predictions = os.path.join(
                            args.output_dir, 'best-dev-predictions')
                        dev_predictions = os.path.join(args.output_dir,
                                                       'dev_predictions')
                        os.makedirs(best_dev_predictions, exist_ok=True)
                        os.makedirs(dev_predictions, exist_ok=True)
                        for part in improved_parts:
                            logger.info(
                                "!!! Best dev %s (lr=%s, epoch=%d): %.2f -> %.2f"
                                % (part, str(scheduler.get_lr()[0]), epoch,
                                   best_result[part] * 100.0,
                                   metrics[part] * 100.0))
                            best_result[part] = metrics[part]
                            dev_writer.add_scalar('best_' + part,
                                                  metrics[part], global_step)

                            if [
                                    save_weight for save_weight in
                                    args.save_by_score.split('+')
                                    if save_weight == part
                            ]:
                                os.makedirs(os.path.join(
                                    args.output_dir, part),
                                            exist_ok=True)
                                output_model_file = os.path.join(
                                    args.output_dir, part, WEIGHTS_NAME)
                                save_model(args, model, output_model_file,
                                           metrics)
                            best_dev_files = [
                                file.split('/')[-1]
                                for file in glob(f'{dev_predictions}/*')
                                if part.split('.')[1] in file
                            ]
                            for dev_file in best_dev_files:
                                logger.info(
                                    f'{dev_predictions}/{dev_file} -> {best_dev_predictions}/'
                                )
                                os.system(
                                    f'cp {dev_predictions}/{dev_file} {best_dev_predictions}/'
                                )

                        if args.log_test_metrics and os.path.exists(test_dir):
                            test_predictions = os.path.join(
                                args.output_dir, 'test_predictions')
                            test_metrics = model.predict(test_dataloader,
                                                         test_predictions,
                                                         test_features,
                                                         compute_metrics=True)
                            best_test_predictions = os.path.join(
                                args.output_dir, 'best-test-predictions')
                            os.makedirs(best_test_predictions, exist_ok=True)
                            corresp_test_files = [
                                file.split('/')[-1]
                                for file in glob(f'{test_predictions}/*')
                                if any([
                                    part.split('.')[1] in file
                                    for part in improved_parts
                                ])
                            ]
                            for test_file in corresp_test_files:
                                logger.info(
                                    f'{test_predictions}/{test_file} -> {best_test_predictions}/'
                                )
                                os.system(
                                    f'cp {test_predictions}/{test_file} {best_test_predictions}/'
                                )

                            for key, value in test_metrics.items():
                                if key.endswith('.score'):
                                    test_writer.add_scalar(
                                        key, value, global_step)
                                if key in improved_parts:
                                    test_writer.add_scalar(
                                        'best_' + key, value, global_step)
                        if any([
                                'nen-nen.score' in part
                                for part in improved_parts
                        ]):
                            best_dev_nen_nen_path = os.path.join(
                                args.output_dir,
                                'best-dev-nen-nen-predictions')
                            os.makedirs(best_dev_nen_nen_path, exist_ok=True)
                            os.system(
                                f'mv {dev_predictions}/* {best_dev_nen_nen_path}/'
                            )
                            if args.log_test_metrics and os.path.exists(
                                    test_dir):
                                best_test_nen_nen_path = os.path.join(
                                    args.output_dir,
                                    'best-test-nen-nen-predictions')
                                os.makedirs(best_test_nen_nen_path,
                                            exist_ok=True)
                                os.system(
                                    f'mv {test_predictions}/* {best_test_nen_nen_path}/'
                                )

    if args.do_eval:
        assert args.ckpt_path != '', 'in do_eval mode ckpt_path should be specified'
        test_dir = args.eval_input_dir
        config = configs[model_name].from_pretrained(model_name)
        model = models[model_name]
        model = model.from_pretrained(args.ckpt_path,
                                      args=args,
                                      data_processor=data_processor,
                                      config=config)
        model.to(device)
        test_features = model.convert_dataset_to_features(test_dir, logger)
        logger.info("***** Test *****")
        logger.info("  Num examples = %d", len(test_features))
        logger.info("  Batch size = %d", args.eval_batch_size)

        test_dataloader = \
            get_dataloader_and_tensors(test_features, args.eval_batch_size, 'sequential')

        metrics = model.predict(test_dataloader,
                                os.path.join(args.output_dir,
                                             args.eval_output_dir),
                                test_features,
                                compute_metrics=True)
        logger.info(json.dumps(metrics, indent=4))
        with open(
                os.path.join(args.output_dir, args.eval_output_dir,
                             'metrics.txt'), 'w') as outp:
            json.dump(metrics, outp, indent=4)
コード例 #13
0
def main(args):
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if args.do_train:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if args.do_train:
        suffix = datetime.now().isoformat().replace('-', '_').replace(
            ':', '_').split('.')[0].replace('T', '-')
        logger.addHandler(
            logging.FileHandler(
                os.path.join(args.output_dir, f"train_{suffix}.log"), 'w'))
        eval_logger.addHandler(
            logging.FileHandler(
                os.path.join(args.output_dir, f"scores_{suffix}.log"), 'w'))
    else:
        logger.addHandler(
            logging.FileHandler(os.path.join(args.output_dir, "eval.log"),
                                'w'))
    logger.info(args)
    logger.info("device: {}, n_gpu: {}".format(device, n_gpu))

    processor = DataProcessor(tag_format=args.tag_format,
                              filter_non_causal=args.only_task_2
                              or args.only_bert_ner)

    if args.only_task_2 or args.only_bert_ner:
        model_name = f'{args.model}-fake'
        assert args.text_clf_weight == 0.0, f"Training only on task 2 requires to set " \
                                            f"text_clf_weight to zero. {args.text_clf_weight} passed."
        assert args.eval_metric.startswith('sequence'), f"Training only on task 2 requires to set task 2 related " \
                                                        f"metric. {args.eval_metric} passed."
    else:
        model_name = args.model

    text_labels_list = processor.get_text_labels(args.data_dir, logger)
    sequence_labels_list = processor.get_sequence_labels(args.data_dir, logger)

    label2id = {
        'text': {label: i
                 for i, label in enumerate(text_labels_list)},
        'sequence':
        {label: i
         for i, label in enumerate(sequence_labels_list, 1)}
    }

    id2label = {
        'text': {i: label
                 for i, label in enumerate(text_labels_list)},
        'sequence':
        {i: label
         for i, label in enumerate(sequence_labels_list, 1)}
    }

    num_text_labels = len(text_labels_list)
    num_sequence_labels = len(sequence_labels_list) + 1

    # do_lower_case = 'uncased' in args.model
    do_lower_case = True
    tokenizer = tokenizers[args.model].from_pretrained(
        args.model, do_lower_case=do_lower_case)

    if args.do_train:
        config = configs[args.model]
        config = config.from_pretrained(args.model,
                                        hidden_dropout_prob=args.dropout)
        model = models[model_name].from_pretrained(
            args.model,
            cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE),
            num_text_labels=num_text_labels,
            num_sequence_labels=num_sequence_labels,
            sequence_clf_weight=args.sequence_clf_weight,
            text_clf_weight=args.text_clf_weight,
            pooling_type=args.bert_ner_pool_type,
            config=config)
        print("text and sequence tasks weights:", model.text_clf_weight,
              model.sequence_clf_weight)

    else:
        model = models[model_name].from_pretrained(
            args.output_dir,
            num_sequence_labels=num_sequence_labels,
            num_text_labels=num_text_labels,
            text_clf_weight=args.text_clf_weight,
            sequence_clf_weight=args.sequence_clf_weight,
            pooling_type=args.bert_ner_pool_type)

    model.to(device)

    eval_examples = processor.get_dev_examples(args.data_dir)
    eval_features = model.convert_examples_to_features(eval_examples, label2id,
                                                       args.max_seq_length,
                                                       tokenizer, logger)
    logger.info("***** Dev *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_dataloader, eval_text_labels_ids, eval_sequence_labels_ids = \
        get_dataloader_and_text_ids_with_sequence_ids(eval_features, args.eval_batch_size)

    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        train_features = model.convert_examples_to_features(
            train_examples, label2id, args.max_seq_length, tokenizer, logger)

        if args.train_mode == 'sorted' or args.train_mode == 'random_sorted':
            train_features = sorted(train_features,
                                    key=lambda f: np.sum(f.input_mask))
        else:
            random.shuffle(train_features)

        train_dataloader, _, _ = \
            get_dataloader_and_text_ids_with_sequence_ids(train_features, args.train_batch_size)
        train_batches = [batch for batch in train_dataloader]

        num_train_optimization_steps = \
            len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
        warmup_steps = int(args.warmup_proportion *
                           num_train_optimization_steps)

        logger.info("***** Training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_result = None
        eval_step = max(1, len(train_batches) // args.eval_per_epoch)
        lr = float(args.learning_rate)

        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                param for name, param in param_optimizer
                if not any(nd in name for nd in no_decay)
            ],
            'weight_decay':
            float(args.weight_decay)
        }, {
            'params': [
                param for name, param in param_optimizer
                if any(nd in name for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
        if args.lr_schedule == 'constant_warmup':
            print('lr schedule = constant_warmup')
            scheduler = get_constant_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps)
        else:
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=num_train_optimization_steps)

        start_time = time.time()
        global_step = 0
        tr_loss = 0
        nb_tr_examples = 0
        nb_tr_steps = 0
        for epoch in range(1, 1 + int(args.num_train_epochs)):
            model.train()
            logger.info("Start epoch #{} (lr = {})...".format(epoch, lr))
            if args.train_mode == 'random' or args.train_mode == 'random_sorted':
                random.shuffle(train_batches)

            for step, batch in enumerate(
                    tqdm(train_batches,
                         total=len(train_batches),
                         desc='fitting ... ')):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, text_labels_ids, sequence_labels_ids, token_pos_ids = batch
                loss = model(input_ids=input_ids,
                             token_type_ids=segment_ids,
                             attention_mask=input_mask,
                             text_labels=text_labels_ids,
                             sequence_labels=sequence_labels_ids,
                             token_pos_ids=token_pos_ids)

                if n_gpu > 1:
                    loss = loss.mean()

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    global_step += 1

                if args.do_validate and (step + 1) % eval_step == 0:
                    logger.info(
                        'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}'
                        .format(epoch, step + 1, len(train_batches),
                                time.time() - start_time,
                                tr_loss / nb_tr_steps))
                    save_model = False

                    preds, result, scores = evaluate(
                        model, device, eval_dataloader, eval_text_labels_ids,
                        eval_sequence_labels_ids, num_text_labels,
                        num_sequence_labels, label2id)
                    model.train()
                    result['global_step'] = global_step
                    result['epoch'] = epoch
                    result['learning_rate'] = lr
                    result['batch_size'] = args.train_batch_size
                    if not args.only_task_2 and not args.only_bert_ner:
                        logger.info("First 20 predictions:")
                        for text_pred, text_label in zip(
                                preds['text'][:20],
                                eval_text_labels_ids.numpy()[:20]):
                            sign = u'\u2713' if text_pred == text_label else u'\u2718'
                            logger.info("pred = %s, label = %s %s" %
                                        (id2label['text'][text_pred],
                                         id2label['text'][text_label], sign))

                    if (best_result is
                            None) or (result[args.eval_metric] >
                                      best_result[args.eval_metric]):
                        best_result = result
                        save_model = True
                        logger.info("!!! Best dev %s (lr=%s, epoch=%d): %.2f" %
                                    (args.eval_metric, str(lr), epoch,
                                     result[args.eval_metric] * 100.0))

                    if save_model:
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            args.output_dir, WEIGHTS_NAME)
                        output_config_file = os.path.join(
                            args.output_dir, CONFIG_NAME)
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        tokenizer.save_vocabulary(args.output_dir)
                        if best_result:
                            output_eval_file = os.path.join(
                                args.output_dir, "eval_results.txt")
                            with open(output_eval_file, "w") as writer:
                                for key in sorted(result.keys()):
                                    writer.write("%s = %s\n" %
                                                 (key, str(result[key])))
    if args.do_eval:
        test_file = os.path.join(
            args.data_dir,
            'test.json') if args.test_file == '' else args.test_file
        eval_examples = processor.get_test_examples(test_file)

        eval_features = model.convert_examples_to_features(
            eval_examples, label2id, args.max_seq_length, tokenizer, logger)
        logger.info("***** Test *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        eval_dataloader, eval_text_labels_ids, eval_sequence_labels_ids = \
            get_dataloader_and_text_ids_with_sequence_ids(eval_features, args.eval_batch_size)

        preds, result, scores = evaluate(model,
                                         device,
                                         eval_dataloader,
                                         eval_text_labels_ids,
                                         eval_sequence_labels_ids,
                                         num_text_labels,
                                         num_sequence_labels,
                                         label2id,
                                         compute_scores=False)

        aggregated_results = {}
        task = "sequence"
        eval_orig_positions_map = [
            ex.orig_positions_map for ex in eval_features
        ]
        aggregated_results[task] = [
            list(pred[orig_positions]) + [label2id[task]['0']] *
            (len(ex.tokens) - len(orig_positions))
            for pred, orig_positions, ex in zip(
                preds[task], eval_orig_positions_map, eval_examples)
        ]

        aggregated_results[f'{task}_scores'] = [
            list(score[orig_positions]) + [0.999] *
            (len(ex.tokens) - len(orig_positions))
            for score, orig_positions, ex in zip(
                scores[task], eval_orig_positions_map, eval_examples)
        ]

        prediction_results = {
            'idx': [ex.guid for ex in eval_examples],
            'tokens': [' '.join(ex.tokens) for ex in eval_examples],
            'sequence_labels':
            [' '.join(ex.sequence_labels) for ex in eval_examples],
            'text_label': [ex.text_label for ex in eval_examples],
            'text_pred': [id2label['text'][x] for x in preds['text']],
            'sequence_pred': [
                ' '.join(
                    [id2label['sequence'][x] if x != 0 else '0' for x in sent])
                for sent in aggregated_results['sequence']
            ],
            'sequence_scores': [
                ' '.join([str(score) for score in sent])
                for sent in aggregated_results['sequence_scores']
            ],
            'task_id': [ex.task_id for ex in eval_examples],
            'text': [ex.text for ex in eval_examples]
        }

        prediction_results = pd.DataFrame(prediction_results)
        prediction_results.to_csv(os.path.join(
            args.output_dir,
            f"{args.test_file.split('/')[-1]}_predictions.tsv"),
                                  sep='\t',
                                  index=False)
        with open(
                os.path.join(
                    args.output_dir,
                    f"{args.test_file.split('/')[-1]}_eval_results.txt"),
                "w") as f:

            for key in sorted(result.keys()):
                f.write("%s = %s\n" % (key, str(result[key])))