Exemple #1
0
def initialize(config):
    model = utils.get_model(config["model"])
    # Adapt model for distributed backend if provided
    model = idist.auto_model(model)

    optimizer = utils.get_optimizer(
        config["optimizer"],
        model,
        learning_rate=config["learning_rate"],
        weight_decay=config["weight_decay"],
    )
    # Adapt optimizer for distributed backend if provided
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.CrossEntropyLoss().to(idist.device())

    le = config["num_iters_per_epoch"]
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    return model, optimizer, criterion, lr_scheduler
Exemple #2
0
 def learningrate(self, trainer, args):
     """ Linear decreasing learning rate. """
     scheduler = PiecewiseLinear(
         self.optimizer, "lr",
         [(0, self.hps.lr),
          (self.hps.n_epochs * len(args["loader_train"]), 0.0)])
     trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
Exemple #3
0
def initialize(config):
    model = utils.get_model(config["model"], config["model_dir"],
                            config["dropout"], config["n_fc"],
                            config["num_classes"])

    config["learning_rate"] *= idist.get_world_size()
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.AdamW(
        model.parameters(),
        lr=config["learning_rate"],
        weight_decay=config["weight_decay"],
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.BCEWithLogitsLoss()

    le = config["num_iters_per_epoch"]
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    return model, optimizer, criterion, lr_scheduler
Exemple #4
0
def initialize(config):
    model = utils.get_model(config["model"])
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model, find_unused_parameters=True)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config["learning_rate"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)
    criterion = nn.CrossEntropyLoss().to(idist.device())

    le = config["num_iters_per_epoch"]
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    return model, optimizer, criterion, lr_scheduler
def finetune_model(args, model, loader):
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    def update(engine, batch):
        model.train()
        batch = tuple(batch[input_name].to(args.device)
                      for input_name in MODEL_INPUTS)
        input_ids, lm_labels, token_type_ids, nodes_ids, attention_mask = batch
        if (not args.graph and not args.edge_list):
            nodes_ids = None
        if (not args.unilm): attention_mask = None
        (lm_loss), *_ = model(input_ids=input_ids,
                              token_type_ids=token_type_ids,
                              labels=lm_labels,
                              nodes=nodes_ids,
                              attention_mask=attention_mask)
        loss = lm_loss / args.gradient_accumulation_steps

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)
    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    trainer.run(loader, max_epochs=args.n_epochs)
    return model
Exemple #6
0
    def _run(self, trainer, optimizer, output_transform, num_iter, end_lr,
             step_mode, smooth_f, diverge_th):

        self._history = {"lr": [], "loss": []}
        self._best_loss = None
        self._diverge_flag = False

        # attach LRScheduler to trainer.
        if num_iter is None:
            num_iter = trainer.state.epoch_length * trainer.state.max_epochs
        else:
            max_iter = trainer.state.epoch_length * trainer.state.max_epochs
            if num_iter > max_iter:
                warnings.warn(
                    "Desired num_iter {} is unreachable with the current run setup of {} iteration "
                    "({} epochs)".format(num_iter, max_iter,
                                         trainer.state.max_epochs),
                    UserWarning,
                )

        if not trainer.has_event_handler(self._reached_num_iterations):
            trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                      self._reached_num_iterations, num_iter)

        # attach loss and lr logging
        if not trainer.has_event_handler(self._log_lr_and_loss):
            trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                      self._log_lr_and_loss, output_transform,
                                      smooth_f, diverge_th)

        self.logger.debug(
            "Running LR finder for {} iterations".format(num_iter))
        # Initialize the proper learning rate policy
        if step_mode.lower() == "exp":
            self._lr_schedule = LRScheduler(
                _ExponentialLR(optimizer, end_lr, num_iter))
        else:
            start_lr = optimizer.param_groups[0]["lr"]
            self._lr_schedule = PiecewiseLinear(optimizer,
                                                param_name="lr",
                                                milestones_values=[
                                                    (0, start_lr),
                                                    (num_iter, end_lr)
                                                ])
        if not trainer.has_event_handler(self._lr_schedule):
            trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                      self._lr_schedule, num_iter)
Exemple #7
0
def get_lr_scheduler(optimizer, lr_max_value, lr_max_value_epoch, num_epochs, epoch_length):
    milestones_values = [
        (0, 0.00001), 
        (epoch_length * lr_max_value_epoch, lr_max_value), 
        (epoch_length * num_epochs - 1, 0.00001)
    ]
    lr_scheduler1 = PiecewiseLinear(optimizer, "lr", milestones_values=milestones_values,param_group_index=0)

    milestones_values = [
        (0, 0.00002), 
        (epoch_length * lr_max_value_epoch, lr_max_value * 2),
        (epoch_length * lr_max_value_epoch  + 5, lr_max_value),
        (epoch_length * num_epochs - 1, 0.00002)
    ]
    lr_scheduler2 = PiecewiseLinear(optimizer, "lr", milestones_values=milestones_values,param_group_index=1)

    lr_scheduler = ParamGroupScheduler(
        [lr_scheduler1, lr_scheduler2],
        ["lr scheduler (non-biases)", "lr scheduler (biases)"]
    )
    
    return lr_scheduler
Exemple #8
0
    def assign_schedulers(self):
        """
        Assigns the step schedulers for each network parameters group
        :return: (list) with parameters group step schedulers for ignite
        """
        schedulers = []

        milestones = ((0, 2e-4), (8499, 2e-4), (8500, 1e-4), (16999, 1e-4),
                      (17000, 5e-5), (25999, 5e-5), (26000, 2.5e-5))

        multi_lr = PiecewiseLinear(self.optimizer,
                                   "lr",
                                   milestones_values=milestones)
        schedulers.append(multi_lr)
        return schedulers
Exemple #9
0
def train():
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    model = Bert_SQG()
    optimizer = AdamW(model.parameters(), lr=3e-5)

    ds = dataloader.BertSQG_DataClass()
    dl = DataLoader(ds, num_workers=4, batch_size=4)
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, 3e-5),
                                 (EPOCHS * len(ds) // BATCH_SIZE, 0.0)])
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))}

    def update(engine, batch):
        model.train()
        for i in range(0, len(batch) - 1):
            x = batch[i].to(device)
            y = batch[i + 1].to(device)
            y_prime = model(x)
            loss = criterion(y_prime[-1], y[-1]) / ITERATION_STEP
            loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        if engine.state.iteration % ITERATION_STEP == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=["loss"])
    tb_logger = TensorboardLogger(log_dir='./logs')
    tb_logger.attach(trainer,
                     log_handler=OutputHandler(tag="training",
                                               metric_names=["loss"]),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=OptimizerParamsHandler(optimizer),
                     event_name=Events.ITERATION_STARTED)

    checkpoint_handler = ModelCheckpoint('./checkpoint',
                                         '_checkpoint',
                                         save_interval=1,
                                         n_saved=3)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                              {'bert_sqg': getattr(model, 'module', model)})
    trainer.run(dl, max_epochs=EPOCHS)
    tb_loger.close()
Exemple #10
0
    def train_model(self, n_epochs, train_loader, val_loader, eval_before_start=True):
        # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
        self.trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: self.evaluator.run(val_loader))
        self.trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: self.update_epoch())
        if eval_before_start:
            self.trainer.add_event_handler(Events.STARTED, lambda _: self.evaluator.run(val_loader))

        # Linearly decrease the learning rate from lr to zero
        scheduler = PiecewiseLinear(self.optimizer, "lr", [(0, self.lr), (n_epochs * len(train_loader), 0.0)])
        self.trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

        # Prepare metrics
        RunningAverage(output_transform=lambda x: x).attach(self.trainer, "loss")
        metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
                   "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
        metrics["average_ppl"] = MetricsLambda(math.exp, metrics["nll"])
        for name, metric in metrics.items():
            metric.attach(self.evaluator, name)

        # On the main process: add progress bar, tensorboard, checkpoints and save model
        pbar = ProgressBar(persist=True)
        pbar.attach(self.trainer, metric_names=["loss"])

        if not self.verbose:
            pbar_eval = ProgressBar(persist=False)
            pbar_eval.attach(self.evaluator)

        self.evaluator.add_event_handler(Events.STARTED, lambda _: self.logger.info(f'Beginning validation for epoch {self.epoch}...'))
        self.evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(self.evaluator.state.metrics)))

        self.tb_logger.attach(self.trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        self.tb_logger.attach(self.trainer, log_handler=OptimizerParamsHandler(self.optimizer), event_name=Events.ITERATION_STARTED)
        self.tb_logger.attach(self.evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=self.trainer),
                              event_name=Events.EPOCH_COMPLETED)

        self.trainer.add_event_handler(Events.EPOCH_COMPLETED, self.checkpoint_handler,
                                       {'mymodel': getattr(self.model, 'module', self.model)})  # "getattr" takes care of distributed encapsulation

        # Run the training
        self.trainer.run(train_loader, max_epochs=n_epochs)

        # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
        if n_epochs > 0:
            os.rename(self.checkpoint_handler._saved[-1][1][-1], os.path.join(cfg.checkpoint_log_folder, self.name, WEIGHTS_NAME))
            self.tb_logger.close()
Exemple #11
0
def initialize(config):
    model = get_model(config.model, config.model_dir, config.dropout, config.n_fc, config.num_classes)

    config.learning_rate *= idist.get_world_size()
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
    optimizer = idist.auto_optim(optimizer)
    loss_fn = nn.BCEWithLogitsLoss()

    le = config.num_iters_per_epoch
    milestones_values = [
        (0, 0.0),
        (le * config.num_warmup_epochs, config.learning_rate),
        (le * config.max_epochs, 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values)

    return model, optimizer, loss_fn, lr_scheduler
Exemple #12
0
def initialize(config):
    model = utils.get_model(config["model"], config["num_classes"])
    # Adapt model for distributed settings if configured
    model = idist.auto_model(model)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config["learning_rate"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        #        nesterov=True,
    )
    optimizer = idist.auto_optim(optimizer)

    # criterion = nn.CrossEntropyLoss().to(idist.device())
    criterion = nn.CrossEntropyLoss()

    le = config["num_iters_per_epoch"]
    cl = config["learning_rate"]
    # print("%d, %f" %(le,cl))
    milestones_values = [
        (30 * le, cl),
        (45 * le, 0.5 * cl),
        (46 * le, 0.1 * cl),
        (60 * le, 0.1 * cl),
        (61 * le, 0.01 * cl),
        (90 * le, 0.01 * cl),
        (120 * le, 0.001 * cl),
        # (le * config["num_warmup_epochs"], config["learning_rate"]),
        # (le * config["num_epochs"], 0.0),
    ]
    # print(milestones_values)
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config["lr_step_size"], gamma=config["lr_gamma"])

    return model, optimizer, criterion, lr_scheduler
Exemple #13
0
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='persona_comet_weak_label_preprocessed',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="openai-gpt",
                        help="Path, url or short name of the model")
    parser.add_argument("--num_candidates",
                        type=int,
                        default=2,
                        help="Number of candidates for training")
    parser.add_argument("--max_history",
                        type=int,
                        default=2,
                        help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--lm_coef",
                        type=float,
                        default=1.0,
                        help="LM loss coefficient")
    parser.add_argument("--mc_coef",
                        type=float,
                        default=1.0,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=3,
                        help="Number of training epochs")
    parser.add_argument("--personality_permutations",
                        type=int,
                        default=1,
                        help="Number of permutations of personality sentences")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    parser.add_argument("--num_beams",
                        type=int,
                        default=5,
                        help="Number of beams for comet expansion")
    parser.add_argument("--test_run_num",
                        type=int,
                        default=-1,
                        help="Datapoints to run with in a test run")
    parser.add_argument("--exp_name",
                        type=str,
                        default="",
                        required=True,
                        help="Provide an experiment name")
    parser.add_argument("--do_train", action='store_true', help="Do training")
    parser.add_argument("--do_eval", action='store_true', help="Do Evaluation")
    parser.add_argument("--no_persona",
                        action='store_true',
                        help="No Persona Evaluation")
    parser.add_argument("--no_comet_persona",
                        action='store_true',
                        help="No Persona Evaluation")
    parser.add_argument("--uniform_prior",
                        action='store_true',
                        help="Uniform prior")
    parser.add_argument("--entropy_regularize_prior_wt",
                        type=float,
                        default=0.0,
                        help="entropy regularize prior")
    parser.add_argument("--training_type",
                        type=str,
                        default="",
                        help="Marginalize or Reinforce")
    parser.add_argument("--use_baseline",
                        action='store_true',
                        help="Use baseline")
    parser.add_argument("--moving_avg_ratio",
                        type=float,
                        default=0.99,
                        help="Moving avg ratio for running mean baseline")
    parser.add_argument("--reinforce_loss_coef",
                        type=float,
                        default=0.99,
                        help="Loss coef for reinforce")
    parser.add_argument("--prior_model",
                        type=str,
                        default="bow",
                        help="Prior model selection")
    parser.add_argument("--log_dir",
                        type=str,
                        default="",
                        required=True,
                        help="Provide a log dir")
    parser.add_argument("--use_structured_prior",
                        action='store_true',
                        default=False,
                        help="Use effect type as feature")
    parser.add_argument("--use_structured_prior_binarypotential",
                        action='store_true',
                        default=False,
                        help="")
    parser.add_argument("--effect_emb_dim",
                        type=int,
                        default=6,
                        help="Embedding type while computing effect feature")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    print(
        "Running process {}".format(args.local_rank)
    )  # This is a logger.warning: it will be printed by all distributed processes
    print("Arguments: {}".format(pformat(args)))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    print("Prepare tokenizer, pretrained model and optimizer.")
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)

    model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
    if args.do_eval and not args.do_train:
        print('Loading model from checkpoint {}'.format(args.model_checkpoint))
    # model = model_class.from_pretrained(args.model_checkpoint)
    # model.to(args.device)

    model = LatentMarginalizedModel(args, generator_class=model_class)
    print('Num parameters: {}'.format(count_parameters(model)))
    model.to(args.device)

    # Add special tokens if they are not already added
    add_special_tokens_(model, tokenizer)
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    print("Prepare datasets")
    start = datetime.now()

    train_dataset = PersonaChatDataset(args, tokenizer, split='train')
    if args.do_eval:
        val_dataset = PersonaChatDataset(args, tokenizer, split='valid')

    train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)

    if args.no_comet_persona:
        max_num_persona = MAX_NUM_PERSONA
    else:
        max_num_persona = MAX_NUM_COMET_PERSONA

    train_loader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=args.train_batch_size,
                              collate_fn=partial(
                                  collate_dialog,
                                  max_num_persona=max_num_persona),
                              pin_memory=True)

    if args.do_eval:
        val_loader = DataLoader(val_dataset,
                                shuffle=False,
                                batch_size=args.valid_batch_size,
                                collate_fn=partial(
                                    collate_dialog,
                                    max_num_persona=max_num_persona),
                                pin_memory=True)

    print('{} - Data loaded. Starting training'.format(datetime.now() - start))

    # train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)

    # Training function and trainer
    def update(engine, batch):

        model.train()

        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        input_ids, token_type_ids, lm_labels, mc_token_ids, mc_labels, persona, history, effects = batch

        (lm_loss), (mc_loss), (loss_prior), (conditional_lm_loss), (
            num_labels), (track_rewards) = model(input_ids=input_ids,
                                                 token_type_ids=token_type_ids,
                                                 mc_token_ids=mc_token_ids,
                                                 lm_labels=lm_labels,
                                                 mc_labels=mc_labels,
                                                 persona=persona,
                                                 history=history,
                                                 effects=effects)

        loss = (lm_loss * args.lm_coef +
                mc_loss * args.mc_coef) / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)

        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return loss.item(), lm_loss.item(), mc_loss.item(), loss_prior.item(
        ), conditional_lm_loss.item(), track_rewards.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):

        model.eval()

        with torch.no_grad():

            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch

            # print(tokenizer.decode(input_ids[0, -1, :].tolist()))
            # if we dont send labels to model, it doesnt return losses

            lm_logits, mc_logits, *_ = model(
                input_ids,
                token_type_ids=token_type_ids,
                mc_token_ids=mc_token_ids,
            )

            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)

            return (lm_logits_flat_shifted,
                    mc_logits), (lm_labels_flat_shifted, mc_labels)

    evaluator = Engine(inference)

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    # if args.distributed:
    #     trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
    #     evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss")
    RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lm_loss")
    RunningAverage(output_transform=lambda x: x[2]).attach(trainer, "mc_loss")
    RunningAverage(output_transform=lambda x: x[3]).attach(
        trainer, "prior_loss")
    RunningAverage(output_transform=lambda x: x[4]).attach(
        trainer, "cond_lm_loss")
    RunningAverage(output_transform=lambda x: x[5]).attach(trainer, "rewards")

    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-100),
             output_transform=lambda x: (x[0][0], x[1][0])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
    }

    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args),
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)
    })

    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])

    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    def print_model_save(engine):
        print("Training complete. Saving Model.")

    def print_validation(engine):
        print("Model saved. Starting validation.")

    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer,
                    metric_names=[
                        "loss", "lm_loss", "mc_loss", "prior_loss",
                        "cond_lm_loss", "rewards"
                    ])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.model_checkpoint, args.exp_name)
        log_dir = os.path.join(args.log_dir, log_dir)

        print("Logging at log dir: {}".format(log_dir))

        # tb stuff
        # tb_logger = TensorboardLogger(log_dir)
        # tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        # tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        # tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        # save model checkpoints
        checkpoint_handler = ModelCheckpoint(log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=None)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, print_model_save)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        # getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        tokenizer.save_pretrained(log_dir)

        # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
        trainer.add_event_handler(Events.EPOCH_COMPLETED, print_validation)
        if args.do_eval:
            trainer.add_event_handler(Events.EPOCH_COMPLETED,
                                      lambda _: evaluator.run(val_loader))
            if args.n_epochs < 1:
                trainer.add_event_handler(Events.COMPLETED,
                                          lambda _: evaluator.run(val_loader))
            if args.eval_before_start:
                trainer.add_event_handler(Events.STARTED,
                                          lambda _: evaluator.run(val_loader))

    # Run the training
    if args.do_train:
        trainer.run(train_loader, max_epochs=args.n_epochs)
    if args.do_eval and not args.do_train:
        print('Running only Evaluation. No Training.')
        evaluator.run(val_loader)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0 and args.do_train:
        os.rename(
            os.path.join(log_dir, checkpoint_handler._saved[-1][1]),
            os.path.join(log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
    # model
    model = CustomCNNv3()
    model = model.to(device)

    # loss function
    loss_fn = nn.MSELoss(reduction='sum').to(device)

    optimizer = torch.optim.Adam(model.parameters(),
                                 args.lr,
                                 weight_decay=args.decay)

    milestones_values = [(70, 1e-4), (100, 1e-5), (200, 1e-5)]
    experiment.log_parameter("milestones_values", str(milestones_values))
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        loss_fn,
                                        device=device)
    evaluator = create_supervised_evaluator(
        model,
        metrics={
            'mae': CrowdCountingMeanAbsoluteError(),
            'mse': CrowdCountingMeanSquaredError(),
            'loss': Loss(loss_fn)
        },
        device=device)
    print(model)
Exemple #15
0

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(), output_transform=lambda x: (x[0][0], x[1][0]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
Exemple #16
0
def _test_setup_common_training_handlers(
    dirname, device, rank=0, local_rank=0, distributed=False, lr_scheduler=None, save_handler=None
):

    lr = 0.01
    step_size = 100
    gamma = 0.5
    num_iters = 100
    num_epochs = 10

    model = DummyModel().to(device)
    if distributed and "cuda" in device:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank,], output_device=local_rank)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    if lr_scheduler is None:
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
    elif isinstance(lr_scheduler, str) and lr_scheduler == "ignite|LRScheduler":
        from ignite.contrib.handlers import LRScheduler

        lr_scheduler = LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma))
    elif isinstance(lr_scheduler, str) and lr_scheduler == "ignite":
        from ignite.contrib.handlers import PiecewiseLinear

        milestones_values = [(0, 0.0), (step_size, lr), (num_iters * (num_epochs - 1), 0.0)]
        lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values)
    else:
        raise ValueError(f"Unknown lr_scheduler: {lr_scheduler}")

    def update_fn(engine, batch):
        optimizer.zero_grad()
        x = torch.tensor([batch], requires_grad=True, device=device)
        y_pred = model(x)
        loss = y_pred.mean()
        loss.backward()
        optimizer.step()
        return loss

    train_sampler = None
    if distributed and idist.get_world_size() > 1:
        train_sampler = MagicMock(spec=DistributedSampler)
        train_sampler.set_epoch = MagicMock()

    trainer = Engine(update_fn)
    setup_common_training_handlers(
        trainer,
        train_sampler=train_sampler,
        to_save={"model": model, "optimizer": optimizer},
        save_every_iters=75,
        output_path=dirname,
        save_handler=save_handler,
        lr_scheduler=lr_scheduler,
        with_gpu_stats=False,
        output_names=["batch_loss",],
        with_pbars=True,
        with_pbar_on_iters=True,
        log_every_iters=50,
    )

    data = [i * 0.1 for i in range(num_iters)]
    trainer.run(data, max_epochs=num_epochs)

    # check handlers
    handlers = trainer._event_handlers[Events.ITERATION_COMPLETED]
    for cls in [
        TerminateOnNan,
    ]:
        assert any([isinstance(h[0], cls) for h in handlers]), f"{handlers}"
    assert "batch_loss" in trainer.state.metrics

    # Check saved checkpoint
    if rank == 0:
        if save_handler is not None:
            dirname = save_handler.dirname
        checkpoints = list(os.listdir(dirname))
        assert len(checkpoints) == 1
        for v in [
            "training_checkpoint",
        ]:
            assert any([v in c for c in checkpoints])

    # Check LR scheduling
    assert optimizer.param_groups[0]["lr"] <= lr * gamma ** (
        num_iters * num_epochs / step_size
    ), f"{optimizer.param_groups[0]['lr']} vs {lr * gamma ** (num_iters * num_epochs / step_size)}"
def train(
    distributed=False, local_rank=-1, lr = 6.25e-5, dataset_path='../data/personachat_self_original.json', 
    dataset_cache=cached_path('../data/personachat_self_original.json'),
    model_checkpoint='gpt2', num_candidates=2, max_history=5, train_batch_size=2, valid_batch_size=2,
    gradient_accumulation_steps=8, lm_coef=1.0, mc_coef=1.0, max_norm=1.0, n_epochs=10, 
    personality_permutations=1, eval_before_start=False, device = 'cuda' if torch.cuda.is_available() else 'cpu',
    fp16=''
    ):
    '''
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model")
    parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps")
    parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate")
    parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient")
    parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs")
    parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences")
    parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()
    
    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))
    '''
    
    args = None
    
    # Initialize distributed training if needed
    distributed = (local_rank != -1)
    if distributed:
        torch.cuda.set_device(local_rank)
        device = torch.device("cuda", local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')

    #logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
    print(f'{datetime.now()}: Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning')
    
    model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    # We will use 5 special tokens:
    # - <bos> to indicate the start of the sequence
    # - <eos> to indicate the end of the sequence
    # - <speaker1> to indicate the beginning and the tokens of an utterance from the user
    # - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
    # - <pad> as a padding token to build batches of sequences
    special_tokens = {
        'bos_token': '<bos>',
        'eos_token': '<eos>',
        'additional_special_tokens': ['<speaker1>', '<speaker2>'],
        'pad_token': '<pad>'
    }

    # We can add these special tokens to the vocabulary and the embeddings of the model:
    tokenizer.add_special_tokens(special_tokens)
    #model.config.num_special_tokens = len(special_tokens)
    model.resize_token_embeddings(len(tokenizer))
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16)
    if distributed:
        model = DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)

    #logger.info("Prepare datasets")
    print(f'{datetime.now()}: prepare datasets')
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(device) for input_tensor in batch)
        
        lm_loss, mc_loss, _, _, _ = model(*batch)
        loss = (lm_loss * lm_coef + mc_loss * mc_coef) / gradient_accumulation_steps
        if fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        if engine.state.iteration % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()
    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            print(f'{datetime.now()}: {tokenizer.decode(input_ids[0, -1, :].tolist())}')
            model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[1]  # So we can also use GPT2 outputs
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, lr), (n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics 
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
    #metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
    #                "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir='../logs')
        #tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        #tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        #tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        #checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
        #trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, 
        #                          {'mymodel': getattr(model, 'module', model)})  
        # "getattr" take care of distributed encapsulation

        #torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin')
        #getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        #tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=n_epochs)
def train(args):
    logger.info("Prepare tokenizer, pretrained model and optimizer.")
    tokenizer, _, vocab = get_kogpt2_tokenizer()
    model = get_kogpt2_model()
    model.to(args.device)
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    logger.info("Prepare datasets")
    train_loader, val_loader = get_data_loaders(args, tokenizer, vocab)

    def update(engine, batch):
        model.train()

        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        input_ids, labels, token_type_ids = batch

        loss, *_ = model(input_ids,
                         token_type_ids=token_type_ids,
                         labels=labels)
        loss = loss / args.gradient_accumulation_steps

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)

        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return loss.item()

    trainer = Engine(update)

    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            input_ids, labels, token_type_ids = batch
            # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            # if we dont send labels to model, it doesnt return losses
            logits, *_ = model(input_ids, token_type_ids=token_type_ids)
            logits_flat_shifted = logits[..., :-1, :].contiguous().view(
                -1, logits.size(-1))
            labels_flat_shifted = labels[..., 1:].contiguous().view(-1)
            return (logits_flat_shifted), (labels_flat_shifted)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-100),
             output_transform=lambda x: (x[0], x[1])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0], x[1]))
    }
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model,
    # configuration and tokenizer before we start to train
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=["loss"])
    evaluator.add_event_handler(
        Events.COMPLETED, lambda _: pbar.log_message(
            "Validation: %s" % pformat(evaluator.state.metrics)))

    log_dir = make_logdir("kogpt2_personachat")
    tb_logger = TensorboardLogger(log_dir)

    tb_logger.attach(trainer,
                     log_handler=OutputHandler(tag="training",
                                               metric_names=["loss"]),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=OptimizerParamsHandler(optimizer),
                     event_name=Events.ITERATION_STARTED)
    tb_logger.attach(
        evaluator,
        log_handler=OutputHandler(
            tag="validation",
            metric_names=list(metrics.keys()),
            global_step_transform=global_step_from_engine(trainer)),
        event_name=Events.EPOCH_COMPLETED)

    checkpoint_handler = ModelCheckpoint(log_dir,
                                         'checkpoint',
                                         save_interval=1,
                                         n_saved=3)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED, checkpoint_handler,
        {'mymodel': getattr(model, 'module', model)
         })  # "getattr" takes care of distributed encapsulation

    torch.save(args, log_dir + '/model_training_args.bin')
    getattr(model, 'module',
            model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
    # tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    # TODO: PR in ignite to have better access to saved file paths (cleaner)
    os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]),
              os.path.join(log_dir, WEIGHTS_NAME))
    tb_logger.close()
Exemple #19
0
def train():
    config_file = "configs/train_daily_dialog_emotion_action_config.json"
    config = Config.from_json_file(config_file)

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", config.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(config))

    # Initialize distributed training if needed
    config.distributed = (config.local_rank != -1)
    if config.distributed:
        torch.cuda.set_device(config.local_rank)
        config.device = torch.device("cuda", config.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )
    tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
    model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel
    model = model_class.from_pretrained(config.model_checkpoint)
    tokenizer.set_special_tokens(SPECIAL_TOKENS)
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.to(config.device)
    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if config.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=config.fp16)
    if config.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[config.local_rank],
                                        output_device=config.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        config, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(
            input_tensor.to(config.device) for input_tensor in batch)
        lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels,
                                 token_type_ids, token_emotion_ids,
                                 token_action_ids)
        loss = (lm_loss * config.lm_coef +
                mc_loss * config.mc_coef) / config.gradient_accumulation_steps
        if config.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           config.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
        if engine.state.iteration % config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(config.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch
            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            model_outputs = model(input_ids,
                                  mc_token_ids,
                                  token_type_ids=token_type_ids,
                                  token_emotion_ids=token_emotion_ids,
                                  token_action_ids=token_action_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[
                1]  # So we can also use GPT2 outputs
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted,
                    mc_logits), (lm_labels_flat_shifted, mc_labels)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if config.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if config.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if config.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, config.lr),
                                 (config.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-1),
             output_transform=lambda x: (x[0][0], x[1][0])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], config),
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if config.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=config.log_dir)
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" take care of distributed encapsulation

        torch.save(config,
                   tb_logger.writer.log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(
            os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=config.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if config.local_rank in [-1, 0] and config.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Exemple #20
0
def train():
    os.environ['CUDA_VISIBLE_DEVICES'] = '7'

    parser = ArgumentParser()
    parser.add_argument('--gpt2', action='store_true', help="use gpt2")
    parser.add_argument("--model_checkpoint", type=str, default="uer/gpt2-chinese-cluecorpussmall", help="Path or URL of the model")
    parser.add_argument("--from_step", type=int, default=-1, help="Init learning rate from this step")
    parser.add_argument('--pretrained', action='store_true', help="If False train from scratch")
    parser.add_argument("--data_path", type=str, default="data/autocloze.json",
                        help="Path or url of the dataset. ")
    parser.add_argument("--train_path", type=str, default="data/toy_train.txt",
                        help="Path of the train dataset for dist dataset. ")
    parser.add_argument("--valid_path", type=str, default="data/toy_valid.txt",
                        help="Path of the valid dataset for dist dataset. ")
    #--------------------------------------------------------------
    parser.add_argument("--dataset_cache", type=str, default="dataset_zh",
                        help="Path or url of the dataset cache")
    parser.add_argument('--log_file', '-log_file', type=str, default="", help="Output logs to a file under this path")
    parser.add_argument("--num_workers", type=int, default=8, help="Number of subprocesses for data loading")
    parser.add_argument("--n_epochs", type=int, default=40, help="Number of training epochs")
    parser.add_argument("--train_batch_size", type=int, default=1, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=1, help="Batch size for validation")
    parser.add_argument("--max_history", type=int, default=15, help="Number of previous exchanges to keep in history")
    parser.add_argument("--scheduler", type=str, default="noam", choices=['noam', 'linear'], help="method of optim")
    parser.add_argument("--n_emd", type=int, default=768, help="Number of n_emd in config file (for noam)")
    parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate")
    parser.add_argument("--eval_before_start", action='store_true',
                        help="If true start with a first evaluation before training")
    parser.add_argument("--warmup_steps", type=int, default=5000, help="Warm up steps")
    parser.add_argument("--valid_steps", type=int, default=5000, help="Perfom validation every X steps")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=64,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--fp16", type=str, default="",
                        help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()
    print('cuda ',torch.cuda.is_available())
    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process.
    # logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    '''if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
    '''
    args.device = torch.device("cuda")
    print('device ',args.device)
    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
    #model_class = OpenAIGPTLMHeadModel if not args.gpt2 else GPT2LMHeadModel
    #config_class = OpenAIGPTConfig if not args.gpt2 else GPT2Config
    model_class = GPT2LMHeadModel
    config_class = GPT2Config
    tokenizer_class = BertTokenizer
    print('pretrained:',args.pretrained)
    if args.pretrained:
        print("----------------pretrained")
        tokenizer = BertTokenizer.from_pretrained(args.model_checkpoint, do_lower_case=True)
        model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
        model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall",from_tf=True)
        #print('generate')
        #print(text_generator("这是很久之前的事情了", max_length=100, do_sample=True))

    #args.device=torch.device("cuda", 2)
    
    model.to(args.device)
    
    optimizer = AdamW([{'params': model.parameters(), 'initial_lr': args.lr}], lr=args.lr, correct_bias=True)

    logger.info("Prepare datasets")
    loader_class = build_dist_loaders if not args.data_path else build_dataloaders
    train_loader, val_loader, train_sampler, valid_sampler = loader_class(args, tokenizer, logger)

    logger.info("Prepare datasets ends")
    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
        model=model.module
    #if isinstance(model,torch.nn.DataParallel):
    
    #print('params:',params_count(model))

    #tokens_embed = model.transformer.get_input_embeddings()
    # Training function and trainer
    def update(engine, batch):
        input_ids, token_type_ids, lm_labels = tuple(input_tensor.to(args.device) for input_tensor in batch)
        
        #for i in range(input_ids.size()[0]):
        #    for j in range(input_ids.size()[1]):
        #        if input_ids[i,j]==-1:
        #            input_ids[i,j]=-100
        #        if lm_labels[i,j]==-1:
        #            lm_labels[i,j]=-100
        #one=torch.tensor(-100)
        #input_ids=torch.where(input_ids==-1,one,input_ids)
        #lm_labels=torch.where(lm_labels==-1,one,lm_labels)
        #print('traindata',input_ids,lm_labels)

        #lm_labels=input_ids
        r'''input_shape = input_ids.siz`e`()
        input_ids = input_ids.view(-1, input_shape[-1])
        inputs_embeds = tokens_embed(input_ids) * math.sqrt(tokens_embed.embedding_dim)'''

        model.train()
        #(lm_loss), *_ = model(inputs_embeds=inputs_embeds, labels=lm_labels,return_dict=0)
        (lm_loss), *_ = model(input_ids=input_ids, labels=lm_labels,return_dict=False)
        #print('lm_loss',lm_loss)
        loss = lm_loss / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item(), optimizer.param_groups[0]['lr']

    trainer = Engine(update)
    

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    cntepoch=0
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            input_ids, token_type_ids, lm_labels = tuple(input_tensor.to(args.device) for input_tensor in batch)
            # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            #one = torch.tensor(-100)
            #input_ids=torch.where(input_ids==-1,one,input_ids)
            #print('validdata',input_ids,lm_labels)
            #lm_labels=input_ids
            r'''input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
            inputs_embeds = tokens_embed(input_ids) * math.sqrt(tokens_embed.embedding_dim)'''
            

            #lm_logits, *_ = model(inputs_embeds=inputs_embeds,return_dict=0)
            lm_logits, *_ = model(input_ids=input_ids,return_dict=False)
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return lm_logits_flat_shifted, lm_labels_flat_shifted
        cntepoch+=1
        torch.save(args, tb_logger.writer.logdir + '_%s/model_training_args.bin'%(str(cntepoch)))

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Evaluation during training
    @trainer.on(Events.ITERATION_STARTED)
    def log_iterations(engine):
        # if engine.state.iteration % max(int(0.1 * len(train_loader)), 1) == 0:
        if engine.state.iteration % args.valid_steps == 0:
            evaluator.run(val_loader)

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # noam decrease the learning rate
    # model_size = model.config.n_embd
    model_size = args.n_emd
    noam_lambda = lambda step: (
            model_size ** (-0.5) * min((step + 1) ** (-0.5), (step + 1) * args.warmup_steps ** (-1.5)))
    noam_scheduler = LambdaLR(optimizer, lr_lambda=noam_lambda, last_epoch=args.from_step)
    scheduler = LRScheduler(noam_scheduler)
    if args.scheduler == "linear":
        scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss")
    RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lr")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0], x[1]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints
    # And save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True, mininterval=2)
        pbar.attach(trainer, metric_names=["loss", "lr"])
        evaluator.add_event_handler(Events.COMPLETED,
                                    lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=None)
        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()),
                                                              another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=6)
        # save model after evaluation
        evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {
            'mymodel': getattr(model, 'module', model)})
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {
            'mymodel': getattr(model, 'module', model)})  # "getattr" take care of distributed encapsulation

        torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.logdir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.logdir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)
    
    # On the main process: close tensorboard logger and rename the last checkpoint
    # (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(checkpoint_handler._saved[-1][1][-1],
                  os.path.join(tb_logger.writer.logdir,
                               WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Exemple #21
0
def main():
  parser = argparse.ArgumentParser()

  # Required parameters
  parser.add_argument("--model", type=str, default='ffn', help="model's name")
  parser.add_argument("--mode", type=int, choices=[0, 1, 2], default=None)
  parser.add_argument("--SNRdb", type=float, default=None)
  parser.add_argument("--pilot_version", type=int, choices=[1, 2], default=1)
  parser.add_argument("--loss_type", type=str, default="BCELoss")
  parser.add_argument("--train_batch_size", type=int, default=128)
  parser.add_argument("--valid_batch_size", type=int, default=128)
  parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
  parser.add_argument("--max_norm", type=float, default=-1)
  parser.add_argument("--lr", type=float, default=1e-3)
  parser.add_argument("--noise_lambda", type=float, default=1.0)
  parser.add_argument("--lr_scheduler", type=str, choices=["linear", "cycle", "cosine"], default="linear")
  parser.add_argument("--reset_lr_scheduler", type=str, choices=["linear", "cycle", "cosine"], default=None)
  parser.add_argument("--reset_trainer", action='store_true')
  parser.add_argument("--modify_model", action='store_true')
  parser.add_argument("--wd", type=float, default=1e-4, help="weight decay")
  parser.add_argument("--eval_iter", type=int, default=10)
  parser.add_argument("--save_iter", type=int, default=10)
  parser.add_argument("--n_epochs", type=int, default=10)
  parser.add_argument("--flush_dataset", type=int, default=0)
  parser.add_argument("--no_cache", action='store_true')
  parser.add_argument("--with_pure_y", action='store_true') 
  parser.add_argument("--with_h", action='store_true') 
  parser.add_argument("--only_l1", action='store_true', help="Only loss 1")
  parser.add_argument("--interpolation", action='store_true', help="if interpolate between pure and reconstruction.") 
  parser.add_argument("--data_dir", type=str, default="data")
  parser.add_argument("--cache_dir", type=str, default="train_cache")
  parser.add_argument("--output_path", type=str, default="runs", help="model save")
  parser.add_argument("--resume_from", type=str, default=None, help="resume training.")
  parser.add_argument("--first_cache_index", type=int, default=0)
  parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                      help="Device (cuda or cpu)")
  parser.add_argument("--local_rank", type=int, default=-1,
                      help="Local rank for distributed training (-1: not distributed)")
  parser.add_argument("--seed", type=int, default=43)
  parser.add_argument("--debug", action='store_true')
  args = parser.parse_args()

  args.output_path = os.path.join(args.output_path, f'pilot_{args.pilot_version}')
  args.cache_dir = os.path.join(args.data_dir, args.cache_dir)
  # Setup CUDA, GPU & distributed training
  args.distributed = (args.local_rank != -1)
  if not args.distributed:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl", init_method='env://')
  args.n_gpu = torch.cuda.device_count() if not args.distributed else 1
  args.device = device

  # Set seed
  set_seed(args)
  logger = setup_logger("trainer", distributed_rank=args.local_rank)

  # Model construction
  model = getattr(models, args.model)(args)
  model = model.to(device)
  optimizer = AdamW(model.parameters(), lr = args.lr, weight_decay=args.wd)

  if args.loss_type == "MSELoss":
    criterion = nn.MSELoss(reduction='sum').to(device)
  else:
    criterion = getattr(nn, args.loss_type, getattr(auxiliary, args.loss_type, None))().to(device)
  criterion2 = nn.MSELoss(reduction='sum').to(device)

  if args.local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
    )

  train_dataset = SIGDataset(args, data_type="train")
  valid_dataset = SIGDataset(args, data_type="valid")
  train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
  valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
  train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, pin_memory=True, shuffle=(not args.distributed))
  valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, pin_memory=True, shuffle=False)
  
  lr_scheduler = None
  if args.lr_scheduler == "linear":
    lr_scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
  elif args.lr_scheduler == "cycle":
    lr_scheduler = LinearCyclicalScheduler(optimizer, 'lr', 0.0, args.lr, args.eval_iter * len(train_loader))
  elif args.lr_scheduler == "cosine":
    lr_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, args.eval_iter * len(train_loader))

  # Training function and trainer
  def update(engine, batch):
      model.train()
      y, x_label, y_pure, H = train_dataset.prepare_batch(batch, device=args.device)

      if args.with_pure_y and args.with_h:
        x_pred, y_pure_pred, H_pred = model(y, pure=y_pure, H=H, opp=True)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        if args.loss_type == "MSELoss":
          loss_1 = loss_1 / x_pred.size(0)
        loss_noise = criterion2(y_pure_pred, y_pure) / y.size(0) / args.gradient_accumulation_steps
        loss_noise_h = criterion2(H_pred, H) / H.size(0) / args.gradient_accumulation_steps
        if args.only_l1:
          loss = loss_1
        else:
          loss = loss_1 + loss_noise * args.noise_lambda + loss_noise_h
        output = (loss.item(), loss_1.item(), loss_noise.item(), loss_noise_h.item())
      elif args.with_pure_y:
        x_pred, y_pure_pred = model(y, pure=y_pure if args.interpolation else None, opp=True)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        loss_noise = criterion2(y_pure_pred, y_pure) / y.size(0) / args.gradient_accumulation_steps
        loss = loss_1 + loss_noise * args.noise_lambda
        output = (loss.item(), loss_1.item(), loss_noise.item())
      elif args.with_h:
        x_pred, H_pred = model(y, opp=True)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        loss_noise = criterion2(H_pred, H) / H.size(0) / args.gradient_accumulation_steps
        loss = loss_1 + loss_noise * args.noise_lambda
        output = (loss.item(), loss_1.item(), loss_noise.item())
      else:
        x_pred = model(y)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        loss = loss_1
        output = (loss.item(), loss_1.item(), torch.zeros_like(loss_1).item())

      loss.backward()
      if args.max_norm > 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
      if engine.state.iteration % args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
      return output
  trainer = Engine(update)

  to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler}
  metric_names = ["loss", "l1", "ln"]
  if args.with_pure_y and args.with_h:
    metric_names.append("lnH")

  common.setup_common_training_handlers(
    trainer=trainer,
    train_sampler=train_loader.sampler,
    to_save=to_save,
    save_every_iters=len(train_loader) * args.save_iter,
    lr_scheduler=lr_scheduler,
    output_names=metric_names,
    with_pbars=False,
    clear_cuda_cache=False,
    output_path=args.output_path,
    n_saved=2,
  )

  resume_from = args.resume_from
  if resume_from is not None:
    checkpoint_fp = Path(resume_from)
    assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix())
    logger.info("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix()))
    checkpoint = torch.load(checkpoint_fp.as_posix(), map_location="cpu")
    if args.reset_trainer:
      to_save.pop("trainer")
    checkpoint_to_load = to_save if 'validation' not in resume_from else {"model": model}
    Checkpoint.load_objects(to_load=checkpoint_to_load, checkpoint=checkpoint)
    if args.reset_lr_scheduler is not None:
      if args.reset_lr_scheduler == "linear":
        lr_scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
      elif args.reset_lr_scheduler == "cycle":
        lr_scheduler = LinearCyclicalScheduler(optimizer, 'lr', 0.0, args.lr, args.eval_iter * len(train_loader))
      elif args.reset_lr_scheduler == "cosine":
        lr_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, args.eval_iter * len(train_loader))

  metrics = {
    "accuracy": Accuracy(lambda output: (torch.round(output[0][0]), output[1][0])), 
    "loss_1": Loss(criterion, output_transform=lambda output: (output[0][0], output[1][0])),
    "loss_noise": Loss(criterion2, output_transform=lambda output: (output[0][1], output[1][1]))
  }
  if args.with_pure_y and args.with_h:
    metrics["loss_noise_h"] = Loss(criterion2, output_transform=lambda output: (output[0][2], output[1][2]))

  def _inference(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[torch.Tensor]]:
    model.eval()
    with torch.no_grad():
      x, y, x_pure, H = valid_dataset.prepare_batch(batch, device=args.device, non_blocking=True)
      if args.with_pure_y and args.with_h:
        y_pred, x_pure_pred, h_pred = model(x, opp=True)
        outputs = (y_pred, x_pure_pred, h_pred), (y, x_pure, H)
      elif args.with_pure_y:
        y_pred, x_pure_pred = model(x, opp=True)
        outputs = (y_pred, x_pure_pred), (y, x_pure)
      elif args.with_h:
        y_pred, h_pred = model(x, opp=True)
        outputs = (y_pred, h_pred), (y, H)
      else:
        y_pred = model(x)
        x_pure_pred = x_pure
        outputs = (y_pred, x_pure_pred), (y, x_pure)       
      return outputs
  evaluator = Engine(_inference)
  for name, metric in metrics.items():
      metric.attach(evaluator, name)

  trainer.add_event_handler(Events.EPOCH_COMPLETED(every=args.eval_iter), lambda _: evaluator.run(valid_loader))

  if args.flush_dataset > 0:
    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=args.n_epochs//args.flush_dataset), 
                  lambda _: train_loader.dataset.reset() if args.no_cache else train_loader.dataset.reload())

  # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
  if args.local_rank in [-1, 0]:
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=metric_names, output_transform=lambda _: {"lr": f"{optimizer.param_groups[0]['lr']:.2e}"})
    evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

    tb_logger = common.setup_tb_logging(args.output_path, trainer, optimizer, evaluators={'validation': evaluator}, log_every_iters=1)

  # Store 3 best models by validation accuracy:
  common.gen_save_best_models_by_val_score(
    save_handler=DiskSaver(args.output_path, require_empty=False),
    evaluator=evaluator,
    models={"model": model},
    metric_name="accuracy",
    n_saved=3,
    trainer=trainer,
    tag="validation"
  )

  # Run the training
  trainer.run(train_loader, max_epochs=args.n_epochs)

  if args.local_rank in [-1, 0]:
    tb_logger.close()
Exemple #22
0
def train():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_checkpoint", type=str, default=PRETRAINED_MODEL_URL, help="Path to the pretrained model checkpoint")
    parser.add_argument("--dataset_path", type=str, default='../data/sst', help="Directory to dataset.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path to dataset cache")
    parser.add_argument("--logdir", type=str, default='./transformer_results', help="Path to logs")
    parser.add_argument("--num_classes", type=int, default=5, help="Number of classes for the target classification task")
    parser.add_argument("--adapters_dim", type=int, default=-1, help="If >0 add adapters to the model with adapters_dim dimension")
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout for transformer module")
    parser.add_argument("--clf_loss_coef", type=float, default=1, help="If >0 add a classification loss")
    parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=32, help="Batch size for validation")
    parser.add_argument("--valid_pct", type=float, default=0.1, help="Percentage of training data to use for validation")
    parser.add_argument("--lr", type=float, default=6.5e-5, help="Learning rate")
    parser.add_argument("--n_warmup", type=int, default=10, help="Number of warmup iterations")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay")
    parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs")
    parser.add_argument("--gradient_acc_steps", type=int, default=2, help="Number of update steps to accumulate before a backward pass.")
    parser.add_argument("--init_range", type=float, default=0.02, help="Normal initialization standard deviation")

    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    args = parser.parse_args()

    # Define pretrained model and optimizer
    model, state_dict, config = load_pretrained_model(args)
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=False)
    num_model_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Model has {num_model_params:,} parameters")
    # Define datasets
    datasets = read_sst5(args.dataset_path)

    # Define labels
    labels = list(set(datasets["train"][LABEL_COL].tolist()))
    assert len(labels) == args.num_classes  # Specified number of classes should be equal to that in the given dataset!
    label2int = {label: i for i, label in enumerate(labels)}
    int2label = {i: label for label, i in label2int.items()}

    # Get BertTokenizer for this pretrained model
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
    clf_token = tokenizer.vocab['[CLS]']  # classifier token
    pad_token = tokenizer.vocab['[PAD]']  # pad token
    processor = TextProcessor(tokenizer, label2int, clf_token, pad_token, max_length=config.num_max_positions)

    train_dl = processor.create_dataloader(datasets["train"],
                                           shuffle=True,
                                           batch_size=args.train_batch_size,
                                           valid_pct=None)

    valid_dl = processor.create_dataloader(datasets["dev"],
                                           batch_size=args.train_batch_size,
                                           valid_pct=None)

    test_dl = processor.create_dataloader(datasets["test"],
                                          batch_size=args.valid_batch_size,
                                          valid_pct=None)

    # Training function and trainer
    def update(engine, batch):
        "update function for training"
        model.train()
        inputs, labels = (t.to(args.device) for t in batch)
        inputs = inputs.transpose(0, 1).contiguous()  # to shape [seq length, batch]
        _, loss = model(inputs,
                        clf_tokens_mask=(inputs == clf_token),
                        clf_labels=labels)
        loss = loss / args.gradient_acc_steps
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_acc_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()
    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch, labels = (t.to(args.device) for t in batch)
            inputs = batch.transpose(0, 1).contiguous()  # to shape [seq length, batch]
            clf_logits = model(inputs,
                               clf_tokens_mask=(inputs == clf_token),
                               padding_mask=(batch == pad_token))
        return clf_logits, labels
    evaluator = Engine(inference)

    # add metric to evaluator
    Accuracy().attach(evaluator, "accuracy")

    # add evaluator to trainer: eval on valid set after each epoch
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(valid_dl)
        print(f"validation epoch: {engine.state.epoch} acc: {100*evaluator.state.metrics['accuracy']:.3f}%")

    # Learning rate schedule: linearly warm-up to lr and then to zero
    scheduler = PiecewiseLinear(optimizer, 'lr', [(0, 0.0), (args.n_warmup, args.lr),
                                (len(train_dl) * args.n_epochs, 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Add progressbar with loss
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    ProgressBar(persist=True).attach(trainer, metric_names=['loss'])

    # Save checkpoints and finetuning config
    checkpoint_handler = ModelCheckpoint(args.logdir, 'checkpoint',
                                         save_interval=1, require_empty=False)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'sst_model': model})

    # Save metadata
    torch.save({
        "config": config,
        "config_ft": args,
        "int2label": int2label
    }, os.path.join(args.logdir, "model_training_args.bin"))

    # Run trainer
    trainer.run(train_dl, max_epochs=args.n_epochs)
    # Evaluate
    evaluator.run(test_dl)
    print(f"test results - acc: {100*evaluator.state.metrics['accuracy']:.3f}")
    # Save fine-tuned model weights
    torch.save(model.state_dict(), os.path.join(args.logdir, "model_weights.pth"))
Exemple #23
0
class FastaiLRFinder:
    """Learning rate finder handler for supervised trainers.

    While attached, the handler increases the learning rate in between two
    boundaries in a linear or exponential manner. It provides valuable
    information on how well the network can be trained over a range of learning
    rates and what can be an optimal learning rate.

    Examples:

    .. code-block:: python

        from ignite.contrib.handlers import FastaiLRFinder

        trainer = ...
        model = ...
        optimizer = ...

        lr_finder = LRFinder()
        to_save = {"model": model, "optimizer": optimizer}

        with lr_finder.attach(trainer, to_save=to_save) as trainer_with_lr_finder:
            trainer_with_lr_finder.run(dataloader)

        # Get lr_finder results
        lr_finder.get_results()

        # Plot lr_finder results (requires matplotlib)
        lr_finder.plot()

        # get lr_finder suggestion for lr
        lr_finder.lr_suggestion()


    Note:
        When context manager is exited all LR finder's handlers are removed.

    Note:
        Please, also keep in mind that all other handlers attached the trainer will be executed during LR finder's run.

    Note:
        This class may require `matplotlib` package to be installed to plot learning rate range test:

        .. code-block:: bash

            pip install matplotlib


    References:

        Cyclical Learning Rates for Training Neural Networks:
        https://arxiv.org/abs/1506.01186

        fastai/lr_find: https://github.com/fastai/fastai
    """
    def __init__(self):
        self._diverge_flag = False
        self._history = None
        self._best_loss = None
        self._lr_schedule = None
        self.logger = logging.getLogger(__name__)

    def _run(self, trainer, optimizer, output_transform, num_iter, end_lr,
             step_mode, smooth_f, diverge_th):

        self._history = {"lr": [], "loss": []}
        self._best_loss = None
        self._diverge_flag = False

        # attach LRScheduler to trainer.
        if num_iter is None:
            num_iter = trainer.state.epoch_length * trainer.state.max_epochs
        else:
            max_iter = trainer.state.epoch_length * trainer.state.max_epochs
            if num_iter > max_iter:
                warnings.warn(
                    "Desired num_iter {} is unreachable with the current run setup of {} iteration "
                    "({} epochs)".format(num_iter, max_iter,
                                         trainer.state.max_epochs),
                    UserWarning,
                )

        if not trainer.has_event_handler(self._reached_num_iterations):
            trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                      self._reached_num_iterations, num_iter)

        # attach loss and lr logging
        if not trainer.has_event_handler(self._log_lr_and_loss):
            trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                      self._log_lr_and_loss, output_transform,
                                      smooth_f, diverge_th)

        self.logger.debug(
            "Running LR finder for {} iterations".format(num_iter))
        # Initialize the proper learning rate policy
        if step_mode.lower() == "exp":
            self._lr_schedule = LRScheduler(
                _ExponentialLR(optimizer, end_lr, num_iter))
        else:
            start_lr = optimizer.param_groups[0]["lr"]
            self._lr_schedule = PiecewiseLinear(optimizer,
                                                param_name="lr",
                                                milestones_values=[
                                                    (0, start_lr),
                                                    (num_iter, end_lr)
                                                ])
        if not trainer.has_event_handler(self._lr_schedule):
            trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                      self._lr_schedule, num_iter)

    def _reset(self, trainer):
        self.logger.debug("Completed LR finder run")
        trainer.remove_event_handler(self._lr_schedule,
                                     Events.ITERATION_COMPLETED)
        trainer.remove_event_handler(self._log_lr_and_loss,
                                     Events.ITERATION_COMPLETED)
        trainer.remove_event_handler(self._reached_num_iterations,
                                     Events.ITERATION_COMPLETED)

    def _log_lr_and_loss(self, trainer, output_transform, smooth_f,
                         diverge_th):
        output = trainer.state.output
        loss = output_transform(output)
        lr = self._lr_schedule.get_param()
        self._history["lr"].append(lr)
        if trainer.state.iteration == 1:
            self._best_loss = loss
        else:
            if smooth_f > 0:
                loss = smooth_f * loss + (1 -
                                          smooth_f) * self._history["loss"][-1]
            if loss < self._best_loss:
                self._best_loss = loss
        self._history["loss"].append(loss)

        # Check if the loss has diverged; if it has, stop the trainer
        if self._history["loss"][-1] > diverge_th * self._best_loss:
            self._diverge_flag = True
            self.logger.info("Stopping early, the loss has diverged")
            trainer.terminate()

    def _reached_num_iterations(self, trainer, num_iter):
        if trainer.state.iteration > num_iter:
            trainer.terminate()

    def _warning(self, _):
        if not self._diverge_flag:
            warnings.warn(
                "Run completed without loss diverging, increase end_lr, decrease diverge_th or look"
                " at lr_finder.plot()",
                UserWarning,
            )

    def _detach(self, trainer):
        """
        Detaches lr_finder from trainer.

        Args:
            trainer: the trainer to detach form.
        """

        if trainer.has_event_handler(self._run, Events.STARTED):
            trainer.remove_event_handler(self._run, Events.STARTED)
        if trainer.has_event_handler(self._warning, Events.COMPLETED):
            trainer.remove_event_handler(self._warning, Events.COMPLETED)
        if trainer.has_event_handler(self._reset, Events.COMPLETED):
            trainer.remove_event_handler(self._reset, Events.COMPLETED)

    def get_results(self):
        """
        Returns: dictionary with loss and lr logs fromm the previous run
        """
        return self._history

    def plot(self, skip_start=10, skip_end=5, log_lr=True):
        """Plots the learning rate range test.

        This method requires `matplotlib` package to be installed:

        .. code-block:: bash

            pip install matplotlib

        Args:
            skip_start (int, optional): number of batches to trim from the start.
                Default: 10.
            skip_end (int, optional): number of batches to trim from the start.
                Default: 5.
            log_lr (bool, optional): True to plot the learning rate in a logarithmic
                scale; otherwise, plotted in a linear scale. Default: True.
        """
        try:
            from matplotlib import pyplot as plt
        except ImportError:
            raise RuntimeError(
                "This method requires matplotlib to be installed. "
                "Please install it with command: \n pip install matplotlib")

        if self._history is None:
            raise RuntimeError(
                "learning rate finder didn't run yet so results can't be plotted"
            )

        if skip_start < 0:
            raise ValueError("skip_start cannot be negative")
        if skip_end < 0:
            raise ValueError("skip_end cannot be negative")

        # Get the data to plot from the history dictionary. Also, handle skip_end=0
        # properly so the behaviour is the expected

        lrs = self._history["lr"]
        losses = self._history["loss"]
        if skip_end == 0:
            lrs = lrs[skip_start:]
            losses = losses[skip_start:]
        else:
            lrs = lrs[skip_start:-skip_end]
            losses = losses[skip_start:-skip_end]

        # Plot loss as a function of the learning rate
        plt.plot(lrs, losses)
        if log_lr:
            plt.xscale("log")
        plt.xlabel("Learning rate")
        plt.ylabel("Loss")
        plt.show()

    def lr_suggestion(self):
        """
        Returns: learning rate at the minimum numerical gradient
        """
        if self._history is None:
            raise RuntimeError(
                "learning rate finder didn't run yet so lr_suggestion can't be returned"
            )
        loss = self._history["loss"]
        grads = torch.tensor(
            [loss[i] - loss[i - 1] for i in range(1, len(loss))])
        min_grad_idx = grads.argmin() + 1
        return self._history["lr"][int(min_grad_idx)]

    @contextlib.contextmanager
    def attach(
        self,
        trainer,
        to_save,
        output_transform=lambda output: output,
        num_iter=None,
        end_lr=10.0,
        step_mode="exp",
        smooth_f=0.05,
        diverge_th=5.0,
    ):
        """Attaches lr_finder to a given trainer. It also resets model and optimizer at the end of the run.

        Usage:

        .. code-block:: python

            to_save = {"model": model, "optimizer": optimizer}
            with lr_finder.attach(trainer, to_save=to_save) as trainer_with_lr_finder:
                trainer_with_lr_finder.run(dataloader)`

        Args:
            trainer (Engine): lr_finder is attached to this trainer. Please, keep in mind that all attached handlers
                will be executed.
            to_save (Mapping): dictionary with optimizer and other objects that needs to be restored after running
                the LR finder. For example, `to_save={'optimizer': optimizer, 'model': model}`. All objects should
                implement `state_dict` and `load_state_dict` methods.
            output_transform (callable, optional): function that transforms the trainer's `state.output` after each
                iteration. It must return the loss of that iteration.
            num_iter (int, optional): number of iterations for lr schedule between base lr and end_lr. Default, it will
                run for `trainer.state.epoch_length * trainer.state.max_epochs`.
            end_lr (float, optional): upper bound for lr search. Default, 10.0.
            step_mode (str, optional): "exp" or "linear", which way should the lr be increased from optimizer's initial
                lr to `end_lr`. Default, "exp".
            smooth_f (float, optional): loss smoothing factor in range `[0, 1)`. Default, 0.05
            diverge_th (float, optional): Used for stopping the search when `current loss > diverge_th * best_loss`.
                Default, 5.0.

        Notes:
            lr_finder cannot be attached to more than one trainer at a time

        Returns:
            trainer_with_lr_finder: trainer used for finding the lr
        """
        if not isinstance(to_save, Mapping):
            raise TypeError(
                "Argument to_save should be a mapping, but given {}".format(
                    type(to_save)))

        Checkpoint._check_objects(to_save, "state_dict")
        Checkpoint._check_objects(to_save, "load_state_dict")

        if "optimizer" not in to_save:
            raise ValueError("Mapping to_save should contain 'optimizer' key")

        if not isinstance(to_save["optimizer"], torch.optim.Optimizer):
            raise ValueError(
                "Object to_save['optimizer'] should be torch optimizer, but given {}"
                .format(type(to_save["optimizer"])))

        if smooth_f < 0 or smooth_f >= 1:
            raise ValueError("smooth_f is outside the range [0, 1]")
        if diverge_th < 1:
            raise ValueError("diverge_th should be larger than 1")
        if step_mode not in ["exp", "linear"]:
            raise ValueError(
                "step_mode should be 'exp' or 'linear', but given {}".format(
                    step_mode))
        if num_iter is not None and (not isinstance(num_iter, int)
                                     or num_iter <= 0):
            raise ValueError(
                "if provided, num_iter should be a positive integer, but given {}"
                .format(num_iter))

        # store to_save
        with tempfile.TemporaryDirectory() as tmpdirname:
            obj = {k: o.state_dict() for k, o in to_save.items()}
            cache_filepath = Path(tmpdirname) / "ignite_lr_finder_cache.pt.tar"
            torch.save(obj, cache_filepath.as_posix())

            optimizer = to_save["optimizer"]
            # Attach handlers
            if not trainer.has_event_handler(self._run):
                trainer.add_event_handler(
                    Events.STARTED,
                    self._run,
                    optimizer,
                    output_transform,
                    num_iter,
                    end_lr,
                    step_mode,
                    smooth_f,
                    diverge_th,
                )
            if not trainer.has_event_handler(self._warning):
                trainer.add_event_handler(Events.COMPLETED, self._warning)
            if not trainer.has_event_handler(self._reset):
                trainer.add_event_handler(Events.COMPLETED, self._reset)

            yield trainer
            self._detach(trainer)
            # restore to_save and reset trainer's state
            obj = torch.load(cache_filepath.as_posix())
            trainer.state = None
            for k, o in obj.items():
                to_save[k].load_state_dict(o)
Exemple #24
0
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3."
    )
    parser.add_argument(
        "--logdir", type=str, default=None, help="If provided, the model will be output to this folder."
    )
    parser.add_argument("--dataset_cache", type=str, default="./dataset_cache", help="Path or url of the dataset cache")
    parser.add_argument("--use_mlflow", action="store_true", help="If true we enable mlflow")
    parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient")
    parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")

    parser.add_argument(
        "--tracking_uri", type=str, default="http://localhost:5000", help="url for mlflow tracking server"
    )
    parser.add_argument("--num_candidates", type=int, default=5, help="Number of candidates for training")

    parser.add_argument("--experiment", type=str, help="experiment name for mlflow")

    parser.add_argument("--task_config", type=str, help="Path to the tokenization config file")
    parser.add_argument("--special_tokens_file", type=str, default=None, help="Path to the special tokens file")
    parser.add_argument(
        "--model_checkpoint", type=str, default="distilgpt2", help="Path, url or short name of the model"
    )
    parser.add_argument("--model_type", type=str, default=None, help="gpt or gpt2")
    parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=1, help="Batch size for validation")
    parser.add_argument(
        "--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps"
    )

    parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate")
    parser.add_argument("--adam_epsilon", type=float, default=1e-6, help="Learning rate")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--patience", type=int, default=1, help="patience parameter for early stopping")
    parser.add_argument("--n_epochs", type=int, default=10, help="Number of training epochs")
    parser.add_argument("--max_data", type=int, default=0, help="Number of data items (0 includes everything)")
    parser.add_argument(
        "--val_max_data", type=int, default=0, help="Number of validation data items (0 includes everything)"
    )
    parser.add_argument(
        "--eval_before_start", action="store_true", help="If true start with a first evaluation before training"
    )
    parser.add_argument(
        "--overwrite_output_dir",
        action="store_true",
        help="If true, and the logdir is explictly passed, it will be overwritten",
    )
    parser.add_argument("--ul", action="store_true", help="If true use unlikelihood sampling")
    parser.add_argument("--freeze", action="store_true", help="If true freeze layers")
    parser.add_argument("--smoothing", type=float, default=0.0, help="label smoothing epsilon")
    parser.add_argument("--ignore_cache", action="store_true", help="If true ignore the dataset cache")
    parser.add_argument(
        "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)"
    )
    parser.add_argument(
        "--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)"
    )
    parser.add_argument(
        "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)"
    )
    parser.add_argument("--warmup-steps", default=0, type=int, help="Linear warmup over warmup_steps.")

    # custom training
    parser.add_argument("--sequence-tune-rate", type=float, default=0.5)
    parser.add_argument("--sequence-ngram-n", type=int, default=4)
    parser.add_argument(
        "--multitask", action="store_true", help="If true use multitask training with multiple choice loss"
    )
    parser.add_argument(
        "--retrain_base",
        type=str,
        default=None,
        help="JSON file with training parameters or MLflow run_id from which to get the parameters for retraining",
    )
    parser.add_argument(
        "--training_args_file",
        type=str,
        default=None,
        help="File with the training arguments generated by a previous run to use as parameters",
    )
    parser.add_argument("--scheduler", type=str, default="piecewiselinear", help="scheduler choice")
    parser.add_argument("--optimizer", type=str, default="AdamW", help="optimizer choice")
    parser.add_argument(
        "--max_block_size", type=int, default=None, help="If set, data is truncated to fit this max size"
    )

    args = parser.parse_args()

    if args.retrain_base:
        try:
            logger.info(f"reading the arguments from {args.retrain_base}")
            model_training_args = json.load(open(args.retrain_base))
        except:
            model_training_args = load_training_args(args.retrain_base)

        passed_args = [x[2:] for x in sys.argv if x.startswith("--")]
        # this is set by pytorch
        passed_args.extend(["ignore_cache", "local_rank"])

        for key, value in model_training_args.items():
            # we only update an argument if it's not passed explicitly
            if key not in passed_args:
                if value:
                    args.__setattr__(key, value)
        logger.info(vars(args))

    if args.logdir is None:
        args.logdir = Path(f"runs/{get_curr_time()}")
    else:
        args.logdir = Path(args.logdir)
        if not is_empty_or_absent_dir(args.logdir) and not args.overwrite_output_dir:
            logger.error(f"Error: {args.logdir} is not empty and you did not pass --overwrite_output_dir as True")
            exit()
        else:
            if args.local_rank in [-1, 0]:
                logger.info(f"deleting the existing folder {args.logdir}")
                try:
                    rmtree(args.logdir)
                except:
                    pass

    logger.info(f"outputting model to {args.logdir}")
    try:

        def finalize():

            if args.local_rank not in [-1, 0,]:
                # Make sure only the first process in distributed training will download model & vocab
                torch.distributed.barrier()

            if args.local_rank in [-1, 0] and args.n_epochs > 0:
                try:
                    # On the main process: rename the last checkpoint
                    # (for easy re-loading with from_pretrained method)
                    os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(args.logdir, WEIGHTS_NAME))

                    if args.use_mlflow:
                        mlflow.log_artifact(args.logdir / WEIGHTS_NAME, "training")
                        logger.info("ending mlflow run")
                        logger.info(f"run_id: {mlflow.active_run().info.run_id}")
                        mlflow.end_run()

                        rmtree(args.logdir)

                except:
                    logger.info("No checkpoint to finalize the model. Deleting run")
                    # TODO: fix issue in mlflow trying to delete the experiment multiple times
                    mlflow.delete_run(mlflow.active_run().info.run_id)
                    rmtree(args.logdir)

                if args.local_rank == 0:
                    torch.distributed.barrier()

        args.logdir.mkdir(parents=True, exist_ok=True)
        TRAINING_ARGS_FILE = args.logdir / "model_training_args.json"
        args_dict = deepcopy(vars(args))
        args_dict["logdir"] = str(args_dict["logdir"])
        json.dump(args_dict, open(TRAINING_ARGS_FILE, "w"), indent=2)

        if args.use_mlflow:
            if args.local_rank in [-1, 0]:
                assert args.tracking_uri
                assert args.experiment
                mlflow.set_tracking_uri(args.tracking_uri)
                mlflow.set_experiment(args.experiment)
                mlflow.start_run()

                # Log parameters
                mlflow.log_params(vars(args))
                # Log training arguments into a file
                mlflow.log_artifact(TRAINING_ARGS_FILE, "training")

        # The validation maximum number of items shouldn't be more than the training (used during debugging)
        if args.val_max_data == 0 and args.max_data > 0:
            args.val_max_data = args.max_data

        # Logging is set to INFO (resp. WARN) for main (resp. auxiliary)
        # process. logger.info => log main process only, logger.warning => log all processes
        logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
        # This is a logger.warning: it will be printed by all distributed processes
        logger.warning("Running process %d", args.local_rank)

        # Initialize distributed training if needed
        args.distributed = args.local_rank != -1

        if args.distributed:
            torch.cuda.set_device(args.local_rank)
            args.device = torch.device("cuda", args.local_rank)
            torch.distributed.init_process_group(backend="nccl", init_method="env://")

        logger.info(f"Reading the task configuration: {args.task_config}")
        copyfile(args.task_config, args.logdir / "task_config.json")
        task_config = load_task_config(args.task_config)

        logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")

        model_directory, is_local = get_model_directory(args.model_checkpoint)

        model, tokenizer = load_pretrained(
            model_directory,
            model_type=args.model_type,
            smoothing=args.smoothing,
            multitask=args.multitask,
            special_tokens_file=args.special_tokens_file,
            task_config=task_config,
            dataset_path=args.dataset_path,
        )

        special_tokens = read_special_tokens(
            task_config=task_config,
            special_tokens_file=args.special_tokens_file,
            dataset_path=args.dataset_path
        )
        logger.info(f"adding {len(special_tokens)}")
        tokenizer.add_tokens(special_tokens)

        model.resize_token_embeddings(len(tokenizer))

        model.to(args.device)

        if args.freeze:
            transformer = list(model.children())[0]
            i = 0
            for param in transformer.parameters():
                param.requires_grad = False
                i += 1
                if i >= len(list(transformer.parameters())) // 2:
                    break

        if args.optimizer.lower() == "rmsprop":
            optimizer = RMSprop(model.parameters(), lr=args.lr)
        elif args.optimizer.lower() == "adam":
            optimizer = Adam(model.parameters(), lr=args.lr)
        elif args.optimizer.lower() == "adafactor":
            optimizer = Adafactor(model.parameters(), lr=args.lr, warmup_init=False)
        elif args.optimizer.lower() == "sgd":
            optimizer = SGD(model.parameters(), lr=args.lr)
        elif args.optimizer.lower() == "novograd":
            optimizer = Novograd(model.parameters(), lr=args.lr)
        else:
            optimizer = AdamW(model.parameters(), lr=args.lr)

        # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
        if args.fp16:
            from apex import amp  # Apex is only required if we use fp16 training

            model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)

        if args.distributed:
            model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

        logger.info("Prepare datasets")
        train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, task_config, tokenizer)

        def named_batch(batch, with_labels=True):
            """Helper function so that we get a dictionary with key as the input name and the value as the input value. 
            This makes it easier to pass parameters to the model by their name, without caring about the order
            """
            named_batch = {}
            # The components in the batch are ordered as in MODEL_INPUTS
            i = 0
            for input_name in MODEL_INPUTS:

                if not with_labels and "labels" in input_name:
                    continue

                key = input_name
                if not args.multitask:
                    if "mc_" in input_name:
                        continue
                    # the field is called `lm_labels` in the DoubleHeads and `labels` in single head model
                    if input_name == "lm_labels":
                        key = "labels"

                named_batch[key] = batch[i]
                i += 1
            return named_batch

        # Training function and trainer
        def update(engine, batch):
            model.train()

            n_batch = named_batch(tuple(input_tensor.to(args.device) for input_tensor in batch))

            outputs = model(**n_batch)

            lm_loss = outputs[0]
            if args.multitask:
                mc_loss = outputs[1]
            else:
                mc_loss = 0

            loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
            if engine.state.iteration % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            return loss.item()

        trainer = Engine(update)

        # Evaluation function and evaluator (evaluator output is the input of the metrics)
        def inference(engine, batch):
            model.eval()
            with torch.no_grad():
                n_batch = named_batch(tuple(input_tensor.to(args.device) for input_tensor in batch))
                outputs = model(**{key: n_batch[key] for key in n_batch if "labels" not in key})
                lm_logits = outputs[0]
                lm_labels = n_batch["lm_labels"] if args.multitask else n_batch["labels"]

                lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
                lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)

                if args.multitask:
                    mc_logits = outputs[1]
                    mc_labels = n_batch["mc_labels"]

                    return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
                else:
                    return lm_logits_flat_shifted, lm_labels_flat_shifted

        evaluator = Engine(inference)

        def checkpointing_score_function(engine):
            """"""
            val_metric = engine.state.metrics["average_ppl"]
            logger.info(val_metric)
            return -val_metric

        def score_function(engine):
            """"""
            val_ppl = engine.state.metrics["average_ppl"]
            return -val_ppl

        # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
        trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
        if args.n_epochs < 1:
            trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
        if args.eval_before_start:
            trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))
        # Attach mlflow logger
        # trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))

        # Make sure distributed data samplers split the dataset nicely between the distributed processes
        if args.distributed:
            trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
            evaluator.add_event_handler(
                Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)
            )

        if args.scheduler.lower() == "piecewiselinear":
            # Linearly decrease the learning rate from lr to zero
            scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
        elif args.scheduler.lower() == "linearcyclical":
            scheduler = LinearCyclicalScheduler(optimizer, "lr", args.lr / 10, args.lr, len(train_loader))
        elif args.scheduler.lower() == "cosine":
            scheduler = CosineAnnealingLR(optimizer, args.n_epochs * len(train_loader), 1e-4)
        elif args.warmup_steps > 0:
            t_total = len(train_loader) // args.gradient_accumulation_steps * args.n_epochs
            scheduler = get_linear_schedule_with_warmup(optimizer, args.warmup_steps, t_total)

        trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

        # Prepare metrics - note how we compute distributed metrics
        RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
        if args.multitask:
            metrics = {
                "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
                "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])),
            }
            metrics.update(
                {
                    "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args),
                }
            )
        else:
            metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1, reduction="mean"))}
            metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)})

        metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])

        for name, metric in metrics.items():
            metric.attach(evaluator, name)

        # On the main process: add progress bar, tensorboard, checkpoints and save model,
        # configuration and tokenizer before we start to train

        if args.local_rank in [-1, 0]:
            pbar = ProgressBar(persist=True)
            pbar.attach(trainer, metric_names=["loss"])
            evaluator.add_event_handler(
                Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))
            )

            checkpoint_handler = ModelCheckpoint(
                args.logdir,
                filename_prefix="checkpoint",
                score_function=checkpointing_score_function,
                create_dir=True,
                n_saved=2,
            )

            evaluator.add_event_handler(
                Events.COMPLETED, checkpoint_handler, {"mymodel": getattr(model, "module", model)}
            )  # "getattr" takes care of distributed encapsulation

            getattr(model, "module", model).config.to_json_file(os.path.join(args.logdir, CONFIG_NAME))
            tokenizer.save_pretrained(args.logdir)

            early_handler = EarlyStopping(patience=args.patience, score_function=score_function, trainer=trainer)
            evaluator.add_event_handler(Events.COMPLETED, early_handler)

        if args.use_mlflow and args.local_rank in [-1, 0]:

            class MLflowTracker:
                def __init__(self):
                    self.iteration = 1

                def eval_metric_logger(self, engine):
                    mlflow.log_metric("last_epoch", self.iteration)
                    for metric in engine.state.metrics:
                        mlflow.log_metric(f"eval_{metric}", engine.state.metrics[metric], step=self.iteration)
                    self.iteration += 1

                def train_metric_logger(self, engine):
                    for metric in engine.state.metrics:
                        mlflow.log_metric(f"train_{metric}", engine.state.metrics[metric], step=engine.state.epoch)

                def finish_experiment(self, engine):
                    mlflow.log_metric("finished", True)

                def start_experiment(self, engine):
                    # log the initial artifacts in the dir
                    mlflow.log_artifacts(args.logdir, "training")
                    mlflow.log_metric("finished", False)

            mlflow_tracker = MLflowTracker()
            trainer.add_event_handler(Events.STARTED, mlflow_tracker.start_experiment)
            # Log the train and validation metrics
            trainer.add_event_handler(Events.EPOCH_COMPLETED, mlflow_tracker.train_metric_logger)
            evaluator.add_event_handler(Events.COMPLETED, mlflow_tracker.eval_metric_logger)
            # Log the model
            trainer.add_event_handler(Events.COMPLETED, mlflow_tracker.finish_experiment)

        # Run the training
        trainer.run(train_loader, max_epochs=args.n_epochs)
    except KeyboardInterrupt:
        finalize()

    logger.info("training about to finish")
    finalize()
    logger.info("finalized training")
Exemple #25
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path",
                        type=str,
                        default="",
                        help="Path or url of the dataset.")
    parser.add_argument("--use_adapter",
                        default=False,
                        action='store_true',
                        help="Use adapter or not")
    parser.add_argument("--keyword_Module",
                        type=str,
                        default="",
                        help="add, attention, ")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="bertGpt",
                        help="Path, url or short name of the model")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=8,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=8,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=3,
                        help="Number of training epochs")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    parser.add_argument("--bert_model_path",
                        default="./",
                        type=str,
                        help="Bert pre-trained model path")
    parser.add_argument(
        "--vocab_file",
        default="./vocab.korean.rawtext.list",
        type=str,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer.")
    #tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    #tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Load KoBERT model and tokenizer
    bert_tokenizer = BertTokenizer.from_pretrained(
        args.vocab_file, do_lower_case=args.do_lower_case)
    bert_model = BertModel.from_pretrained(args.bert_model_path)
    bert_model.to(args.device)

    # Load KoGPT2 model and tokenizer
    tok_path = get_tokenizer()
    gpt_model, gpt_vocab = get_pytorch_conkogpt2_model2(
        keyword_Module=args.keyword_Module, use_adapter=args.use_adapter)
    gpt_tokenizer = SentencepieceTokenizer(tok_path)
    gpt_model.to(args.device)

    model = Seq2Seq(bert_model, gpt_model, gpt_vocab, args)

    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    #if args.fp16:
    #from apex import amp  # Apex is only required if we use fp16 training
    #model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        args, bert_tokenizer, gpt_tokenizer, gpt_vocab)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        source_ids, target_ids, lm_labels = batch

        #(lm_loss), *_ = model(input_ids, token_type_ids=token_type_ids, labels=lm_labels)
        (lm_loss), *_ = model(source_ids, target_ids, lm_labels=lm_labels)
        loss = lm_loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            source_ids, target_ids, lm_labels = batch

            #lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids,)
            lm_logits, *_ = model(source_ids, target_ids)
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted), (lm_labels_flat_shifted)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-100),
             output_transform=lambda x: (x[0], x[1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.model_checkpoint, args.dataset_path,
                              args.keyword_Module)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=2)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': model
             })  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        #getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        #tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            os.path.join(log_dir, checkpoint_handler._saved[-1][1]),
            os.path.join(log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
def train():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path",
                        type=str,
                        default="",
                        help="Path or url of the dataset.")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=64,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=64,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-4,
                        help="Learning rate")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=15,
                        help="Number of training epochs")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    parser.add_argument("--gpt2_model_name",
                        type=str,
                        default="gpt2",
                        help="Path, url or short name of the model")

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')
    args = parser.parse_args()
    args.d_word_vec = args.d_model

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer.")

    tokenizer_class = GPT2Tokenizer if "gpt2" in args.gpt2_model_name else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    tokenizer = tokenizer_class.from_pretrained(args.gpt2_model_name)

    num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(
        ATTR_TO_SPECIAL_TOKEN)  # doesn't add if they are already there

    model = Transformer(
        num_tokens + num_added_tokens,
        num_tokens + num_added_tokens,
        src_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]),
        trg_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]),
        trg_emb_prj_weight_sharing=args.proj_share_weight,
        emb_src_trg_weight_sharing=args.embs_share_weight,
        d_k=args.d_k,
        d_v=args.d_v,
        d_model=args.d_model,
        d_word_vec=args.d_word_vec,
        d_inner=args.d_inner_hid,
        n_layers=args.n_layers,
        n_head=args.n_head,
        dropout=args.dropout).to(args.device)

    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        args, tokenizer, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        source_ids, target_ids, lm_labels = batch

        (lm_loss), *_ = model(source_ids, target_ids, labels=lm_labels)

        loss = lm_loss / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            source_ids, target_ids, lm_labels = batch
            #logger.info(tokenizer.decode(target_ids[0].tolist()))

            lm_logits, *_ = model(source_ids, target_ids)
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, ), (lm_labels_flat_shifted, )

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-100),
             output_transform=lambda x: (x[0][0], x[1][0]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.gpt2_model_name, args.dataset_path)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=4)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        #getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            os.path.join(log_dir, checkpoint_handler._saved[-1][1]),
            os.path.join(log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Exemple #27
0
def run(output_path, config):
    device = "cuda"

    local_rank = config['local_rank']
    distributed = backend is not None
    if distributed:
        torch.cuda.set_device(local_rank)
        device = "cuda"
    rank = dist.get_rank() if distributed else 0

    # Rescale batch_size and num_workers
    ngpus_per_node = torch.cuda.device_count()
    ngpus = dist.get_world_size() if distributed else 1
    batch_size = config['batch_size'] // ngpus
    num_workers = int(
        (config['num_workers'] + ngpus_per_node - 1) / ngpus_per_node)

    train_labelled_loader, test_loader = \
        get_train_test_loaders(path=config['data_path'],
                               batch_size=batch_size,
                               distributed=distributed,
                               num_workers=num_workers)

    model = get_model(config['model'])
    model = model.to(device)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[
                local_rank,
            ], output_device=local_rank)

    optimizer = optim.SGD(model.parameters(),
                          lr=config['learning_rate'],
                          momentum=config['momentum'],
                          weight_decay=config['weight_decay'],
                          nesterov=True)

    criterion = nn.CrossEntropyLoss().to(device)

    le = len(train_labelled_loader)
    milestones_values = [(0, 0.0),
                         (le * config['num_warmup_epochs'],
                          config['learning_rate']),
                         (le * config['num_epochs'], 0.0)]
    lr_scheduler = PiecewiseLinear(optimizer,
                                   param_name="lr",
                                   milestones_values=milestones_values)

    def _prepare_batch(batch, device, non_blocking):
        x, y = batch
        return (convert_tensor(x, device=device, non_blocking=non_blocking),
                convert_tensor(y, device=device, non_blocking=non_blocking))

    def process_function(engine, labelled_batch):

        x, y = _prepare_batch(labelled_batch, device=device, non_blocking=True)

        model.train()
        # Supervised part
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return {
            'batch loss': loss.item(),
        }

    trainer = Engine(process_function)

    if not hasattr(lr_scheduler, "step"):
        trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler)
    else:
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  lambda engine: lr_scheduler.step())

    metric_names = [
        'batch loss',
    ]

    def output_transform(x, name):
        return x[name]

    for n in metric_names:
        # We compute running average values on the output (batch loss) across all devices
        RunningAverage(output_transform=partial(output_transform, name=n),
                       epoch_bound=False,
                       device=device).attach(trainer, n)

    if rank == 0:
        checkpoint_handler = ModelCheckpoint(dirname=output_path,
                                             filename_prefix="checkpoint")
        trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000),
                                  checkpoint_handler, {
                                      'model': model,
                                      'optimizer': optimizer
                                  })

        ProgressBar(persist=True,
                    bar_format="").attach(trainer,
                                          event_name=Events.EPOCH_STARTED,
                                          closing_event_name=Events.COMPLETED)
        if config['display_iters']:
            ProgressBar(persist=False,
                        bar_format="").attach(trainer,
                                              metric_names=metric_names)

        tb_logger = TensorboardLogger(log_dir=output_path)
        tb_logger.attach(trainer,
                         log_handler=tbOutputHandler(
                             tag="train", metric_names=metric_names),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=tbOptimizerParamsHandler(optimizer,
                                                              param_name="lr"),
                         event_name=Events.ITERATION_STARTED)

    metrics = {
        "accuracy": Accuracy(device=device if distributed else None),
        "loss": Loss(criterion, device=device if distributed else None)
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        torch.cuda.synchronize()
        train_evaluator.run(train_labelled_loader)
        evaluator.run(test_loader)

    trainer.add_event_handler(Events.EPOCH_STARTED(every=3), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    if rank == 0:
        if config['display_iters']:
            ProgressBar(persist=False,
                        desc="Train evaluation").attach(train_evaluator)
            ProgressBar(persist=False,
                        desc="Test evaluation").attach(evaluator)

        tb_logger.attach(train_evaluator,
                         log_handler=tbOutputHandler(tag="train",
                                                     metric_names=list(
                                                         metrics.keys()),
                                                     another_engine=trainer),
                         event_name=Events.COMPLETED)

        tb_logger.attach(evaluator,
                         log_handler=tbOutputHandler(tag="test",
                                                     metric_names=list(
                                                         metrics.keys()),
                                                     another_engine=trainer),
                         event_name=Events.COMPLETED)

        # Store the best model
        def default_score_fn(engine):
            score = engine.state.metrics['accuracy']
            return score

        score_function = default_score_fn if not hasattr(
            config, "score_function") else config.score_function

        best_model_handler = ModelCheckpoint(
            dirname=output_path,
            filename_prefix="best",
            n_saved=3,
            global_step_transform=global_step_from_engine(trainer),
            score_name="val_accuracy",
            score_function=score_function)
        evaluator.add_event_handler(Events.COMPLETED, best_model_handler, {
            'model': model,
        })

    trainer.run(train_labelled_loader, max_epochs=config['num_epochs'])

    if rank == 0:
        tb_logger.close()
Exemple #28
0
def run(output_path, config):

    device = "cuda"
    batch_size = config['batch_size']

    train_loader, test_loader = get_train_test_loaders(
        dataset_name=config['dataset'],
        path=config['data_path'],
        batch_size=batch_size,
        num_workers=config['num_workers'])

    model = get_model(config['model'])
    model = model.to(device)

    optim_fn = optim.SGD
    if config['with_layca']:
        optim_fn = LaycaSGD

    optimizer = optim_fn(model.parameters(),
                         lr=0.0,
                         momentum=config['momentum'],
                         weight_decay=config['weight_decay'],
                         nesterov=True)
    criterion = nn.CrossEntropyLoss()

    le = len(train_loader)
    milestones_values = [(le * m, v)
                         for m, v in config['lr_milestones_values']]
    scheduler = PiecewiseLinear(optimizer,
                                "lr",
                                milestones_values=milestones_values)

    def _prepare_batch(batch, device, non_blocking):
        x, y = batch
        return (convert_tensor(x, device=device, non_blocking=non_blocking),
                convert_tensor(y, device=device, non_blocking=non_blocking))

    def process_function(engine, batch):

        x, y = _prepare_batch(batch, device=device, non_blocking=True)

        model.train()
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return loss.item()

    trainer = Engine(process_function)

    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    RunningAverage(output_transform=lambda x: x,
                   epoch_bound=False).attach(trainer, 'batchloss')

    ProgressBar(persist=True,
                bar_format="").attach(trainer,
                                      event_name=Events.EPOCH_STARTED,
                                      closing_event_name=Events.COMPLETED)

    tb_logger = TensorboardLogger(log_dir=output_path)
    tb_logger.attach(trainer,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names='all'),
                     event_name=Events.ITERATION_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=tbOptimizerParamsHandler(optimizer,
                                                          param_name="lr"),
                     event_name=Events.ITERATION_STARTED)

    tb_logger.attach(trainer,
                     log_handler=LayerRotationStatsHandler(model),
                     event_name=Events.EPOCH_STARTED)

    metrics = {
        "accuracy": Accuracy(),
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine, val_interval):
        if (engine.state.epoch - 1) % val_interval == 0:
            train_evaluator.run(train_loader)
            evaluator.run(test_loader)

    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              run_validation,
                              val_interval=2)
    trainer.add_event_handler(Events.COMPLETED, run_validation, val_interval=1)

    tb_logger.attach(train_evaluator,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names='all',
                                                 another_engine=trainer),
                     event_name=Events.COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=tbOutputHandler(tag="test",
                                                 metric_names='all',
                                                 another_engine=trainer),
                     event_name=Events.COMPLETED)

    def mlflow_batch_metrics_logging(engine, tag):
        step = trainer.state.iteration
        for name, value in engine.state.metrics.items():
            mlflow.log_metric("{} {}".format(tag, name), value, step=step)

    def mlflow_val_metrics_logging(engine, tag):
        step = trainer.state.epoch
        for name in metrics.keys():
            value = engine.state.metrics[name]
            mlflow.log_metric("{} {}".format(tag, name), value, step=step)

    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              mlflow_batch_metrics_logging, "train")
    train_evaluator.add_event_handler(Events.COMPLETED,
                                      mlflow_val_metrics_logging, "train")
    evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging,
                                "test")

    trainer.run(train_loader, max_epochs=config['num_epochs'])
    tb_logger.close()
Exemple #29
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model")
    parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps")
    parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate")
    parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient")
    parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs")
    parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences")
    parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model_class = GPT2LMHeadModel if "gpt2" in args.model_checkpoint else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(args.model_checkpoint)
    tokenizer.set_special_tokens(SPECIAL_TOKENS)
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.to(args.device)
    optimizer = OpenAIAdam(model.parameters(), lr=args.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        lm_loss, mc_loss = model(*batch)
        loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()
    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[1]  # So we can also use GPT2 outputs
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics 
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=None)
        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" take care of distributed encapsulation

        torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Exemple #30
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--local_rank", type=int, default=-1)
    args = parser.parse_args()
    device = torch.device("cuda" if torch.cuda.device_count() > 1 else "cpu")
    model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    DISTRIBUTED = args.local_rank != -1

    if DISTRIBUTED and torch.distributed.is_available():
        print("Distributed")
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        #BATCH_SIZE *= 2

    def average_distributed_scalar(scalar):
        if (not DISTRIBUTED):
            return scalar
        scalar_t = torch.tensor(
            scalar, dtype=torch.float,
            device=device) / torch.distributed.get_world_size()
        torch.distributed.all_reduce(scalar_t,
                                     op=torch.distributed.ReduceOp.SUM)
        return scalar_t.item()

    optimizer = AdamW(model.parameters(), lr=6.25e-5)

    ds = dataloader.Conv_GPT2_DataClass(tokenizer)
    v_ds = dataloader.Conv_GPT2_DataClass(tokenizer, dev=True)
    orig_added_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(
        dataloader.ATTR_SPECIAL_TOKENS)
    if (num_added_tokens > 0):
        model.resize_token_embeddings(new_num_tokens=orig_added_tokens +
                                      num_added_tokens)
    model = model.to(device)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        ds) if DISTRIBUTED else None
    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        v_ds) if DISTRIBUTED else None

    dl = DataLoader(ds,
                    sampler=train_sampler,
                    batch_size=BATCH_SIZE,
                    shuffle=not DISTRIBUTED)
    v_dl = DataLoader(v_ds, sampler=valid_sampler, shuffle=False)

    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-1),
             output_transform=lambda x: (x[0][0], x[1][0])),
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"]),
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])

    def update(engine, batch):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        lm_loss, *_ = model(batch[0],
                            token_type_ids=batch[1],
                            lm_labels=batch[2])
        loss = lm_loss / ITERATION_STEP
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        if engine.state.iteration % ITERATION_STEP == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(t.to(device) for t in batch)
            input_ids, token_type_ids, lm_labels = batch
            model_outputs = model(input_ids, token_type_ids=token_type_ids)
            lm_logits = model_outputs[0]
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return lm_logits_flat_shifted, lm_labels_flat_shifted

    trainer = Engine(update)
    evaluator = Engine(inference)

    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, 6.25e-5),
                                 (EPOCHS * len(ds) // BATCH_SIZE, 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(v_dl))

    if DISTRIBUTED:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        #evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")

    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    if (args.local_rank in [0, -1]):
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        #evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir='./logs')
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        #tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint('./checkpoint',
                                             '_checkpoint',
                                             n_saved=3)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                  {'gpt2_qg': getattr(model, 'module', model)})

        getattr(model, 'module', model).config.to_json_file(
            os.path.join('./checkpoint', 'config'))
        tokenizer.save_pretrained('./checkpoint')

    trainer.run(dl, max_epochs=EPOCHS)

    if (args.local_rank in [0, -1]):
        tb_logger.close()