Example #1
0
 def create_optimizer_and_scheduler(self, num_training_steps: int):
     """
     Setup the optimizer and the learning rate scheduler.
     We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
     TFTrainer's init through :obj:`optimizers`, or subclass and override this method.
     """
     if not self.optimizer and not self.lr_scheduler:
         self.optimizer, self.lr_scheduler = create_optimizer(
             self.args.learning_rate,
             num_training_steps,
             self.args.warmup_steps,
             adam_beta1=self.args.adam_beta1,
             adam_beta2=self.args.adam_beta2,
             adam_epsilon=self.args.adam_epsilon,
             weight_decay_rate=self.args.weight_decay,
         )
def get_optimizer(num_train_examples, options):
    steps_per_epoch = ceil(num_train_examples / options.batch_size)
    num_train_steps = steps_per_epoch * options.epochs
    num_warmup_steps = floor(num_train_steps * options.warmup_proportion)

    # Mostly defaults from transformers.optimization_tf
    optimizer, lr_scheduler = create_optimizer(
        options.lr,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        min_lr_ratio=0.0,
        adam_beta1=0.9,
        adam_beta2=0.999,
        adam_epsilon=1e-8,
        weight_decay_rate=0.01,
        power=1.0,
    )
    return optimizer
Example #3
0
def get_optimizer(lr, epochs, batch_size, warmup_proportion,
                  num_train_examples):
    from transformers.optimization_tf import create_optimizer

    steps_per_epoch = math.ceil(num_train_examples / batch_size)
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = math.floor(num_train_steps * warmup_proportion)

    # Mostly defaults from transformers.optimization_tf
    optimizer, lr_schedule = create_optimizer(
        lr,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        min_lr_ratio=0.0,
        adam_beta1=0.9,
        adam_beta2=0.999,
        adam_epsilon=1e-8,
        weight_decay_rate=0.01,
        power=1.0,
    )
    return optimizer, lr_schedule
Example #4
0
    def get_optimizers(
        self,
    ) -> Tuple[tf.keras.optimizers.Optimizer,
               tf.keras.optimizers.schedules.LearningRateSchedule]:
        """
        Setup the optimizer and the learning rate scheduler.
        We provide a reasonable default that works well.
        If you want to use something else, you can pass a tuple in the Trainer's init,
        or override this method in a subclass.
        """
        if self.optimizers is not None:
            return self.optimizers

        optimizer, scheduler = create_optimizer(
            self.args.learning_rate,
            self.train_steps * self.args.num_train_epochs,
            self.args.warmup_steps,
            adam_epsilon=self.args.adam_epsilon,
            weight_decay_rate=self.args.weight_decay,
        )

        return optimizer, scheduler
Example #5
0
    def __init__(self, directory, settings, **kwargs):
        self.dir = directory
        self.model = None
        self.tokenizer = None

        self.settings = {
            "model_file": "model.tf",
            "vocab_file": "vocab",
            "model": 'jplu/tf-xlm-roberta-base',
            "batch_size": 16,
            "maxlen": 150,
            "n_classes": 2,
            "epochs": 10,
            "steps_per_epoch": 40000,
            "patience": 3,
            "dropout": 0.1,
            "n_hidden": 2048,
            "activation": 'relu',
            "loss": "binary_crossentropy",
            "lr": 2e-6,
            "decay_rate": 0.1,
            "warmup_steps": 1000,
            "clipnorm": 1.0,
            **settings,
        }
        scheduler = InverseTimeDecay(self.settings["lr"],
                                     decay_steps=32.0,
                                     decay_rate=0.1)
        self.settings["scheduler"] = scheduler
        optimizer, scheduler = create_optimizer(
            self.settings["lr"],
            self.settings["steps_per_epoch"] * self.settings["epochs"],
            self.settings["warmup_steps"],
            weight_decay_rate=self.settings["decay_rate"])
        self.settings["scheduler"] = scheduler
        self.settings["optimizer"] = optimizer
Example #6
0
def main():

    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
                print(gpu)
        except RuntimeError as e:
            print(e)

    parser = argparse.ArgumentParser()

    parser.add_argument('--casing',
                        type=str,
                        default="bert-base-uncased",
                        help='BERT model')
    parser.add_argument('--bottleneck_size',
                        type=int,
                        default=64,
                        help='Bottleneck size of adapters')
    parser.add_argument('--non_linearity',
                        type=str,
                        default='gelu_new',
                        help='non_linearity function in adapters')
    parser.add_argument('--task', type=str, default='mrpc', help='GLUE task')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        help='The number of training epochs')
    parser.add_argument('--max_seq_length',
                        type=int,
                        default=128,
                        help='max sequence length')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=3e-4,
                        help='learning rate')
    parser.add_argument('--warmup_ratio',
                        type=float,
                        default=0.1,
                        help='warmup ratio')
    parser.add_argument('--saved_models_dir',
                        type=str,
                        default='saved_models',
                        help='save directory')

    args = parser.parse_args()
    if not os.path.isdir(args.saved_models_dir):
        os.mkdir(args.saved_models_dir)
    if not os.path.isdir(os.path.join(args.saved_models_dir, args.task)):
        os.mkdir(os.path.join(args.saved_models_dir, args.task))

    if args.task == "sst-2":
        TFDS_TASK = "sst2"
        postfix = ""
    elif args.task == "sts-b":
        TFDS_TASK = "stsb"
        postfix = ""
    elif args.task == "mnli":
        TFDS_TASK = "mnli"
        postfix = "_matched"
    elif args.task == "mnli-mm":
        TFDS_TASK = "mnli"
        postfix = "_mismatched"
    else:
        TFDS_TASK = args.task
        postfix = ""

    TFDS_TASK = "glue/" + TFDS_TASK
    num_labels = len(glue_processors[args.task]().get_labels())

    # Load Model, Tokenizer & Datasets
    config = BertConfig.from_pretrained(args.casing, num_labels=num_labels)
    config.bottleneck_size = args.bottleneck_size
    config.non_linearity = args.non_linearity

    tokenizer = BertTokenizer.from_pretrained(args.casing)
    bert_model = TFBertModel(config).from_pretrained(args.casing)
    model = modeling_tf_adapter_bert.AdapterBertModel(bert_model, num_labels)

    data, info = tensorflow_datasets.load(TFDS_TASK, with_info=True)
    train_examples = info.splits["train"].num_examples
    eval_examples = info.splits["validation" + postfix].num_examples

    train_dataset = glue_convert_examples_to_features(
        data["train"],
        tokenizer,
        max_length=args.max_seq_length,
        task=args.task)
    eval_dataset = glue_convert_examples_to_features(
        data["validation" + postfix],
        tokenizer,
        max_length=args.max_seq_length,
        task=args.task)

    train_dataset = train_dataset.repeat().shuffle(buffer_size=100).batch(
        args.batch_size)
    eval_dataset = eval_dataset.batch(args.batch_size)

    train_steps = int(np.ceil(train_examples / args.batch_size))
    eval_steps = int(np.ceil(eval_examples / args.batch_size))

    # Add Adapters
    for i in range(config.num_hidden_layers):
        # instantiate
        model.bert.bert.encoder.layer[
            i].attention.dense_output = modeling_tf_adapter_bert.TFBertSelfOutput(
                model.bert.bert.encoder.layer[i].attention.dense_output.dense,
                model.bert.bert.encoder.layer[i].attention.dense_output.
                LayerNorm, config)
        model.bert.bert.encoder.layer[
            i].bert_output = modeling_tf_adapter_bert.TFBertOutput(
                model.bert.bert.encoder.layer[i].bert_output.dense,
                model.bert.bert.encoder.layer[i].bert_output.LayerNorm, config)

    # Freeze BERT
    model.bert.bert.embeddings.trainable = False
    model.bert.bert.pooler.trainable = False
    for i in range(config.num_hidden_layers):
        model.bert.bert.encoder.layer[
            i].attention.self_attention.trainable = False
        model.bert.bert.encoder.layer[
            i].attention.dense_output.dense.trainable = False
        #model.bert.bert.encoder.layer[i].attention.dense_output.LayerNorm.trainable = False
        model.bert.bert.encoder.layer[i].intermediate.trainable = False
        model.bert.bert.encoder.layer[i].bert_output.dense.trainable = False
        #model.bert.bert.encoder.layer[i].bert_output.LayerNorm.trainable = False

    # Loss & Optimizer
    if num_labels == 1:
        loss = tf.keras.losses.MeanSquaredError()
        monitor = 'val_spearmanr'
    else:
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        if args.task == 'cola':
            monitor = 'val_matthews_corrcoef'
        elif args.task in ['mrpc', 'qqp']:
            monitor = 'val_f1'
        else:
            monitor = 'val_accuracy'

    opt, scheduler = create_optimizer(
        init_lr=args.learning_rate,
        num_train_steps=train_steps * args.epochs,
        num_warmup_steps=int(train_steps * args.epochs * args.warmup_ratio),
        adam_epsilon=1e-6,
        weight_decay_rate=0)
    model.compile(optimizer=opt, loss=loss)

    # Callback to save the best model
    checkpoint = ModelCheckpoint(
        eval_dataset, args.batch_size, eval_steps, monitor,
        os.path.join(args.saved_models_dir, args.task))

    # Fine-tuning
    history = model.fit(train_dataset,
                        epochs=args.epochs,
                        steps_per_epoch=train_steps,
                        callbacks=[checkpoint])
Example #7
0
# train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
# valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
# test_dataset = test_dataset.batch(EVAL_BATCH_SIZE)

# train_steps = train_examples // BATCH_SIZE
# valid_steps = valid_examples // EVAL_BATCH_SIZE
# test_steps = test_examples // EVAL_BATCH_SIZE
len_train_features = len(train_features)
len_dev_features = len(dev_features)
total_train_steps = int(len_train_features * EPOCHS / BATCH_SIZE) + 1
train_dataset = get_dataset_from_features(train_features, BATCH_SIZE)
dev_dataset = get_dataset_from_features(dev_features, EVAL_BATCH_SIZE)

# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
opt = create_optimizer(init_lr=3e-5,
                       num_train_steps=total_train_steps,
                       num_warmup_steps=int(0.1 * total_train_steps))
# opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
if USE_AMP:
    # loss scaling is currently required when using mixed precision
    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
        opt, "dynamic")

if num_labels == 1:
    loss = tf.keras.losses.MeanSquaredError()
else:
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=opt, loss=loss, metrics=[metric])
train_loss_results, train_accuracy_results = [], []
Example #8
0
    def train(self,
              finetune=False,
              previous_epoch=10,
              do_eval_train=False) -> None:
        """
        Train method to train the model.
        """
        train_ds = self.get_train_tfdataset()

        if self.args.debug:
            tf.summary.trace_on(graph=True, profiler=True)

        self.gradient_accumulator.reset()

        with self.args.strategy.scope():
            optimizer, lr_scheduler = self.get_optimizers()
            iterations = optimizer.iterations
            folder = os.path.join(self.args.output_dir, PREFIX_CHECKPOINT_DIR)
            ckpt = tf.train.Checkpoint(optimizer=optimizer, model=self.model)
            self.model.ckpt_manager = tf.train.CheckpointManager(
                ckpt, folder, max_to_keep=self.args.save_total_limit)

            if self.model.ckpt_manager.latest_checkpoint:
                logger.info(
                    "Checkpoint file %s found and restoring from checkpoint",
                    self.model.ckpt_manager.latest_checkpoint)

                ckpt.restore(self.model.ckpt_manager.latest_checkpoint
                             ).expect_partial()
                if finetune:
                    logger.info("Reset Optimizer in finetune mode")
                    optimizer, lr_scheduler = create_optimizer(
                        self.args.learning_rate,
                        self.train_steps * self.args.num_train_epochs,
                        self.args.warmup_steps,
                        adam_epsilon=self.args.adam_epsilon,
                        weight_decay_rate=self.args.weight_decay,
                    )
                    iterations = optimizer.iterations

        if iterations.numpy() > 0:
            logger.info("Start the training from the last checkpoint")
            start_epoch = (iterations.numpy() // self.train_steps) + 1
        else:
            if finetune:
                logger.info("Start the finetune from the last checkpoint")
                start_epoch = 1 + previous_epoch
            else:
                start_epoch = 1

        tf.summary.experimental.set_step(iterations)

        epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs
        if finetune and self.args.max_steps <= 0:
            epochs += previous_epoch

        if self.args.fp16:
            policy = tf.keras.mixed_precision.experimental.Policy(
                "mixed_float16")
            tf.keras.mixed_precision.experimental.set_policy(policy)

        with self.tb_writer.as_default():
            tf.summary.text("args", self.args.to_json_string())

        self.tb_writer.flush()

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", self.num_train_examples)
        logger.info("  Num Epochs = %d", epochs)
        logger.info("  Total optimization steps = %d", self.train_steps)

        for epoch_iter in range(start_epoch, int(epochs + 1)):
            time_start = time.time()
            for step, training_loss in enumerate(
                    self._training_steps(train_ds, optimizer)):
                self.global_step = iterations.numpy()
                self.epoch_logging = epoch_iter - 1 + (
                    self.global_step % self.train_steps + 1) / self.train_steps

                if self.args.debug:
                    logs = {}
                    logs["loss"] = training_loss.numpy()
                    logs["epoch"] = self.epoch_logging
                    self._log(logs)

                if self.global_step == 1 and self.args.debug:
                    with self.tb_writer.as_default():
                        tf.summary.trace_export(
                            name="training",
                            step=self.global_step,
                            profiler_outdir=self.args.logging_dir)

                if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 0:
                    if do_eval_train:
                        self.evaluate_train()
                    self.evaluate()

                if self.global_step % self.args.logging_steps == 0:
                    logs = {}
                    logs["loss"] = training_loss.numpy()
                    logs["learning_rate"] = lr_scheduler(
                        self.global_step).numpy()
                    logs["epoch"] = self.epoch_logging
                    self._log(logs)

                if self.global_step % self.args.save_steps == 0:
                    ckpt_save_path = self.model.ckpt_manager.save()
                    logger.info("Saving checkpoint for step {} at {}".format(
                        self.global_step, ckpt_save_path))

                if self.global_step % 1000 == 0:
                    time_elapse = time.time() - time_start
                    time_est = time_elapse / (step + 1) * self.train_steps
                    logger.info(
                        'Epoch: %d; Step: %d; Current Epoch Elapse/Estimate: %0.2fs/%0.2fs'
                        % (epoch_iter, step + 1, time_elapse, time_est))

                if self.global_step % self.train_steps == 0:
                    break