Esempio n. 1
0
    def fit(self, x, y, epochs=2):
        input_ids, attention_mask = self.__transform_x(x)
        y = self.fit_and_transform_on_y(y)

        steps_per_epoch = math.ceil(len(x) / self.batch_size)
        warmup_steps = steps_per_epoch // 3
        total_steps = steps_per_epoch - warmup_steps
        optimizer = create_optimizer(init_lr=2e-5,
                                     num_train_steps=total_steps,
                                     num_warmup_steps=warmup_steps)

        for epoch in range(epochs):
            total_loss = 0

            for i in range(steps_per_epoch):
                start_idx = i * self.batch_size
                end_idx = min((i + 1) * self.batch_size, len(x))
                batch_input_ids = input_ids[start_idx:end_idx]
                batch_attention_mask = attention_mask[start_idx:end_idx]
                batch_y = y[start_idx:end_idx]
                batch_loss = self.__train_one_step(batch_input_ids,
                                                   batch_attention_mask,
                                                   batch_y, optimizer)
                total_loss += batch_loss
                print('Epoch {} Batch {} Batch Loss: {}'.format(
                    epoch, i, batch_loss))
            print('Epoch {} Loss: {}'.format(epoch,
                                             total_loss / steps_per_epoch))
Esempio n. 2
0
    def compile_model(self, local_model, train_size):

        nb_train_steps = (train_size // self.training_config.batch_size
                          ) * self.training_config.max_epoch

        if self.model_config.transformer_name is not None:
            # we use a transformer layer in the architecture
            optimizer, lr_schedule = create_optimizer(
                init_lr=2e-5,
                num_train_steps=nb_train_steps,
                weight_decay_rate=0.01,
                num_warmup_steps=0.1 * nb_train_steps,
            )

            if local_model.config.use_chain_crf:
                local_model.compile(
                    optimizer=optimizer,
                    loss=local_model.crf.sparse_crf_loss_bert_masked)
            elif local_model.config.use_crf:
                # loss is calculated by the custom CRF wrapper
                local_model.compile(optimizer=optimizer)
            else:
                # we apply a mask on the predicted labels so that the weights
                # corresponding to special symbols are neutralized
                local_model.compile(optimizer=optimizer,
                                    loss=sparse_crossentropy_masked)
        else:

            lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
                initial_learning_rate=self.training_config.learning_rate,
                decay_steps=nb_train_steps,
                decay_rate=0.1)
            optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

            #optimizer = tf.keras.optimizers.Adam(self.training_config.learning_rate)
            if local_model.config.use_chain_crf:
                local_model.compile(optimizer=optimizer,
                                    loss=local_model.crf.loss)
            elif local_model.config.use_crf:
                if tf.executing_eagerly():
                    # loss is calculated by the custom CRF wrapper, no need to specify a loss function here
                    local_model.compile(optimizer=optimizer)
                else:
                    print("compile model, graph mode")
                    # always expecting a loss function here, but it is calculated internally by the CRF wapper
                    # the following will fail in graph mode because
                    # '<tf.Variable 'chain_kernel:0' shape=(10, 10) dtype=float32> has `None` for gradient.'
                    # however this variable cannot be accessed, so no soluton for the moment
                    # (probably need not using keras fit and creating a custom training loop to get the gradient)
                    local_model.compile(optimizer=optimizer,
                                        loss='sparse_categorical_crossentropy')
                    #local_model.compile(optimizer=optimizer, loss=InnerLossPusher(local_model))
            else:
                # only sparse label encoding is used (no one-hot encoded labels as it was the case in DeLFT < 0.3)
                local_model.compile(optimizer=optimizer,
                                    loss='sparse_categorical_crossentropy')

        return local_model
Esempio n. 3
0
    def testGradientAccumulatorDistributionStrategy(self):
        context._context = None
        ops.enable_eager_execution_internal()
        physical_devices = tf.config.list_physical_devices("CPU")
        if len(physical_devices) == 1:
            tf.config.set_logical_device_configuration(physical_devices[0], [
                tf.config.LogicalDeviceConfiguration(),
                tf.config.LogicalDeviceConfiguration()
            ])
        devices = tf.config.list_logical_devices(device_type="CPU")
        strategy = tf.distribute.MirroredStrategy(devices=devices[:2])

        with strategy.scope():
            accumulator = GradientAccumulator()
            variable = tf.Variable([4.0, 3.0])
            optimizer, _ = create_optimizer(5e-5, 10, 5)
            gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)

        def accumulate_on_replica(gradient):
            accumulator([gradient])

        def apply_on_replica():
            optimizer.apply_gradients(
                list(zip(accumulator.gradients, [variable])))

        @tf.function
        def accumulate(grad1, grad2):
            with strategy.scope():
                local_variables = strategy.experimental_local_results(
                    gradient_placeholder)
                local_variables[0].assign(grad1)
                local_variables[1].assign(grad2)
                strategy.experimental_run_v2(accumulate_on_replica,
                                             args=(gradient_placeholder, ))

        @tf.function
        def apply_grad():
            with strategy.scope():
                strategy.experimental_run_v2(apply_on_replica)

        def _check_local_values(grad1, grad2):
            values = strategy.experimental_local_results(
                accumulator._gradients[0])
            self.assertListAlmostEqual(values[0].value(), grad1, tol=1e-2)
            self.assertListAlmostEqual(values[1].value(), grad2, tol=1e-2)

        accumulate([1.0, 2.0], [-1.0, 1.0])
        accumulate([3.0, -1.0], [-1.0, -1.0])
        accumulate([-2.0, 2.0], [3.0, -2.0])
        self.assertEqual(accumulator.step, 3)
        _check_local_values([2.0, 3.0], [1.0, -2.0])
        apply_grad()
        self.assertListAlmostEqual(variable.value(), [4.0, 3.0], tol=1e-2)
        accumulator.reset()
        self.assertEqual(accumulator.step, 0)
        _check_local_values([0.0, 0.0], [0.0, 0.0])
Esempio n. 4
0
 def compile(self, train_size):
     #optimizer = Adam(learning_rate=2e-5, clipnorm=1)
     optimizer, lr_schedule = create_optimizer(
         init_lr=2e-5,
         num_train_steps=train_size,
         weight_decay_rate=0.01,
         num_warmup_steps=0.1 * train_size,
     )
     self.model.compile(optimizer=optimizer,
                        loss='binary_crossentropy',
                        metrics=["accuracy"])
Esempio n. 5
0
File: train.py Progetto: shifop/yolo
    n2s = read_json('./data/n2s.json')

    model = YOLOv3({
        'num_class':9
    })

    # 打印variable
    model.get_loss(init_call[0], 
                ((init_call[1], init_call[4]), 
                (init_call[2], init_call[5]), 
                (init_call[3], init_call[6])))

    # for index, x in enumerate(model.variables):
    #         print('%s %s %s'%(str(index), str(x.name), str(x.shape)))

    optimizer1,_ = create_optimizer(1e-4, cfg.TRAIN.EPOCHS*size, size, 1e-7)
    optimizer2,_ = create_optimizer(1e-3, cfg.TRAIN.EPOCHS*size, size, 1e-6)
    optimizer = [optimizer1, optimizer2]

    # EPOCHS = 10000
    # optimizer1,_ = create_optimizer(1e-3, EPOCHS*size, 1000, 1e-6, weight_decay_rate=0.01)
    # optimizer2,_ = create_optimizer(1e-4, EPOCHS*size, 1000, 1e-7, weight_decay_rate=0.01)
    # optimizer = [optimizer1, optimizer2]

    ckpt = tf.train.Checkpoint(model=model)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              './model/'+save_f,
                                              checkpoint_name='model.ckpt',
                                              max_to_keep=300)

    # ckpt.restore(tf.train.latest_checkpoint('./model/%s/'%(save_f)))
Esempio n. 6
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_qa",
                           model_args,
                           data_args,
                           framework="tensorflow")

    output_dir = Path(training_args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # endregion

    # region Checkpoints
    checkpoint = None
    if len(os.listdir(training_args.output_dir)
           ) > 0 and not training_args.overwrite_output_dir:
        if (output_dir / CONFIG_NAME).is_file() and (
                output_dir / TF2_WEIGHTS_NAME).is_file():
            checkpoint = output_dir
            logger.info(
                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
        else:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to continue regardless.")
    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)

    # Set the verbosity to info of the Transformers logger (on main process only):
    if training_args.should_log:
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # region Load Data
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
            extension = data_args.train_file.split(".")[-1]

        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
        datasets = load_dataset(
            extension,
            data_files=data_files,
            field="data",
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    # endregion

    # region Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Tokenizer check: this script requires a fast tokenizer.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
            " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
            " this requirement")
    # endregion

    # region Preprocessing the datasets
    # Preprocessing is slightly different for training and evaluation.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    elif training_args.do_eval:
        column_names = datasets["validation"].column_names
    else:
        column_names = datasets["test"].column_names
    question_column_name = "question" if "question" in column_names else column_names[
        0]
    context_column_name = "context" if "context" in column_names else column_names[
        1]
    answer_column_name = "answers" if "answers" in column_names else column_names[
        2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    if data_args.pad_to_max_length or isinstance(training_args.strategy,
                                                 tf.distribute.TPUStrategy):
        logger.info(
            "Padding all batches to max length because argument was set or we're on TPU."
        )
        padding = "max_length"
    else:
        padding = False

    # Training preprocessing
    def prepare_train_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [
            q.lstrip() for q in examples[question_column_name]
        ]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding=padding,
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != (1 if pad_on_right
                                                          else 0):
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != (1 if pad_on_right else
                                                        0):
                    token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char
                        and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[
                            token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(
                        token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(
                        token_end_index + 1)

        return tokenized_examples

    processed_datasets = dict()
    if training_args.do_train:
        if "train" not in datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = datasets["train"]
        if data_args.max_train_samples is not None:
            # We will select sample from whole data if agument is specified
            max_train_samples = min(len(train_dataset),
                                    data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))
        # Create train feature from dataset
        train_dataset = train_dataset.map(
            prepare_train_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
        if data_args.max_train_samples is not None:
            # Number of samples might increase during Feature Creation, We select only specified max samples
            max_train_samples = min(len(train_dataset),
                                    data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))
        processed_datasets["train"] = train_dataset

    # Validation preprocessing
    def prepare_validation_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [
            q.lstrip() for q in examples[question_column_name]
        ]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding=padding,
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        for i in range(len(tokenized_examples["input_ids"])):
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)
            context_index = 1 if pad_on_right else 0

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(
                examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if training_args.do_eval:
        if "validation" not in datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_examples = datasets["validation"]
        if data_args.max_eval_samples is not None:
            # We will select sample from whole data
            max_eval_samples = min(len(eval_examples),
                                   data_args.max_eval_samples)
            eval_examples = eval_examples.select(range(max_eval_samples))
        # Validation Feature Creation
        eval_dataset = eval_examples.map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
        if data_args.max_eval_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            max_eval_samples = min(len(eval_dataset),
                                   data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))
        processed_datasets["validation"] = eval_dataset

    if training_args.do_predict:
        if "test" not in datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_examples = datasets["test"]
        if data_args.max_predict_samples is not None:
            # We will select sample from whole data
            predict_examples = predict_examples.select(
                range(data_args.max_predict_samples))
        # Predict Feature Creation
        predict_dataset = predict_examples.map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
        if data_args.max_predict_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            max_predict_samples = min(len(predict_dataset),
                                      data_args.max_predict_samples)
            predict_dataset = predict_dataset.select(
                range(max_predict_samples))
        processed_datasets["test"] = predict_dataset
    # endregion

    # region Metrics and Post-processing:
    def post_processing_function(examples,
                                 features,
                                 predictions,
                                 stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions = postprocess_qa_predictions(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=data_args.version_2_with_negative,
            n_best_size=data_args.n_best_size,
            max_answer_length=data_args.max_answer_length,
            null_score_diff_threshold=data_args.null_score_diff_threshold,
            output_dir=training_args.output_dir,
            prefix=stage,
        )
        # Format the result to the format the metric expects.
        if data_args.version_2_with_negative:
            formatted_predictions = [{
                "id": k,
                "prediction_text": v,
                "no_answer_probability": 0.0
            } for k, v in predictions.items()]
        else:
            formatted_predictions = [{
                "id": k,
                "prediction_text": v
            } for k, v in predictions.items()]

        references = [{
            "id": ex["id"],
            "answers": ex[answer_column_name]
        } for ex in examples]
        return EvalPrediction(predictions=formatted_predictions,
                              label_ids=references)

    metric = evaluate.load(
        "squad_v2" if data_args.version_2_with_negative else "squad")

    def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions=p.predictions,
                              references=p.label_ids)

    # endregion

    with training_args.strategy.scope():

        dataset_options = tf.data.Options()
        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
        num_replicas = training_args.strategy.num_replicas_in_sync

        # region Load model and prepare datasets
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForQuestionAnswering.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        if training_args.do_train:

            training_dataset = model.prepare_tf_dataset(
                processed_datasets["train"],
                shuffle=True,
                batch_size=training_args.per_device_train_batch_size *
                num_replicas,
                tokenizer=tokenizer,
            )

            training_dataset = training_dataset.with_options(dataset_options)

            num_train_steps = len(
                training_dataset) * training_args.num_train_epochs
            if training_args.warmup_steps > 0:
                num_warmup_steps = training_args.warmup_steps
            elif training_args.warmup_ratio > 0:
                num_warmup_steps = int(num_train_steps *
                                       training_args.warmup_ratio)
            else:
                num_warmup_steps = 0

            optimizer, schedule = create_optimizer(
                init_lr=training_args.learning_rate,
                num_train_steps=len(training_dataset) *
                training_args.num_train_epochs,
                num_warmup_steps=num_warmup_steps,
                adam_beta1=training_args.adam_beta1,
                adam_beta2=training_args.adam_beta2,
                adam_epsilon=training_args.adam_epsilon,
                weight_decay_rate=training_args.weight_decay,
                adam_global_clipnorm=training_args.max_grad_norm,
            )

            # no user-specified loss = will use the model internal loss
            model.compile(optimizer=optimizer,
                          jit_compile=training_args.xla,
                          metrics=["accuracy"])

        else:
            model.compile(optimizer=None,
                          jit_compile=training_args.xla,
                          metrics=["accuracy"])
            training_dataset = None

        if training_args.do_eval:
            eval_dataset = model.prepare_tf_dataset(
                processed_datasets["validation"],
                shuffle=False,
                batch_size=training_args.per_device_train_batch_size *
                num_replicas,
                tokenizer=tokenizer,
            )
            eval_dataset = eval_dataset.with_options(dataset_options)
        else:
            eval_dataset = None

        if training_args.do_predict:
            predict_dataset = model.prepare_tf_dataset(
                processed_datasets["test"],
                shuffle=False,
                batch_size=training_args.per_device_eval_batch_size *
                num_replicas,
                tokenizer=tokenizer,
            )
            predict_dataset = predict_dataset.with_options(dataset_options)
        else:
            predict_dataset = None

        # endregion

        # region Preparing push_to_hub and model card
        push_to_hub_model_id = training_args.push_to_hub_model_id
        model_name = model_args.model_name_or_path.split("/")[-1]
        if not push_to_hub_model_id:
            if data_args.dataset_name is not None:
                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
            else:
                push_to_hub_model_id = f"{model_name}-finetuned-question-answering"

        model_card_kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tasks": "question-answering"
        }
        if data_args.dataset_name is not None:
            model_card_kwargs["dataset_tags"] = data_args.dataset_name
            if data_args.dataset_config_name is not None:
                model_card_kwargs[
                    "dataset_args"] = data_args.dataset_config_name
                model_card_kwargs[
                    "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
            else:
                model_card_kwargs["dataset"] = data_args.dataset_name

        if training_args.push_to_hub:
            callbacks = [
                PushToHubCallback(
                    output_dir=training_args.output_dir,
                    model_id=push_to_hub_model_id,
                    organization=training_args.push_to_hub_organization,
                    token=training_args.push_to_hub_token,
                    tokenizer=tokenizer,
                    **model_card_kwargs,
                )
            ]
        else:
            callbacks = []
        # endregion

        # region Training and Evaluation

        if training_args.do_train:
            # Note that the validation and test datasets have been processed in a different way to the
            # training datasets in this example, and so they don't have the same label structure.
            # As such, we don't pass them directly to Keras, but instead get model predictions to evaluate
            # after training.
            model.fit(training_dataset,
                      epochs=int(training_args.num_train_epochs),
                      callbacks=callbacks)

        if training_args.do_eval:
            logger.info("*** Evaluation ***")

            # In this example, we compute advanced metrics at the end of training, but
            # if you'd like to compute metrics every epoch that are too complex to be written as
            # standard Keras metrics, you can use our KerasMetricCallback. See
            # https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks

            eval_predictions = model.predict(eval_dataset)
            if isinstance(eval_predictions.start_logits, tf.RaggedTensor):
                # If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
                # The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
                # the highest probability in a sample. Instead, we use a large negative value, which ensures that the
                # padding positions are correctly masked.
                eval_start_logits = eval_predictions.start_logits.to_tensor(
                    default_value=-1000).numpy()
                eval_end_logits = eval_predictions.end_logits.to_tensor(
                    default_value=-1000).numpy()
            else:
                eval_start_logits = eval_predictions.start_logits
                eval_end_logits = eval_predictions.end_logits

            post_processed_eval = post_processing_function(
                datasets["validation"],
                processed_datasets["validation"],
                (eval_start_logits, eval_end_logits),
            )
            metrics = compute_metrics(post_processed_eval)
            logging.info("Evaluation metrics:")
            for metric, value in metrics.items():
                logging.info(f"{metric}: {value:.3f}")
            if training_args.output_dir is not None:
                output_eval_file = os.path.join(training_args.output_dir,
                                                "all_results.json")
                with open(output_eval_file, "w") as writer:
                    writer.write(json.dumps(metrics))
        # endregion

        # region Prediction
        if training_args.do_predict:
            logger.info("*** Predict ***")

            test_predictions = model.predict(predict_dataset)
            if isinstance(test_predictions.start_logits, tf.RaggedTensor):
                # If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
                # The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
                # the highest probability in a sample. Instead, we use a large negative value, which ensures that the
                # padding positions are correctly masked.
                test_start_logits = test_predictions.start_logits.to_tensor(
                    default_value=-1000).numpy()
                test_end_logits = test_predictions.end_logits.to_tensor(
                    default_value=-1000).numpy()
            else:
                test_start_logits = test_predictions.start_logits
                test_end_logits = test_predictions.end_logits
            post_processed_test = post_processing_function(
                datasets["test"],
                processed_datasets["test"],
                (test_start_logits, test_end_logits),
            )
            metrics = compute_metrics(post_processed_test)

            logging.info("Test metrics:")
            for metric, value in metrics.items():
                logging.info(f"{metric}: {value:.3f}")
        # endregion

    if training_args.output_dir is not None and not training_args.push_to_hub:
        # If we're not pushing to hub, at least save a local copy when we're done
        model.save_pretrained(training_args.output_dir)
Esempio n. 7
0
def main():
    # region Argument Parsing
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sanity checks
    if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None:
        raise ValueError(
            "Need either a dataset name or a training/validation file.")
    else:
        if data_args.train_file is not None:
            extension = data_args.train_file.split(".")[-1]
            assert extension in [
                "csv", "json", "txt"
            ], "`train_file` should be a csv, json or txt file."
        if data_args.validation_file is not None:
            extension = data_args.validation_file.split(".")[-1]
            assert extension in [
                "csv", "json", "txt"
            ], "`validation_file` should be a csv, json or txt file."

    if training_args.output_dir is not None:
        training_args.output_dir = Path(training_args.output_dir)
        os.makedirs(training_args.output_dir, exist_ok=True)

    if isinstance(
            training_args.strategy,
            tf.distribute.TPUStrategy) and not data_args.pad_to_max_length:
        logger.warning("We are training on TPU - forcing pad_to_max_length")
        data_args.pad_to_max_length = True
    # endregion

    # region Checkpoints
    # Detecting last checkpoint.
    checkpoint = None
    if len(os.listdir(training_args.output_dir)
           ) > 0 and not training_args.overwrite_output_dir:
        config_path = training_args.output_dir / CONFIG_NAME
        weights_path = training_args.output_dir / TF2_WEIGHTS_NAME
        if config_path.is_file() and weights_path.is_file():
            checkpoint = training_args.output_dir
            logger.info(
                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
        else:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to continue regardless.")

    # endregion

    # region Setup logging
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()
    # endregion

    # If passed along, set the training seed now.
    if training_args.seed is not None:
        set_seed(training_args.seed)

    # region Load datasets
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(data_args.dataset_name,
                                    data_args.dataset_config_name)
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
            )
    else:
        data_files = {}
        dataset_args = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
        raw_datasets = load_dataset(extension,
                                    data_files=data_files,
                                    **dataset_args)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    # endregion

    # region Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )
    # endregion

    # region Dataset preprocessing
    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if data_args.max_seq_length > tokenizer.model_max_length:
            logger.warning(
                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(data_args.max_seq_length,
                             tokenizer.model_max_length)

    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    def tokenize_function(examples):
        return tokenizer(examples[text_column_name])

    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Running tokenizer on dataset",
    )

    block_size = tokenizer.model_max_length
    if block_size > 1024:
        logger.warning(
            f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
            "Picking 1024 instead. You can reduce that value by passing --block_size xxx."
        )
        block_size = 1024

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {
            k: sum(examples[k], [])
            for k in examples.keys()
        }
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k:
            [t[i:i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
    # to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
        desc=f"Grouping texts in chunks of {block_size}",
    )

    train_dataset = lm_datasets["train"]
    if data_args.validation_file is not None:
        eval_dataset = lm_datasets["validation"]
    else:
        logger.info(
            f"Validation file not found: using {data_args.validation_split_percentage}% of the dataset as validation as provided in data_args"
        )
        train_indices, val_indices = train_test_split(
            list(range(len(train_dataset))),
            test_size=data_args.validation_split_percentage / 100)

        eval_dataset = train_dataset.select(val_indices)
        train_dataset = train_dataset.select(train_indices)

    if data_args.max_train_samples is not None:
        train_dataset = train_dataset.select(range(
            data_args.max_train_samples))
    if data_args.max_eval_samples is not None:
        eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")
    # endregion

    with training_args.strategy.scope():
        # region Prepare model
        if checkpoint is not None:
            model = TFAutoModelForCausalLM.from_pretrained(checkpoint,
                                                           config=config)
        elif model_args.model_name_or_path:
            model = TFAutoModelForCausalLM.from_pretrained(
                model_args.model_name_or_path, config=config)
        else:
            logger.info("Training new model from scratch")
            model = TFAutoModelForCausalLM.from_config(config)

        model.resize_token_embeddings(len(tokenizer))
        # endregion

        # region TF Dataset preparation
        num_replicas = training_args.strategy.num_replicas_in_sync
        train_generator = partial(sample_generator, train_dataset, tokenizer)
        train_signature = {
            feature: tf.TensorSpec(shape=(None, ), dtype=tf.int64)
            for feature in train_dataset.features
            if feature != "special_tokens_mask"
        }
        train_sig = (train_signature, train_signature["labels"])
        options = tf.data.Options()
        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
        tf_train_dataset = (tf.data.Dataset.from_generator(
            train_generator,
            output_signature=train_sig).with_options(options).batch(
                batch_size=num_replicas *
                training_args.per_device_train_batch_size,
                drop_remainder=True).repeat(int(
                    training_args.num_train_epochs)))
        eval_generator = partial(sample_generator, eval_dataset, tokenizer)
        eval_signature = {
            feature: tf.TensorSpec(shape=(None, ), dtype=tf.int64)
            for feature in eval_dataset.features
            if feature != "special_tokens_mask"
        }
        eval_sig = (eval_signature, eval_signature["labels"])
        tf_eval_dataset = (tf.data.Dataset.from_generator(
            eval_generator,
            output_signature=eval_sig).with_options(options).batch(
                batch_size=num_replicas *
                training_args.per_device_eval_batch_size,
                drop_remainder=True).repeat(int(
                    training_args.num_train_epochs)))
        # endregion

        # region Optimizer and loss
        batches_per_epoch = len(train_dataset) // (
            num_replicas * training_args.per_device_train_batch_size)
        # Bias and layernorm weights are automatically excluded from the decay
        optimizer, lr_schedule = create_optimizer(
            init_lr=training_args.learning_rate,
            num_train_steps=int(training_args.num_train_epochs *
                                batches_per_epoch),
            num_warmup_steps=training_args.warmup_steps,
            adam_beta1=training_args.adam_beta1,
            adam_beta2=training_args.adam_beta2,
            adam_epsilon=training_args.adam_epsilon,
            weight_decay_rate=training_args.weight_decay,
        )

        def dummy_loss(y_true, y_pred):
            return tf.reduce_mean(y_pred)

        model.compile(optimizer=optimizer, loss={"loss": dummy_loss})
        # endregion

        # region Training and validation
        logger.info("***** Running training *****")
        logger.info(f"  Num examples = {len(train_dataset)}")
        logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
        logger.info(
            f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
        )
        logger.info(
            f"  Total train batch size = {training_args.per_device_train_batch_size * num_replicas}"
        )

        history = model.fit(
            tf_train_dataset,
            validation_data=tf_eval_dataset,
            epochs=int(training_args.num_train_epochs),
            steps_per_epoch=len(train_dataset) //
            (training_args.per_device_train_batch_size * num_replicas),
            callbacks=[
                SavePretrainedCallback(output_dir=training_args.output_dir)
            ],
        )
        try:
            train_perplexity = math.exp(history.history["loss"][-1])
        except OverflowError:
            train_perplexity = math.inf
        try:
            validation_perplexity = math.exp(history.history["val_loss"][-1])
        except OverflowError:
            validation_perplexity = math.inf
        logger.info(f"  Final train loss: {history.history['loss'][-1]:.3f}")
        logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
        logger.info(
            f"  Final validation loss: {history.history['val_loss'][-1]:.3f}")
        logger.info(
            f"  Final validation perplexity: {validation_perplexity:.3f}")
        # endregion

        if training_args.output_dir is not None:
            model.save_pretrained(training_args.output_dir)

    if training_args.push_to_hub:
        # You'll probably want to include some of your own metadata here!
        model.push_to_hub()
Esempio n. 8
0
def main():
    # region Argument Parsing
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_ner",
                           model_args,
                           data_args,
                           framework="tensorflow")
    # endregion

    # region Setup logging
    # we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()

    # If passed along, set the training seed now.
    if training_args.seed is not None:
        set_seed(training_args.seed)
    # endregion

    # region Loading datasets
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
    # 'tokens' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        raw_datasets = load_dataset(
            extension,
            data_files=data_files,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if raw_datasets["train"] is not None:
        column_names = raw_datasets["train"].column_names
        features = raw_datasets["train"].features
    else:
        column_names = raw_datasets["validation"].column_names
        features = raw_datasets["validation"].features

    if data_args.text_column_name is not None:
        text_column_name = data_args.text_column_name
    elif "tokens" in column_names:
        text_column_name = "tokens"
    else:
        text_column_name = column_names[0]

    if data_args.label_column_name is not None:
        label_column_name = data_args.label_column_name
    elif f"{data_args.task_name}_tags" in column_names:
        label_column_name = f"{data_args.task_name}_tags"
    else:
        label_column_name = column_names[1]

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    if isinstance(features[label_column_name].feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(raw_datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)
    # endregion

    # region Load config and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            num_labels=num_labels)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            num_labels=num_labels)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
    if not tokenizer_name_or_path:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if config.model_type in {"gpt2", "roberta"}:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,
                                                  use_fast=True,
                                                  add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,
                                                  use_fast=True)
    # endregion

    # region Preprocessing the raw datasets
    # First we tokenize all the texts.
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            max_length=data_args.max_length,
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )

        labels = []
        for i, label in enumerate(examples[label_column_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label_to_id[label[word_idx]] if data_args.
                                     label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    processed_raw_datasets = raw_datasets.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=raw_datasets["train"].column_names,
        desc="Running tokenizer on dataset",
    )

    train_dataset = processed_raw_datasets["train"]
    eval_dataset = processed_raw_datasets["validation"]

    if data_args.max_train_samples is not None:
        max_train_samples = min(len(train_dataset),
                                data_args.max_train_samples)
        train_dataset = train_dataset.select(range(max_train_samples))

    if data_args.max_eval_samples is not None:
        max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
        eval_dataset = eval_dataset.select(range(max_eval_samples))

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")
    # endregion

    with training_args.strategy.scope():
        # region Initialize model
        if model_args.model_name_or_path:
            model = TFAutoModelForTokenClassification.from_pretrained(
                model_args.model_name_or_path,
                config=config,
            )
        else:
            logger.info("Training new model from scratch")
            model = TFAutoModelForTokenClassification.from_config(config)

        model.resize_token_embeddings(len(tokenizer))
        # endregion

        # region Create TF datasets

        # We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as
        # well as inputs.
        collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer,
                                                        return_tensors="tf")
        num_replicas = training_args.strategy.num_replicas_in_sync
        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas

        dataset_options = tf.data.Options()
        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF

        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
        # yourself if you use this method, whereas they are automatically inferred from the model input names when
        # using model.prepare_tf_dataset()
        # For more info see the docs:
        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset

        tf_train_dataset = model.prepare_tf_dataset(
            train_dataset,
            collate_fn=collate_fn,
            batch_size=total_train_batch_size,
            shuffle=True,
        ).with_options(dataset_options)
        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
        tf_eval_dataset = model.prepare_tf_dataset(
            eval_dataset,
            collate_fn=collate_fn,
            batch_size=total_eval_batch_size,
            shuffle=False,
        ).with_options(dataset_options)

        # endregion

        # region Optimizer, loss and compilation
        num_train_steps = int(
            len(tf_train_dataset) * training_args.num_train_epochs)
        if training_args.warmup_steps > 0:
            num_warmup_steps = training_args.warmup_steps
        elif training_args.warmup_ratio > 0:
            num_warmup_steps = int(num_train_steps *
                                   training_args.warmup_ratio)
        else:
            num_warmup_steps = 0

        optimizer, lr_schedule = create_optimizer(
            init_lr=training_args.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            adam_beta1=training_args.adam_beta1,
            adam_beta2=training_args.adam_beta2,
            adam_epsilon=training_args.adam_epsilon,
            weight_decay_rate=training_args.weight_decay,
            adam_global_clipnorm=training_args.max_grad_norm,
        )

        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
        # endregion

        # Metrics
        metric = evaluate.load("seqeval")

        def get_labels(y_pred, y_true):
            # Transform predictions and references tensos to numpy arrays

            # Remove ignored index (special tokens)
            true_predictions = [[
                label_list[p] for (p, l) in zip(pred, gold_label) if l != -100
            ] for pred, gold_label in zip(y_pred, y_true)]
            true_labels = [[
                label_list[l] for (p, l) in zip(pred, gold_label) if l != -100
            ] for pred, gold_label in zip(y_pred, y_true)]
            return true_predictions, true_labels

        def compute_metrics():
            results = metric.compute()
            if data_args.return_entity_level_metrics:
                # Unpack nested dictionaries
                final_results = {}
                for key, value in results.items():
                    if isinstance(value, dict):
                        for n, v in value.items():
                            final_results[f"{key}_{n}"] = v
                    else:
                        final_results[key] = value
                return final_results
            else:
                return {
                    "precision": results["overall_precision"],
                    "recall": results["overall_recall"],
                    "f1": results["overall_f1"],
                    "accuracy": results["overall_accuracy"],
                }

        # endregion

        # region Preparing push_to_hub and model card
        push_to_hub_model_id = training_args.push_to_hub_model_id
        model_name = model_args.model_name_or_path.split("/")[-1]
        if not push_to_hub_model_id:
            if data_args.dataset_name is not None:
                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
            else:
                push_to_hub_model_id = f"{model_name}-finetuned-token-classification"

        model_card_kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tasks": "token-classification"
        }
        if data_args.dataset_name is not None:
            model_card_kwargs["dataset_tags"] = data_args.dataset_name
            if data_args.dataset_config_name is not None:
                model_card_kwargs[
                    "dataset_args"] = data_args.dataset_config_name
                model_card_kwargs[
                    "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
            else:
                model_card_kwargs["dataset"] = data_args.dataset_name

        if training_args.push_to_hub:
            callbacks = [
                PushToHubCallback(
                    output_dir=training_args.output_dir,
                    model_id=push_to_hub_model_id,
                    organization=training_args.push_to_hub_organization,
                    token=training_args.push_to_hub_token,
                    tokenizer=tokenizer,
                    **model_card_kwargs,
                )
            ]
        else:
            callbacks = []
        # endregion

        # region Training
        logger.info("***** Running training *****")
        logger.info(f"  Num examples = {len(train_dataset)}")
        logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
        logger.info(
            f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
        )
        logger.info(f"  Total train batch size = {total_train_batch_size}")
        # Only show the progress bar once on each machine.

        model.fit(
            tf_train_dataset,
            validation_data=tf_eval_dataset,
            epochs=int(training_args.num_train_epochs),
            callbacks=callbacks,
        )
        # endregion

        # region Predictions
        # If you have variable batch sizes (i.e. not using pad_to_max_length), then
        # this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq
        # length from predict().

        try:
            predictions = model.predict(
                tf_eval_dataset,
                batch_size=training_args.per_device_eval_batch_size)["logits"]
        except tf.python.framework.errors_impl.InvalidArgumentError:
            raise ValueError(
                "Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older "
                "then you will need to use --pad_to_max_length to generate predictions, as older "
                "versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor."
            )
        if isinstance(predictions, tf.RaggedTensor):
            predictions = predictions.to_tensor(default_value=-100)
        predictions = tf.math.argmax(predictions, axis=-1).numpy()
        if "label" in eval_dataset:
            labels = eval_dataset.with_format("tf")["label"]
        else:
            labels = eval_dataset.with_format("tf")["labels"]
        if isinstance(labels, tf.RaggedTensor):
            labels = labels.to_tensor(default_value=-100)
        labels = labels.numpy()
        attention_mask = eval_dataset.with_format("tf")["attention_mask"]
        if isinstance(attention_mask, tf.RaggedTensor):
            attention_mask = attention_mask.to_tensor(default_value=-100)
        attention_mask = attention_mask.numpy()
        labels[attention_mask == 0] = -100
        preds, refs = get_labels(predictions, labels)
        metric.add_batch(
            predictions=preds,
            references=refs,
        )
        eval_metric = compute_metrics()
        logger.info("Evaluation metrics:")
        for key, val in eval_metric.items():
            logger.info(f"{key}: {val:.4f}")

        if training_args.output_dir is not None:
            output_eval_file = os.path.join(training_args.output_dir,
                                            "all_results.json")
            with open(output_eval_file, "w") as writer:
                writer.write(json.dumps(eval_metric))
        # endregion

    if training_args.output_dir is not None and not training_args.push_to_hub:
        # If we're not pushing to hub, at least save a local copy when we're done
        model.save_pretrained(training_args.output_dir)
Esempio n. 9
0
def main():
    # region Argument Parsing
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    # endregion

    # region Setup logging
    # we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()

    # If passed along, set the training seed now.
    if training_args.seed is not None:
        set_seed(training_args.seed)
    # endregion

    # region Loading datasets
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
    # 'tokens' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        raw_datasets = load_dataset(
            extension,
            data_files=data_files,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if raw_datasets["train"] is not None:
        column_names = raw_datasets["train"].column_names
        features = raw_datasets["train"].features
    else:
        column_names = raw_datasets["validation"].column_names
        features = raw_datasets["validation"].features

    if data_args.text_column_name is not None:
        text_column_name = data_args.text_column_name
    elif "tokens" in column_names:
        text_column_name = "tokens"
    else:
        text_column_name = column_names[0]

    if data_args.label_column_name is not None:
        label_column_name = data_args.label_column_name
    elif f"{data_args.task_name}_tags" in column_names:
        label_column_name = f"{data_args.task_name}_tags"
    else:
        label_column_name = column_names[1]

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    if isinstance(features[label_column_name].feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(raw_datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)
    # endregion

    # region Load config and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            num_labels=num_labels)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            num_labels=num_labels)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
    if not tokenizer_name_or_path:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if config.model_type in {"gpt2", "roberta"}:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,
                                                  use_fast=True,
                                                  add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,
                                                  use_fast=True)
    # endregion

    # region Preprocessing the raw datasets
    # First we tokenize all the texts.
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            max_length=data_args.max_length,
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )

        labels = []
        for i, label in enumerate(examples[label_column_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label_to_id[label[word_idx]] if data_args.
                                     label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    processed_raw_datasets = raw_datasets.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=raw_datasets["train"].column_names,
        desc="Running tokenizer on dataset",
    )

    train_dataset = processed_raw_datasets["train"]
    eval_dataset = processed_raw_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")
    # endregion

    with training_args.strategy.scope():
        # region Initialize model
        if model_args.model_name_or_path:
            model = TFAutoModelForTokenClassification.from_pretrained(
                model_args.model_name_or_path,
                config=config,
            )
        else:
            logger.info("Training new model from scratch")
            model = TFAutoModelForTokenClassification.from_config(config)

        model.resize_token_embeddings(len(tokenizer))
        # endregion

        # region Create TF datasets
        num_replicas = training_args.strategy.num_replicas_in_sync
        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
        train_batches_per_epoch = len(train_dataset) // total_train_batch_size
        tf_train_dataset = dataset_to_tf(
            train_dataset,
            tokenizer,
            total_batch_size=total_train_batch_size,
            num_epochs=training_args.num_train_epochs,
            shuffle=True,
        )
        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
        eval_batches_per_epoch = len(eval_dataset) // total_eval_batch_size
        tf_eval_dataset = dataset_to_tf(
            eval_dataset,
            tokenizer,
            total_batch_size=total_eval_batch_size,
            num_epochs=training_args.num_train_epochs,
            shuffle=False,
        )

        # endregion

        # region Optimizer, loss and compilation
        optimizer, lr_schedule = create_optimizer(
            init_lr=training_args.learning_rate,
            num_train_steps=int(training_args.num_train_epochs *
                                train_batches_per_epoch),
            num_warmup_steps=training_args.warmup_steps,
            adam_beta1=training_args.adam_beta1,
            adam_beta2=training_args.adam_beta2,
            adam_epsilon=training_args.adam_epsilon,
            weight_decay_rate=training_args.weight_decay,
        )

        def dummy_loss(y_true, y_pred):
            return tf.reduce_mean(y_pred)

        model.compile(loss={"loss": dummy_loss}, optimizer=optimizer)
        # endregion

        # Metrics
        metric = load_metric("seqeval")

        def get_labels(y_pred, y_true):
            # Transform predictions and references tensos to numpy arrays

            # Remove ignored index (special tokens)
            true_predictions = [[
                label_list[p] for (p, l) in zip(pred, gold_label) if l != -100
            ] for pred, gold_label in zip(y_pred, y_true)]
            true_labels = [[
                label_list[l] for (p, l) in zip(pred, gold_label) if l != -100
            ] for pred, gold_label in zip(y_pred, y_true)]
            return true_predictions, true_labels

        def compute_metrics():
            results = metric.compute()
            if data_args.return_entity_level_metrics:
                # Unpack nested dictionaries
                final_results = {}
                for key, value in results.items():
                    if isinstance(value, dict):
                        for n, v in value.items():
                            final_results[f"{key}_{n}"] = v
                    else:
                        final_results[key] = value
                return final_results
            else:
                return {
                    "precision": results["overall_precision"],
                    "recall": results["overall_recall"],
                    "f1": results["overall_f1"],
                    "accuracy": results["overall_accuracy"],
                }

        # endregion

        # region Training
        logger.info("***** Running training *****")
        logger.info(f"  Num examples = {len(train_dataset)}")
        logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
        logger.info(
            f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
        )
        logger.info(f"  Total train batch size = {total_train_batch_size}")
        # Only show the progress bar once on each machine.
        model.fit(
            tf_train_dataset,
            validation_data=tf_eval_dataset,
            epochs=int(training_args.num_train_epochs),
            steps_per_epoch=train_batches_per_epoch,
            validation_steps=eval_batches_per_epoch,
        )
        # endregion

        # region Predictions
        # For predictions, we preload the entire validation set - note that if you have a really giant validation
        # set, you might need to change this!
        eval_inputs = {
            key: tf.ragged.constant(eval_dataset[key]).to_tensor()
            for key in eval_dataset.features
        }
        predictions = model.predict(
            eval_inputs,
            batch_size=training_args.per_device_eval_batch_size)["logits"]
        predictions = tf.math.argmax(predictions, axis=-1)
        labels = np.array(eval_inputs["labels"])
        labels[np.array(eval_inputs["attention_mask"]) == 0] = -100
        preds, refs = get_labels(predictions, labels)
        metric.add_batch(
            predictions=preds,
            references=refs,
        )
        eval_metric = compute_metrics()
        logger.info("Evaluation metrics:")
        for key, val in eval_metric.items():
            logger.info(f"{key}: {val:.4f}")
        # endregion

    # We don't do predictions in the strategy scope because there are some issues in there right now.
    # They'll get fixed eventually, promise!

    if training_args.output_dir is not None:
        model.save_pretrained(training_args.output_dir)
Esempio n. 10
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_translation",
                           model_args,
                           data_args,
                           framework="tensorflow")
    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity(logging.INFO)
    transformers.utils.logging.set_verbosity(logging.INFO)

    # Log on each process the small summary:
    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # region Detecting last checkpoint
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
    # endregion

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # region Load datasets
    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files this script will use the first column for the full texts and the second column for the
    # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
            extension = data_args.train_file.split(".")[-1]
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
        raw_datasets = load_dataset(
            extension,
            data_files=data_files,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    # endregion

    # region Load model config and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
    # endregion

    # region Dataset preprocessing
    # We need to tokenize inputs and targets.
    if training_args.do_train:
        column_names = raw_datasets["train"].column_names
    elif training_args.do_eval:
        column_names = raw_datasets["validation"].column_names
    else:
        logger.info(
            "There is nothing to do. Please pass `do_train`, and/or `do_eval`."
        )
        return

    column_names = raw_datasets["train"].column_names

    # For translation we set the codes of our source and target languages (only useful for mBART, the others will
    # ignore those attributes).
    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
        assert data_args.target_lang is not None and data_args.source_lang is not None, (
            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and "
            "--target_lang arguments.")
        tokenizer.src_lang = data_args.source_lang
        tokenizer.tgt_lang = data_args.target_lang
        forced_bos_token_id = (
            tokenizer.lang_code_to_id[data_args.forced_bos_token]
            if data_args.forced_bos_token is not None else None)

    # Get the language codes for input/target.
    source_lang = data_args.source_lang.split("_")[0]
    target_lang = data_args.target_lang.split("_")[0]

    padding = "max_length" if data_args.pad_to_max_length else False

    # Temporarily set max_target_length for training.
    max_target_length = data_args.max_target_length
    padding = "max_length" if data_args.pad_to_max_length else False

    def preprocess_function(examples):
        inputs = [ex[source_lang] for ex in examples["translation"]]
        targets = [ex[target_lang] for ex in examples["translation"]]
        inputs = [prefix + inp for inp in inputs]
        model_inputs = tokenizer(inputs,
                                 max_length=data_args.max_source_length,
                                 padding=padding,
                                 truncation=True)

        # Tokenize targets with the `text_target` keyword argument
        labels = tokenizer(text_target=targets,
                           max_length=max_target_length,
                           padding=padding,
                           truncation=True)

        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
        # padding in the loss.
        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
            labels["input_ids"] = [[
                (l if l != tokenizer.pad_token_id else -100) for l in label
            ] for label in labels["input_ids"]]

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            max_train_samples = min(len(train_dataset),
                                    data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))
        with training_args.main_process_first(
                desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )
    else:
        train_dataset = None

    if training_args.do_eval:
        max_target_length = data_args.val_max_target_length
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(eval_dataset),
                                   data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))
        with training_args.main_process_first(
                desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )
    else:
        eval_dataset = None
    # endregion

    with training_args.strategy.scope():
        # region Prepare model
        model = TFAutoModelForSeq2SeqLM.from_pretrained(
            model_args.model_name_or_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )

        model.resize_token_embeddings(len(tokenizer))
        if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
            model.config.forced_bos_token_id = forced_bos_token_id
        # endregion

        # region Set decoder_start_token_id
        if model.config.decoder_start_token_id is None and isinstance(
                tokenizer, (MBartTokenizer, MBartTokenizerFast)):
            assert (data_args.target_lang is not None
                    and data_args.source_lang is not None
                    ), "mBart requires --target_lang and --source_lang"
            if isinstance(tokenizer, MBartTokenizer):
                model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
                    data_args.target_lang]
            else:
                model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(
                    data_args.target_lang)

        if model.config.decoder_start_token_id is None:
            raise ValueError(
                "Make sure that `config.decoder_start_token_id` is correctly defined"
            )
        # endregion

        # region Prepare TF Dataset objects
        label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
        data_collator = DataCollatorForSeq2Seq(
            tokenizer,
            model=model,
            label_pad_token_id=label_pad_token_id,
            pad_to_multiple_of=
            64,  # Reduce the number of unique shapes for XLA, especially for generation
            return_tensors="tf",
        )
        num_replicas = training_args.strategy.num_replicas_in_sync
        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas

        dataset_options = tf.data.Options()
        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF

        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
        # yourself if you use this method, whereas they are automatically inferred from the model input names when
        # using model.prepare_tf_dataset()
        # For more info see the docs:
        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset

        tf_train_dataset = model.prepare_tf_dataset(
            train_dataset,
            collate_fn=data_collator,
            batch_size=total_train_batch_size,
            shuffle=True,
        ).with_options(dataset_options)
        tf_eval_dataset = model.prepare_tf_dataset(
            eval_dataset,
            collate_fn=data_collator,
            batch_size=total_eval_batch_size,
            shuffle=False).with_options(dataset_options)
        # endregion

        # region Optimizer and LR scheduling
        num_train_steps = int(
            len(tf_train_dataset) * training_args.num_train_epochs)
        if training_args.warmup_steps > 0:
            num_warmup_steps = training_args.warmup_steps
        elif training_args.warmup_ratio > 0:
            num_warmup_steps = int(num_train_steps *
                                   training_args.warmup_ratio)
        else:
            num_warmup_steps = 0
        if training_args.do_train:
            optimizer, lr_schedule = create_optimizer(
                init_lr=training_args.learning_rate,
                num_train_steps=num_train_steps,
                num_warmup_steps=num_warmup_steps,
                adam_beta1=training_args.adam_beta1,
                adam_beta2=training_args.adam_beta2,
                adam_epsilon=training_args.adam_epsilon,
                weight_decay_rate=training_args.weight_decay,
                adam_global_clipnorm=training_args.max_grad_norm,
            )
        else:
            optimizer = None
        # endregion

        # region Metric and postprocessing
        if training_args.do_eval:
            metric = evaluate.load("sacrebleu")

            if data_args.val_max_target_length is None:
                data_args.val_max_target_length = data_args.max_target_length

            gen_kwargs = {
                "max_length": data_args.val_max_target_length,
                "num_beams": data_args.num_beams,
                "no_repeat_ngram_size":
                0,  # Not supported under XLA right now, and some models set it by default
            }

            def postprocess_text(preds, labels):
                preds = [pred.strip() for pred in preds]
                labels = [[label.strip()] for label in labels]

                return preds, labels

            def compute_metrics(preds):
                predictions, labels = preds
                if isinstance(predictions, tuple):
                    predictions = predictions[0]
                decoded_preds = tokenizer.batch_decode(
                    predictions, skip_special_tokens=True)
                labels = np.where(labels != -100, labels,
                                  tokenizer.pad_token_id)
                decoded_labels = tokenizer.batch_decode(
                    labels, skip_special_tokens=True)
                decoded_preds, decoded_labels = postprocess_text(
                    decoded_preds, decoded_labels)
                metrics = metric.compute(predictions=decoded_preds,
                                         references=decoded_labels)
                return {"bleu": metrics["score"]}

            # The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
            # to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
            # useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
            # For more information, see the docs at
            # https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback

            metric_callback = KerasMetricCallback(
                metric_fn=compute_metrics,
                eval_dataset=tf_eval_dataset,
                predict_with_generate=True,
                use_xla_generation=True,
                generate_kwargs=gen_kwargs,
            )
            callbacks = [metric_callback]
        else:
            callbacks = []

        # endregion

        # region Preparing push_to_hub and model card
        push_to_hub_model_id = training_args.push_to_hub_model_id
        model_name = model_args.model_name_or_path.split("/")[-1]
        if not push_to_hub_model_id:
            push_to_hub_model_id = f"{model_name}-finetuned-{data_args.source_lang}-{data_args.target_lang}"

        model_card_kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tasks": "translation"
        }
        if data_args.dataset_name is not None:
            model_card_kwargs["dataset_tags"] = data_args.dataset_name
            if data_args.dataset_config_name is not None:
                model_card_kwargs[
                    "dataset_args"] = data_args.dataset_config_name
                model_card_kwargs[
                    "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
            else:
                model_card_kwargs["dataset"] = data_args.dataset_name

        languages = [
            l for l in [data_args.source_lang, data_args.target_lang]
            if l is not None
        ]
        if len(languages) > 0:
            model_card_kwargs["language"] = languages

        if training_args.push_to_hub:
            # Because this training can be quite long, we save once per epoch.
            callbacks.append(
                PushToHubCallback(
                    output_dir=training_args.output_dir,
                    model_id=push_to_hub_model_id,
                    organization=training_args.push_to_hub_organization,
                    token=training_args.push_to_hub_token,
                    tokenizer=tokenizer,
                    **model_card_kwargs,
                ))
        # endregion

        # region Training
        eval_metrics = None
        model.compile(optimizer=optimizer, jit_compile=training_args.xla)

        if training_args.do_train:
            logger.info("***** Running training *****")
            logger.info(f"  Num examples = {len(train_dataset)}")
            logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
            logger.info(
                f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
            )
            logger.info(f"  Total train batch size = {total_train_batch_size}")
            logger.info(f"  Total optimization steps = {num_train_steps}")

            if training_args.xla and not data_args.pad_to_max_length:
                logger.warning(
                    "XLA training may be slow at first when --pad_to_max_length is not set "
                    "until all possible shapes have been compiled.")

            history = model.fit(tf_train_dataset,
                                epochs=int(training_args.num_train_epochs),
                                callbacks=callbacks)
            eval_metrics = {
                key: val[-1]
                for key, val in history.history.items()
            }
        # endregion

        # region Validation
        if training_args.do_eval and not training_args.do_train:
            # Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
            @tf.function(jit_compile=True)
            def generate(**kwargs):
                return model.generate(**kwargs)

            if training_args.do_eval:
                logger.info("Evaluation...")
                for batch, labels in tf_eval_dataset:
                    batch.update(gen_kwargs)
                    generated_tokens = generate(**batch)
                    if isinstance(generated_tokens, tuple):
                        generated_tokens = generated_tokens[0]
                    decoded_preds = tokenizer.batch_decode(
                        generated_tokens, skip_special_tokens=True)
                    labels = np.where(labels != -100, labels,
                                      tokenizer.pad_token_id)
                    decoded_labels = tokenizer.batch_decode(
                        labels, skip_special_tokens=True)
                    decoded_preds, decoded_labels = postprocess_text(
                        decoded_preds, decoded_labels)

                    metric.add_batch(predictions=decoded_preds,
                                     references=decoded_labels)

                eval_metrics = metric.compute()
                logger.info({"bleu": eval_metrics["score"]})
        # endregion

        if training_args.output_dir is not None and eval_metrics is not None:
            output_eval_file = os.path.join(training_args.output_dir,
                                            "all_results.json")
            with open(output_eval_file, "w") as writer:
                writer.write(json.dumps(eval_metrics))

        if training_args.output_dir is not None and not training_args.push_to_hub:
            # If we're not pushing to hub, at least save a local copy when we're done
            model.save_pretrained(training_args.output_dir)
Esempio n. 11
0
    dev = Dataset(read_json(cfg.DEV), batch_size=cfg.BATCH_SIZE)
    dev_count = dev.total
    dev = get_dataset_by_iter(dev)
    dev = get_data(dev, shuffle=0)

    model = RL()
    ckpt = tf.train.Checkpoint(model=model)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              './model/' + save_f,
                                              checkpoint_name='model.ckpt',
                                              max_to_keep=1)

    for _ in train:
        init_call = _
        break

    model.get_loss(init_call)
    for i, x in enumerate(model.trainable_variables):
        print("%d:%s %s" % (i, x.name, str(x.shape)))

    EPOCHS = cfg.EPOCHS
    optimizer2, _ = create_optimizer(2e-5,
                                     count * EPOCHS,
                                     count,
                                     0,
                                     weight_decay_rate=0.01)
    optimizer2 = mixed_precision.LossScaleOptimizer(optimizer2,
                                                    loss_scale='dynamic')

    train_and_test(model, train, dev, EPOCHS, count, dev_count, optimizer2,
                   197, ckpt_manager)
Esempio n. 12
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_text_classification",
                           model_args,
                           data_args,
                           framework="tensorflow")

    output_dir = Path(training_args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # endregion

    # region Checkpoints
    # Detecting last checkpoint.
    checkpoint = None
    if len(os.listdir(training_args.output_dir)
           ) > 0 and not training_args.overwrite_output_dir:
        if (output_dir / CONFIG_NAME).is_file() and (
                output_dir / TF2_WEIGHTS_NAME).is_file():
            checkpoint = output_dir
            logger.info(
                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
        else:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to continue regardless.")

    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO)

    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # region Loading data
    # For CSV/JSON files, this script will use the 'label' field as the label and the 'sentence1' and optionally
    # 'sentence2' fields as inputs if they exist. If not, the first two fields not named label are used if at least two
    # columns are provided. Note that the term 'sentence' can be slightly misleading, as they often contain more than
    # a single grammatical sentence, when the task requires it.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    data_files = {
        "train": data_args.train_file,
        "validation": data_args.validation_file,
        "test": data_args.test_file
    }
    data_files = {
        key: file
        for key, file in data_files.items() if file is not None
    }

    for key in data_files.keys():
        logger.info(f"Loading a local file for {key}: {data_files[key]}")

    if data_args.input_file_extension == "csv":
        # Loading a dataset from local csv files
        datasets = load_dataset(
            "csv",
            data_files=data_files,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        # Loading a dataset from local json files
        datasets = load_dataset("json",
                                data_files=data_files,
                                cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    # endregion

    # region Label preprocessing
    # If you've passed us a training set, we try to infer your labels from it
    if "train" in datasets:
        # By default we assume that if your label column looks like a float then you're doing regression,
        # and if not then you're doing classification. This is something you may want to change!
        is_regression = datasets["train"].features["label"].dtype in [
            "float32", "float64"
        ]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)
    # If you haven't passed a training set, we read label info from the saved model (this happens later)
    else:
        num_labels = None
        label_list = None
        is_regression = None
    # endregion

    # region Load model config and tokenizer
    if checkpoint is not None:
        config_path = training_args.output_dir
    elif model_args.config_name:
        config_path = model_args.config_name
    else:
        config_path = model_args.model_name_or_path
    if num_labels is not None:
        config = AutoConfig.from_pretrained(
            config_path,
            num_labels=num_labels,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        config = AutoConfig.from_pretrained(
            config_path,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Dataset preprocessing
    # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
    column_names = {
        col
        for cols in datasets.column_names.values() for col in cols
    }
    non_label_column_names = [name for name in column_names if name != "label"]
    if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", "sentence2"
    elif "sentence1" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", None
    else:
        if len(non_label_column_names) >= 2:
            sentence1_key, sentence2_key = non_label_column_names[:2]
        else:
            sentence1_key, sentence2_key = non_label_column_names[0], None

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    # Ensure that our labels match the model's, if it has some pre-specified
    if "train" in datasets:
        if not is_regression and config.label2id != PretrainedConfig(
                num_labels=num_labels).label2id:
            label_name_to_id = config.label2id
            if list(sorted(label_name_to_id.keys())) == list(
                    sorted(label_list)):
                label_to_id = label_name_to_id  # Use the model's labels
            else:
                logger.warning(
                    "Your model seems to have been trained with labels, but they don't match the dataset: ",
                    f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels:"
                    f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.",
                )
                label_to_id = {v: i for i, v in enumerate(label_list)}
        elif not is_regression:
            label_to_id = {v: i for i, v in enumerate(label_list)}
        else:
            label_to_id = None
        # Now we've established our label2id, let's overwrite the model config with it.
        config.label2id = label_to_id
        if config.label2id is not None:
            config.id2label = {id: label for label, id in label_to_id.items()}
        else:
            config.id2label = None
    else:
        label_to_id = config.label2id  # Just load the data from the model

    if "validation" in datasets and config.label2id is not None:
        validation_label_list = datasets["validation"].unique("label")
        for val_label in validation_label_list:
            assert val_label in label_to_id, f"Label {val_label} is in the validation set but not the training set!"

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args, max_length=max_seq_length, truncation=True)

        # Map labels to IDs
        if config.label2id is not None and "label" in examples:
            result["label"] = [(config.label2id[l] if l != -1 else -1)
                               for l in examples["label"]]
        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)

    # endregion

    with training_args.strategy.scope():
        # region Load pretrained model
        # Set seed before initializing model
        set_seed(training_args.seed)
        #
        # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
        # download model & vocab.
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        # endregion

        # region Convert data to a tf.data.Dataset
        dataset_options = tf.data.Options()
        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
        num_replicas = training_args.strategy.num_replicas_in_sync

        tf_data = dict()
        max_samples = {
            "train": data_args.max_train_samples,
            "validation": data_args.max_val_samples,
            "test": data_args.max_test_samples,
        }
        for key in ("train", "validation", "test"):
            if key not in datasets:
                tf_data[key] = None
                continue
            if ((key == "train" and not training_args.do_train)
                    or (key == "validation" and not training_args.do_eval)
                    or (key == "test" and not training_args.do_predict)):
                tf_data[key] = None
                continue
            if key in ("train", "validation"):
                assert "label" in datasets[
                    key].features, f"Missing labels from {key} data!"
            if key == "train":
                shuffle = True
                batch_size = training_args.per_device_train_batch_size * num_replicas
            else:
                shuffle = False
                batch_size = training_args.per_device_eval_batch_size * num_replicas
            samples_limit = max_samples[key]
            dataset = datasets[key]
            if samples_limit is not None:
                dataset = dataset.select(range(samples_limit))

            # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
            # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
            # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
            # yourself if you use this method, whereas they are automatically inferred from the model input names when
            # using model.prepare_tf_dataset()
            # For more info see the docs:
            # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
            # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset

            data = model.prepare_tf_dataset(
                dataset,
                shuffle=shuffle,
                batch_size=batch_size,
                tokenizer=tokenizer,
            )
            data = data.with_options(dataset_options)
            tf_data[key] = data
        # endregion

        # region Optimizer, loss and compilation

        if training_args.do_train:
            num_train_steps = len(
                tf_data["train"]) * training_args.num_train_epochs
            if training_args.warmup_steps > 0:
                num_warmup_steps = training_args.warmup_steps
            elif training_args.warmup_ratio > 0:
                num_warmup_steps = int(num_train_steps *
                                       training_args.warmup_ratio)
            else:
                num_warmup_steps = 0

            optimizer, schedule = create_optimizer(
                init_lr=training_args.learning_rate,
                num_train_steps=num_train_steps,
                num_warmup_steps=num_warmup_steps,
                adam_beta1=training_args.adam_beta1,
                adam_beta2=training_args.adam_beta2,
                adam_epsilon=training_args.adam_epsilon,
                weight_decay_rate=training_args.weight_decay,
                adam_global_clipnorm=training_args.max_grad_norm,
            )
        else:
            optimizer = None
        if is_regression:
            metrics = []
        else:
            metrics = ["accuracy"]
        model.compile(optimizer=optimizer, metrics=metrics)
        # endregion

        # region Preparing push_to_hub and model card
        push_to_hub_model_id = training_args.push_to_hub_model_id
        model_name = model_args.model_name_or_path.split("/")[-1]
        if not push_to_hub_model_id:
            push_to_hub_model_id = f"{model_name}-finetuned-text-classification"

        model_card_kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tasks": "text-classification"
        }

        if training_args.push_to_hub:
            callbacks = [
                PushToHubCallback(
                    output_dir=training_args.output_dir,
                    model_id=push_to_hub_model_id,
                    organization=training_args.push_to_hub_organization,
                    token=training_args.push_to_hub_token,
                    tokenizer=tokenizer,
                    **model_card_kwargs,
                )
            ]
        else:
            callbacks = []
        # endregion

        # region Training and validation
        if tf_data["train"] is not None:
            model.fit(
                tf_data["train"],
                validation_data=tf_data["validation"],
                epochs=int(training_args.num_train_epochs),
                callbacks=callbacks,
            )
        if tf_data["validation"] is not None:
            logger.info("Computing metrics on validation data...")
            if is_regression:
                loss = model.evaluate(tf_data["validation"])
                logger.info(f"Eval loss: {loss:.5f}")
            else:
                loss, accuracy = model.evaluate(tf_data["validation"])
                logger.info(
                    f"Eval loss: {loss:.5f}, Eval accuracy: {accuracy * 100:.4f}%"
                )
            if training_args.output_dir is not None:
                output_eval_file = os.path.join(training_args.output_dir,
                                                "all_results.json")
                eval_dict = {"eval_loss": loss}
                if not is_regression:
                    eval_dict["eval_accuracy"] = accuracy
                with open(output_eval_file, "w") as writer:
                    writer.write(json.dumps(eval_dict))
        # endregion

        # region Prediction
        if tf_data["test"] is not None:
            logger.info("Doing predictions on test dataset...")
            predictions = model.predict(tf_data["test"])["logits"]
            predicted_class = np.squeeze(
                predictions) if is_regression else np.argmax(predictions,
                                                             axis=1)
            output_test_file = os.path.join(training_args.output_dir,
                                            "test_results.txt")
            with open(output_test_file, "w") as writer:
                writer.write("index\tprediction\n")
                for index, item in enumerate(predicted_class):
                    if is_regression:
                        writer.write(f"{index}\t{item:3.3f}\n")
                    else:
                        item = config.id2label[item]
                        writer.write(f"{index}\t{item}\n")
            logger.info(f"Wrote predictions to {output_test_file}!")
        # endregion

        if training_args.output_dir is not None and not training_args.push_to_hub:
            # If we're not pushing to hub, at least save a local copy when we're done
            model.save_pretrained(training_args.output_dir)
Esempio n. 13
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_glue",
                           model_args,
                           data_args,
                           framework="tensorflow")

    if not (training_args.do_train or training_args.do_eval
            or training_args.do_predict):
        exit(
            "Must specify at least one of --do_train, --do_eval or --do_predict!"
        )
    # endregion

    # region Checkpoints
    checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        checkpoint = get_last_checkpoint(training_args.output_dir)
        if checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # region Dataset and labels
    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee
    # that only one local process can concurrently download the dataset.
    datasets = load_dataset(
        "glue",
        data_args.task_name,
        cache_dir=model_args.cache_dir,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    is_regression = data_args.task_name == "stsb"
    if not is_regression:
        label_list = datasets["train"].features["label"].names
        num_labels = len(label_list)
    else:
        num_labels = 1

    if data_args.predict_file is not None:
        logger.info("Preparing user-supplied file for predictions...")

        data_files = {"data": data_args.predict_file}

        for key in data_files.keys():
            logger.info(f"Loading a local file for {key}: {data_files[key]}")

        if data_args.predict_file.endswith(".csv"):
            # Loading a dataset from local csv files
            user_dataset = load_dataset("csv",
                                        data_files=data_files,
                                        cache_dir=model_args.cache_dir)
        else:
            # Loading a dataset from local json files
            user_dataset = load_dataset("json",
                                        data_files=data_files,
                                        cache_dir=model_args.cache_dir)
        needed_keys = task_to_keys[data_args.task_name]
        for key in needed_keys:
            assert key in user_dataset[
                "data"].features, f"Your supplied predict_file is missing the {key} key!"
        datasets["user_data"] = user_dataset["data"]
    # endregion

    # region Load model config and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Dataset preprocessing
    sentence1_key, sentence2_key = task_to_keys[data_args.task_name]

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if config.label2id != PretrainedConfig(
            num_labels=num_labels).label2id and not is_regression:
        # Some have all caps in their config, some don't.
        label_name_to_id = {k.lower(): v for k, v in config.label2id.items()}
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {
                i: int(label_name_to_id[label_list[i]])
                for i in range(num_labels)
            }
        else:
            logger.warning(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
            )
            label_to_id = {label: i for i, label in enumerate(label_list)}
    if label_to_id is not None:
        config.label2id = label_to_id
        config.id2label = {id: label for label, id in config.label2id.items()}
    elif data_args.task_name is not None and not is_regression:
        config.label2id = {l: i for i, l in enumerate(label_list)}
        config.id2label = {id: label for label, id in config.label2id.items()}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args,
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)

    if data_args.pad_to_max_length:
        data_collator = DefaultDataCollator(return_tensors="tf")
    else:
        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
    # endregion

    # region Metric function
    metric = evaluate.load("glue", data_args.task_name)

    def compute_metrics(preds, label_ids):
        preds = preds["logits"]
        preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                  axis=1)
        result = metric.compute(predictions=preds, references=label_ids)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result

    # endregion

    with training_args.strategy.scope():
        # region Load pretrained model
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        # endregion

        # region Convert data to a tf.data.Dataset
        dataset_options = tf.data.Options()
        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
        num_replicas = training_args.strategy.num_replicas_in_sync

        tf_data = dict()
        max_samples = {
            "train": data_args.max_train_samples,
            "validation": data_args.max_eval_samples,
            "validation_matched": data_args.max_eval_samples,
            "validation_mismatched": data_args.max_eval_samples,
            "test": data_args.max_predict_samples,
            "test_matched": data_args.max_predict_samples,
            "test_mismatched": data_args.max_predict_samples,
            "user_data": None,
        }
        for key in datasets.keys():
            if key == "train" or key.startswith("validation"):
                assert "label" in datasets[
                    key].features, f"Missing labels from {key} data!"
            if key == "train":
                shuffle = True
                batch_size = training_args.per_device_train_batch_size * num_replicas
            else:
                shuffle = False
                batch_size = training_args.per_device_eval_batch_size * num_replicas
            samples_limit = max_samples[key]
            dataset = datasets[key]
            if samples_limit is not None:
                dataset = dataset.select(range(samples_limit))

            # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
            # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
            # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
            # yourself if you use this method, whereas they are automatically inferred from the model input names when
            # using model.prepare_tf_dataset()
            # For more info see the docs:
            # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
            # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
            data = model.prepare_tf_dataset(
                dataset,
                shuffle=shuffle,
                batch_size=batch_size,
                collate_fn=data_collator,
                tokenizer=tokenizer,
            )
            data = data.with_options(dataset_options)
            tf_data[key] = data
        # endregion

        # region Optimizer, loss and compilation
        if training_args.do_train:
            num_train_steps = len(
                tf_data["train"]) * training_args.num_train_epochs
            if training_args.warmup_steps > 0:
                num_warmup_steps = training_args.warmup_steps
            elif training_args.warmup_ratio > 0:
                num_warmup_steps = int(num_train_steps *
                                       training_args.warmup_ratio)
            else:
                num_warmup_steps = 0

            optimizer, schedule = create_optimizer(
                init_lr=training_args.learning_rate,
                num_train_steps=num_train_steps,
                num_warmup_steps=num_warmup_steps,
                adam_beta1=training_args.adam_beta1,
                adam_beta2=training_args.adam_beta2,
                adam_epsilon=training_args.adam_epsilon,
                weight_decay_rate=training_args.weight_decay,
                adam_global_clipnorm=training_args.max_grad_norm,
            )
        else:
            optimizer = "adam"  # Just write anything because we won't be using it
        if is_regression:
            metrics = []
        else:
            metrics = ["accuracy"]
        model.compile(optimizer=optimizer,
                      metrics=metrics,
                      jit_compile=training_args.xla)
        # endregion

        # region Preparing push_to_hub and model card
        push_to_hub_model_id = training_args.push_to_hub_model_id
        model_name = model_args.model_name_or_path.split("/")[-1]
        if not push_to_hub_model_id:
            push_to_hub_model_id = f"{model_name}-finetuned-glue"

        model_card_kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tasks": "text-classification"
        }
        model_card_kwargs["task_name"] = data_args.task_name

        if training_args.push_to_hub:
            callbacks = [
                PushToHubCallback(
                    output_dir=training_args.output_dir,
                    model_id=push_to_hub_model_id,
                    organization=training_args.push_to_hub_organization,
                    token=training_args.push_to_hub_token,
                    tokenizer=tokenizer,
                    **model_card_kwargs,
                )
            ]
        else:
            callbacks = []
        # endregion

        # region Training and validation
        if training_args.do_train:
            if training_args.do_eval and not data_args.task_name == "mnli":
                # Do both evaluation and training in the Keras fit loop, unless the task is MNLI
                # because MNLI has two validation sets
                validation_data = tf_data["validation"]
            else:
                validation_data = None
            model.fit(
                tf_data["train"],
                validation_data=validation_data,
                epochs=int(training_args.num_train_epochs),
                callbacks=callbacks,
            )
        # endregion

        # region Evaluation
        if training_args.do_eval:
            # We normally do validation as part of the Keras fit loop, but we run it independently
            # if there was no fit() step (because we didn't train the model) or if the task is MNLI,
            # because MNLI has a separate validation-mismatched validation set

            # In this example, we compute advanced metrics only at the end of training, and only compute
            # loss and accuracy on the validation set each epoch, but
            # if you'd like to compute metrics every epoch that are too complex to be written as
            # standard Keras metrics, you can use our KerasMetricCallback. See
            # https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
            logger.info("*** Evaluate ***")

            # Loop to handle MNLI double evaluation (matched, mis-matched)
            if data_args.task_name == "mnli":
                tasks = ["mnli", "mnli-mm"]
                tf_datasets = [
                    tf_data["validation_matched"],
                    tf_data["validation_mismatched"]
                ]
                raw_datasets = [
                    datasets["validation_matched"],
                    datasets["validation_mismatched"]
                ]
            else:
                tasks = [data_args.task_name]
                tf_datasets = [tf_data["validation"]]
                raw_datasets = [datasets["validation"]]

            for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets,
                                                     tasks):
                eval_predictions = model.predict(tf_dataset)
                eval_metrics = compute_metrics(eval_predictions,
                                               raw_dataset["label"])
                print(f"Evaluation metrics ({task}):")
                print(eval_metrics)
                if training_args.output_dir is not None:
                    output_eval_file = os.path.join(training_args.output_dir,
                                                    "all_results.json")
                    with open(output_eval_file, "w") as writer:
                        writer.write(json.dumps(eval_metrics))

        # endregion

        # region Prediction
        if training_args.do_predict or data_args.predict_file:
            logger.info("*** Predict ***")

            # Loop to handle MNLI double evaluation (matched, mis-matched)
            tasks = []
            tf_datasets = []
            raw_datasets = []
            if training_args.do_predict:
                if data_args.task_name == "mnli":
                    tasks.extend(["mnli", "mnli-mm"])
                    tf_datasets.extend(
                        [tf_data["test_matched"], tf_data["test_mismatched"]])
                    raw_datasets.extend([
                        datasets["test_matched"], datasets["test_mismatched"]
                    ])
                else:
                    tasks.append(data_args.task_name)
                    tf_datasets.append(tf_data["test"])
                    raw_datasets.append(datasets["test"])
            if data_args.predict_file:
                tasks.append("user_data")
                tf_datasets.append(tf_data["user_data"])
                raw_datasets.append(datasets["user_data"])

            for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets,
                                                     tasks):
                test_predictions = model.predict(tf_dataset)
                if "label" in raw_dataset:
                    test_metrics = compute_metrics(test_predictions,
                                                   raw_dataset["label"])
                    print(f"Test metrics ({task}):")
                    print(test_metrics)

                if is_regression:
                    predictions_to_write = np.squeeze(
                        test_predictions["logits"])
                else:
                    predictions_to_write = np.argmax(
                        test_predictions["logits"], axis=1)

                output_predict_file = os.path.join(
                    training_args.output_dir, f"predict_results_{task}.txt")
                with open(output_predict_file, "w") as writer:
                    logger.info(
                        f"***** Writing prediction results for {task} *****")
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions_to_write):
                        if is_regression:
                            writer.write(f"{index}\t{item:3.3f}\n")
                        else:
                            item = model.config.id2label[item]
                            writer.write(f"{index}\t{item}\n")
        # endregion

        if training_args.output_dir is not None and not training_args.push_to_hub:
            # If we're not pushing to hub, at least save a local copy when we're done
            model.save_pretrained(training_args.output_dir)
Esempio n. 14
0
class MyLoss(tf.keras.losses.Loss):
    def call(self, labels, logits):
        print(labels)
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        # make sure only labels that are not equal to -100
        # are taken into account as loss
        active_loss = tf.not_equal(tf.reshape(labels, (-1, )), -100)
        reduced_logits = tf.boolean_mask(
            tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
        labels = tf.boolean_mask(tf.reshape(labels, (-1, )), active_loss)
        return loss_fn(labels, reduced_logits)


#import tensorflow_addons as tfa
optimizer, lr = create_optimizer(1e-4, 1000000, 10000, 0.1, 1e-6, 0.01)
training_loss = TFMaskedLanguageModelingLoss.compute_loss
model.compile(optimizer=optimizer, loss=MyLoss())
model.fit(gen(),
          callbacks=[
              tf.keras.callbacks.TensorBoard(log_dir="./log",
                                             update_freq='batch',
                                             profile_batch=0),
              tf.keras.callbacks.ModelCheckpoint("./output/model_{epoch}",
                                                 save_freq=200000)
          ])
#model.summary()
#trainer = TFTrainer(
#    model=model,
#    args=training_args,
#    train_dataset=gen(),
Esempio n. 15
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    output_dir = Path(training_args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()
    # endregion

    # region Checkpoints
    checkpoint = None
    if len(os.listdir(training_args.output_dir)
           ) > 0 and not training_args.overwrite_output_dir:
        if (output_dir / CONFIG_NAME).is_file() and (
                output_dir / TF2_WEIGHTS_NAME).is_file():
            checkpoint = output_dir
            logger.info(
                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
        else:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to continue regardless.")
    # endregion

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # region Load datasets
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.train_file is not None or data_args.validation_file is not None:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension,
                                    data_files=data_files,
                                    cache_dir=model_args.cache_dir)
    else:
        # Downloading and loading the swag dataset from the hub.
        raw_datasets = load_dataset("swag",
                                    "regular",
                                    cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # When using your own dataset or a different dataset from swag, you will probably need to change this.
    ending_names = [f"ending{i}" for i in range(4)]
    context_name = "sent1"
    question_header_name = "sent2"
    # endregion

    # region Load model config and tokenizer
    if checkpoint is not None:
        config_path = training_args.output_dir
    elif model_args.config_name:
        config_path = model_args.config_name
    else:
        config_path = model_args.model_name_or_path

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        config_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Dataset preprocessing
    if data_args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if data_args.max_seq_length > tokenizer.model_max_length:
            logger.warning(
                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(data_args.max_seq_length,
                             tokenizer.model_max_length)

    def preprocess_function(examples):
        first_sentences = [[context] * 4 for context in examples[context_name]]
        question_headers = examples[question_header_name]
        second_sentences = [[
            f"{header} {examples[end][i]}" for end in ending_names
        ] for i, header in enumerate(question_headers)]

        # Flatten out
        first_sentences = list(chain(*first_sentences))
        second_sentences = list(chain(*second_sentences))

        # Tokenize
        tokenized_examples = tokenizer(first_sentences,
                                       second_sentences,
                                       truncation=True,
                                       max_length=max_seq_length)
        # Un-flatten
        data = {
            k: [v[i:i + 4] for i in range(0, len(v), 4)]
            for k, v in tokenized_examples.items()
        }
        return data

    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        non_label_columns = [
            feature for feature in train_dataset.features
            if feature not in ("label", "labels")
        ]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))
        with training_args.main_process_first(
                desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                load_from_cache_file=not data_args.overwrite_cache,
            )

    if training_args.do_eval:
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation"]
        if not training_args.do_train:
            non_label_columns = [
                feature for feature in eval_dataset.features
                if feature not in ("label", "labels")
            ]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(
                range(data_args.max_eval_samples))
        with training_args.main_process_first(
                desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                load_from_cache_file=not data_args.overwrite_cache,
            )

    if data_args.pad_to_max_length:
        data_collator = DefaultDataCollator(return_tensors="tf")
    else:
        # custom class defined above, as HF has no data collator for multiple choice
        data_collator = DataCollatorForMultipleChoice(tokenizer)
    # endregion

    with training_args.strategy.scope():
        # region Build model
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForMultipleChoice.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )

        num_replicas = training_args.strategy.num_replicas_in_sync
        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
        if training_args.do_train:
            total_train_steps = (len(train_dataset) // total_train_batch_size
                                 ) * int(training_args.num_train_epochs)
            optimizer, lr_schedule = create_optimizer(
                init_lr=training_args.learning_rate,
                num_train_steps=int(total_train_steps),
                num_warmup_steps=0)
        else:
            optimizer = "adam"  # Just put anything in here, since we're not using it anyway
        model.compile(
            optimizer=optimizer,
            loss=tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True),
            metrics=[
                tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")
            ],
        )
        # endregion

        # region Training
        if training_args.do_train:
            dataset_exclude_cols = set(non_label_columns + ["label"])
            tf_train_dataset = train_dataset.to_tf_dataset(
                columns=[
                    col for col in train_dataset.column_names
                    if col not in dataset_exclude_cols
                ],
                shuffle=True,
                batch_size=total_train_batch_size,
                collate_fn=data_collator,
                drop_remainder=True,
                # `label_cols` is needed for user-defined losses, such as in this example
                label_cols="label"
                if "label" in train_dataset.column_names else None,
            )

            if training_args.do_eval:
                validation_data = eval_dataset.to_tf_dataset(
                    columns=[
                        col for col in eval_dataset.column_names
                        if col not in dataset_exclude_cols
                    ],
                    shuffle=False,
                    batch_size=total_eval_batch_size,
                    collate_fn=data_collator,
                    drop_remainder=True,
                    # `label_cols` is needed for user-defined losses, such as in this example
                    label_cols="label"
                    if "label" in eval_dataset.column_names else None,
                )
            else:
                validation_data = None
            model.fit(
                tf_train_dataset,
                validation_data=validation_data,
                epochs=int(training_args.num_train_epochs),
                callbacks=[
                    SavePretrainedCallback(output_dir=training_args.output_dir)
                ],
            )
        # endregion

        # region Evaluation
        if training_args.do_eval and not training_args.do_train:
            dataset_exclude_cols = set(non_label_columns + ["label"])
            # Do a standalone evaluation pass
            tf_eval_dataset = eval_dataset.to_tf_dataset(
                columns=[
                    col for col in eval_dataset.column_names
                    if col not in dataset_exclude_cols
                ],
                shuffle=False,
                batch_size=total_eval_batch_size,
                collate_fn=data_collator,
                drop_remainder=True,
                # `label_cols` is needed for user-defined losses, such as in this example
                label_cols="label"
                if "label" in eval_dataset.column_names else None,
            )
            model.evaluate(tf_eval_dataset)
        # endregion

        # region Push to hub
        if training_args.push_to_hub:
            model.push_to_hub(
                finetuned_from=model_args.model_name_or_path,
                tasks="multiple-choice",
                dataset_tags="swag",
                dataset_args="regular",
                dataset="SWAG",
                language="en",
            )
 def create_optimizer_and_scheduler(self, num_training_steps: int):
     optimizer, lr_scheduler = create_optimizer(self.args.learning_rate,num_training_steps,self.args.warmup_steps,adam_epsilon=self.args.adam_epsilon,weight_decay_rate=self.args.weight_decay)
     return optimizer, lr_scheduler
Esempio n. 17
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_swag",
                           model_args,
                           data_args,
                           framework="tensorflow")

    output_dir = Path(training_args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()
    # endregion

    # region Checkpoints
    checkpoint = None
    if len(os.listdir(training_args.output_dir)
           ) > 0 and not training_args.overwrite_output_dir:
        if (output_dir / CONFIG_NAME).is_file() and (
                output_dir / TF2_WEIGHTS_NAME).is_file():
            checkpoint = output_dir
            logger.info(
                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
        else:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to continue regardless.")
    # endregion

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # region Load datasets
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.train_file is not None or data_args.validation_file is not None:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        raw_datasets = load_dataset(
            extension,
            data_files=data_files,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        # Downloading and loading the swag dataset from the hub.
        raw_datasets = load_dataset(
            "swag",
            "regular",
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # When using your own dataset or a different dataset from swag, you will probably need to change this.
    ending_names = [f"ending{i}" for i in range(4)]
    context_name = "sent1"
    question_header_name = "sent2"
    # endregion

    # region Load model config and tokenizer
    if checkpoint is not None:
        config_path = training_args.output_dir
    elif model_args.config_name:
        config_path = model_args.config_name
    else:
        config_path = model_args.model_name_or_path

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        config_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Dataset preprocessing
    if data_args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if data_args.max_seq_length > tokenizer.model_max_length:
            logger.warning(
                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(data_args.max_seq_length,
                             tokenizer.model_max_length)

    def preprocess_function(examples):
        first_sentences = [[context] * 4 for context in examples[context_name]]
        question_headers = examples[question_header_name]
        second_sentences = [[
            f"{header} {examples[end][i]}" for end in ending_names
        ] for i, header in enumerate(question_headers)]

        # Flatten out
        first_sentences = list(chain(*first_sentences))
        second_sentences = list(chain(*second_sentences))

        # Tokenize
        tokenized_examples = tokenizer(first_sentences,
                                       second_sentences,
                                       truncation=True,
                                       max_length=max_seq_length)
        # Un-flatten
        data = {
            k: [v[i:i + 4] for i in range(0, len(v), 4)]
            for k, v in tokenized_examples.items()
        }
        return data

    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            max_train_samples = min(len(train_dataset),
                                    data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))
        with training_args.main_process_first(
                desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                load_from_cache_file=not data_args.overwrite_cache,
            )

    if training_args.do_eval:
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(eval_dataset),
                                   data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))
        with training_args.main_process_first(
                desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                load_from_cache_file=not data_args.overwrite_cache,
            )

    if data_args.pad_to_max_length:
        data_collator = DefaultDataCollator(return_tensors="tf")
    else:
        # custom class defined above, as HF has no data collator for multiple choice
        data_collator = DataCollatorForMultipleChoice(tokenizer)
    # endregion

    with training_args.strategy.scope():
        # region Build model
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForMultipleChoice.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )

        num_replicas = training_args.strategy.num_replicas_in_sync
        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas

        if training_args.do_train:
            num_train_steps = (len(train_dataset) // total_train_batch_size
                               ) * int(training_args.num_train_epochs)
            if training_args.warmup_steps > 0:
                num_warmup_steps = training_args.warmup_steps
            elif training_args.warmup_ratio > 0:
                num_warmup_steps = int(num_train_steps *
                                       training_args.warmup_ratio)
            else:
                num_warmup_steps = 0
            optimizer, lr_schedule = create_optimizer(
                init_lr=training_args.learning_rate,
                num_train_steps=num_train_steps,
                num_warmup_steps=num_warmup_steps,
                adam_beta1=training_args.adam_beta1,
                adam_beta2=training_args.adam_beta2,
                adam_epsilon=training_args.adam_epsilon,
                weight_decay_rate=training_args.weight_decay,
                adam_global_clipnorm=training_args.max_grad_norm,
            )
        else:
            optimizer = None
        model.compile(optimizer=optimizer,
                      metrics=["accuracy"],
                      jit_compile=training_args.xla)
        # endregion

        # region Preparing push_to_hub and model card
        push_to_hub_model_id = training_args.push_to_hub_model_id
        model_name = model_args.model_name_or_path.split("/")[-1]
        if not push_to_hub_model_id:
            push_to_hub_model_id = f"{model_name}-finetuned-multiplechoice"

        model_card_kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tasks": "multiple-choice"
        }

        if training_args.push_to_hub:
            callbacks = [
                PushToHubCallback(
                    output_dir=training_args.output_dir,
                    model_id=push_to_hub_model_id,
                    organization=training_args.push_to_hub_organization,
                    token=training_args.push_to_hub_token,
                    tokenizer=tokenizer,
                    **model_card_kwargs,
                )
            ]
        else:
            callbacks = []
        # endregion

        # region Training
        eval_metrics = None
        if training_args.do_train:
            dataset_options = tf.data.Options()
            dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF

            # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
            # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
            # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
            # yourself if you use this method, whereas they are automatically inferred from the model input names when
            # using model.prepare_tf_dataset()
            # For more info see the docs:
            # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
            # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset

            tf_train_dataset = model.prepare_tf_dataset(
                train_dataset,
                shuffle=True,
                batch_size=total_train_batch_size,
                collate_fn=data_collator,
            ).with_options(dataset_options)

            if training_args.do_eval:
                validation_data = model.prepare_tf_dataset(
                    eval_dataset,
                    shuffle=False,
                    batch_size=total_eval_batch_size,
                    collate_fn=data_collator,
                    drop_remainder=True,
                ).with_options(dataset_options)
            else:
                validation_data = None
            history = model.fit(
                tf_train_dataset,
                validation_data=validation_data,
                epochs=int(training_args.num_train_epochs),
                callbacks=callbacks,
            )
            eval_metrics = {
                key: val[-1]
                for key, val in history.history.items()
            }
        # endregion

        # region Evaluation
        if training_args.do_eval and not training_args.do_train:
            dataset_options = tf.data.Options()
            dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
            # Do a standalone evaluation pass
            tf_eval_dataset = model.prepare_tf_dataset(
                eval_dataset,
                shuffle=False,
                batch_size=total_eval_batch_size,
                collate_fn=data_collator,
                drop_remainder=True,
            ).with_options(dataset_options)
            eval_results = model.evaluate(tf_eval_dataset)
            eval_metrics = {
                "val_loss": eval_results[0],
                "val_accuracy": eval_results[1]
            }
        # endregion

        if eval_metrics is not None and training_args.output_dir is not None:
            output_eval_file = os.path.join(training_args.output_dir,
                                            "all_results.json")
            with open(output_eval_file, "w") as writer:
                writer.write(json.dumps(eval_metrics))

        # region Push to hub

        if training_args.output_dir is not None and not training_args.push_to_hub:
            # If we're not pushing to hub, at least save a local copy when we're done
            model.save_pretrained(training_args.output_dir)
Esempio n. 18
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity(logging.INFO)
    transformers.utils.logging.set_verbosity(logging.INFO)

    # Log on each process the small summary:
    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # region T5 special-casing
    if data_args.source_prefix is None and model_args.model_name_or_path in [
        "t5-small",
        "t5-base",
        "t5-large",
        "t5-3b",
        "t5-11b",
    ]:
        logger.warning(
            "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
            "`--source_prefix 'summarize: ' `"
        )
    # endregion

    # region Detecting last checkpoint
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
    # endregion

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # region Load datasets
    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files this script will use the first column for the full texts and the second column for the
    # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
        )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
            extension = data_args.train_file.split(".")[-1]
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    # endregion

    # region Load model config and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
    # endregion

    # region Dataset preprocessing
    # We need to tokenize inputs and targets.
    if training_args.do_train:
        column_names = raw_datasets["train"].column_names
    elif training_args.do_eval:
        column_names = raw_datasets["validation"].column_names
    else:
        logger.info("There is nothing to do. Please pass `do_train`, and/or `do_eval`.")
        return

    # Get the column names for input/target.
    dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None)
    if data_args.text_column is None:
        text_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
    else:
        text_column = data_args.text_column
        if text_column not in column_names:
            raise ValueError(
                f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}"
            )
    if data_args.summary_column is None:
        summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
    else:
        summary_column = data_args.summary_column
        if summary_column not in column_names:
            raise ValueError(
                f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}"
            )

    # Temporarily set max_target_length for training.
    max_target_length = data_args.max_target_length
    padding = "max_length" if data_args.pad_to_max_length else False

    def preprocess_function(examples):
        inputs = examples[text_column]
        targets = examples[summary_column]
        inputs = [prefix + inp for inp in inputs]
        model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)

        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
        # padding in the loss.
        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
            labels["input_ids"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
            ]

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        with training_args.main_process_first(desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )
    else:
        train_dataset = None

    if training_args.do_eval:
        max_target_length = data_args.val_max_target_length
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
        with training_args.main_process_first(desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )
    else:
        eval_dataset = None
    # endregion

    # region Text preprocessing
    def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

    # endregion

    with training_args.strategy.scope():
        # region Prepare model
        model = TFAutoModelForSeq2SeqLM.from_pretrained(
            model_args.model_name_or_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )

        model.resize_token_embeddings(len(tokenizer))
        # endregion

        # region Prepare TF Dataset objects
        if model.config.decoder_start_token_id is None:
            raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

        num_replicas = training_args.strategy.num_replicas_in_sync
        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
        tf_train_dataset = dataset_to_tf(
            train_dataset,
            model,
            tokenizer,
            total_batch_size=total_train_batch_size,
            num_epochs=training_args.num_train_epochs,
            shuffle=True,
        )
        tf_eval_dataset = dataset_to_tf(
            eval_dataset,
            model,
            tokenizer,
            total_eval_batch_size,
            num_epochs=1,
            shuffle=False,
        )
        # endregion

        # region Optimizer, loss and LR scheduling
        # Scheduler and math around the number of training steps.
        num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size
        num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
        optimizer, lr_schedule = create_optimizer(
            init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0
        )

        def masked_sparse_categorical_crossentropy(y_true, y_pred):
            # We clip the negative labels to 0 to avoid NaNs appearing in the output and
            # fouling up everything that comes afterwards. The loss values corresponding to clipped values
            # will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
            # 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
            # event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
            # More pragmatically, consider redesigning your tokenizer.
            losses = tf.keras.losses.sparse_categorical_crossentropy(
                tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
            )
            # Compute the per-sample loss only over the unmasked tokens
            losses = tf.ragged.boolean_mask(losses, y_true != -100)
            losses = tf.reduce_mean(losses, axis=-1)
            return losses

        # endregion

        # region Metric
        metric = load_metric("rouge")
        # endregion

        # region Training
        model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)

        if training_args.do_train:
            logger.info("***** Running training *****")
            logger.info(f"  Num examples = {len(train_dataset)}")
            logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
            logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
            logger.info(f"  Total train batch size = {total_train_batch_size}")
            logger.info(f"  Total optimization steps = {num_train_steps}")

            model.fit(
                tf_train_dataset,
                epochs=int(training_args.num_train_epochs),
                steps_per_epoch=num_update_steps_per_epoch,
            )
        # endregion

        # region Validation
        if data_args.val_max_target_length is None:
            data_args.val_max_target_length = data_args.max_target_length

        gen_kwargs = {
            "max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
            "num_beams": data_args.num_beams,
        }
        if training_args.do_eval:
            logger.info("Evaluation...")
            for batch, labels in tqdm(
                tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
            ):
                batch.update(gen_kwargs)
                generated_tokens = model.generate(**batch)
                if isinstance(generated_tokens, tuple):
                    generated_tokens = generated_tokens[0]
                decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
                labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
                decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
                decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

                metric.add_batch(predictions=decoded_preds, references=decoded_labels)

            result = metric.compute(use_stemmer=True)
            # Extract a few results from ROUGE
            result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

            result = {k: round(v, 4) for k, v in result.items()}

            logger.info(result)
        # endregion

        if training_args.output_dir is not None:
            model.save_pretrained(training_args.output_dir)
Esempio n. 19
0
def main():
    # region Argument Parsing
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_clm", model_args, data_args, framework="tensorflow")

    # Sanity checks
    if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None:
        raise ValueError("Need either a dataset name or a training/validation file.")
    else:
        if data_args.train_file is not None:
            extension = data_args.train_file.split(".")[-1]
            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
        if data_args.validation_file is not None:
            extension = data_args.validation_file.split(".")[-1]
            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."

    if training_args.output_dir is not None:
        training_args.output_dir = Path(training_args.output_dir)
        os.makedirs(training_args.output_dir, exist_ok=True)
    # endregion

    # region Checkpoints
    # Detecting last checkpoint.
    checkpoint = None
    if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir:
        config_path = training_args.output_dir / CONFIG_NAME
        weights_path = training_args.output_dir / TF2_WEIGHTS_NAME
        if config_path.is_file() and weights_path.is_file():
            checkpoint = training_args.output_dir
            logger.info(
                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
        else:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to continue regardless."
            )

    # endregion

    # region Setup logging
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()
    # endregion

    # If passed along, set the training seed now.
    if training_args.seed is not None:
        set_seed(training_args.seed)

    # region Load datasets
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
            )
    else:
        data_files = {}
        dataset_args = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = (
            data_args.train_file.split(".")[-1]
            if data_args.train_file is not None
            else data_args.validation_file.split(".")[-1]
        )
        if extension == "txt":
            extension = "text"
            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
        raw_datasets = load_dataset(
            extension,
            data_files=data_files,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
            **dataset_args,
        )
        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
                **dataset_args,
            )
            raw_datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
                **dataset_args,
            )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    # endregion

    # region Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )
    # endregion

    # region Dataset preprocessing
    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    def tokenize_function(examples):
        return tokenizer(examples[text_column_name])

    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Running tokenizer on dataset",
    )

    if data_args.block_size is None:
        block_size = tokenizer.model_max_length
        if block_size > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
            )
            block_size = 1024
    else:
        if data_args.block_size > tokenizer.model_max_length:
            logger.warning(
                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
            )
        block_size = min(data_args.block_size, tokenizer.model_max_length)

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
    # to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
        desc=f"Grouping texts in chunks of {block_size}",
    )

    train_dataset = lm_datasets["train"]
    if data_args.validation_file is not None:
        eval_dataset = lm_datasets["validation"]
    else:
        logger.info(
            f"Validation file not found: using {data_args.validation_split_percentage}% of the dataset as validation"
            " as provided in data_args"
        )
        train_indices, val_indices = train_test_split(
            list(range(len(train_dataset))), test_size=data_args.validation_split_percentage / 100
        )

        eval_dataset = train_dataset.select(val_indices)
        train_dataset = train_dataset.select(train_indices)

    if data_args.max_train_samples is not None:
        max_train_samples = min(len(train_dataset), data_args.max_train_samples)
        train_dataset = train_dataset.select(range(max_train_samples))
    if data_args.max_eval_samples is not None:
        max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
        eval_dataset = eval_dataset.select(range(max_eval_samples))

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
    # endregion

    with training_args.strategy.scope():
        # region Prepare model
        if checkpoint is not None:
            model = TFAutoModelForCausalLM.from_pretrained(checkpoint, config=config)
        elif model_args.model_name_or_path:
            model = TFAutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config)
        else:
            logger.info("Training new model from scratch")
            model = TFAutoModelForCausalLM.from_config(config)

        model.resize_token_embeddings(len(tokenizer))
        # endregion

        # region TF Dataset preparation
        num_replicas = training_args.strategy.num_replicas_in_sync
        options = tf.data.Options()
        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF

        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
        # yourself if you use this method, whereas they are automatically inferred from the model input names when
        # using model.prepare_tf_dataset()
        # For more info see the docs:
        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset

        tf_train_dataset = model.prepare_tf_dataset(
            train_dataset,
            shuffle=True,
            batch_size=num_replicas * training_args.per_device_train_batch_size,
        ).with_options(options)

        tf_eval_dataset = model.prepare_tf_dataset(
            eval_dataset,
            shuffle=False,
            batch_size=num_replicas * training_args.per_device_eval_batch_size,
            drop_remainder=True,
        ).with_options(options)
        # endregion

        # region Optimizer and loss
        num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
        if training_args.warmup_steps > 0:
            num_warmup_steps = training_args.warmup_steps
        elif training_args.warmup_ratio > 0:
            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
        else:
            num_warmup_steps = 0

        # Bias and layernorm weights are automatically excluded from the decay
        optimizer, lr_schedule = create_optimizer(
            init_lr=training_args.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            adam_beta1=training_args.adam_beta1,
            adam_beta2=training_args.adam_beta2,
            adam_epsilon=training_args.adam_epsilon,
            weight_decay_rate=training_args.weight_decay,
            adam_global_clipnorm=training_args.max_grad_norm,
        )

        # no user-specified loss = will use the model internal loss
        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
        # endregion

        # region Preparing push_to_hub and model card
        push_to_hub_model_id = training_args.push_to_hub_model_id
        model_name = model_args.model_name_or_path.split("/")[-1]
        if not push_to_hub_model_id:
            if data_args.dataset_name is not None:
                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
            else:
                push_to_hub_model_id = f"{model_name}-finetuned-clm"

        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
        if data_args.dataset_name is not None:
            model_card_kwargs["dataset_tags"] = data_args.dataset_name
            if data_args.dataset_config_name is not None:
                model_card_kwargs["dataset_args"] = data_args.dataset_config_name
                model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
            else:
                model_card_kwargs["dataset"] = data_args.dataset_name

        if training_args.push_to_hub:
            callbacks = [
                PushToHubCallback(
                    output_dir=training_args.output_dir,
                    model_id=push_to_hub_model_id,
                    organization=training_args.push_to_hub_organization,
                    token=training_args.push_to_hub_token,
                    tokenizer=tokenizer,
                    **model_card_kwargs,
                )
            ]
        else:
            callbacks = []
        # endregion

        # region Training and validation
        logger.info("***** Running training *****")
        logger.info(f"  Num examples = {len(train_dataset)}")
        logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
        logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
        logger.info(f"  Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")

        # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
        # to the Hugging Face Hub rather than just pushing the finished model.
        # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback

        history = model.fit(
            tf_train_dataset,
            validation_data=tf_eval_dataset,
            epochs=int(training_args.num_train_epochs),
            callbacks=callbacks,
        )
        train_loss = history.history["loss"][-1]
        try:
            train_perplexity = math.exp(train_loss)
        except OverflowError:
            train_perplexity = math.inf
        logger.info(f"  Final train loss: {train_loss:.3f}")
        logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
        validation_loss = history.history["val_loss"][-1]
        try:
            validation_perplexity = math.exp(validation_loss)
        except OverflowError:
            validation_perplexity = math.inf
        logger.info(f"  Final validation loss: {validation_loss:.3f}")
        logger.info(f"  Final validation perplexity: {validation_perplexity:.3f}")

        if training_args.output_dir is not None:
            output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
            results_dict = dict()
            results_dict["train_loss"] = train_loss
            results_dict["train_perplexity"] = train_perplexity
            results_dict["eval_loss"] = validation_loss
            results_dict["eval_perplexity"] = validation_perplexity
            with open(output_eval_file, "w") as writer:
                writer.write(json.dumps(results_dict))
        # endregion

    if training_args.output_dir is not None and not training_args.push_to_hub:
        # If we're not pushing to hub, at least save a local copy when we're done
        model.save_pretrained(training_args.output_dir)
Esempio n. 20
0
    def _custom_train(
        self,
        train_dataset,
        tokenizer,
        model,
        num_train_examples,
        train_batch_size,
    ):
        config = self.parent.config._asdict()
        config["strategy"] = self.parent.config.strategy
        config["n_device"] = self.parent.config.n_device
        labels = config["ner_tags"]
        if config["max_steps"] > 0:
            num_train_steps = (
                config["max_steps"] * config["gradient_accumulation_steps"]
            )
            config["epochs"] = 1
        else:
            num_train_steps = (
                math.ceil(num_train_examples / train_batch_size)
                // config["gradient_accumulation_steps"]
                * config["epochs"]
            )

        with config["strategy"].scope():
            loss_fct = self.tf.keras.losses.SparseCategoricalCrossentropy(
                reduction=self.tf.keras.losses.Reduction.NONE
            )
            optimizer = create_optimizer(
                config["learning_rate"],
                num_train_steps,
                config["warmup_steps"],
            )

            if config["use_fp16"]:
                optimizer = self.tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                    optimizer, "dynamic"
                )

            loss_metric = self.tf.keras.metrics.Mean(
                name="loss", dtype=self.tf.float32
            )
            gradient_accumulator = GradientAccumulator()

        self.logger.info("***** Running training *****")
        self.logger.info("  Num examples = %d", num_train_examples)
        self.logger.info("  Num Epochs = %d", config["epochs"])
        self.logger.info(
            "  Instantaneous batch size per device = %d",
            config["per_device_train_batch_size"],
        )
        self.logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            train_batch_size * config["gradient_accumulation_steps"],
        )
        self.logger.info(
            "  Gradient Accumulation steps = %d",
            config["gradient_accumulation_steps"],
        )
        self.logger.info("  Total training steps = %d", num_train_steps)

        self.logger.debug(model.summary())

        @self.tf.function
        def apply_gradients():
            grads_and_vars = []

            for gradient, variable in zip(
                gradient_accumulator.gradients, model.trainable_variables
            ):
                if gradient is not None:
                    scaled_gradient = gradient / (
                        config["n_device"]
                        * config["gradient_accumulation_steps"]
                    )
                    grads_and_vars.append((scaled_gradient, variable))
                else:
                    grads_and_vars.append((gradient, variable))

            optimizer.apply_gradients(grads_and_vars, config["max_grad_norm"])
            gradient_accumulator.reset()

        @self.tf.function
        def train_step(train_features, train_labels):
            def step_fn(train_features, train_labels):
                inputs = {
                    "attention_mask": train_features["input_mask"],
                    "training": True,
                }

                if config["model_architecture_type"] != "distilbert":
                    inputs["token_type_ids"] = (
                        train_features["segment_ids"]
                        if config["model_architecture_type"]
                        in ["bert", "xlnet"]
                        else None
                    )

                with self.tf.GradientTape() as tape:
                    logits = model(train_features["input_ids"], **inputs)[0]
                    logits = self.tf.reshape(logits, (-1, len(labels) + 1))
                    active_loss = self.tf.reshape(
                        train_features["input_mask"], (-1,)
                    )
                    active_logits = self.tf.boolean_mask(logits, active_loss)
                    train_labels = self.tf.reshape(train_labels, (-1,))
                    active_labels = self.tf.boolean_mask(
                        train_labels, active_loss
                    )
                    cross_entropy = loss_fct(active_labels, active_logits)
                    loss = self.tf.reduce_sum(cross_entropy) * (
                        1.0 / train_batch_size
                    )
                    grads = tape.gradient(loss, model.trainable_variables)
                    print(grads)

                    gradient_accumulator(grads)

                return cross_entropy

            per_example_losses = config["strategy"].run(
                step_fn, args=(train_features, train_labels)
            )
            mean_loss = config["strategy"].reduce(
                self.tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0
            )

            return mean_loss

        current_time = datetime.datetime.now()
        train_iterator = master_bar(range(config["epochs"]))
        global_step = 0
        self.logger_loss = 0.0

        for epoch in train_iterator:
            epoch_iterator = progress_bar(
                train_dataset,
                total=num_train_steps,
                parent=train_iterator,
                display=config["n_device"] > 1,
            )
            step = 1

            with config["strategy"].scope():
                for train_features, train_labels in epoch_iterator:
                    loss = train_step(train_features, train_labels)

                    if step % config["gradient_accumulation_steps"] == 0:
                        config["strategy"].run(apply_gradients)
                        loss_metric(loss)
                        global_step += 1
                        if (
                            config["save_steps"] > 0
                            and global_step % config["save_steps"] == 0
                        ):
                            # Save model checkpoint
                            output_dir = os.path.join(
                                config["output_dir"],
                                "checkpoint-{}".format(global_step),
                            )

                            if not os.path.exists(output_dir):
                                os.makedirs(output_dir)

                            model.save_pretrained(output_dir)
                            self.logger.info(
                                "Saving model checkpoint to %s", output_dir
                            )

                    train_iterator.child.comment = (
                        f"loss : {loss_metric.result()}"
                    )
                    step += 1

            train_iterator.write(
                f"loss epoch {epoch + 1}: {loss_metric.result()}"
            )
            loss_metric.reset_states()
        self.logger.debug(
            "  Training took time = {}".format(
                datetime.datetime.now() - current_time
            )
        )
import time
from transformers import create_optimizer

steps_per_epoch = train_size // BATCH_SIZE
validation_steps = validation_size // BATCH_SIZE

# | Loss Function
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=False)
train_loss = tf.keras.metrics.Mean(name='train_loss')
validation_loss = tf.keras.metrics.Mean(name='test_loss')

# | Optimizer (with 1-cycle-policy)
warmup_steps = steps_per_epoch // 3
total_steps = steps_per_epoch * NR_EPOCHS - warmup_steps
optimizer = create_optimizer(init_lr=2e-5, num_train_steps=total_steps, num_warmup_steps=warmup_steps)

# | Metrics
train_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(label_cols))]
validation_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(label_cols))]

@tf.function
def train_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)
    with tf.GradientTape() as tape:
        predictions = model(token_ids, attention_mask=masks)
        loss = loss_object(labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables),'1.0')
    train_loss(loss)
Esempio n. 22
0
def train(args, strategy, train_dataset, tokenizer, model, num_train_examples,
          labels, train_batch_size, pad_token_label_id):
    if args["max_steps"] > 0:
        num_train_steps = args["max_steps"] * args[
            "gradient_accumulation_steps"]
        args["num_train_epochs"] = 1
    else:
        num_train_steps = (math.ceil(num_train_examples / train_batch_size) //
                           args["gradient_accumulation_steps"] *
                           args["num_train_epochs"])

    writer = tf.summary.create_file_writer("/tmp/mylogs")

    with strategy.scope():
        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        optimizer = create_optimizer(args["learning_rate"], num_train_steps,
                                     args["warmup_steps"])

        if args["fp16"]:
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                optimizer, "dynamic")

        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
        gradient_accumulator = GradientAccumulator()

    logging.info("***** Running training *****")
    logging.info("  Num examples = %d", num_train_examples)
    logging.info("  Num Epochs = %d", args["num_train_epochs"])
    logging.info("  Instantaneous batch size per device = %d",
                 args["per_device_train_batch_size"])
    logging.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        train_batch_size * args["gradient_accumulation_steps"],
    )
    logging.info("  Gradient Accumulation steps = %d",
                 args["gradient_accumulation_steps"])
    logging.info("  Total training steps = %d", num_train_steps)

    model.summary()

    @tf.function
    def apply_gradients():
        grads_and_vars = []

        for gradient, variable in zip(gradient_accumulator.gradients,
                                      model.trainable_variables):
            if gradient is not None:
                scaled_gradient = gradient / (
                    args["n_device"] * args["gradient_accumulation_steps"])
                grads_and_vars.append((scaled_gradient, variable))
            else:
                grads_and_vars.append((gradient, variable))

        optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"])
        gradient_accumulator.reset()

    @tf.function
    def train_step(train_features, train_labels):
        def step_fn(train_features, train_labels):
            inputs = {
                "attention_mask": train_features["attention_mask"],
                "training": True
            }

            if "token_type_ids" in train_features:
                inputs["token_type_ids"] = train_features["token_type_ids"]

            with tf.GradientTape() as tape:
                logits = model(train_features["input_ids"], **inputs)[0]
                active_loss = tf.reshape(train_labels,
                                         (-1, )) != pad_token_label_id
                active_logits = tf.boolean_mask(
                    tf.reshape(logits, (-1, len(labels))), active_loss)
                active_labels = tf.boolean_mask(
                    tf.reshape(train_labels, (-1, )), active_loss)
                cross_entropy = loss_fct(active_labels, active_logits)
                loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
                grads = tape.gradient(loss, model.trainable_variables)

                gradient_accumulator(grads)

            return cross_entropy

        per_example_losses = strategy.experimental_run_v2(step_fn,
                                                          args=(train_features,
                                                                train_labels))
        mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                    per_example_losses,
                                    axis=0)

        return mean_loss

    current_time = datetime.datetime.now()
    train_iterator = master_bar(range(args["num_train_epochs"]))
    global_step = 0
    logging_loss = 0.0

    for epoch in train_iterator:
        epoch_iterator = progress_bar(train_dataset,
                                      total=num_train_steps,
                                      parent=train_iterator,
                                      display=args["n_device"] > 1)
        step = 1

        with strategy.scope():
            for train_features, train_labels in epoch_iterator:
                loss = train_step(train_features, train_labels)

                if step % args["gradient_accumulation_steps"] == 0:
                    strategy.experimental_run_v2(apply_gradients)

                    loss_metric(loss)

                    global_step += 1

                    if args["logging_steps"] > 0 and global_step % args[
                            "logging_steps"] == 0:
                        # Log metrics
                        if (
                                args["n_device"] == 1
                                and args["evaluate_during_training"]
                        ):  # Only evaluate when single GPU otherwise metrics may not average well
                            y_true, y_pred, eval_loss = evaluate(
                                args,
                                strategy,
                                model,
                                tokenizer,
                                labels,
                                pad_token_label_id,
                                mode="dev")
                            report = metrics.classification_report(y_true,
                                                                   y_pred,
                                                                   digits=4)

                            logging.info("Eval at step " + str(global_step) +
                                         "\n" + report)
                            logging.info("eval_loss: " + str(eval_loss))

                            precision = metrics.precision_score(y_true, y_pred)
                            recall = metrics.recall_score(y_true, y_pred)
                            f1 = metrics.f1_score(y_true, y_pred)

                            with writer.as_default():
                                tf.summary.scalar("eval_loss", eval_loss,
                                                  global_step)
                                tf.summary.scalar("precision", precision,
                                                  global_step)
                                tf.summary.scalar("recall", recall,
                                                  global_step)
                                tf.summary.scalar("f1", f1, global_step)

                        lr = optimizer.learning_rate
                        learning_rate = lr(step)

                        with writer.as_default():
                            tf.summary.scalar("lr", learning_rate, global_step)
                            tf.summary.scalar(
                                "loss", (loss_metric.result() - logging_loss) /
                                args["logging_steps"], global_step)

                        logging_loss = loss_metric.result()

                    with writer.as_default():
                        tf.summary.scalar("loss",
                                          loss_metric.result(),
                                          step=step)

                    if args["save_steps"] > 0 and global_step % args[
                            "save_steps"] == 0:
                        # Save model checkpoint
                        output_dir = os.path.join(
                            args["output_dir"],
                            "checkpoint-{}".format(global_step))

                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)

                        model.save_pretrained(output_dir)
                        logging.info("Saving model checkpoint to %s",
                                     output_dir)

                train_iterator.child.comment = f"loss : {loss_metric.result()}"
                step += 1

        train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")

        loss_metric.reset_states()

    logging.info("  Training took time = {}".format(datetime.datetime.now() -
                                                    current_time))
Esempio n. 23
0
        (input_ids_list, attention_mask_list, token_type_ids_list,
         label_list)).map(map_example_to_dict)


train_encoded = encode_examples(train_data).shuffle(BUFFER_SIZE).batch(
    BATCH_SIZE)
test_encoded = encode_examples(test_data).batch(BATCH_SIZE)

print('ENCODE SUCCESSFULLY')

# build model -> load directly
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

total_steps = int(train_size / BATCH_SIZE) * EPOCHS
optimizer, _ = transformers.create_optimizer(init_lr=ALPHA,
                                             num_train_steps=total_steps,
                                             num_warmup_steps=0,
                                             adam_epsilon=EPSILON)
optimizer.clipnorm = 1.0
'''
optimizer = transformers.AdamWeightDecay(
    learning_rate=ALPHA, 
    epsilon=EPSILON, 
    clipnorm=1.0)
'''

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.summary()