Beispiel #1
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files, field="data")
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = XLNetConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = XLNetTokenizerFast.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = XLNetForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    question_column_name = "question" if "question" in column_names else column_names[0]
    context_column_name = "context" if "context" in column_names else column_names[1]
    answer_column_name = "answers" if "answers" in column_names else column_names[2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    # Training preprocessing
    def prepare_train_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=data_args.max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")
        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []
        tokenized_examples["is_impossible"] = []
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
            # The cls token gets 1.0 too (for predictions of empty answers).
            tokenized_examples["p_mask"].append(
                [
                    0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
                    for k, s in enumerate(sequence_ids)
                ]
            )

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
                tokenized_examples["is_impossible"].append(1.0)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != context_idx:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != context_idx:
                    token_end_index -= 1
                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                    tokenized_examples["is_impossible"].append(1.0)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)
                    tokenized_examples["is_impossible"].append(0.0)

        return tokenized_examples

    if training_args.do_train:
        train_dataset = datasets["train"].map(
            prepare_train_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Validation preprocessing
    def prepare_validation_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=data_args.max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label.
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, input_ids in enumerate(tokenized_examples["input_ids"]):
            # Find the CLS token in the input ids.
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others 1.0.
            tokenized_examples["p_mask"].append(
                [
                    0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
                    for k, s in enumerate(sequence_ids)
                ]
            )

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_idx else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if training_args.do_eval:
        validation_dataset = datasets["validation"].map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Data collator
    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
    # collator.
    data_collator = default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding(tokenizer)

    # Post-processing:
    def post_processing_function(examples, features, predictions):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=data_args.version_2_with_negative,
            n_best_size=data_args.n_best_size,
            max_answer_length=data_args.max_answer_length,
            start_n_top=model.config.start_n_top,
            end_n_top=model.config.end_n_top,
            output_dir=training_args.output_dir,
            is_world_process_zero=trainer.is_world_process_zero(),
        )
        # Format the result to the format the metric expects.
        if data_args.version_2_with_negative:
            formatted_predictions = [
                {"id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k]}
                for k, v in predictions.items()
            ]
        else:
            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in datasets["validation"]]
        return EvalPrediction(predictions=formatted_predictions, label_ids=references)

    # TODO: Once the fix lands in a Datasets release, remove the _local here and the squad_v2_local folder.
    current_dir = os.path.sep.join(os.path.join(__file__).split(os.path.sep)[:-1])
    metric = load_metric(os.path.join(current_dir, "squad_v2_local") if data_args.version_2_with_negative else "squad")

    def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions=p.predictions, references=p.label_ids)

    # Initialize our Trainer
    trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=validation_dataset if training_args.do_eval else None,
        eval_examples=datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        results = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
    def _build_vocab(self, max_vocab_cnt):
        # build vocab
        if self.tokenizer_type.startswith('word'):
            self._build_vocab_manual(max_vocab_cnt)
        elif self.tokenizer_type.startswith('bert-'):
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 30522  # fixed for pretrained BERT vocab (old version)
            config_pretrained = BertConfig.from_pretrained(self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}

        elif self.tokenizer_type.startswith('xlnet-'):
            # self.vocab = self.tokenizer.vocab
            # self.rev_vocab = self.tokenizer.ids_to_tokens
            # self.pad_id = self.vocab["[PAD]"]
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 32000  # fixed for pretrained BERT vocab
            config_pretrained = XLNetConfig.from_pretrained(
                self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}

            self.vocab = map_vocab
            self.rev_vocab = inv_map

        elif self.tokenizer_type.startswith('x5-'):
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 32000
            config_pretrained = T5Config.from_pretrained(self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}
            self.vocab = map_vocab
            self.rev_vocab = inv_map

        elif self.tokenizer_type.startswith('bart-'):
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 32000  # fixed for pretrained BERT vocab
            config_pretrained = BartConfig.from_pretrained(self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}

        return
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "fasta":
            FASTA_DATASET = True

            datasets = load_dataset_fasta(data_files, data_args.max_seq_length)
        else:
            if extension == "txt":
                extension = "text"
            datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = XLNetConfig()
        logger.warning("You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    elif model_args.model_name_or_path:
        tokenizer = XLNetTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = XLNetLMHeadModel.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = XLNetLMHeadModel.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    tokenized_datasets = dict()
    for dataset_key, dataset in datasets.items():
        # Tokenize
        encodings = tokenizer(
            dataset['sequences'],
            truncation=True,
            padding='max_length', # TODO get from args passed in
            max_length=data_args.max_seq_length,
            return_special_tokens_mask=True,
            return_token_type_ids=False,
            return_attention_mask=False
        )
        
        torch_dataset = FastaDataset(encodings)
        tokenized_datasets[dataset_key] = torch_dataset


    # Data collator
    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer,
        plm_probability=data_args.plm_probability,
        max_span_length=data_args.max_span_length,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        model_path = (
            model_args.model_name_or_path
            if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path))
            else None
        )
        trainer.train(model_path=model_path)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
Beispiel #4
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(data_args.dataset_name,
                                    data_args.dataset_config_name,
                                    cache_dir=model_args.cache_dir)
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
            )
            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        raw_datasets = load_dataset(extension,
                                    data_files=data_files,
                                    cache_dir=model_args.cache_dir)
        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
            )
            raw_datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
            )

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            **config_kwargs)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            **config_kwargs)
    else:
        config = XLNetConfig()
        logger.warning(
            "You are instantiating a new config instance from scratch.")
        if model_args.config_overrides is not None:
            logger.info(f"Overriding config: {model_args.config_overrides}")
            config.update_from_string(model_args.config_overrides)

    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
        "use_fast": model_args.use_fast_tokenizer,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name,
                                                  **tokenizer_kwargs)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, **tokenizer_kwargs)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = XLNetLMHeadModel.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        logger.info("Training new model from scratch")
        model = XLNetLMHeadModel.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = raw_datasets["train"].column_names
    else:
        column_names = raw_datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    if data_args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if data_args.pad_to_max_length else False

        def tokenize_function(examples):
            # Remove empty lines
            examples["text"] = [
                line for line in examples["text"]
                if len(line) > 0 and not line.isspace()
            ]
            return tokenizer(examples["text"],
                             padding=padding,
                             truncation=True,
                             max_length=max_seq_length)

        with training_args.main_process_first(desc="dataset map tokenization"):
            tokenized_datasets = raw_datasets.map(
                tokenize_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=[text_column_name],
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on dataset line_by_line",
            )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name])

        with training_args.main_process_first(desc="dataset map tokenization"):
            tokenized_datasets = raw_datasets.map(
                tokenize_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on every text in dataset",
            )

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: sum(examples[k], [])
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            if total_length >= max_seq_length:
                total_length = (total_length //
                                max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + max_seq_length]
                    for i in range(0, total_length, max_seq_length)
                ]
                for k, t in concatenated_examples.items()
            }
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

        with training_args.main_process_first(desc="grouping texts together"):
            tokenized_datasets = tokenized_datasets.map(
                group_texts,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                load_from_cache_file=not data_args.overwrite_cache,
                desc=f"Grouping texts in chunks of {max_seq_length}",
            )

    if training_args.do_train:
        if "train" not in tokenized_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = tokenized_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))

    if training_args.do_eval:
        if "validation" not in tokenized_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = tokenized_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(
                range(data_args.max_eval_samples))

    # Data collator
    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer,
        plm_probability=data_args.plm_probability,
        max_span_length=data_args.max_span_length,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
        metrics = train_result.metrics

        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
        try:
            perplexity = math.exp(metrics["eval_loss"])
        except OverflowError:
            perplexity = float("inf")
        metrics["perplexity"] = perplexity

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.push_to_hub:
        kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tasks": "language-modeling"
        }
        if data_args.dataset_name is not None:
            kwargs["dataset_tags"] = data_args.dataset_name
            if data_args.dataset_config_name is not None:
                kwargs["dataset_args"] = data_args.dataset_config_name
                kwargs[
                    "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
            else:
                kwargs["dataset"] = data_args.dataset_name

        trainer.push_to_hub(**kwargs)
Beispiel #5
0
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name,
                                    args.dataset_config_name)
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        if args.test_file is not None:
            data_files["test"] = args.test_file
        extension = args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension,
                                    data_files=data_files,
                                    field="data")
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = XLNetConfig.from_pretrained(args.model_name_or_path)
    tokenizer = XLNetTokenizerFast.from_pretrained(args.model_name_or_path)
    model = XLNetForQuestionAnswering.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config)

    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.
    column_names = raw_datasets["train"].column_names

    question_column_name = "question" if "question" in column_names else column_names[
        0]
    context_column_name = "context" if "context" in column_names else column_names[
        1]
    answer_column_name = "answers" if "answers" in column_names else column_names[
        2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    if args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )

    max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)

    # Training preprocessing
    def prepare_train_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")
        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []
        tokenized_examples["is_impossible"] = []
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
            # The cls token gets 1.0 too (for predictions of empty answers).
            tokenized_examples["p_mask"].append([
                0.0 if (not special_tokens[i][k] and s == context_idx)
                or k == cls_index else 1.0 for k, s in enumerate(sequence_ids)
            ])

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
                tokenized_examples["is_impossible"].append(1.0)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != context_idx:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != context_idx:
                    token_end_index -= 1
                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char
                        and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                    tokenized_examples["is_impossible"].append(1.0)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[
                            token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(
                        token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(
                        token_end_index + 1)
                    tokenized_examples["is_impossible"].append(0.0)

        return tokenized_examples

    if "train" not in raw_datasets:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = raw_datasets["train"]
    if args.max_train_samples is not None:
        # We will select sample from whole data if agument is specified
        train_dataset = train_dataset.select(range(args.max_train_samples))
    # Create train feature from dataset
    train_dataset = train_dataset.map(
        prepare_train_features,
        batched=True,
        num_proc=args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not args.overwrite_cache,
        desc="Running tokenizer on train dataset",
    )
    if args.max_train_samples is not None:
        # Number of samples might increase during Feature Creation, We select only specified max samples
        train_dataset = train_dataset.select(range(args.max_train_samples))

    # Validation preprocessing
    def prepare_validation_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label.
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, input_ids in enumerate(tokenized_examples["input_ids"]):
            # Find the CLS token in the input ids.
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others 1.0.
            tokenized_examples["p_mask"].append([
                0.0 if (not special_tokens[i][k] and s == context_idx)
                or k == cls_index else 1.0 for k, s in enumerate(sequence_ids)
            ])

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(
                examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_idx else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if "validation" not in raw_datasets:
        raise ValueError("--do_eval requires a validation dataset")
    eval_examples = raw_datasets["validation"]
    if args.max_eval_samples is not None:
        # We will select sample from whole data
        eval_examples = eval_examples.select(range(args.max_eval_samples))
    # Validation Feature Creation
    eval_dataset = eval_examples.map(
        prepare_validation_features,
        batched=True,
        num_proc=args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not args.overwrite_cache,
        desc="Running tokenizer on validation dataset",
    )

    if args.max_eval_samples is not None:
        # During Feature creation dataset samples might increase, we will select required samples again
        eval_dataset = eval_dataset.select(range(args.max_eval_samples))

    if args.do_predict:
        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_examples = raw_datasets["test"]
        if args.max_predict_samples is not None:
            # We will select sample from whole data
            predict_examples = predict_examples.select(
                range(args.max_predict_samples))
        # Predict Feature Creation
        predict_dataset = predict_examples.map(
            prepare_validation_features,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not args.overwrite_cache,
            desc="Running tokenizer on prediction dataset",
        )
        if args.max_predict_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            predict_dataset = predict_dataset.select(
                range(args.max_predict_samples))

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorWithPadding(
            tokenizer,
            pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)

    eval_dataset_for_model = eval_dataset.remove_columns(
        ["example_id", "offset_mapping"])
    eval_dataloader = DataLoader(eval_dataset_for_model,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    if args.do_predict:
        predict_dataset_for_model = predict_dataset.remove_columns(
            ["example_id", "offset_mapping"])
        predict_dataloader = DataLoader(
            predict_dataset_for_model,
            collate_fn=data_collator,
            batch_size=args.per_device_eval_batch_size)

    # Post-processing:
    def post_processing_function(examples,
                                 features,
                                 predictions,
                                 stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=args.version_2_with_negative,
            n_best_size=args.n_best_size,
            max_answer_length=args.max_answer_length,
            start_n_top=model.config.start_n_top,
            end_n_top=model.config.end_n_top,
            output_dir=args.output_dir,
            prefix=stage,
        )
        # Format the result to the format the metric expects.
        if args.version_2_with_negative:
            formatted_predictions = [{
                "id":
                k,
                "prediction_text":
                v,
                "no_answer_probability":
                scores_diff_json[k]
            } for k, v in predictions.items()]
        else:
            formatted_predictions = [{
                "id": k,
                "prediction_text": v
            } for k, v in predictions.items()]

        references = [{
            "id": ex["id"],
            "answers": ex[answer_column_name]
        } for ex in examples]
        return EvalPrediction(predictions=formatted_predictions,
                              label_ids=references)

    metric = load_metric(
        "squad_v2" if args.version_2_with_negative else "squad")

    def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
        """
        Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor

        Args:
            start_or_end_logits(:obj:`tensor`):
                This is the output predictions of the model. We can only enter either start or end logits.
            eval_dataset: Evaluation dataset
            max_len(:obj:`int`):
                The maximum length of the output tensor. ( See the model.eval() part for more details )
        """

        step = 0
        # create a numpy array and fill it with -100.
        logits_concat = np.full((len(dataset), max_len),
                                -100,
                                dtype=np.float32)
        # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather
        for i, output_logit in enumerate(
                start_or_end_logits):  # populate columns
            # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
            # And after every iteration we have to change the step

            batch_size = output_logit.shape[0]
            cols = output_logit.shape[1]
            if step + batch_size < len(dataset):
                logits_concat[step:step + batch_size, :cols] = output_logit
            else:
                logits_concat[step:, :cols] = output_logit[:len(dataset) -
                                                           step]

            step += batch_size

        return logits_concat

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")

    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

        # intialize all lists to collect the batches

    all_start_top_log_probs = []
    all_start_top_index = []
    all_end_top_log_probs = []
    all_end_top_index = []
    all_cls_logits = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
            start_top_log_probs = outputs.start_top_log_probs
            start_top_index = outputs.start_top_index
            end_top_log_probs = outputs.end_top_log_probs
            end_top_index = outputs.end_top_index
            cls_logits = outputs.cls_logits

            if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
                start_top_log_probs = accelerator.pad_across_processes(
                    start_top_log_probs, dim=1, pad_index=-100)
                start_top_index = accelerator.pad_across_processes(
                    start_top_index, dim=1, pad_index=-100)
                end_top_log_probs = accelerator.pad_across_processes(
                    end_top_log_probs, dim=1, pad_index=-100)
                end_top_index = accelerator.pad_across_processes(
                    end_top_index, dim=1, pad_index=-100)
                cls_logits = accelerator.pad_across_processes(cls_logits,
                                                              dim=1,
                                                              pad_index=-100)

            all_start_top_log_probs.append(
                accelerator.gather(start_top_log_probs).cpu().numpy())
            all_start_top_index.append(
                accelerator.gather(start_top_index).cpu().numpy())
            all_end_top_log_probs.append(
                accelerator.gather(end_top_log_probs).cpu().numpy())
            all_end_top_index.append(
                accelerator.gather(end_top_index).cpu().numpy())
            all_cls_logits.append(accelerator.gather(cls_logits).cpu().numpy())

    max_len = max([x.shape[1] for x in all_end_top_log_probs
                   ])  # Get the max_length of the tensor

    # concatenate all numpy arrays collected above
    start_top_log_probs_concat = create_and_fill_np_array(
        all_start_top_log_probs, eval_dataset, max_len)
    start_top_index_concat = create_and_fill_np_array(all_start_top_index,
                                                      eval_dataset, max_len)
    end_top_log_probs_concat = create_and_fill_np_array(
        all_end_top_log_probs, eval_dataset, max_len)
    end_top_index_concat = create_and_fill_np_array(all_end_top_index,
                                                    eval_dataset, max_len)
    cls_logits_concat = np.concatenate(all_cls_logits, axis=0)

    # delete the list of numpy arrays
    del start_top_log_probs
    del start_top_index
    del end_top_log_probs
    del end_top_index
    del cls_logits

    outputs_numpy = (
        start_top_log_probs_concat,
        start_top_index_concat,
        end_top_log_probs_concat,
        end_top_index_concat,
        cls_logits_concat,
    )
    prediction = post_processing_function(eval_examples, eval_dataset,
                                          outputs_numpy)
    eval_metric = metric.compute(predictions=prediction.predictions,
                                 references=prediction.label_ids)
    logger.info(f"Evaluation metrics: {eval_metric}")

    if args.do_predict:
        # intialize all lists to collect the batches

        all_start_top_log_probs = []
        all_start_top_index = []
        all_end_top_log_probs = []
        all_end_top_index = []
        all_cls_logits = []
        for step, batch in enumerate(predict_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
                start_top_log_probs = outputs.start_top_log_probs
                start_top_index = outputs.start_top_index
                end_top_log_probs = outputs.end_top_log_probs
                end_top_index = outputs.end_top_index
                cls_logits = outputs.cls_logits

                if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
                    start_top_log_probs = accelerator.pad_across_processes(
                        start_top_log_probs, dim=1, pad_index=-100)
                    start_top_index = accelerator.pad_across_processes(
                        start_top_index, dim=1, pad_index=-100)
                    end_top_log_probs = accelerator.pad_across_processes(
                        end_top_log_probs, dim=1, pad_index=-100)
                    end_top_index = accelerator.pad_across_processes(
                        end_top_index, dim=1, pad_index=-100)
                    cls_logits = accelerator.pad_across_processes(
                        cls_logits, dim=1, pad_index=-100)

                all_start_top_log_probs.append(
                    accelerator.gather(start_top_log_probs).cpu().numpy())
                all_start_top_index.append(
                    accelerator.gather(start_top_index).cpu().numpy())
                all_end_top_log_probs.append(
                    accelerator.gather(end_top_log_probs).cpu().numpy())
                all_end_top_index.append(
                    accelerator.gather(end_top_index).cpu().numpy())
                all_cls_logits.append(
                    accelerator.gather(cls_logits).cpu().numpy())

        max_len = max([x.shape[1] for x in all_end_top_log_probs
                       ])  # Get the max_length of the tensor

        # concatenate all numpy arrays collected above
        start_top_log_probs_concat = create_and_fill_np_array(
            all_start_top_log_probs, predict_dataset, max_len)
        start_top_index_concat = create_and_fill_np_array(
            all_start_top_index, predict_dataset, max_len)
        end_top_log_probs_concat = create_and_fill_np_array(
            all_end_top_log_probs, predict_dataset, max_len)
        end_top_index_concat = create_and_fill_np_array(
            all_end_top_index, predict_dataset, max_len)
        cls_logits_concat = np.concatenate(all_cls_logits, axis=0)

        # delete the list of numpy arrays
        del start_top_log_probs
        del start_top_index
        del end_top_log_probs
        del end_top_index
        del cls_logits

        outputs_numpy = (
            start_top_log_probs_concat,
            start_top_index_concat,
            end_top_log_probs_concat,
            end_top_index_concat,
            cls_logits_concat,
        )

        prediction = post_processing_function(predict_examples,
                                              predict_dataset, outputs_numpy)
        predict_metric = metric.compute(predictions=prediction.predictions,
                                        references=prediction.label_ids)
        logger.info(f"Predict metrics: {predict_metric}")

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir,
                                        save_function=accelerator.save)
def main():

    # Section: Set device for PyTorch
    if torch.cuda.is_available():
        # might need to update when using more than 1 GPU
        rank = 0
        torch.cuda.set_device(rank)
        device = torch.device("cuda", rank)
        #torch.distributed.init_process_group(backend='nccl')
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cpu")
        n_gpu = 0

    print("N GPU: ", n_gpu)
    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Indicate batch size")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--val_logging_step",
        default=100000,
        type=int,
        help="Number of steps in between logs of performance on validation set"
    )
    parser.add_argument(
        "--train_logging_step",
        default=1000,
        type=int,
        help="Number of steps in between logs of performance on training set")
    parser.add_argument("--save_step",
                        default=100000,
                        type=int,
                        help="Number of steps to save model parameters")
    parser.add_argument(
        "--model_id",
        type=str,
        help=
        "Model and optimizer will be saved at '/gpfs/data/razavianlab/capstone19/models/model_id'. "
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        "--feature_save_dir",
        type=str,
        help=
        "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/feature_save_dir'. "
    )
    parser.add_argument(
        "--model_type",
        default="base",
        type=str,
        help="Whether to use the xlnet base model or the xlnet large model")
    parser.add_argument("--learning_rate",
                        default=4e-5,
                        type=float,
                        help="Learning rate for optimizer")
    args = parser.parse_args()

    # Set random seed
    set_seeds(seed=args.seed, n_gpu=n_gpu)

    # Load data
    feature_save_path = os.path.join(
        '/gpfs/data/razavianlab/capstone19/preprocessed_data/',
        args.feature_save_dir)
    logger.info("Loading train dataset")
    train_dataloader = load_featurized_examples(
        args.batch_size, set_type="train", feature_save_path=feature_save_path)
    logger.info("Loading validation dataset")
    val_dataloader = load_featurized_examples(
        args.batch_size, set_type="val", feature_save_path=feature_save_path)

    # Load pretrained model
    num_train_optimization_steps = args.num_train_epochs * len(
        train_dataloader)

    if args.model_type == "large":
        config = XLNetConfig.from_pretrained('xlnet-large-cased',
                                             num_labels=2292)
        model = XLNetForSequenceClassification.from_pretrained(
            'xlnet-large-cased', config=config)
    else:
        config = XLNetConfig.from_pretrained(
            'xlnet-base-cased', num_labels=2292)  # TODO: check if we need this
        model = XLNetForSequenceClassification.from_pretrained(
            'xlnet-base-cased', config=config)

    model.to(device)

    optimizer, scheduler, model = initialize_optimizer(model, train_dataloader,
                                                       args)

    logger.info("***** Running training *****")
    logger.info("  Num batches = %d", len(train_dataloader))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Total train batch size  = %d", args.batch_size)
    logger.info("  Total optimization steps = %d",
                len(train_dataloader) * args.num_train_epochs)

    model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu)))
    train(train_dataloader=train_dataloader,
          val_dataloader=val_dataloader,
          model=model,
          optimizer=optimizer,
          scheduler=scheduler,
          num_train_epochs=args.num_train_epochs,
          n_gpu=n_gpu,
          device=device,
          model_id=args.model_id,
          save_step=args.save_step,
          train_logging_step=args.train_logging_step,
          val_logging_step=args.val_logging_step)
Beispiel #7
0
def main():

    # Set device for PyTorch
    if torch.cuda.is_available():
        # might need to update when using more than 1 GPU
        rank = 0
        torch.cuda.set_device(rank)
        device = torch.device("cuda", rank)
        #torch.distributed.init_process_group(backend='nccl')
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cpu")
        n_gpu = 0

    print("N GPU: ", n_gpu)

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--model_id",
        type=str,
        help=
        "Model and optimizer should be saved at a folder inside '/gpfs/data/razavianlab/capstone19/models/{model_id}'. "
    )
    parser.add_argument(
        "--checkpoint",
        type=str,
        help=
        "Checkpoint number. Model and optimizer should be saved at '/gpfs/data/razavianlab/capstone19/models/{model_id}/model_checkpoint_{checkpoint}'. "
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        "--feature_save_dir",
        type=str,
        help=
        "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. "
    )

    parser.add_argument("--set_type",
                        type=str,
                        help="Specify train/test file.")

    args = parser.parse_args()

    # Load training data
    feature_save_path = os.path.join(
        '/gpfs/data/razavianlab/capstone19/preprocessed_data/',
        args.feature_save_dir)
    logger.info("Loading test dataset")
    test_dataloader = load_featurized_examples(
        batch_size=32,
        set_type=args.set_type,
        feature_save_path=feature_save_path)

    # Load saved model
    model_path = os.path.join('/gpfs/data/razavianlab/capstone19/models/',
                              args.model_id,
                              'model_checkpoint_' + args.checkpoint)
    logger.info("Loading saved model from {}".format(model_path))
    config = XLNetConfig.from_pretrained(
        os.path.join(model_path, 'config.json'),
        num_labels=2292)  # TODO: check if we need this
    model = XLNetForSequenceClassification.from_pretrained(model_path,
                                                           config=config)
    model.to(device)
    model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu)))

    summaries = torch.empty(0, config.d_model).to(device)
    all_doc_ids = torch.empty(0).to(device)
    all_label_ids = torch.empty(0, 2292).to(device)

    for i, batch in enumerate(test_dataloader):
        model.eval()
        with torch.no_grad():
            input_ids, input_mask, segment_ids, label_ids, doc_ids = batch

            input_ids = input_ids.to(device).long()
            input_mask = input_mask.to(device).long()
            segment_ids = segment_ids.to(device).long()
            doc_ids = doc_ids.to(device).float()
            label_ids = label_ids.to(device).float()
            transformer_outputs = model.module.transformer(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                input_mask=input_mask)

            output = transformer_outputs[0]
            # extracting the CLS token
            summary = output[:, 0]
            summary = summary.to(device)

            summaries = torch.cat([summaries, summary], dim=0)
            all_doc_ids = torch.cat([all_doc_ids, doc_ids], dim=0)
            all_label_ids = torch.cat([all_label_ids, label_ids], dim=0)

    # Average the representation of the CLS token for all examples from the same document
    mask = torch.zeros(int(all_doc_ids.max().item()) + 1, len(summaries))
    mask[all_doc_ids.long(), torch.arange(len(summaries))] = 1
    averaging_matrix = torch.nn.functional.normalize(mask, p=1,
                                                     dim=1).to(device)
    mean_summaries = torch.mm(averaging_matrix, summaries)
    print("mean summaries.shape", mean_summaries.size())
    # Create an object storing one copy of the labels per document
    last_doc_id = -1
    label_ids = torch.empty(0, all_label_ids.size()[1]).to(device)
    for (i, doc_id) in enumerate(all_doc_ids):
        if doc_id.item() != last_doc_id:
            label_ids = torch.cat([label_ids, all_label_ids[i].unsqueeze(0)])
            last_doc_id = doc_id.item()

    print('label_ids shape', label_ids.size())
    # Save the embedded representations of the document, along with the labels
    torch.save(
        mean_summaries,
        os.path.join(feature_save_path, args.set_type + '_summaries.pt'))
    torch.save(
        label_ids,
        os.path.join(feature_save_path, args.set_type + '_doc_label_ids.pt')
    )  # label_ids.pt has one record per window (and thus multiple records per document)

    return
Beispiel #8
0
configuration = XLNetConfig().from_dict({
    "_name_or_path": "xlnet-predict-middle-notes",
    "architectures": ["XLNetLMHeadModel"],
    "attn_type": "bi",
    "bi_data": False,
    "bos_token_id": 10000,
    "clamp_len": -1,
    # "d_head": 64,
    "d_inner": 3072,
    "d_model": 768,
    "dropout": 0.1,
    "end_n_top": 5,
    "eos_token_id": 2,
    "ff_activation": "gelu",
    "initializer_range": 0.02,
    "layer_norm_eps": 1e-12,
    "mem_len": None,  # null
    "model_type": "xlnet",
    "n_head": 8,  # 12 originally
    "n_layer": 12,
    "pad_token_id": 10000,
    "reuse_len": None,  # null,
    "same_length": False,
    "start_n_top": 5,
    "summary_activation": "tanh",
    "summary_last_dropout": 0.1,
    "summary_type": "last",
    "summary_use_proj": True,
    "untie_r": True,
    "use_mems_eval": True,
    "use_mems_train": True,
    # "vocab_size": 32000
})
Beispiel #9
0
    def init_model(self):
        basic_encoder = None
        if self.config['use_bert']:
            bert_config = BertConfig.from_pretrained(
                self.config['bert_model_name'],
                cache_dir=self.config['bert_dir'])
            if self.config['num_bert_layer'] is not None:
                bert_config.num_hidden_layers = self.config['num_bert_layer']
            bert = BertModel.from_pretrained(self.config['bert_model_name'],
                                             cache_dir=self.config['bert_dir'],
                                             config=bert_config)
            basic_encoder = bert
        elif self.config['use_xlnet']:
            xlnet_config = XLNetConfig.from_pretrained(
                'hfl/chinese-xlnet-base', cache_dir=self.config['xlnet_dir'])
            xlnet_config.n_layer = self.config['num_xlnet_layer']
            xlnet_config.mem_len = self.config['xlnet_mem_len']
            xlnet = XLNetModel.from_pretrained(
                'hfl/chinese-xlnet-base',
                cache_dir=self.config['xlnet_dir'],
                config=xlnet_config)
            basic_encoder = xlnet
        else:
            raise Exception('Not support other basic encoder')

        self.model = DocEE(self.config, basic_encoder, self.tokenizer)
        if self.config['cuda']:
            self.model.cuda()
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.config['learning_rate'])

        if self.config['resume_model']:
            OUTPUT_DIR = self.config['output_dir']
            MODEL_SAVE_DIR = os.path.join(OUTPUT_DIR,
                                          self.config['model_save_dir'])
            if os.path.exists(MODEL_SAVE_DIR):
                cpt_file_names = os.listdir(MODEL_SAVE_DIR)
                if len(cpt_file_names) > 0:
                    epoch_record = []
                    for cpt_file_name in cpt_file_names:
                        epoch_record.append(
                            int(cpt_file_name.split('-')[-1].split('.')[0]))
                    epoch_record.sort()
                    latest_epoch = epoch_record[-1]
                    self.latest_epoch = latest_epoch + 1

                    latest_model_file_name = os.path.join(
                        MODEL_SAVE_DIR, self.config['model_file'] %
                        (self.config['ee_method'], latest_epoch))
                    if self.config['cuda']:
                        store_dict = torch.load(
                            latest_model_file_name,
                            map_location=torch.device('cuda'))
                    else:
                        store_dict = torch.load(latest_model_file_name,
                                                map_location='cpu')
                    self.model.load_state_dict(store_dict['model_state'])
                    self.optimizer.load_state_dict(
                        store_dict['optimizer_state'])
                    print('resume train from %s' % latest_model_file_name)
        print('model init finish')
Beispiel #10
0
else:
    my_collect = collate_fn
train_loader = DataLoader(train_dataset,
                          num_workers=2,
                          batch_size=args.batch_size,
                          shuffle=True,
                          collate_fn=my_collect)
test_loader = DataLoader(test_dataset,
                         num_workers=2,
                         batch_size=args.eval_batch_size,
                         shuffle=False,
                         collate_fn=my_collect)

# ##make model
device = torch.device(args.gpu_ids)
config = XLNetConfig.from_pretrained("xlnet-base-cased")
config.num_labels = 5
if args.dataset is "ag_news":
    config.num_labels = 4
pretrained_model = XLNetForSequenceClassification.from_pretrained(
    "xlnet-base-cased", config=config)
model = scl_model_Xlnet(config,
                        device,
                        pretrained_model,
                        with_semi=args.with_mix,
                        with_sum=args.with_summary)

##make optimizer
optimizer = OpenAIAdam(model.parameters(),
                       lr=args.lr,
                       schedule='warmup_linear',
Beispiel #11
0
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer, BertConfig, BertModel, BertTokenizer, XLNetConfig, XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
import torch
'''
config = RobertaConfig.from_pretrained("./roberta-base/roberta-base-config.json")
tokenizer = RobertaTokenizer.from_pretrained("./roberta-base/roberta-base-vocab.json")
model = RobertaModel.from_pretrained("./roberta-base/roberta-base-pytorch_model.bin", config=config)
'''

config = XLNetConfig.from_pretrained(
    "./xlnet-base-cased/xlnet-base-cased-config.json")
tokenizer = XLNetTokenizer.from_pretrained(
    "./xlnet-base-cased/xlnet-base-cased-spiece.model")
model = XLNetModel.from_pretrained(
    "./xlnet-base-cased/xlnet-base-cased-pytorch_model.bin", config=config)

input_ids = torch.tensor(tokenizer.encode("toxicity")).unsqueeze(
    0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0]
print(last_hidden_states)
Beispiel #12
0
 def __init__(self):
     super(Model, self).__init__()
     self.config = XLNetConfig.from_pretrained('./xlnet_pretrain/config.json')
     self.xlnet = XLNetModel.from_pretrained('./xlnet_pretrain/pytorch_model.bin', config=self.config)
     self.fc = nn.Linear(self.config.d_model, 2)
Beispiel #13
0
def main(config, model_filename):
    if not os.path.exists(config.output_dir):
        os.makedirs(config.output_dir)

    if not os.path.exists(config.cache_dir):
        os.makedirs(config.cache_dir)
    if not os.path.exists(config.log_dir):
        os.makedirs(config.log_dir)

    model_file = os.path.join(config.output_dir, model_filename)

    # Prepare the device
    gpu_ids = [2]
    device, n_gpu = get_device(gpu_ids[0])
    if n_gpu > 1:
        n_gpu = len(gpu_ids)

    # Set Random Seeds
    random.seed(config.seed)
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(config.seed)
        torch.backends.cudnn.deterministic = True

    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
    xlnet_config = XLNetConfig.from_pretrained(config.bert_config_path)

    cache_train_dataset = "cached_dataset_train_linear_512"
    cache_dev_dataset = "cached_dataset_dev_linear_512"
    if os.path.exists(config.cache_dir + '/' + cache_train_dataset):
        logger.info("Loading features from cached file %s",
                    config.cache_dir + '/' + cache_train_dataset)
        train_dataset = torch.load(config.cache_dir + '/' +
                                   cache_train_dataset)
        dev_dataset = torch.load(config.cache_dir + '/' + cache_dev_dataset)
    else:
        train_dataset, dev_dataset, test_dataset = load_data(
            config.data_path, device, tokenizer, config.cache_dir, 64, 960)
        logger.info("save cached file in  %s", config.cache_dir)
        torch.save(train_dataset, config.cache_dir + '/' + cache_train_dataset)
        torch.save(dev_dataset, config.cache_dir + '/' + cache_dev_dataset)
    # train_sampler = RandomSampler(train_dataset)
    # dev_sampler =RandomSampler(dev_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=config.train_batch_size,
                                  num_workers=8,
                                  pin_memory=False,
                                  drop_last=False)
    dev_dataloader = DataLoader(dev_dataset,
                                shuffle=True,
                                batch_size=config.dev_batch_size,
                                num_workers=8,
                                pin_memory=False,
                                drop_last=False)
    # train_iterator = trange(int(config.epoch_num))
    if config.model_name == "GAReader":
        from XLNet_Linear.GAReader.GAReader import GAReader
        model = GAReader(config.bert_word_dim, config.output_dim,
                         config.hidden_size, config.rnn_num_layers,
                         config.ga_layers, config.bidirectional,
                         config.dropout, xlnet_config)

    # optimizer_grouped_parameter = [
    #     {'params':[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay) and 'embedding' not in n and 'bert' not in n]},
    #     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and 'embedding' not in n and 'bert' not in n]}
    # ]
    param_optimizer = list(model.named_parameters())
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    # optimizer_parameter =[
    #     {'params':model.word_embedding.bert.parameters()},
    #     {'params':model.word_embedding.aggregation.parameters(),'lr':1e-4},
    #     # {'params':model.rnn.parameters(),'lr':1e-3},
    #     # {'params':model.ga_rnn.parameters(),'lr':1e-3},
    #     # {'params':model.mlp_att.parameters(),'lr':1e-2},
    #     # {'params':model.dot_layer.parameters(),'lr':1e-2},
    #     {'params':model.final_liear.parameters(),'lr':1e-4},
    # ]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in param_optimizer
            if not any(nd in n for nd in no_decay) and 'xlnet' not in n
        ],
        'name': [
            n for n, p in param_optimizer
            if not any(nd in n for nd in no_decay) and 'xlnet' not in n
        ],
        'weight_decay':
        0.01,
        'lr':
        3e-4
    }, {
        'params': [
            p for n, p in param_optimizer
            if any(nd in n for nd in no_decay) and 'xlnet' not in n
        ],
        'name': [
            n for n, p in param_optimizer
            if not any(nd in n for nd in no_decay) and 'xlnet' not in n
        ],
        'weight_decay':
        0.0,
        'lr':
        3e-4
    }, {
        'params': [
            p for n, p in param_optimizer
            if not any(nd in n for nd in no_decay) and 'xlnet' in n
        ],
        'name': [
            n for n, p in param_optimizer
            if not any(nd in n for nd in no_decay) and 'xlnet' not in n
        ],
        'weight_decay':
        0.01,
        'lr':
        config.lr
    }, {
        'params': [
            p for n, p in param_optimizer
            if any(nd in n for nd in no_decay) and 'xlnet' in n
        ],
        'name': [
            n for n, p in param_optimizer
            if not any(nd in n for nd in no_decay) and 'xlnet' not in n
        ],
        'weight_decay':
        0.0,
        'lr':
        config.lr
    }]
    # print(optimizer_grouped_parameter)
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=config.lr,
                            eps=1e-6)
    # optimizer = optim.SGD(model.parameters(), lr=config.lr)
    # print(optimizer_grouped_parameter)
    # optimizer = optim.SGD(optimizer_parameter,lr=config.lr)
    # model,optimizer = amp.initialize(model,optimizer,opt_level="01")
    scheduler = get_linear_schedule_with_warmup(optimizer, 16000, 200000)

    criterion = nn.CrossEntropyLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    if config.do_train:
        train(config.epoch_num, model, train_dataloader, dev_dataloader,
              optimizer, criterion, ['0', '1', '2', '3', '4'], model_file,
              config.log_dir, config.print_step, config.clip, device,
              scheduler)
    # trained_file = './ga/output/2020-10-20-22_41_37best_model_linear'
    # tt = torch.load(trained_file)
    # model.load_state_dict(torch.load(trained_file,map_location={'cuda:2':'cuda:1'}))
    model.load_state_dict(torch.load(model_file))

    test_loss, test_acc, test_report = evaluate(model, train_dataloader,
                                                criterion,
                                                ['0', '1', '2', '3', '4'],
                                                device, log_dir)
    print("-------------- Test -------------")
    print("\t Loss: {} | Acc: {} | Macro avg F1: {} | Weighted avg F1: {}".
          format(test_loss, test_acc, test_report['macro avg']['f1-score'],
                 test_report['weighted avg']['f1-score']))
Beispiel #14
0
def main():

    # Set device for PyTorch
    if torch.cuda.is_available():
        # might need to update when using more than 1 GPU
        rank = 0
        torch.cuda.set_device(rank)
        device = torch.device("cuda", rank)
        #torch.distributed.init_process_group(backend='nccl')
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cpu")
        n_gpu = 0

    print("N GPU: ", n_gpu)

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--model_id",
        type=str,
        help=
        "Model and optimizer should be saved at a folder inside '/gpfs/data/razavianlab/capstone19/models/{model_id}'. "
    )
    parser.add_argument(
        "--checkpoint",
        type=str,
        help=
        "Checkpoint number. Model and optimizer should be saved at '/gpfs/data/razavianlab/capstone19/models/{model_id}/model_checkpoint_{checkpoint}'. "
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        "--feature_save_dir",
        type=str,
        help=
        "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. "
    )
    parser.add_argument("--set_type", type=str, help="Specify train/val/test")
    parser.add_argument("--model_type",
                        type=str,
                        default='xlnet',
                        help="Specify xlnet or classifier")
    parser.add_argument(
        '--num_hidden_layers',
        type=int,
        default=5,
        help=
        "Number of hidden layers for MLP classifier (not needed to evaluate a model with XLNet architecture)"
    )
    parser.add_argument(
        '--hidden_size',
        type=int,
        default=1024,
        help=
        "Hidden size for MLP classifier (not needed to evaluate a model with XLNet architecture)"
    )
    parser.add_argument(
        "--drop_rate",
        default=0.3,
        type=float,
        help=
        "Droprate in between hidden layers for MLP classifer (not needed to evaluate a model with XLNet architecture)"
    )
    parser.add_argument(
        "--activation_function",
        default='sigmoid',
        type=str,
        help=
        "Activation function for MLP classifer (not needed to evaluate a model with XLNet architecture)"
    )

    args = parser.parse_args()

    # Load training data
    feature_save_path = os.path.join(
        '/gpfs/data/razavianlab/capstone19/preprocessed_data/',
        args.feature_save_dir)
    logger.info("Loading {} dataset".format(args.set_type))
    test_dataloader = load_featurized_examples(
        batch_size=32,
        set_type=args.set_type,
        sliding_window=(args.model_type == "classifier"),
        feature_save_path=feature_save_path)

    # Load saved model
    model_path = os.path.join('/gpfs/data/razavianlab/capstone19/models/',
                              args.model_id,
                              'model_checkpoint_' + args.checkpoint)
    logger.info("Loading saved model from {}".format(model_path))
    if args.model_type == "xlnet":
        config = XLNetConfig.from_pretrained(
            os.path.join(model_path, 'config.json'),
            num_labels=2292)  # TODO: check if we need this
        model = XLNetForSequenceClassification.from_pretrained(model_path,
                                                               config=config)
    else:
        saved_model = torch.load(os.path.join(model_path, 'model.pt'))
        model = SlidingClassifier(num_layers=args.num_hidden_layers,
                                  hidden_size=args.hidden_size,
                                  p=args.drop_rate,
                                  activation_function=args.activation_function)
        model.state_dict = saved_model['model']
    model.to(device)
    model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu)))

    eval_folder = '/gpfs/data/razavianlab/capstone19/evals'
    val_file_name = os.path.join(
        eval_folder, args.model_id +
        "_{}_{}_metrics.p".format(args.checkpoint, args.set_type))
    # Create empty data frame to store evaluation results in (to be written to val_file_name)
    val_results = pd.DataFrame(columns=[
        'loss', 'micro_AUC', 'macro_AUC', 'top1_precision', 'top3_precision',
        'top5_precision', 'micro_f1', 'macro_f1', 'macro_AUC_list'
    ])
    # Run evaluation
    results = evaluate(dataloader=test_dataloader,
                       model=model,
                       model_id=args.model_id,
                       n_gpu=n_gpu,
                       device=device,
                       sliding_window=(args.model_type == "classifier"))
    # Save results
    val_results = val_results.append(pd.DataFrame(results, index=[0]))
    pickle.dump(val_results, open(val_file_name, "wb"))
    os.system("chgrp razavianlab {}".format(val_file_name))

    return
Beispiel #15
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = XLNetConfig()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = XLNetLMHeadModel.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = XLNetLMHeadModel.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if data_args.pad_to_max_length else False

        def tokenize_function(examples):
            # Remove empty lines
            examples["text"] = [
                line for line in examples["text"]
                if len(line) > 0 and not line.isspace()
            ]
            return tokenizer(examples["text"],
                             padding=padding,
                             truncation=True,
                             max_length=data_args.max_seq_length)

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=[text_column_name],
            load_from_cache_file=not data_args.overwrite_cache,
        )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name])

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

        if data_args.max_seq_length is None:
            max_seq_length = tokenizer.model_max_length
        else:
            if data_args.max_seq_length > tokenizer.model_max_length:
                logger.warn(
                    f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                    f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
                )
            max_seq_length = min(data_args.max_seq_length,
                                 tokenizer.model_max_length)

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: sum(examples[k], [])
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + max_seq_length]
                    for i in range(0, total_length, max_seq_length)
                ]
                for k, t in concatenated_examples.items()
            }
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Data collator
    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer,
        plm_probability=data_args.plm_probability,
        max_span_length=data_args.max_span_length,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_plm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )
    parser.add_argument(
        "--encoder_model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected",
    )
    parser.add_argument(
        "--encoder_model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name",
    )
    parser.add_argument(
        "--decoder_model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected",
    )
    parser.add_argument(
        "--decoder_model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_predict",
        action="store_true",
        help="Whether to run predictions on the test set.",
    )
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.",
    )
    parser.add_argument(
        "--keep_accents",
        action="store_const",
        const=True,
        help="Set this flag if model is trained with accents.",
    )
    parser.add_argument(
        "--strip_accents",
        action="store_const",
        const=True,
        help="Set this flag if model is trained without accents.",
    )
    parser.add_argument(
        "--use_fast",
        action="store_const",
        const=True,
        help="Set this flag to use fast tokenization.",
    )
    parser.add_argument(
        "--per_gpu_train_batch_size",
        default=8,
        type=int,
        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=8,
        type=int,
        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--optimizer",
        default="lamb",
        type=str,
        help="Optimizer (AdamW or lamb)",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--learning_rate",
        default=5e-5,
        type=float,
        help="The initial learning rate for Adam.",
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs",
        default=3.0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=500,
                        help="Log every X updates steps.")
    parser.add_argument(
        "--save_steps",
        type=int,
        default=500,
        help="Save checkpoint every X updates steps.",
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir",
        action="store_true",
        help="Overwrite the content of the output directory",
    )
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="For distributed training: local_rank",
    )
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda:0" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda:0", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device
    print('DEVICE : ' + str(args.device))

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.encoder_model_type = args.encoder_model_type.lower()
    args.decoder_model_type = args.decoder_model_type.lower()
    tokenizer_args = {
        k: v
        for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS
    }
    logger.info("Tokenizer arguments: %s", tokenizer_args)
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.encoder_model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
        **tokenizer_args,
    )

    # ensure there's a pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = "<PAD>"

    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    # pad_token_label_id = CrossEntropyLoss().ignore_index
    pad_token_label_id = tokenizer.pad_token_id

    if args.encoder_model_type == 'bert':
        config_encoder = BertConfig()
    elif args.encoder_model_type == 'gpt2':
        config_encoder = GPT2Config()
    elif args.encoder_model_type == 'xlnet':
        config_encoder = XLNetConfig()

    if args.decoder_model_type == 'bert':
        config_decoder = BertConfig()
    elif args.decoder_model_type == 'gpt2':
        config_decoder = GPT2Config()
    elif args.decoder_model_type == 'xlnet':
        config_decoder = XLNetConfig()

    config = EncoderDecoderConfig.from_encoder_decoder_configs(
        config_encoder, config_decoder)

    logger.info('Defining model...')
    model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        args.encoder_model_name_or_path,
        args.decoder_model_name_or_path,
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                pad_token_label_id,
                                                mode="train")
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:

        # since the config is prefaced with `tokenizer_`, Autotokenizer doesn't instatiate this correctly
        #config = AutoConfig.from_pretrained(os.path.join(args.output_dir, "tokenizer_config.json"))
        #config = {"do_lower_case": False, "model_max_length": 512}
        #tokenizer = AutoTokenizer.from_pretrained(args.output_dir, config=config, **tokenizer_args)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            #model = EncoderDecoderModel.from_pretrained(
            #    os.path.join(args.output_dir, "encoder"), os.path.join(args.output_dir, "decoder"),
            #)
            model.to(args.device)
            result, _ = evaluate(
                args,
                model,
                tokenizer,
                pad_token_label_id,
                mode="dev",
                prefix=global_step,
            )
            if global_step:
                result = {
                    "{}_{}".format(global_step, k): v
                    for k, v in result.items()
                }
            results.update(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w", encoding="utf-8") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))

    if args.do_predict and args.local_rank in [-1, 0]:

        # since the config is prefaced with `tokenizer_`, Autotokenizer doesn't instatiate this correctly
        #config = AutoConfig.from_pretrained(os.path.join(args.output_dir, "tokenizer_config.json"))
        #config = {"do_lower_case": False, "model_max_length": 512}
        #tokenizer = AutoTokenizer.from_pretrained(args.output_dir, config=config, **tokenizer_args)
        #model = EncoderDecoderModel.from_pretrained(
        #    os.path.join(args.output_dir, "encoder"), os.path.join(args.output_dir, "decoder"),
        #)
        model.to(args.device)
        result, predictions = evaluate(args,
                                       model,
                                       tokenizer,
                                       pad_token_label_id,
                                       mode="test")
        # Save results
        output_test_results_file = os.path.join(args.output_dir,
                                                "test_results.txt")
        with open(output_test_results_file, "w", encoding="utf-8") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        # Save predictions
        output_test_predictions_file = os.path.join(args.output_dir,
                                                    "test_predictions.txt")
        with open(output_test_predictions_file, "w",
                  encoding="utf-8") as writer:
            for example in predictions:
                output_line = ("output: " + tokenizer.decode(
                    example,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True,
                ) + "\n")
                writer.write(output_line)

    return results
Beispiel #17
0
    'batch_size': 64,
    'tenacity': 5,
    'epoch_size': 4
}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        default='xlnet-base-cased',
                        help='model name or path')
    args = parser.parse_args()

    config = XLNetConfig.from_pretrained(args.model)
    model = XLNetModel.from_pretrained(args.model, config=config)
    tokenizer = XLNetTokenizer.from_pretrained(args.model)

    params_senteval['model'] = model.cuda().eval()
    params_senteval['tokenizer'] = tokenizer

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = [
        'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA',
        'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth',
        'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
        'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI'
    ]
    results = se.eval(transfer_tasks)
Beispiel #18
0
    #
    # if args.language == 'english':
    nlp = spacy.load('en_core_web_sm')
    # nlp = spacy.load('en', parser=False, entity=False)
    # elif args.language == 'french':
    #     nlp = spacy.load('fr_core_news_sm')
    # elif args.language == 'german':
    #     nlp = spacy.load('de_core_news_sm')
    # Create a Tokenizer with the default settings for English
    # including punctuation rules and exceptions
    spacy_tokenizer = nlp.Defaults.create_tokenizer(nlp)
    vocab = dict()

    tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
    tokenizer.con
    config = XLNetConfig.from_pretrained('xlnet-large-cased', num_labels=3)
    model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased',
                                                           config=config)
    device = 'cuda'
    model.to(device)
    model.train()

    df_vocab = pd.DataFrame(columns=['token', 'frequency'])

    # df = pd.read_csv(filename_train)
    df = pd.read_csv(filename_train, header=None, usecols=[0, 1])
    df.columns = ['text', 'label']
    print('columns', df.columns)

    test_train_perc = 0.80
Beispiel #19
0
def main():
    from transformers import XLNetConfig

    config = XLNetConfig(
        vocab_size=21_128,
        d_model=768,
        n_head=12,
        n_layer=6,
    )

    from transformers import XLNetTokenizer

    tokenizer = XLNetTokenizer.from_pretrained("./model/spbpe", max_len=512)

    from transformers import XLNetLMHeadModel

    model = XLNetLMHeadModel(config=config)
    model.resize_token_embeddings(len(tokenizer))
    print(model.num_parameters())

    from transformers import LineByLineTextDataset

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path="./data/data_train.csv",
        block_size=128,
    )

    max_seq_length = 512

    from transformers import DataCollatorForPermutationLanguageModeling

    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer, plm_probability=1.0 / 6, max_span_length=5)

    from transformers import Trainer, TrainingArguments

    training_args = TrainingArguments(
        output_dir="./model/xlnet_v1",
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_gpu_train_batch_size=32,
        save_steps=10_000,
        save_total_limit=2,
        tpu_num_cores=8,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=True,
    )

    trainer.train()

    if trainer.is_world_master():
        trainer.save_model("./model/spbpe")

    print('FIN')
test=pd.read_csv('../tcdata/testB.csv',header=None)
model_path='../model_weight/xlnet/'
output_model='../tmp/xlnet.pth'
batch_size=32
# 合并训练集与测试集 制作特征
for i in range(1,3):
    train[i]=train[i].apply(lambda x:x.replace('|','').strip())
for i in range(1,2):
    test[i]=test[i].apply(lambda x:x.replace('|','').strip())
train.columns=['idx','sentence','label1','label2']
test.columns=['idx','sentence']
# test.columns=['idx','sentence','label1','label2']

tokenizer=BertTokenizerFast.from_pretrained(model_path)

config=XLNetConfig.from_pretrained(model_path,num_labels=17,hidden_dropout_prob=0.2) # config.output_attentions=True
config.hidden_dropout_prob=0.2


# In[3]:


def train_model(train_df,val_df,test_oof):
    
        ###--------------------
    early_stop=0
    print("Reading training data...")
    train_set = CustomDataset(train_df, maxlen=128,tokenizer=tokenizer)
    train_loader = Data.DataLoader(train_set, batch_size=batch_size, num_workers=5, shuffle=True)

    print("Reading validation data...")
Beispiel #21
0
def train():
    # 加载预训练bert
    config = XLNetConfig.from_pretrained('xlnet_config.json')
    model = XLNetForQuestionAnswering.from_pretrained('xlnet_model.ckpt.index',
                                                      from_tf=True,
                                                      config=config)
    device = args.device
    model.to(device)

    # 准备 optimizer
    param_optimizer = list(model.named_parameters())
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = adabound.AdaBound(optimizer_grouped_parameters,
                                  lr=1e-3,
                                  final_lr=0.1)
    # 准备数据
    data = Dureader()
    train_dataloader, dev_dataloader = data.train_iter, data.dev_iter

    best_loss = 100000.0
    model.train()
    for i in range(args.num_train_epochs):
        for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch")):
            input_ids, input_mask, segment_ids, start_positions, end_positions = \
                                        batch.input_ids, batch.input_mask, batch.segment_ids, batch.start_position, batch.end_position
            input_ids, input_mask, segment_ids, start_positions, end_positions = \
                                        input_ids.to(device), input_mask.to(device), segment_ids.to(device), start_positions.to(device), end_positions.to(device)

            # 计算loss
            outputs = model(input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)
            loss = outputs[0]
            loss = loss / args.gradient_accumulation_steps
            loss.backward()

            # 更新梯度
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            # 验证
            if step % args.log_step == 4:
                eval_loss = evaluate.evaluate(model, dev_dataloader)
                if eval_loss < best_loss:
                    best_loss = eval_loss
                    torch.save(model.state_dict(),
                               './model_dir/' + "best_model")
                    model.train()
Beispiel #22
0
 def __init__(self):
     super(XlnetModelTest, self).__init__()
     config = XLNetConfig.from_pretrained('Saier/models/config.json')
     self.xlnet = XLNetForSequenceClassification(config)  # /bert_pretrain/
     self.device = torch.device("cuda")
Beispiel #23
0
    def prepare_config_and_inputs(self):
        input_ids_1 = ids_tensor([self.batch_size, self.seq_length],
                                 self.vocab_size)
        input_ids_2 = ids_tensor([self.batch_size, self.seq_length],
                                 self.vocab_size)
        segment_ids = ids_tensor([self.batch_size, self.seq_length],
                                 self.type_vocab_size)
        input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()

        input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1],
                                 self.vocab_size)
        perm_mask = torch.zeros(
            self.batch_size,
            self.seq_length + 1,
            self.seq_length + 1,
            dtype=torch.float,
            device=torch_device,
        )
        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
        target_mapping = torch.zeros(
            self.batch_size,
            1,
            self.seq_length + 1,
            dtype=torch.float,
            device=torch_device,
        )
        target_mapping[:, 0, -1] = 1.0  # predict last token

        sequence_labels = None
        lm_labels = None
        is_impossible_labels = None
        token_labels = None
        if self.use_labels:
            lm_labels = ids_tensor([self.batch_size, self.seq_length],
                                   self.vocab_size)
            sequence_labels = ids_tensor([self.batch_size],
                                         self.type_sequence_label_size)
            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
            token_labels = ids_tensor([self.batch_size, self.seq_length],
                                      self.type_vocab_size)

        config = XLNetConfig(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,
            n_head=self.num_attention_heads,
            d_inner=self.d_inner,
            n_layer=self.num_hidden_layers,
            untie_r=self.untie_r,
            mem_len=self.mem_len,
            clamp_len=self.clamp_len,
            same_length=self.same_length,
            reuse_len=self.reuse_len,
            bi_data=self.bi_data,
            initializer_range=self.initializer_range,
            num_labels=self.type_sequence_label_size,
            bos_token_id=self.bos_token_id,
            pad_token_id=self.pad_token_id,
            eos_token_id=self.eos_token_id,
            return_dict=True,
        )

        return (
            config,
            input_ids_1,
            input_ids_2,
            input_ids_q,
            perm_mask,
            input_mask,
            target_mapping,
            segment_ids,
            lm_labels,
            sequence_labels,
            is_impossible_labels,
            token_labels,
        )
        def prepare_config_and_inputs(self):
            input_ids_1 = ids_tensor([self.batch_size, self.seq_length],
                                     self.vocab_size)
            input_ids_2 = ids_tensor([self.batch_size, self.seq_length],
                                     self.vocab_size)
            segment_ids = ids_tensor([self.batch_size, self.seq_length],
                                     self.type_vocab_size)
            input_mask = ids_tensor([self.batch_size, self.seq_length],
                                    2,
                                    dtype=tf.float32)

            input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1],
                                     self.vocab_size)
            perm_mask = tf.zeros(
                (self.batch_size, self.seq_length + 1, self.seq_length),
                dtype=tf.float32)
            perm_mask_last = tf.ones((self.batch_size, self.seq_length + 1, 1),
                                     dtype=tf.float32)
            perm_mask = tf.concat([perm_mask, perm_mask_last], axis=-1)
            # perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
            target_mapping = tf.zeros((self.batch_size, 1, self.seq_length),
                                      dtype=tf.float32)
            target_mapping_last = tf.ones((self.batch_size, 1, 1),
                                          dtype=tf.float32)
            target_mapping = tf.concat([target_mapping, target_mapping_last],
                                       axis=-1)
            # target_mapping[:, 0, -1] = 1.0  # predict last token

            sequence_labels = None
            lm_labels = None
            is_impossible_labels = None
            if self.use_labels:
                lm_labels = ids_tensor([self.batch_size, self.seq_length],
                                       self.vocab_size)
                sequence_labels = ids_tensor([self.batch_size],
                                             self.type_sequence_label_size)
                is_impossible_labels = ids_tensor([self.batch_size],
                                                  2,
                                                  dtype=tf.float32)

            config = XLNetConfig(
                vocab_size=self.vocab_size,
                d_model=self.hidden_size,
                n_head=self.num_attention_heads,
                d_inner=self.d_inner,
                n_layer=self.num_hidden_layers,
                untie_r=self.untie_r,
                mem_len=self.mem_len,
                clamp_len=self.clamp_len,
                same_length=self.same_length,
                reuse_len=self.reuse_len,
                bi_data=self.bi_data,
                initializer_range=self.initializer_range,
                num_labels=self.type_sequence_label_size,
            )

            return (
                config,
                input_ids_1,
                input_ids_2,
                input_ids_q,
                perm_mask,
                input_mask,
                target_mapping,
                segment_ids,
                lm_labels,
                sequence_labels,
                is_impossible_labels,
            )
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(data_args.dataset_name,
                                    data_args.dataset_config_name,
                                    cache_dir=model_args.cache_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
            extension = data_args.train_file.split(".")[-1]
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
        raw_datasets = load_dataset(extension,
                                    data_files=data_files,
                                    field="data",
                                    cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = XLNetConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = XLNetTokenizerFast.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = XLNetForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.
    if training_args.do_train:
        column_names = raw_datasets["train"].column_names
    elif training_args.do_eval:
        column_names = raw_datasets["validation"].column_names
    else:
        column_names = raw_datasets["test"].column_names
    question_column_name = "question" if "question" in column_names else column_names[
        0]
    context_column_name = "context" if "context" in column_names else column_names[
        1]
    answer_column_name = "answers" if "answers" in column_names else column_names[
        2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    # Training preprocessing
    def prepare_train_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [
            q.lstrip() for q in examples[question_column_name]
        ]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")
        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []
        tokenized_examples["is_impossible"] = []
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
            # The cls token gets 1.0 too (for predictions of empty answers).
            tokenized_examples["p_mask"].append([
                0.0 if (not special_tokens[i][k] and s == context_idx)
                or k == cls_index else 1.0 for k, s in enumerate(sequence_ids)
            ])

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
                tokenized_examples["is_impossible"].append(1.0)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != context_idx:
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != context_idx:
                    token_end_index -= 1
                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char
                        and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                    tokenized_examples["is_impossible"].append(1.0)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[
                            token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(
                        token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(
                        token_end_index + 1)
                    tokenized_examples["is_impossible"].append(0.0)

        return tokenized_examples

    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            # Select samples from Dataset, This will help to decrease processing time
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))
        # Create Training Features
        with training_args.main_process_first(
                desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                prepare_train_features,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )
        if data_args.max_train_samples is not None:
            # Select samples from dataset again since Feature Creation might increase number of features
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))

    # Validation preprocessing
    def prepare_validation_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
            return_token_type_ids=True,
            padding="max_length",
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
        special_tokens = tokenized_examples.pop("special_tokens_mask")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label.
        tokenized_examples["cls_index"] = []
        tokenized_examples["p_mask"] = []

        for i, input_ids in enumerate(tokenized_examples["input_ids"]):
            # Find the CLS token in the input ids.
            cls_index = input_ids.index(tokenizer.cls_token_id)
            tokenized_examples["cls_index"].append(cls_index)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples["token_type_ids"][i]
            for k, s in enumerate(special_tokens[i]):
                if s:
                    sequence_ids[k] = 3
            context_idx = 1 if pad_on_right else 0

            # Build the p_mask: non special tokens and context gets 0.0, the others 1.0.
            tokenized_examples["p_mask"].append([
                0.0 if (not special_tokens[i][k] and s == context_idx)
                or k == cls_index else 1.0 for k, s in enumerate(sequence_ids)
            ])

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(
                examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_idx else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if training_args.do_eval:
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_examples = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            # Selecting Eval Samples from Dataset
            eval_examples = eval_examples.select(
                range(data_args.max_eval_samples))
        # Create Features from Eval Dataset
        with training_args.main_process_first(
                desc="validation dataset map pre-processing"):
            eval_dataset = eval_examples.map(
                prepare_validation_features,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )
        if data_args.max_eval_samples is not None:
            # Selecting Samples from Dataset again since Feature Creation might increase samples size
            eval_dataset = eval_dataset.select(
                range(data_args.max_eval_samples))

    if training_args.do_predict:
        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_examples = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            # We will select sample from whole data
            predict_examples = predict_examples.select(
                range(data_args.max_predict_samples))
        # Test Feature Creation
        with training_args.main_process_first(
                desc="prediction dataset map pre-processing"):
            predict_dataset = predict_examples.map(
                prepare_validation_features,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on prediction dataset",
            )
        if data_args.max_predict_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            predict_dataset = predict_dataset.select(
                range(data_args.max_predict_samples))

    # Data collator
    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
    # collator.
    data_collator = (default_data_collator if data_args.pad_to_max_length else
                     DataCollatorWithPadding(
                         tokenizer,
                         pad_to_multiple_of=8 if training_args.fp16 else None))

    # Post-processing:
    def post_processing_function(examples,
                                 features,
                                 predictions,
                                 stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=data_args.version_2_with_negative,
            n_best_size=data_args.n_best_size,
            max_answer_length=data_args.max_answer_length,
            start_n_top=model.config.start_n_top,
            end_n_top=model.config.end_n_top,
            output_dir=training_args.output_dir,
            log_level=log_level,
            prefix=stage,
        )
        # Format the result to the format the metric expects.
        if data_args.version_2_with_negative:
            formatted_predictions = [{
                "id":
                k,
                "prediction_text":
                v,
                "no_answer_probability":
                scores_diff_json[k]
            } for k, v in predictions.items()]
        else:
            formatted_predictions = [{
                "id": k,
                "prediction_text": v
            } for k, v in predictions.items()]

        references = [{
            "id": ex["id"],
            "answers": ex[answer_column_name]
        } for ex in examples]
        return EvalPrediction(predictions=formatted_predictions,
                              label_ids=references)

    metric = load_metric(
        "squad_v2" if data_args.version_2_with_negative else "squad")

    def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions=p.predictions,
                              references=p.label_ids)

    # Initialize our Trainer
    trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        eval_examples=eval_examples if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

        metrics = train_result.metrics

        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate()

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Prediction
    if training_args.do_predict:
        logger.info("*** Predict ***")
        results = trainer.predict(predict_dataset, predict_examples)
        metrics = results.metrics

        max_predict_samples = (data_args.max_predict_samples
                               if data_args.max_predict_samples is not None
                               else len(predict_dataset))
        metrics["predict_samples"] = min(max_predict_samples,
                                         len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

    kwargs = {
        "finetuned_from": model_args.model_name_or_path,
        "tasks": "question-answering"
    }
    if data_args.dataset_name is not None:
        kwargs["dataset_tags"] = data_args.dataset_name
        if data_args.dataset_config_name is not None:
            kwargs["dataset_args"] = data_args.dataset_config_name
            kwargs[
                "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
        else:
            kwargs["dataset"] = data_args.dataset_name

    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
    else:
        trainer.create_model_card(**kwargs)
Beispiel #26
0
def main():

    # Set device for PyTorch
    if torch.cuda.is_available():
        # might need to update when using more than 1 GPU
        rank = 0
        torch.cuda.set_device(rank)
        device = torch.device("cuda", rank)
        #torch.distributed.init_process_group(backend='nccl')
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cpu")
        n_gpu = 0

    print("N GPU: ", n_gpu)

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--model_id",
        type=str,
        help=
        "Model and optimizer should be saved at a folder inside '/gpfs/data/razavianlab/capstone19/models/{model_id}'. "
    )
    parser.add_argument(
        "--checkpoint",
        type=str,
        help=
        "Checkpoint number. Model and optimizer should be saved at '/gpfs/data/razavianlab/capstone19/models/{model_id}/model_checkpoint_{checkpoint}'. "
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        "--feature_save_dir",
        type=str,
        help=
        "Preprocessed data (features) should be saved at '/gpfs/data/razavianlab/capstone19/preprocessed_data/{feature_save_dir}'. "
    )
    parser.add_argument(
        "--batch_size",
        type=int,
        default=32,
        help="Specify batch size for load featurized examples.")
    parser.add_argument("--set_type",
                        type=str,
                        help="Specify train/val/test file.")

    parser.add_argument("--save_batch",
                        type=int,
                        help="Save files every save_batch batches.")
    args = parser.parse_args()

    # Load data
    feature_save_path = os.path.join(
        '/gpfs/data/razavianlab/capstone19/preprocessed_data/',
        args.feature_save_dir)
    logger.info("Loading dataset")
    dataloader = load_featurized_examples(batch_size=args.batch_size,
                                          set_type=args.set_type,
                                          feature_save_path=feature_save_path)

    # Load saved model
    model_path = os.path.join('/gpfs/data/razavianlab/capstone19/models/',
                              args.model_id,
                              'model_checkpoint_' + args.checkpoint)
    logger.info("Loading saved model from {}".format(model_path))
    config = XLNetConfig.from_pretrained(
        os.path.join(model_path, 'config.json'),
        num_labels=2292)  # TODO: check if we need this
    model = XLNetForSequenceClassification.from_pretrained(model_path,
                                                           config=config)
    model.to(device)
    model = torch.nn.DataParallel(model, device_ids=list(range(n_gpu)))

    last_batch_doc_id = -1  # Used to determine if the last document of the last batch was split up or not
    stored_logits = torch.empty(0, 2292).to(
        device
    )  # Stores logits until we finish a batch where the last document was not split up
    all_doc_ids = torch.empty(0).to(
        device
    )  # Stores the list of doc ids corresponding to the rows of stored_logits
    all_combined_logits = torch.empty(0, 2292).to(
        device
    )  # For all documents, stores the elementwise max of all logits for that document
    all_label_ids = torch.empty(0, 2292).to(device)
    stored_label_ids = torch.empty(0, 2292).to(device)
    for i, batch in enumerate(dataloader):
        if i % 1000 == 0 and i > 0:
            logger.info('Entering batch {}'.format(i))
        model.eval()
        with torch.no_grad():

            input_ids, input_mask, segment_ids, label_ids, doc_ids = batch

            input_ids = input_ids.to(device).long()
            input_mask = input_mask.to(device).long()
            segment_ids = segment_ids.to(device).long()
            doc_ids = doc_ids.to(device).float()
            label_ids = label_ids.to(device).float()

            # Get logits for this batch
            logits = model(input_ids=input_ids,
                           attention_mask=input_mask,
                           token_type_ids=segment_ids)[0]

            # Check if any part of the last document in stored_logits is in this batch,
            # indicating that a document got split across stored_logits and this batch
            if all(
                    doc_ids != last_batch_doc_id
            ) and last_batch_doc_id != -1:  # This means that the last batch of stored_logits did not get split up
                # If nothing was split, then we can combine the logits in stored_logits by document
                # and store the results in all_combined_logits

                # Combine logits by doc_id
                last_doc_id = all_doc_ids[0].item()
                to_combine = torch.empty(0, 2292).to(device)
                for (j, doc_id) in enumerate(all_doc_ids):
                    if doc_id.item() != last_doc_id:
                        # Get the pointwise max over all logits for the last document
                        combined_logits = torch.max(
                            to_combine, dim=0
                        )[0].reshape(
                            1, -1
                        )  # pointwise max of all logits for the last document
                        all_combined_logits = torch.cat(
                            [all_combined_logits, combined_logits], dim=0)
                        # Create to_combine for the new document and update last_doc_id
                        to_combine = stored_logits[j, :].reshape(1, -1)
                        last_doc_id = doc_id.item()
                    else:
                        # Add these logits to to_combine with the other logits for this document
                        to_combine = torch.cat(
                            [to_combine, stored_logits[j, :].reshape(1, -1)],
                            dim=0)
                combined_logits = torch.max(to_combine,
                                            dim=0)[0].reshape(1, -1)
                all_combined_logits = torch.cat(
                    [all_combined_logits, combined_logits], dim=0)

                # Create an object storing one copy of the labels per document
                last_doc_id = -1
                for (j, doc_id) in enumerate(all_doc_ids):
                    if (doc_id.item() != last_doc_id) and last_doc_id != -1:
                        all_label_ids = torch.cat([
                            all_label_ids, stored_label_ids[j - 1].unsqueeze(0)
                        ])
                    last_doc_id = doc_id.item()
                all_label_ids = torch.cat(
                    [all_label_ids, stored_label_ids[j].unsqueeze(0)])

                all_doc_ids = torch.empty(0).to(device)
                stored_logits = torch.empty(0, 2292).to(device)
                stored_label_ids = torch.empty(0, 2292).to(device)
                stored_logits = torch.cat([stored_logits, logits], dim=0)
                all_doc_ids = torch.cat([all_doc_ids, doc_ids], dim=0)
                stored_label_ids = torch.cat([stored_label_ids, label_ids],
                                             dim=0)
                last_batch_doc_id = doc_ids[-1]

            # If a doc was split, then save these logits until we find a batch where no doc was split
            else:
                stored_logits = torch.cat([stored_logits, logits], dim=0)
                all_doc_ids = torch.cat([all_doc_ids, doc_ids], dim=0)
                stored_label_ids = torch.cat([stored_label_ids, label_ids],
                                             dim=0)
                last_batch_doc_id = doc_ids[-1]

        # Save every number of steps and clear out the tensors to save memory
        if i % args.save_batch == 0 and i > 0:
            torch.save(
                all_label_ids,
                os.path.join(
                    feature_save_path,
                    "{}_label_ids_{}.pt".format(args.set_type,
                                                int(i / args.save_batch))))
            torch.save(
                all_combined_logits,
                os.path.join(
                    feature_save_path,
                    "{}_logits_{}.pt".format(args.set_type,
                                             int(i / args.save_batch))))
            all_combined_logits = torch.empty(0, 2292).to(device)
            all_label_ids = torch.empty(0, 2292).to(device)
            logger.info("Saved batch {}".format(int(i / args.save_batch)))
    # Store logits and labels for the final batch(es)
    last_doc_id = all_doc_ids[0].item()
    to_combine = torch.empty(0, 2292).to(device)
    for (j, doc_id) in enumerate(all_doc_ids):
        if doc_id.item() != last_doc_id:
            # Get the pointwise max over all logits for the last document
            combined_logits = torch.max(to_combine, dim=0)[0].reshape(
                1, -1)  # pointwise max of all logits for the last document
            all_combined_logits = torch.cat(
                [all_combined_logits, combined_logits], dim=0)
            # Create to_combine for the new document and update last_doc_id
            to_combine = stored_logits[j, :].reshape(1, -1)
            last_doc_id = doc_id.item()
        else:
            # Add these logits to to_combine with the other logits for this document
            to_combine = torch.cat(
                [to_combine, stored_logits[j, :].reshape(1, -1)], dim=0)
    combined_logits = torch.max(to_combine, dim=0)[0].reshape(
        1, -1)  # pointwise max of all logits for the last document
    all_combined_logits = torch.cat([all_combined_logits, combined_logits],
                                    dim=0)

    # Create an object storing one copy of the labels per document
    last_doc_id = -1
    for (j, doc_id) in enumerate(all_doc_ids):
        if (doc_id.item() != last_doc_id) and last_doc_id != -1:
            all_label_ids = torch.cat(
                [all_label_ids, stored_label_ids[j - 1].unsqueeze(0)])
        last_doc_id = doc_id.item()
    all_label_ids = torch.cat(
        [all_label_ids, stored_label_ids[j].unsqueeze(0)])
    torch.save(
        all_label_ids,
        os.path.join(
            feature_save_path,
            "{}_label_ids_{}.pt".format(args.set_type,
                                        int(math.ceil(i / args.save_batch)))))
    torch.save(
        all_combined_logits,
        os.path.join(
            feature_save_path,
            "{}_logits_{}.pt".format(args.set_type,
                                     int(math.ceil(i / args.save_batch)))))
    logger.info("Saved batch {}".format(int(math.ceil(i / args.save_batch))))

    return