Ejemplo n.º 1
0
def load_qa_from_pretrained(
    model: Optional[tf.keras.Model] = None,
    name: Optional[str] = None,
    path: Optional[str] = None,  # path to checkpoint from TF...ForPreTraining
    config: Optional[PretrainedConfig] = None,
) -> tf.keras.Model:
    """
    Load a TF...QuestionAnswering model by taking the main layer of a pretrained model.
    Preserves the model.config attribute.
    """
    assert (bool(name) ^ bool(model) ^ (bool(path) and bool(config))
            ), "Pass either name, model, or (path and config)"

    if name is not None:
        return TFAutoModelForQuestionAnswering.from_pretrained(name)
    elif model is not None:
        pretrained_model = model
    elif path is not None:
        pretrained_model = TFAutoModelForPreTraining.from_config(config)
        pretrained_model.load_weights(path)

    qa_model = TFAutoModelForQuestionAnswering.from_config(
        pretrained_model.config)
    pretrained_main_layer = getattr(pretrained_model,
                                    qa_model.base_model_prefix)
    assert (
        pretrained_main_layer is not None
    ), f"{pretrained_model} has no attribute '{model.base_model_prefix}'"
    # Generalized way of saying `model.albert = pretrained_model.albert`
    setattr(qa_model, qa_model.base_model_prefix, pretrained_main_layer)
    return qa_model
Ejemplo n.º 2
0
 def __init__(self, bert_squad_model='bert-large-uncased-whole-word-masking-finetuned-squad',
              bert_emb_model='bert-base-uncased'):
     self.model_name = bert_squad_model
     try:
         self.model = TFAutoModelForQuestionAnswering.from_pretrained(self.model_name)
     except:
         self.model = TFAutoModelForQuestionAnswering.from_pretrained(self.model_name, from_pt=True)
     self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
     self.maxlen = 512
     self.te = tpp.TransformerEmbedding(bert_emb_model, layers=[-2])
Ejemplo n.º 3
0
    def __init__(
        self,
        model_name=DEFAULT_MODEL,
        bert_squad_model=None,
        bert_emb_model="bert-base-uncased",
        framework="tf",
        device=None,
        quantize=False,
    ):
        model_name = bert_squad_model if bert_squad_model is not None else model_name
        if bert_squad_model:
            warnings.warn(
                "The bert_squad_model argument is deprecated - please use model_name instead.",
                DeprecationWarning,
                stacklevel=2,
            )
        self.model_name = model_name
        self.framework = framework
        if framework == "tf":
            try:
                import tensorflow as tf
            except ImportError:
                raise Exception('If framework=="tf", TensorFlow must be installed.')
            try:
                self.model = TFAutoModelForQuestionAnswering.from_pretrained(
                    self.model_name
                )
            except:
                warnings.warn(
                    "Could not load supplied model as TensorFlow checkpoint - attempting to load using from_pt=True"
                )
                self.model = TFAutoModelForQuestionAnswering.from_pretrained(
                    self.model_name, from_pt=True
                )
        else:
            bert_emb_model = (
                None  # set to None and ignore since we only want to use PyTorch
            )
            super().__init__(device=device, quantize=quantize)
            self.model = AutoModelForQuestionAnswering.from_pretrained(
                self.model_name
            ).to(self.torch_device)
            if quantize:
                self.model = self.quantize_model(self.model)

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.maxlen = 512
        self.te = (
            tpp.TransformerEmbedding(bert_emb_model, layers=[-2])
            if bert_emb_model is not None
            else None
        )
def semantic_score(sentence, reference):
    """
        return: string containing the answer
    """
    tokenizer = AutoTokenizer.from_pretrained(
        "bert-large-uncased-whole-word-masking-finetuned-squad")
    model = TFAutoModelForQuestionAnswering.from_pretrained(
        "bert-large-uncased-whole-word-masking-finetuned-squad",
        return_dict=True)
    inputs = tokenizer(sentence, reference, add_special_tokens=True,
                       return_tensors="tf")
    input_ids = inputs["input_ids"].numpy()[0]
    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    output = model(inputs)
    answer_start = tf.argmax(
        output.start_logits, axis=1
    ).numpy()[0]
    start = output.start_logits[:, answer_start].numpy()[0]
    print('start:', answer_start, start)
    answer_end = (
            tf.argmax(output.end_logits, axis=1) + 1
    ).numpy()[0]
    end = output.start_logits[:, answer_end].numpy()[0]
    print('end:', answer_end, end)
    return start, end
    def load(self):

        self.tokenizer = AutoTokenizer.from_pretrained(
            "bert-large-uncased-whole-word-masking-finetuned-squad")
        self.model = TFAutoModelForQuestionAnswering.from_pretrained(
            "bert-large-uncased-whole-word-masking-finetuned-squad")
        self.ready = True
Ejemplo n.º 6
0
 def __init__(
         self,
         model_name='bert-large-uncased-whole-word-masking-finetuned-squad'
 ):
     self.model_name = model_name
     self.model = TFAutoModelForQuestionAnswering.from_pretrained(
         self.model_name)
     self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
Ejemplo n.º 7
0
 def __init__(self,
              bert_squad_model=DEFAULT_MODEL,
              bert_emb_model='bert-base-uncased'):
     self.model_name = bert_squad_model
     try:
         self.model = TFAutoModelForQuestionAnswering.from_pretrained(
             self.model_name)
     except:
         warnings.warn(
             'Could not load supplied model as TensorFlow checkpoint - attempting to load using from_pt=True'
         )
         self.model = TFAutoModelForQuestionAnswering.from_pretrained(
             self.model_name, from_pt=True)
     self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
     self.maxlen = 512
     self.te = tpp.TransformerEmbedding(bert_emb_model, layers=[
         -2
     ]) if bert_emb_model is not None else None
Ejemplo n.º 8
0
def save_tf_model_from_transformers():
    model = TFAutoModelForQuestionAnswering.from_pretrained(
        "distilbert-base-cased-distilled-squad")
    callable = tf.function(model.call)
    concrete_function = callable.get_concrete_function([
        tf.TensorSpec([None, MAX_SEQ_LEN], tf.int32, name="input_ids"),
        tf.TensorSpec([None, MAX_SEQ_LEN], tf.int32, name="attention_mask")
    ])
    model.save('saved_model/distilbert_qa/1', signatures=concrete_function)
Ejemplo n.º 9
0
 def load_model(self, model_name, model_path, model_type):
     logger.info(">> Loading HF model " + model_name + " from " +
                 model_path)
     self.type = model_type
     self.name = model_name
     self.tokenizer = AutoTokenizer.from_pretrained(model_path,
                                                    use_fast=True)
     self.model = TFAutoModelForQuestionAnswering.from_pretrained(
         model_path, from_pt=True)
Ejemplo n.º 10
0
    def test_question_answering_model_from_pretrained(self):
        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
        for model_name in ["bert-base-uncased"]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForQuestionAnswering)
    def test_question_answering_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ["bert-base-uncased"]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForQuestionAnswering)
    def semantic_search(self, sentence):
        """ performs semantic search on a corpus of documents.
            Args:
                corpus_path: (str) the path to the corpus of reference
                documents on which to perform semantic search.
                sentence: (str) the sentence from which to
                        perform semantic search.
            Returns:
                (str) the reference text of the document most
                    similar to sentence.
        """
        url = 'bert-large-uncased-whole-word-masking-finetuned-squad'
        tokenizer = BertTokenizer.from_pretrained(url)
        model = TFAutoModelForQuestionAnswering.from_pretrained(
            url, return_dict=True)

        filelist = os.listdir(self.corpus_path)
        maximo = None
        final_file = None
        for file in filelist:
            path_file = self.corpus_path + "/" + file
            if os.path.isfile(path_file):
                with open(path_file, 'rb') as f:
                    reference = f.read().decode(errors='replace')

                inputs = tokenizer(sentence,
                                   reference,
                                   add_special_tokens=True,
                                   return_tensors="tf")
                input_ids = inputs["input_ids"].numpy()[0]
                text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
                output = model(inputs)
                answer_start = tf.argmax(output.start_logits,
                                         axis=1).numpy()[0]
                answer_end = (tf.argmax(output.end_logits, axis=1) +
                              1).numpy()[0]
                first = output.start_logits[:, answer_start].numpy()[0]
                last = output.start_logits[:, answer_end].numpy()[0]

                if maximo is None:
                    maximo = (first + last) / 2
                    final_file = path_file
                elif (maximo < ((first + last) / 2)):
                    maximo = (first + last) / 2
                    final_file = path_file

        with open(final_file, 'rb') as f:
            result = f.read().decode(errors='replace')

        return result
Ejemplo n.º 13
0
    def load(self):
        #f = open('imagenet_classes.txt')
        #self.classes = [line.strip() for line in f.readlines()]

        #model = models.alexnet(pretrained=True)
        #model.eval()
        #self.model = model

        self.tokenizer = AutoTokenizer.from_pretrained(
            "bert-large-uncased-whole-word-masking-finetuned-squad")
        self.model = TFAutoModelForQuestionAnswering.from_pretrained(
            "bert-large-uncased-whole-word-masking-finetuned-squad")

        self.ready = True
Ejemplo n.º 14
0
def camemBert(context, question):
    tokenizer = AutoTokenizer.from_pretrained('camembert-base')
    model = TFAutoModelForQuestionAnswering.from_pretrained("camembert-base")
    inputs = tokenizer.encode_plus(
        question, context, add_special_tokens=True, return_tensors="tf")
    # The .numpy() method explicitly converts a Tensor to a numpy array
    input_ids = inputs["input_ids"].numpy()[0]
    #text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer_start_scores, answer_end_scores = model(inputs)
    # Get the most likely beginning of answer with the argmax of the score
    answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
    # Get the most likely end of answer with the argmax of the score,+1 cause in the list indexing the upper bound isn't included
    answer_end = (tf.argmax(answer_end_scores, axis=1)+1).numpy()[0]
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer
Ejemplo n.º 15
0
def distilBERT(context, question):
    #tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    tokenizer = AutoTokenizer.from_pretrained(
        "bert-large-uncased-whole-word-masking-finetuned-squad")
    #model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased', return_dict=True)
    model = TFAutoModelForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased")
    inputs = tokenizer.encode_plus(
        question, context, add_special_tokens=True, return_tensors="tf")
    # The .numpy() method explicitly converts a Tensor to a numpy array
    input_ids = inputs["input_ids"].numpy()[0]
    #text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer_start_scores, answer_end_scores = model(inputs)
    # Get the most likely beginning of answer with the argmax of the score
    answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
    # Get the most likely end of answer with the argmax of the score,+1 cause in the list indexing the upper bound isn't included
    answer_end = (tf.argmax(answer_end_scores, axis=1)+1).numpy()[0]
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer
Ejemplo n.º 16
0
def comprehesion(text, questions):
    from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
    import tensorflow as tf
    tokenizer = AutoTokenizer.from_pretrained(
        "bert-large-uncased-whole-word-masking-finetuned-squad")
    model = TFAutoModelForQuestionAnswering.from_pretrained(
        "bert-large-uncased-whole-word-masking-finetuned-squad")
    # text = r"""
    # 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
    # architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
    # Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
    # TensorFlow 2.0 and PyTorch.
    # """
    # questions = [
    #     "How many pretrained models are available in 🤗 Transformers?",
    #     "What does 🤗 Transformers provide?",
    #     "🤗 Transformers provides interoperability between which frameworks?",
    # ]
    for question in questions:
        inputs = tokenizer(question,
                           text,
                           add_special_tokens=True,
                           return_tensors="tf")
        input_ids = inputs["input_ids"].numpy()[0]

        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        outputs = model(inputs)
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits

        answer_start = tf.argmax(answer_start_scores, axis=1).numpy(
        )[0]  # Get the most likely beginning of answer with the argmax of the score
        answer_end = (tf.argmax(answer_end_scores, axis=1) + 1).numpy(
        )[0]  # Get the most likely end of answer with the argmax of the score
        answer = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(
                input_ids[answer_start:answer_end]))

        print(f"Question: {question}")
        print(f"Answer: {answer}")
    return
def question_answer(question, reference):
    """
    question: string containing the question to answer
    """
    tokenizer = AutoTokenizer.from_pretrained(
        "bert-large-uncased-whole-word-masking-finetuned-squad")
    model = TFAutoModelForQuestionAnswering.from_pretrained(
        "bert-large-uncased-whole-word-masking-finetuned-squad",
        return_dict=True)
    inputs = tokenizer(question,
                       reference,
                       add_special_tokens=True,
                       return_tensors="tf")
    input_ids = inputs["input_ids"].numpy()[0]
    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    output = model(inputs)
    answer_start = tf.argmax(output.start_logits, axis=1).numpy()[0]
    answer_end = (tf.argmax(output.end_logits, axis=1) + 1).numpy()[0]
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer
Ejemplo n.º 18
0
  def __init__(self):
    """Possible states are 
    1. "await" (awaiting response)
    2. "proceed" (proceed with the conversation)- used to give the bot control over the converation"""
    self._state="await"
  
    """Possible Flags are 
    1. "Exec" (task Executed)
    2. "notExec" (proceed with the conversation)- used to give the bot control over the converation"""
    self._FLAG=None
    self._bert_base_case_mrpc_tokenizer=AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
    self._bert_base_case_mrpc_model=TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
    self._gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    self._gpt2_model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=self._gpt2_tokenizer.eos_token_id)
    self.bert_large_uncased_whole_word_masking_finetuned_squad_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    self.bert_large_uncased_whole_word_masking_finetuned_squad_model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    self._DialoGP_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
    self._DialoGP_model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium")



    self._conversation_started=False
    self._conversation_ended=True
Ejemplo n.º 19
0
    def copy_model_files(self, force=False):
        modified = False

        src_path = self.checkpoint_path

        d = None
        try:
            if force or not (self.git_path / "tf_model.h5").exists() or not (
                    self.git_path / "pytorch_model.bin").exists():
                d = TemporaryDirectory()
                if self.task in self.QA_TASKS:
                    model = QASparseXP.compile_model(src_path,
                                                     dest_path=d.name)
                elif self.task in self.GLUE_TASKS:
                    model = GlueSparseXP.compile_model(src_path,
                                                       dest_path=d.name)
                elif self.task in self.SUMMARIZATION_TASKS:
                    model = SummarizationSparseXP.compile_model(
                        src_path, dest_path=d.name)
                else:
                    raise Exception(f"Unknown task {self.task}")

                model = optimize_model(model, "heads")
                model.save_pretrained(d.name)
                src_path = d.name
            if force or not (self.git_path / "tf_model.h5").exists():
                with TemporaryDirectory() as d2:
                    if self.task in self.QA_TASKS:
                        QASparseXP.final_fine_tune_bertarize(
                            src_path, d2, remove_head_pruning=True)
                        tf_model = TFAutoModelForQuestionAnswering.from_pretrained(
                            d2, from_pt=True)
                    elif self.task in self.GLUE_TASKS:
                        GlueSparseXP.final_fine_tune_bertarize(
                            src_path, d2, remove_head_pruning=True)
                        tf_model = TFAutoModelForSequenceClassification.from_pretrained(
                            d2, from_pt=True)
                    elif self.task in self.SUMMARIZATION_TASKS:
                        SummarizationSparseXP.final_fine_tune_bertarize(
                            src_path, d2, remove_head_pruning=True)
                        tf_model = TFAutoModelForSeq2SeqLM.from_pretrained(
                            d2, from_pt=True)
                    else:
                        raise Exception(f"Unknown task {self.task}")

                    tf_model.save_pretrained(self.git_path)
                    modified = True

            if force or not (self.git_path / "pytorch_model.bin").exists():
                if self.task in self.QA_TASKS:
                    model = AutoModelForQuestionAnswering.from_pretrained(
                        src_path)
                elif self.task in self.GLUE_TASKS:
                    model = AutoModelForSequenceClassification.from_pretrained(
                        src_path)
                elif self.task in self.SUMMARIZATION_TASKS:
                    model = AutoModelForSeq2SeqLM.from_pretrained(src_path)
                else:
                    raise Exception(f"Unknown task {self.task}")
                model.save_pretrained(self.git_path)
                modified = True

            src_path = Path(src_path)
            to_copy = self.get_copy_list()

            for files, dest in to_copy:
                dest.mkdir(exist_ok=True)
                for file in files:
                    if force or not (dest / file).exists():
                        shutil.copyfile(str(src_path / file), str(dest / file))
                        modified = True
        finally:
            if d is not None:
                d.cleanup()

        # Reload the config, this may have been changed by compilation / optimization (pruned_heads, gelu_patch, layer_norm_patch)
        with (self.git_path / "config.json").open() as f:
            self.checkpoint_info["config"] = json.load(f)

        return modified
Ejemplo n.º 20
0
def run_squad_and_get_results(
    run_name: str,
    fsx_prefix: str,
    pre_layer_norm: bool,
    model_size: str,
    load_from: Union[str, tf.keras.Model],
    load_step: int,
    batch_size: int,
    checkpoint_frequency: Optional[int],
    validate_frequency: Optional[int],
    learning_rate: float,
    warmup_steps: int,
    total_steps: int,
    dataset: str,
    dummy_eval: bool = False,
    config: Optional[PretrainedConfig] = None,
) -> Dict:
    checkpoint_frequency = checkpoint_frequency or 1000000
    validate_frequency = validate_frequency or 1000000

    if isinstance(load_from, tf.keras.Model):
        config = load_from.config
    assert config is not None, "config may not be None"

    # Instantiate QuestionAnswering model
    if isinstance(load_from, TFPreTrainedModel):
        model = load_qa_from_pretrained(model=load_from)
    elif load_from == "scratch":
        model = TFAutoModelForQuestionAnswering.from_config(config)
    elif load_from == "huggingface":
        model = load_qa_from_pretrained(name=f"albert-{model_size}-v2")
    else:
        raise ValueError(
            f"'load_from' is '{load_from}'; must be in ['scratch', 'huggingface', 'amazon']"
        )

    tokenizer = get_tokenizer()

    schedule = LinearWarmupPolyDecaySchedule(
        max_learning_rate=learning_rate,
        end_learning_rate=0,
        warmup_steps=warmup_steps,
        total_steps=total_steps,
    )
    optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule)
    optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
        optimizer, loss_scale="dynamic"
    )  # AMP

    model.call = wrap_tf_function_idempotent(model.call)

    if dataset == "squadv1":
        train_filename = "train-v1.1.json"
        val_filename = "dev-v1.1.json"
        processor = SquadV1Processor()
    elif dataset == "squadv2":
        train_filename = "train-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    elif dataset == "debug":
        train_filename = "dev-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    else:
        assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']"

    data_dir = f"{fsx_prefix}/squad_data"

    train_dataset = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=train_filename,
        batch_size=batch_size,
        shard=True,
        shuffle=True,
        repeat=True,
        drop_remainder=True,
    )

    if hvd.rank() == 0:
        print("Starting finetuning")
        pbar = tqdm.tqdm(total_steps)
        summary_writer = None  # Only create a writer if we make it through a successful step
        val_dataset = get_dataset(
            tokenizer=tokenizer,
            processor=processor,
            data_dir=data_dir,
            filename=val_filename,
            batch_size=batch_size,
            shard=False,
            shuffle=True,
            drop_remainder=False,
        )

    # Need to re-wrap every time this function is called
    # Wrapping train_step gives an error with optimizer initialization on the second pass
    # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875
    # Discussion at https://github.com/tensorflow/tensorflow/issues/27120
    wrapped_train_step = tf.function(train_step)
    for step, batch in enumerate(train_dataset):
        learning_rate = schedule(step=tf.constant(step, dtype=tf.float32))
        loss, acc, exact_match, f1, precision, recall = wrapped_train_step(
            model=model, optimizer=optimizer, batch=batch
        )

        # Broadcast model after the first step so parameters and optimizer are initialized
        if step == 0:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        is_final_step = step >= total_steps - 1
        if hvd.rank() == 0:
            do_checkpoint = (step % checkpoint_frequency == 0) or is_final_step
            do_validate = (step % validate_frequency == 0) or is_final_step

            pbar.update(1)
            description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}"
            pbar.set_description(description)

            if do_validate:
                print("Running validation")
                (
                    val_loss,
                    val_acc,
                    val_exact_match,
                    val_f1,
                    val_precision,
                    val_recall,
                ) = run_validation(model=model, val_dataset=val_dataset)
                description = (
                    f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, "
                    f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}"
                )
                print(description)
                print("Running evaluation")
                if dummy_eval:
                    results = {
                        "exact": 0.8169797018445212,
                        "f1": 4.4469722448269335,
                        "total": 11873,
                        "HasAns_exact": 0.15182186234817813,
                        "HasAns_f1": 7.422216845956518,
                        "HasAns_total": 5928,
                        "NoAns_exact": 1.4802354920100924,
                        "NoAns_f1": 1.4802354920100924,
                        "NoAns_total": 5945,
                        "best_exact": 50.07159100480081,
                        "best_exact_thresh": 0.0,
                        "best_f1": 50.0772059855695,
                        "best_f1_thresh": 0.0,
                    }
                else:
                    results: Dict = get_evaluation_metrics(
                        model=model, data_dir=data_dir, filename=val_filename, batch_size=32,
                    )
                print_eval_metrics(results=results, step=step)

            if do_checkpoint:
                checkpoint_path = (
                    f"{fsx_prefix}/checkpoints/albert-squad/{run_name}-step{step}.ckpt"
                )
                print(f"Saving checkpoint at {checkpoint_path}")
                model.save_weights(checkpoint_path)

            if summary_writer is None:
                summary_writer = tf.summary.create_file_writer(
                    f"{fsx_prefix}/logs/albert-squad/{run_name}"
                )
            with summary_writer.as_default():
                tf.summary.scalar("learning_rate", learning_rate, step=step)
                tf.summary.scalar("train_loss", loss, step=step)
                tf.summary.scalar("train_acc", acc, step=step)
                tf.summary.scalar("train_exact", exact_match, step=step)
                tf.summary.scalar("train_f1", f1, step=step)
                tf.summary.scalar("train_precision", precision, step=step)
                tf.summary.scalar("train_recall", recall, step=step)
                if do_validate:
                    tf.summary.scalar("val_loss", val_loss, step=step)
                    tf.summary.scalar("val_acc", val_acc, step=step)
                    tf.summary.scalar("val_exact", val_exact_match, step=step)
                    tf.summary.scalar("val_f1", val_f1, step=step)
                    tf.summary.scalar("val_precision", val_precision, step=step)
                    tf.summary.scalar("val_recall", val_recall, step=step)
                    # And the eval metrics
                    tensorboard_eval_metrics(
                        summary_writer=summary_writer, results=results, step=step
                    )

        if is_final_step:
            break

    # Can we return a value only on a single rank?
    if hvd.rank() == 0:
        pbar.close()
        print(f"Finished finetuning, job name {run_name}")
        return results
    pbar.close()

    return results


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--checkpoint", type=str, default=None)
    parser.add_argument("--pre_layer_norm", type=str, choices=["true"])
    args = parser.parse_args()

    # Load finetuned model from checkpoint
    config = AutoConfig.from_pretrained("albert-base-v2")
    config.pre_layer_norm = args.pre_layer_norm == "true"
    model = TFAutoModelForQuestionAnswering.from_config(config)

    # XLA, AMP, tf.function
    tf.config.optimizer.set_jit(True)
    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
    model.call = tf.function(model.call)

    # Get validation dataset
    data_dir = "/fsx/squad_data"
    train_filename = "train-v2.0.json"
    val_filename = "dev-v2.0.json"

    results = get_evaluation_metrics(
        model=model, data_dir=data_dir, filename=val_filename, batch_size=args.batch_size
    )
    print(dict(results))
Ejemplo n.º 22
0
 def __init__(self, log_path: str, base_model: str) -> None:
     self._wiki = MediaWiki()
     self._entity_recognizer = TFLiteNLU(log_path)
     self._tokenizer = AutoTokenizer.from_pretrained(base_model)
     self._answerer = TFAutoModelForQuestionAnswering.from_pretrained(
         base_model)
Ejemplo n.º 23
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    output_dir = Path(training_args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # endregion

    # region Checkpoints
    checkpoint = None
    if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir:
        if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file():
            checkpoint = output_dir
            logger.info(
                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
        else:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to continue regardless."
            )
    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)

    # Set the verbosity to info of the Transformers logger (on main process only):
    if training_args.should_log:
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # region Load Data
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
            extension = data_args.train_file.split(".")[-1]

        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    # endregion

    # region Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Tokenizer check: this script requires a fast tokenizer.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
            "requirement"
        )
    # endregion

    # region Preprocessing the datasets
    # Preprocessing is slightly different for training and evaluation.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    elif training_args.do_eval:
        column_names = datasets["validation"].column_names
    else:
        column_names = datasets["test"].column_names
    question_column_name = "question" if "question" in column_names else column_names[0]
    context_column_name = "context" if "context" in column_names else column_names[1]
    answer_column_name = "answers" if "answers" in column_names else column_names[2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    # Training preprocessing
    def prepare_train_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length" if data_args.pad_to_max_length else False,
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                    token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)

        return tokenized_examples

    processed_datasets = dict()
    if training_args.do_train:
        if "train" not in datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = datasets["train"]
        if data_args.max_train_samples is not None:
            # We will select sample from whole data if agument is specified
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        # Create train feature from dataset
        train_dataset = train_dataset.map(
            prepare_train_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
        if data_args.max_train_samples is not None:
            # Number of samples might increase during Feature Creation, We select only specified max samples
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        processed_datasets["train"] = train_dataset

    # Validation preprocessing
    def prepare_validation_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length" if data_args.pad_to_max_length else False,
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        for i in range(len(tokenized_examples["input_ids"])):
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)
            context_index = 1 if pad_on_right else 0

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if training_args.do_eval:
        if "validation" not in datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_examples = datasets["validation"]
        if data_args.max_eval_samples is not None:
            # We will select sample from whole data
            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
        # Validation Feature Creation
        eval_dataset = eval_examples.map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
        if data_args.max_eval_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
        processed_datasets["validation"] = eval_dataset

    if training_args.do_predict:
        if "test" not in datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_examples = datasets["test"]
        if data_args.max_predict_samples is not None:
            # We will select sample from whole data
            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
        # Predict Feature Creation
        predict_dataset = predict_examples.map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
        if data_args.max_predict_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
        processed_datasets["test"] = predict_dataset
    # endregion

    # region Metrics and Post-processing:
    def post_processing_function(examples, features, predictions, stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions = postprocess_qa_predictions(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=data_args.version_2_with_negative,
            n_best_size=data_args.n_best_size,
            max_answer_length=data_args.max_answer_length,
            null_score_diff_threshold=data_args.null_score_diff_threshold,
            output_dir=training_args.output_dir,
            prefix=stage,
        )
        # Format the result to the format the metric expects.
        if data_args.version_2_with_negative:
            formatted_predictions = [
                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
            ]
        else:
            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]

        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
        return EvalPrediction(predictions=formatted_predictions, label_ids=references)

    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")

    def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions=p.predictions, references=p.label_ids)

    # endregion

    with training_args.strategy.scope():
        # region Load model
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForQuestionAnswering.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=training_args.learning_rate,
            beta_1=training_args.adam_beta1,
            beta_2=training_args.adam_beta2,
            epsilon=training_args.adam_epsilon,
            clipnorm=training_args.max_grad_norm,
        )

        def dummy_loss(y_true, y_pred):
            return tf.reduce_mean(y_pred)

        losses = {"loss": dummy_loss}
        model.compile(optimizer=optimizer, loss=losses)
        # endregion

        # region Training
        if training_args.do_train:
            # Make a tf.data.Dataset for this
            if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
                logger.info("Padding all batches to max length because argument was set or we're on TPU.")
                dataset_mode = "constant_batch"
            else:
                dataset_mode = "variable_batch"
            training_dataset = convert_dataset_for_tensorflow(
                processed_datasets["train"],
                batch_size=training_args.per_device_train_batch_size,
                dataset_mode=dataset_mode,
                drop_remainder=True,
                shuffle=True,
            )
            model.fit(training_dataset, epochs=int(training_args.num_train_epochs))
        # endregion

        # region Evaluation
        if training_args.do_eval:
            logger.info("*** Evaluation ***")
            eval_inputs = {
                "input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(),
                "attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(),
            }
            eval_predictions = model.predict(eval_inputs)

            post_processed_eval = post_processing_function(
                datasets["validation"],
                processed_datasets["validation"],
                (eval_predictions.start_logits, eval_predictions.end_logits),
            )
            metrics = compute_metrics(post_processed_eval)
            logging.info("Evaluation metrics:")
            for metric, value in metrics.items():
                logging.info(f"{metric}: {value:.3f}")
        # endregion

        # region Prediction
        if training_args.do_predict:
            logger.info("*** Predict ***")
            predict_inputs = {
                "input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(),
                "attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(),
            }
            test_predictions = model.predict(predict_inputs)
            post_processed_test = post_processing_function(
                datasets["test"],
                processed_datasets["test"],
                (test_predictions.start_logits, test_predictions.end_logits),
            )
            metrics = compute_metrics(post_processed_test)

            logging.info("Test metrics:")
            for metric, value in metrics.items():
                logging.info(f"{metric}: {value:.3f}")
        # endregion

    if training_args.push_to_hub:
        model.push_to_hub()
# -*- coding: utf-8 -*-

import pandas as pd
data_bert_df = pd.read_csv(r"data_bert.csv")

from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
import tensorflow as tf
tokenizer = AutoTokenizer.from_pretrained(
    "bert-large-uncased-whole-word-masking-finetuned-squad")
model = TFAutoModelForQuestionAnswering.from_pretrained(
    "bert-large-uncased-whole-word-masking-finetuned-squad", return_dict=True)


def tester(text, question, answer):

    questions = [question]
    for question in questions:
        inputs = tokenizer(question,
                           text,
                           add_special_tokens=True,
                           return_tensors="tf")
        input_ids = inputs["input_ids"].numpy()[0]
        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer_scores = model(inputs)
        answer_start = tf.argmax(answer_scores["start_logits"], axis=1).numpy(
        )[0]  # Get the most likely beginning of answer with the argmax of the score
        answer_end = (
            tf.argmax(answer_scores["end_logits"], axis=1) + 1
        ).numpy(
        )[0]  # Get the most likely end of answer with the argmax of the score
        answer = tokenizer.convert_tokens_to_string(
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(
        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
        training_args.n_replicas,
        bool(training_args.n_replicas > 1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Prepare Question-Answering task
    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )

    with training_args.strategy.scope():
        model = TFAutoModelForQuestionAnswering.from_pretrained(
            model_args.model_name_or_path,
            from_pt=bool(".bin" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    # Get datasets
    if data_args.use_tfds:
        if data_args.version_2_with_negative:
            logger.warn(
                "tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically"
            )

        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError(
                "If not data_dir is specified, tensorflow_datasets needs to be installed."
            )

        tfds_examples = tfds.load("squad", data_dir=data_args.data_dir)
        train_examples = (SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=False) if training_args.do_train else None)
        eval_examples = (SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=True) if training_args.do_eval else None)
    else:
        processor = SquadV2Processor(
        ) if data_args.version_2_with_negative else SquadV1Processor()
        train_examples = processor.get_train_examples(
            data_args.data_dir) if training_args.do_train else None
        eval_examples = processor.get_dev_examples(
            data_args.data_dir) if training_args.do_eval else None

    train_dataset = (squad_convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
        doc_stride=data_args.doc_stride,
        max_query_length=data_args.max_query_length,
        is_training=True,
        return_dataset="tf",
    ) if training_args.do_train else None)

    train_dataset = train_dataset.apply(
        tf.data.experimental.assert_cardinality(len(train_examples)))

    eval_dataset = (squad_convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
        doc_stride=data_args.doc_stride,
        max_query_length=data_args.max_query_length,
        is_training=False,
        return_dataset="tf",
    ) if training_args.do_eval else None)

    eval_dataset = eval_dataset.apply(
        tf.data.experimental.assert_cardinality(len(eval_examples)))

    # Initialize our Trainer
    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)
Ejemplo n.º 26
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments, LoggingArguments, PathArguments)
    )
    model_args, data_args, train_args, log_args, path_args = parser.parse_args_into_dataclasses()

    tf.random.set_seed(train_args.seed)
    tf.autograph.set_verbosity(0)

    level = logging.INFO
    format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s"
    handlers = [
        TqdmLoggingHandler(),
    ]
    logging.basicConfig(level=level, format=format, handlers=handlers)

    # Horovod init
    hvd.init()
    gpus = tf.config.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.set_visible_devices(gpus[hvd.local_rank()], "GPU")
    # XLA, AMP, AutoGraph
    parse_bool = lambda arg: arg == "true"
    tf.config.optimizer.set_jit(not parse_bool(train_args.skip_xla))
    tf.config.experimental_run_functions_eagerly(parse_bool(train_args.eager))

    if hvd.rank() == 0:
        # Run name should only be used on one process to avoid race conditions
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        platform = "eks" if path_args.filesystem_prefix == "/fsx" else "sm"
        if log_args.run_name is None:
            run_name = f"{current_time}-{platform}-{model_args.model_type}-{model_args.model_size}-{data_args.squad_version}-{model_args.load_from}-{hvd.size()}gpus-{train_args.name}"
        else:
            run_name = log_args.run_name
    else:
        # We only use run_name on rank 0, but need all ranks to pass a value in function args
        run_name = None

    if model_args.load_from == "huggingface":
        logger.info(f"Loading weights from Huggingface {model_args.model_desc}")
        model = TFAutoModelForQuestionAnswering.from_pretrained(model_args.model_desc)
    else:
        model = create_model(model_class=TFAutoModelForQuestionAnswering, model_args=model_args)

    model.call = rewrap_tf_function(model.call)
    tokenizer = create_tokenizer(model_args.model_type)

    loaded_optimizer_weights = None
    if model_args.load_from == "checkpoint":
        if hvd.rank() == 0:
            checkpoint_path = os.path.join(path_args.filesystem_prefix, model_args.checkpoint_path)
            logger.info(f"Loading weights from {checkpoint_path}.ckpt")
            model.load_weights(f"{checkpoint_path}.ckpt").expect_partial()

    results = run_squad_and_get_results(
        model=model,
        tokenizer=tokenizer,
        run_name=run_name,
        filesystem_prefix=path_args.filesystem_prefix,
        per_gpu_batch_size=train_args.per_gpu_batch_size,
        checkpoint_frequency=log_args.checkpoint_frequency,
        validate_frequency=log_args.validation_frequency,
        evaluate_frequency=log_args.evaluate_frequency,
        learning_rate=train_args.learning_rate,
        warmup_steps=train_args.warmup_steps,
        total_steps=train_args.total_steps,
        dataset=data_args.squad_version,
    )
    if hvd.rank() == 0:
        logger.info(results)