Ejemplo n.º 1
0
def convert_to_transformers():
    farm_input_dir = Path(
        "../saved_models/farm-bert-base-german-cased-hatespeech-GermEval18Coarse"
    )
    transformers_output_dir = "../saved_models/bert-base-german-cased-hatespeech-GermEval18Coarse"
    #
    # # # load from FARM format
    model = AdaptiveModel.load(farm_input_dir, device="cpu")
    processor = Processor.load_from_dir(farm_input_dir)
    model.connect_heads_with_processor(processor.tasks)

    # convert to transformers
    transformer_model = Converter.convert_to_transformers(model)[0]
    # # Alternative way to convert to transformers:
    #transformer_model = model.convert_to_transformers()[0]

    # save it (note: transformers use str instead of Path objects)
    Path(transformers_output_dir).mkdir(parents=True, exist_ok=True)
    transformer_model.save_pretrained(transformers_output_dir)
    processor.tokenizer.save_pretrained(transformers_output_dir)

    # run predictions (using transformers)
    nlp = pipeline('sentiment-analysis',
                   model=str(transformers_output_dir),
                   tokenizer=str(transformers_output_dir))
    res = nlp("Was ein scheiß Nazi!")
    pprint.pprint(res)
Ejemplo n.º 2
0
    def load(cls, load_dir, batch_size=4, gpu=False, embedder_only=False):
        """
        Initializes inferencer from directory with saved model.

        :param load_dir: Directory where the saved model is located.
        :type load_dir: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param embedder_only: If true, a faster processor (InferenceProcessor) is loaded. This should only be used
        for extracting embeddings (no downstream predictions).
        :type embedder_only: bool
        :return: An instance of the Inferencer.
        """

        device, n_gpu = initialize_device_settings(
            use_cuda=gpu, local_rank=-1, fp16=False
        )

        model = AdaptiveModel.load(load_dir, device)
        if embedder_only:
            # model.prediction_heads = []
            processor = InferenceProcessor.load_from_dir(load_dir)
        else:
            processor = Processor.load_from_dir(load_dir)

        name = os.path.basename(load_dir)
        return cls(model, processor, batch_size=batch_size, gpu=gpu, name=name)
Ejemplo n.º 3
0
    def __init__(self, load_dir, batch_size=4, gpu=False):
        """
        Initializes inferencer from directory with saved model.
        :param load_dir: Directory containing a saved AdaptiveModel
        :type load_dir str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        """
        # Init device and distributed settings
        device, n_gpu = initialize_device_settings(
            use_cuda=gpu, local_rank=-1, fp16=False
        )

        self.processor = Processor.load_from_dir(load_dir)
        self.model = AdaptiveModel.load(load_dir, device)
        self.model.eval()
        self.batch_size = batch_size
        self.device = device
        self.language = self.model.language_model.language
        # TODO adjust for multiple prediction heads
        if len(self.model.prediction_heads) == 1:
            self.prediction_type = self.model.prediction_heads[0].model_type
            self.label_map = self.processor.label_maps[0]
        elif len(self.model.prediction_heads) == 0:
            self.prediction_type = "embedder"
        self.name = os.path.basename(load_dir)
        set_all_seeds(42, n_gpu)
Ejemplo n.º 4
0
def convert_to_transformers():
    farm_model_dir = Path("../saved_models/bert-english-qa-large")

    # load from FARM format
    model = AdaptiveModel.load(farm_model_dir, device="cpu")
    tokenizer = Tokenizer.load(farm_model_dir)

    # convert to transformers
    transformer_model = model.convert_to_transformers()

    # save it (Note: transformers uses strings rather than Path objects)
    model_dir = "../saved_models/bert-large-uncased-whole-word-masking-squad2"
    os.makedirs(model_dir, exist_ok=True)
    transformer_model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)

    # run predictions (using transformers)
    nlp = pipeline('question-answering', model=model_dir, tokenizer=model_dir)
    res = nlp({
        'question':
        'Why is model conversion important?',
        'context':
        'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
    })
    pprint.pprint(res)
Ejemplo n.º 5
0
def test_conversion_inferencer_qa():
    # input
    question = "Why is model conversion important?"
    text = "The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks."

    # Load from model hub
    model = "deepset/bert-base-cased-squad2"
    nlp = Inferencer.load(model, task_type="question_answering", num_processes=0)

    assert nlp.processor.tokenizer.do_lower_case == False
    assert nlp.processor.tokenizer.is_fast == True

    QA_input = [{"questions": [question], "text": text}]
    result_farm = nlp.inference_from_dicts(dicts=QA_input)
    answer_farm = result_farm[0]["predictions"][0]["answers"][0]["answer"]
    assert answer_farm == 'gives freedom to the user'

    # save it
    farm_model_dir = Path("testsave/bert-conversion-test")
    nlp.save(farm_model_dir)

    # free RAM
    del nlp

    # load from disk in FARM format
    model = AdaptiveModel.load(farm_model_dir, device="cpu")
    tokenizer = Tokenizer.load(farm_model_dir)

    # convert to transformers
    transformer_model = Converter.convert_to_transformers(model)[0]

    # free RAM
    del model

    # save it (Note: transformers uses strings rather than Path objects)
    model_dir = "testsave/bert-conversion-test-hf"
    os.makedirs(model_dir, exist_ok=True)
    transformer_model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    del transformer_model
    del tokenizer

    # run predictions (using transformers)
    nlp = pipeline('question-answering', model=model_dir, tokenizer=model_dir)
    result_transformers = nlp({
        'question': question,
        'context': text
    })
    answer_transformers = result_transformers["answer"]
    assert answer_farm == answer_transformers
    del nlp
Ejemplo n.º 6
0
def test_prediction_head_load_save_class_weights(tmp_path, caplog=None):
    """This is a regression test for #428 and #422."""
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 1
    lang_model = "bert-base-german-cased"
    data_dir_path = "samples/doc_class"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    tcp_params = dict(tokenizer=tokenizer,
                      max_seq_len=8,
                      data_dir=Path(data_dir_path),
                      train_filename="train-sample.tsv",
                      label_list=["OTHER", "OFFENSE"],
                      metric="f1_macro",
                      dev_filename="test-sample.tsv",
                      test_filename=None,
                      dev_split=0.0,
                      label_column_name="coarse_label")

    processor = TextClassificationProcessor(**tcp_params)

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(
        num_labels=2,
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    model.save(tmp_path)
    model_loaded = AdaptiveModel.load(tmp_path, device='cpu')
    assert model_loaded is not None
Ejemplo n.º 7
0
    def load(
        cls,
        load_dir,
        batch_size=4,
        gpu=False,
        embedder_only=False,
        return_class_probs=False,
        strict=True
    ):
        """
        Initializes Inferencer from directory with saved model.

        :param load_dir: Directory where the saved model is located.
        :type load_dir: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param embedder_only: If true, a faster processor (InferenceProcessor) is loaded. This should only be used for extracting embeddings (no downstream predictions).
        :type embedder_only: bool
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :return: An instance of the Inferencer.

        """

        device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, fp16=False)

        model = AdaptiveModel.load(load_dir, device, strict=strict)
        if embedder_only:
            # model.prediction_heads = []
            processor = InferenceProcessor.load_from_dir(load_dir)
        else:
            processor = Processor.load_from_dir(load_dir)

        name = os.path.basename(load_dir)
        return cls(
            model,
            processor,
            batch_size=batch_size,
            gpu=gpu,
            name=name,
            return_class_probs=return_class_probs,
        )
Ejemplo n.º 8
0
def test_conversion_inferencer_ner():
    # input
    text = "Paris is a town in France."

    # Load from model hub
    model = "dslim/bert-base-NER"
    nlp = Inferencer.load(model, task_type="ner", num_processes=0)

    assert nlp.processor.tokenizer.do_lower_case == False
    assert nlp.processor.tokenizer.is_fast == True

    input = [{"text": text}]
    result_farm = nlp.inference_from_dicts(dicts=input)
    pred_farm = result_farm[0]["predictions"]
    assert pred_farm[0][0]["label"] == 'LOC'
    assert pred_farm[0][1]["label"] == 'LOC'
    assert len(pred_farm[0]) == 2

    # save it
    farm_model_dir = Path("testsave/bert-conversion-test-hf")
    nlp.save(farm_model_dir)
    del nlp

    # load from disk in FARM format
    model = AdaptiveModel.load(farm_model_dir, device="cpu")
    tokenizer = Tokenizer.load(farm_model_dir)

    # convert to transformers
    transformer_model = Converter.convert_to_transformers(model)[0]
    del model

    # save it (Note: transformers uses strings rather than Path objects)
    model_dir = "testsave/bert-conversion-test-hf"
    os.makedirs(model_dir, exist_ok=True)
    transformer_model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    del transformer_model
    del tokenizer

    # run predictions (using transformers)
    nlp = pipeline('ner', model=model_dir, tokenizer=model_dir)
    result_transformers = nlp(text)
    assert result_transformers[0]["entity"] == 'B-LOC'
    assert result_transformers[1]["entity"] == 'B-LOC'
    assert len(result_transformers) == 2
    del nlp
Ejemplo n.º 9
0
    def __init__(self, load_dir, batch_size=4, gpu=False):
        # Init device and distributed settings
        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   fp16=False)

        self.processor = Processor.load_from_dir(load_dir)
        self.model = AdaptiveModel.load(load_dir, device)
        self.model.eval()
        self.batch_size = batch_size
        self.device = device
        self.language = self.model.language_model.language
        # TODO adjust for multiple prediction heads
        self.prediction_type = self.model.prediction_heads[0].model_type
        self.name = os.path.basename(load_dir)
        self.label_map = self.processor.label_maps[0]
        set_all_seeds(42, 1)
Ejemplo n.º 10
0
def test_conversion_inferencer_classification():
    # input
    text = "Das ist blöd."

    # Load from model hub
    model = "deepset/bert-base-german-cased-hatespeech-GermEval18Coarse"
    nlp = Inferencer.load(model,
                          task_type="text_classification",
                          num_processes=0)

    assert nlp.processor.tokenizer.do_lower_case == False
    assert nlp.processor.tokenizer.is_fast == True

    input = [{"text": text}]
    result_farm = nlp.inference_from_dicts(dicts=input)
    pred_farm = result_farm[0]["predictions"][0]["label"]
    assert pred_farm == 'OFFENSE'

    # save it
    farm_model_dir = Path("testsave/bert-conversion-test-hf")
    nlp.save(farm_model_dir)
    del nlp

    # load from disk in FARM format
    model = AdaptiveModel.load(farm_model_dir, device="cpu")
    tokenizer = Tokenizer.load(farm_model_dir)

    # convert to transformers
    transformer_model = Converter.convert_to_transformers(model)[0]
    del model

    # save it (Note: transformers uses strings rather than Path objects)
    model_dir = "testsave/bert-conversion-test-hf"
    os.makedirs(model_dir, exist_ok=True)
    transformer_model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    del transformer_model
    del tokenizer

    # run predictions (using transformers)
    nlp = pipeline('sentiment-analysis', model=model_dir, tokenizer=model_dir)
    result_transformers = nlp(text)
    pred_transformers = result_transformers[0]["label"]
    assert pred_farm == pred_transformers
    del nlp
Ejemplo n.º 11
0
    def load(cls, load_dir, batch_size=4, gpu=False):
        """
        Initializes inferencer from directory with saved model.
        :param load_dir: Directory where the saved model is located.
        :type load_dir: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :return: An instance of the Inferencer.
        """

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   fp16=False)

        model = AdaptiveModel.load(load_dir, device)
        processor = Processor.load_from_dir(load_dir)
        name = os.path.basename(load_dir)
        return cls(model, processor, batch_size=batch_size, gpu=gpu, name=name)
Ejemplo n.º 12
0
Archivo: train.py Proyecto: wwmmqq/FARM
    def train(self):
        """ Perform the training procedure. """

        # connect the prediction heads with the right output from processor
        self.model.connect_heads_with_processor(self.data_silo.processor.tasks,
                                                require_labels=True)

        # Check that the tokenizer fits the language model
        self.model.verify_vocab_size(
            vocab_size=len(self.data_silo.processor.tokenizer))

        logger.info(f"\n {GROWING_TREE}")
        self.model.train()

        do_stopping = False
        evalnr = 0
        loss = 0

        resume_from_step = self.from_step

        for epoch in range(self.from_epoch + 1, self.epochs + 1):
            train_data_loader = self.data_silo.get_data_loader("train")
            progress_bar = tqdm(train_data_loader)
            for step, batch in enumerate(progress_bar, start=1):
                # when resuming training from a checkpoint, we want to fast forward to the step of the checkpoint
                if resume_from_step and step <= resume_from_step:
                    if resume_from_step == step:
                        resume_from_step = None
                    continue

                if self.sigterm_handler and self.sigterm_handler.kill_now:  # save the current state as a checkpoint
                    logger.info(
                        "Received a SIGTERM signal. Saving the current train state as a checkpoint ..."
                    )
                    self._save()
                    sys.exit(0)

                # save a checkpoint and continue train (do not create a new checkpoint if just resumed from a checkpoint)
                if self.checkpoint_every and step % self.checkpoint_every == 0 and resume_from_step + 1 != step:
                    self._save()

                progress_bar.set_description(
                    f"Train epoch {epoch}/{self.epochs} (Cur. train loss: {loss:.4f})"
                )

                # Move batch of samples to device
                batch = {key: batch[key].to(self.device) for key in batch}

                # Forward pass through model
                logits = self.model.forward(**batch)
                per_sample_loss = self.model.logits_to_loss(
                    logits=logits, global_step=self.global_step, **batch)

                loss = self.backward_propagate(per_sample_loss, step)

                # Perform  evaluation
                if self.global_step % self.evaluate_every == 0 and self.global_step != 0:
                    # When using StreamingDataSilo, each evaluation creates a new instance of
                    # dev_data_loader. In cases like training from scratch, this could cause
                    # some variance across evaluators due to the randomness in word masking.
                    dev_data_loader = self.data_silo.get_data_loader("dev")
                    if dev_data_loader is not None:
                        evaluator_dev = Evaluator(
                            data_loader=dev_data_loader,
                            tasks=self.data_silo.processor.tasks,
                            device=self.device)
                        evalnr += 1
                        result = evaluator_dev.eval(self.model)
                        evaluator_dev.log_results(result, "Dev",
                                                  self.global_step)
                        if self.early_stopping:
                            do_stopping, save_model, eval_value = self.early_stopping.check_stopping(
                                result)
                            if save_model:
                                logger.info(
                                    "Saving current best model to {}, eval={}".
                                    format(self.early_stopping.save_dir,
                                           eval_value))
                                self.model.save(self.early_stopping.save_dir)
                                self.data_silo.processor.save(
                                    self.early_stopping.save_dir)
                            if do_stopping:
                                # log the stopping
                                logger.info(
                                    "STOPPING EARLY AT EPOCH {}, STEP {}, EVALUATION {}"
                                    .format(epoch, step, evalnr))
                if do_stopping:
                    break
                self.global_step += 1
                self.from_step = step
            self.from_epoch = epoch
            if do_stopping:
                break

        # With early stopping we want to restore the best model
        if self.early_stopping and self.early_stopping.save_dir:
            logger.info("Restoring best model so far from {}".format(
                self.early_stopping.save_dir))
            lm_name = self.model.language_model.name
            model = AdaptiveModel.load(self.early_stopping.save_dir,
                                       self.device,
                                       lm_name=lm_name)
            model.connect_heads_with_processor(self.data_silo.processor.tasks,
                                               require_labels=True)

        # Eval on test set
        test_data_loader = self.data_silo.get_data_loader("test")
        if test_data_loader is not None:
            evaluator_test = Evaluator(data_loader=test_data_loader,
                                       tasks=self.data_silo.processor.tasks,
                                       device=self.device)
            result = evaluator_test.eval(self.model)
            evaluator_test.log_results(result, "Test", self.global_step)
        return self.model
Ejemplo n.º 13
0
    def train(self):
        """
        Perform the training procedure.

        The training is visualized by a progress bar. It counts the epochs in a zero based manner.
        For example, when you specify ``epochs=20`` it starts to count from 0 to 19.

        If trainer evaluates the model with a test set the result of the
        evaluation is stored in ``test_result``.

        :return: Returns the model after training. When you do ``early_stopping``
            with a ``save_dir`` the best model is loaded and returned.
        """

        # connect the prediction heads with the right output from processor
        self.model.connect_heads_with_processor(self.data_silo.processor.tasks,
                                                require_labels=True)
        # Check that the tokenizer(s) fits the language model(s)
        if hasattr(self.model, "language_model2"):
            self.model.verify_vocab_size(
                vocab_size1=len(self.data_silo.processor.query_tokenizer),
                vocab_size2=len(self.data_silo.processor.passage_tokenizer))
        else:
            self.model.verify_vocab_size(
                vocab_size=len(self.data_silo.processor.tokenizer))
        self.model.train()

        do_stopping = False
        evalnr = 0
        loss = 0
        resume_from_step = self.from_step

        if self.local_rank in [0, -1]:
            logger.info(f"\n {GROWING_TREE}")

        for epoch in range(self.from_epoch, self.epochs):
            early_break = False
            self.from_epoch = epoch
            train_data_loader = self.data_silo.get_data_loader("train")
            progress_bar = tqdm(train_data_loader,
                                disable=self.local_rank not in [0, -1]
                                or self.disable_tqdm)
            for step, batch in enumerate(progress_bar):
                # when resuming training from a checkpoint, we want to fast forward to the step of the checkpoint
                if resume_from_step and step <= resume_from_step:
                    # TODO: Improve skipping for StreamingDataSilo
                    # The seeds before and within the loop are currently needed, if you need full reproducibility
                    # of runs with vs. without checkpointing using StreamingDataSilo. Reason: While skipping steps in StreamingDataSilo,
                    # we update the state of the random number generator (e.g. due to masking words), which can impact the model behaviour (e.g. dropout)
                    if step % 10000 == 0:
                        logger.info(
                            f"Skipping {step} out of {resume_from_step} steps ..."
                        )
                    if resume_from_step == step:
                        logger.info(
                            f"Finished skipping {resume_from_step} steps ...")
                        resume_from_step = None
                    else:
                        continue

                progress_bar.set_description(
                    f"Train epoch {epoch}/{self.epochs-1} (Cur. train loss: {loss:.4f})"
                )

                # Only for distributed training: we need to ensure that all ranks still have a batch left for training
                if self.local_rank != -1:
                    if not self._all_ranks_have_data(has_data=1, step=step):
                        early_break = True
                        break

                # Move batch of samples to device
                batch = {key: batch[key].to(self.device) for key in batch}
                # Forward & backward pass through model
                logits = self.model.forward(**batch)
                per_sample_loss = self.model.logits_to_loss(
                    logits=logits, global_step=self.global_step, **batch)
                loss = self.backward_propagate(per_sample_loss, step)

                # Perform  evaluation
                if self.evaluate_every != 0 \
                        and self.global_step % self.evaluate_every == 0 \
                        and self.global_step != 0\
                        and self.local_rank in [0,-1]:
                    # When using StreamingDataSilo, each evaluation creates a new instance of
                    # dev_data_loader. In cases like training from scratch, this could cause
                    # some variance across evaluators due to the randomness in word masking.
                    dev_data_loader = self.data_silo.get_data_loader("dev")
                    if dev_data_loader is not None:
                        evaluator_dev = Evaluator(
                            data_loader=dev_data_loader,
                            tasks=self.data_silo.processor.tasks,
                            device=self.device,
                            report=self.eval_report)
                        evalnr += 1
                        result = evaluator_dev.eval(self.model)
                        evaluator_dev.log_results(result, "Dev",
                                                  self.global_step)
                        if self.early_stopping:
                            do_stopping, save_model, eval_value = self.early_stopping.check_stopping(
                                result)
                            if save_model:
                                logger.info(
                                    "Saving current best model to {}, eval={}".
                                    format(self.early_stopping.save_dir,
                                           eval_value))
                                self.model.save(self.early_stopping.save_dir)
                                self.data_silo.processor.save(
                                    self.early_stopping.save_dir)
                            if do_stopping:
                                # log the stopping
                                logger.info(
                                    "STOPPING EARLY AT EPOCH {}, STEP {}, EVALUATION {}"
                                    .format(epoch, step, evalnr))
                if do_stopping:
                    break

                self.global_step += 1
                self.from_step = step + 1

                # save the current state as a checkpoint before exiting if a SIGTERM signal is received
                if self.sigterm_handler and self.sigterm_handler.kill_now:
                    logger.info(
                        "Received a SIGTERM signal. Saving the current train state as a checkpoint ..."
                    )
                    if self.local_rank in [0, -1]:
                        self._save()
                        torch.distributed.destroy_process_group()
                        sys.exit(0)

                # save a checkpoint and continue train
                if self.checkpoint_every and step % self.checkpoint_every == 0:
                    if self.local_rank in [0, -1]:
                        self._save()
                    # Let other ranks wait until rank 0 has finished saving
                    if self.local_rank != -1:
                        torch.distributed.barrier()

            if do_stopping:
                break

            # Only for distributed training: we need to ensure that all ranks still have a batch left for training
            if self.local_rank != -1 and not early_break:
                self._all_ranks_have_data(has_data=False)

        # With early stopping we want to restore the best model
        if self.early_stopping and self.early_stopping.save_dir:
            logger.info("Restoring best model so far from {}".format(
                self.early_stopping.save_dir))
            lm_name = self.model.language_model.name
            self.model = AdaptiveModel.load(self.early_stopping.save_dir,
                                            self.device,
                                            lm_name=lm_name)
            self.model.connect_heads_with_processor(
                self.data_silo.processor.tasks, require_labels=True)

        # Eval on test set
        if self.evaluator_test and self.local_rank in [0, -1]:
            test_data_loader = self.data_silo.get_data_loader("test")
            if test_data_loader is not None:
                evaluator_test = Evaluator(
                    data_loader=test_data_loader,
                    tasks=self.data_silo.processor.tasks,
                    device=self.device)
                self.test_result = evaluator_test.eval(self.model)
                evaluator_test.log_results(self.test_result, "Test",
                                           self.global_step)
        return self.model
Ejemplo n.º 14
0
def doc_classification_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    xval_folds = 5
    xval_stratified = True

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    use_amp = None

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    # For xval, we also store the actual predictions and labels in each result so we can
    # calculate overall metrics over all folds later
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels).get("acc")
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        mcc = matthews_corrcoef(labels, preds)
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro,
            "mcc": mcc
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} ############")
        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = TextClassificationHead(
            class_weights=data_silo.calculate_class_weights(
                task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(language_model=language_model,
                              prediction_heads=[prediction_head],
                              embeds_dropout_prob=0.2,
                              lm_output_types=["per_sequence"],
                              device=device)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        # An early stopping instance can be used to save the model that performs best on the dev set
        # according to some metric and stop training when no improvement is happening for some iterations.
        # NOTE: Using a different save directory for each fold, allows us afterwards to use the
        # nfolds best models in an ensemble!
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        earlystopping = EarlyStopping(
            metric="f1_offense",
            mode=
            "max",  # use the metric from our own metrics function instead of loss
            save_dir=save_dir,  # where to save the best model
            patience=
            5  # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          early_stopping=earlystopping,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    allresults = []
    all_preds = []
    all_labels = []
    bestfold = None
    bestf1_offense = -1
    save_dir = Path("saved_models/bert-german-doc-tutorial-es")
    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold, save_dir)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        allresults.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))

        # keep track of best fold
        f1_offense = result[0]["f1_offense"]
        if f1_offense > bestf1_offense:
            bestf1_offense = f1_offense
            bestfold = num_fold

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("doc_classification_xval.results.json", "wt") as fp:
        json.dump(allresults, fp)

    # calculate overall metrics across all folds
    xval_f1_micro = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             average="micro")
    xval_f1_macro = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             average="macro")
    xval_f1_offense = f1_score(all_labels,
                               all_preds,
                               labels=label_list,
                               pos_label="OFFENSE")
    xval_f1_other = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             pos_label="OTHER")
    xval_mcc = matthews_corrcoef(all_labels, all_preds)

    logger.info("XVAL F1 MICRO:   ", xval_f1_micro)
    logger.info("XVAL F1 MACRO:   ", xval_f1_macro)
    logger.info("XVAL F1 OFFENSE: ", xval_f1_offense)
    logger.info("XVAL F1 OTHER:   ", xval_f1_other)
    logger.info("XVAL MCC:        ", xval_mcc)

    # -----------------------------------------------------
    # Just for illustration, use the best model from the best xval val for evaluation on
    # the original (still unseen) test set.
    logger.info(
        "###### Final Eval on hold out test set using best model #####")
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device)
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)

    result = evaluator_origtest.eval(model)
    logger.info("TEST F1 MICRO:   ", result[0]["f1_micro"])
    logger.info("TEST F1 MACRO:   ", result[0]["f1_macro"])
    logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"])
    logger.info("TEST F1 OTHER:   ", result[0]["f1_other"])
    logger.info("TEST MCC:        ", result[0]["mcc"])
Ejemplo n.º 15
0
xval_mcc = matthews_corrcoef(all_labels, all_preds)

logger.info("XVAL F1 MICRO:   ", xval_f1_micro)
logger.info("XVAL F1 MACRO:   ", xval_f1_macro)
logger.info("XVAL F1 OFFENSE: ", xval_f1_offense)
logger.info("XVAL F1 OTHER:   ", xval_f1_other)
logger.info("XVAL MCC:        ", xval_mcc)

# -----------------------------------------------------
# Just for illustration, use the best model from the best xval val for evaluation on
# the original (still unseen) test set.
logger.info("###### Final Eval on hold out test set using best model #####")
evaluator_origtest = Evaluator(data_loader=data_silo.get_data_loader("test"),
                               tasks=data_silo.processor.tasks,
                               device=device)
# restore model from the best fold
lm_name = model.language_model.name
save_dir = "saved_models/bert-german-doc-tutorial-es-{}".format(bestfold)
model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
model.connect_heads_with_processor(data_silo.processor.tasks,
                                   require_labels=True)

result = evaluator_origtest.eval(model)
logger.info("TEST F1 MICRO:   ", result[0]["f1_micro"])
logger.info("TEST F1 MACRO:   ", result[0]["f1_macro"])
logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"])
logger.info("TEST F1 OTHER:   ", result[0]["f1_other"])
logger.info("TEST MCC:        ", result[0]["mcc"])

# fmt: on
Ejemplo n.º 16
0
    def train(self, model):
        """ Perform the training procedure. """

        # connect the prediction heads with the right output from processor
        model.connect_heads_with_processor(self.data_silo.processor.tasks,
                                           require_labels=True)

        # Check that the tokenizer fits the language model
        model.verify_vocab_size(
            vocab_size=len(self.data_silo.processor.tokenizer))

        logger.info(f"\n {GROWING_TREE}")
        model.train()

        do_stopping = False
        evalnr = 0
        loss = 0
        for epoch in range(1, self.epochs + 1):
            progress_bar = tqdm(self.data_loader_train)
            for step, batch in enumerate(progress_bar):
                progress_bar.set_description(
                    f"Train epoch {epoch}/{self.epochs} (Cur. train loss: {loss:.4f})"
                )

                # Move batch of samples to device
                batch = {key: batch[key].to(self.device) for key in batch}

                # Forward pass through model
                logits = model.forward(**batch)
                per_sample_loss = model.logits_to_loss(logits=logits, **batch)

                loss = self.backward_propagate(per_sample_loss, step)

                # Perform  evaluation
                if self.evaluator_dev:
                    if self.global_step != 0 and (self.global_step %
                                                  self.evaluate_every == 0):
                        evalnr += 1
                        result = self.evaluator_dev.eval(model)
                        self.evaluator_dev.log_results(result, "Dev",
                                                       self.global_step)
                        if self.early_stopping:
                            do_stopping, save_model, eval_value = self.early_stopping.check_stopping(
                                result)
                            if save_model:
                                logger.info(
                                    "Saving current best model to {}, eval={}".
                                    format(self.early_stopping.save_dir,
                                           eval_value))
                                model.save(self.early_stopping.save_dir)
                                self.data_silo.processor.save(
                                    self.early_stopping.save_dir)
                            if do_stopping:
                                # log the stopping
                                logger.info(
                                    "STOPPING EARLY AT EPOCH {}, STEP {}, EVALUATION {}"
                                    .format(epoch, step, evalnr))
                if do_stopping:
                    break
                self.global_step += 1
            if do_stopping:
                break

        # With early stopping we want to restore the best model
        if self.early_stopping and self.early_stopping.save_dir:
            logger.info("Restoring best model so far from {}".format(
                self.early_stopping.save_dir))
            lm_name = model.language_model.name
            model = AdaptiveModel.load(self.early_stopping.save_dir,
                                       self.device,
                                       lm_name=lm_name)
            model.connect_heads_with_processor(self.data_silo.processor.tasks,
                                               require_labels=True)

        # Eval on test set
        if self.evaluator_test:
            result = self.evaluator_test.eval(model)
            self.evaluator_test.log_results(result, "Test", self.global_step)
        return model
Ejemplo n.º 17
0
Archivo: infer.py Proyecto: wwmmqq/FARM
    def load(cls,
             model_name_or_path,
             batch_size=4,
             gpu=False,
             task_type=None,
             return_class_probs=False,
             strict=True,
             max_seq_len=256):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :return: An instance of the Inferencer.

        """

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = AdaptiveModel.load(model_name_or_path,
                                       device,
                                       strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

        # b) or from remote transformers model hub
        else:
            logger.info(
                f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..."
            )
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path, device, task_type)
            config = AutoConfig.from_pretrained(model_name_or_path)
            tokenizer = Tokenizer.load(model_name_or_path)

            # TODO infer task_type automatically from config (if possible)
            if task_type == "question_answering":
                processor = SquadProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    label_list=["start_token", "end_token"],
                    metric="squad",
                    data_dir=None,
                )
            elif task_type == "embeddings":
                processor = InferenceProcessor(tokenizer=tokenizer,
                                               max_seq_len=max_seq_len)

            elif task_type == "text_classification":
                label_list = list(config.id2label[id]
                                  for id in range(len(config.id2label)))
                processor = TextClassificationProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    data_dir=None,
                    label_list=label_list,
                    label_column_name="label",
                    metric="acc",
                    quote_char='"',
                )

            # elif task_type == "multilabel-classification":
            #     # label_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
            #     label_list = list(config.label2id.keys())
            #
            #     processor = TextClassificationProcessor(tokenizer=tokenizer,
            #                                             max_seq_len=max_seq_len,
            #                                             data_dir=None,
            #                                             label_list=label_list,
            #                                             label_column_name="label",
            #                                             metric="acc",
            #                                             quote_char='"',
            #                                             multilabel=True,
            #                                             )

            elif task_type == "ner":
                label_list = list(config.label2id.keys())
                processor = NERProcessor(tokenizer=tokenizer,
                                         max_seq_len=max_seq_len,
                                         data_dir=None,
                                         metric="seq_f1",
                                         label_list=label_list)
            else:
                raise ValueError(
                    f"`task_type` {task_type} is not supported yet. "
                    f"Valid options for arg `task_type`: 'question_answering', 'embeddings', 'text_classification'"
                )

        return cls(
            model,
            processor,
            batch_size=batch_size,
            gpu=gpu,
            name=name,
            return_class_probs=return_class_probs,
        )
    def perform_fine_tuning(current_info_need,
                            bert_model,
                            label_list,
                            num_epochs,
                            condition,
                            folds=10,
                            stratified=True,
                            learning_rate=2e-5,
                            batch_size=32,
                            embeds_dropout_prob=.1):

        ## Define evaluation metrics ##
        def evaluation_metrics(preds, labels):
            acc = simple_accuracy(preds, labels).get("acc")
            f1other = f1_score(y_true=labels, y_pred=preds, pos_label="Other")
            f1infoneed = f1_score(y_true=labels,
                                  y_pred=preds,
                                  pos_label=current_info_need)
            recall_infoneed = recall_score(y_true=labels,
                                           y_pred=preds,
                                           pos_label=current_info_need)
            precision_infoneed = precision_score(y_true=labels,
                                                 y_pred=preds,
                                                 pos_label=current_info_need)
            recall_other = recall_score(y_true=labels,
                                        y_pred=preds,
                                        pos_label="Other")
            precision_other = precision_score(y_true=labels,
                                              y_pred=preds,
                                              pos_label="Other")
            recall_macro = recall_score(y_true=labels,
                                        y_pred=preds,
                                        average="macro")
            precision_macro = precision_score(y_true=labels,
                                              y_pred=preds,
                                              average="macro")
            recall_micro = recall_score(y_true=labels,
                                        y_pred=preds,
                                        average="micro")
            precision_micro = precision_score(y_true=labels,
                                              y_pred=preds,
                                              average="micro")
            recall_weighted = recall_score(y_true=labels,
                                           y_pred=preds,
                                           average="weighted")
            precision_weighted = precision_score(y_true=labels,
                                                 y_pred=preds,
                                                 average="weighted")
            f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
            f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
            mcc = matthews_corrcoef(labels, preds)
            f1weighted = f1_score(y_true=labels,
                                  y_pred=preds,
                                  average="weighted")

            return {
                "info_need": current_info_need,
                "model": bert_model,
                "num_epochs": num_epochs,
                "condition": condition,
                "acc": acc,
                "f1_other": f1other,
                "f1_infoneed": f1infoneed,
                "precision_infoneed": precision_infoneed,
                "recall_infoneed": recall_infoneed,
                "recall_other": recall_other,
                "precision_other": precision_other,
                "recall_macro": recall_macro,
                "precision_macro": precision_macro,
                "recall_micro": recall_micro,
                "precision_micro": precision_micro,
                "recall_weighted": recall_weighted,
                "precision_weighted": precision_weighted,
                "f1_weighted": f1weighted,
                "f1_macro": f1macro,
                "f1_micro": f1micro,
                "f1_weighted": f1weighted,
                "mcc": mcc
            }

        register_metrics(
            f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs',
            evaluation_metrics)
        metric = f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs'
        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=True)
        logger, ml_logger = init_logging()
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=bert_model,
                                   do_lower_case=False)

        processor = TextClassificationProcessor(
            tokenizer=tokenizer,
            max_seq_len=256,
            train_filename=
            f"{current_info_need}_{condition}_{num_epochs}_epochs_train.csv",
            test_filename=
            f"{current_info_need}_{condition}_{num_epochs}_epochs_test.csv",
            data_dir="data/",
            label_list=label_list,
            metric=metric,
            text_column_name="utterance",
            label_column_name=level,
            delimiter=";")

        data_silo = DataSilo(processor=processor, batch_size=batch_size)

        silos = DataSiloForCrossVal.make(data_silo,
                                         n_splits=folds,
                                         sets=['train', 'test'])

        # the following steps should be run for each of the folds of the cross validation, so we put them
        # into a function
        def train_on_split(silo_to_use, n_fold, save_dir):
            logger.info(
                f"############ Crossvalidation: Fold {n_fold} ############")
            # Create an AdaptiveModel
            # a) which consists of a pretrained language model as a basis
            language_model = LanguageModel.load(bert_model)
            # b) and a prediction head on top that is suited for our task => Text classification
            prediction_head = TextClassificationHead(
                class_weights=data_silo.calculate_class_weights(
                    task_name="text_classification"),
                num_labels=len(label_list))

            model = AdaptiveModel(language_model=language_model,
                                  prediction_heads=[prediction_head],
                                  embeds_dropout_prob=embeds_dropout_prob,
                                  lm_output_types=["per_sequence"],
                                  device=device)

            # Create an optimizer
            model, optimizer, lr_schedule = initialize_optimizer(
                model=model,
                learning_rate=learning_rate,
                device=device,
                n_batches=len(silo_to_use.loaders["train"]),
                n_epochs=num_epochs,
                use_amp=None)

            # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
            # Also create an EarlyStopping instance and pass it on to the trainer

            # An early stopping instance can be used to save the model that performs best on the dev set
            # according to some metric and stop training when no improvement is happening for some iterations.
            # NOTE: Using a different save directory for each fold, allows us afterwards to use the
            # nfolds best models in an ensemble!
            save_dir = Path(str(save_dir) + f"-{n_fold}")
            earlystopping = EarlyStopping(
                metric="f1_infoneed",
                mode=
                "max",  # use the metric from our own metrics function instead of loss
                save_dir=save_dir,  # where to save the best model
                patience=
                5  # number of evaluations to wait for improvement before terminating the training
            )

            trainer = Trainer(model=model,
                              optimizer=optimizer,
                              data_silo=silo_to_use,
                              epochs=num_epochs,
                              n_gpu=n_gpu,
                              lr_schedule=lr_schedule,
                              evaluate_every=100,
                              device=device,
                              early_stopping=earlystopping,
                              evaluator_test=False)

            # train it
            trainer.train()

            return trainer.model

        # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
        # on the test set of each fold
        # Remember all the results for overall metrics over all predictions of all folds and for averaging
        allresults = []
        all_preds = []
        all_labels = []
        bestfold = None
        bestf1_info_need = -1
        language_model_name = bert_model
        if language_model_name.find("/") != -1:
            language_model_name = language_model_name.replace("/", "_")
        save_dir = Path(
            f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}"
        )
        for num_fold, silo in enumerate(silos):
            model = train_on_split(silo, num_fold, save_dir)

            # do eval on test set here (and not in Trainer),
            #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
            evaluator_test = Evaluator(
                data_loader=silo.get_data_loader("test"),
                tasks=silo.processor.tasks,
                device=device)
            result = evaluator_test.eval(model, return_preds_and_labels=True)
            evaluator_test.log_results(result,
                                       "Test",
                                       steps=len(silo.get_data_loader("test")),
                                       num_fold=num_fold)

            allresults.append(result)
            all_preds.extend(result[0].get("preds"))
            all_labels.extend(result[0].get("labels"))

            # keep track of best fold
            f1_info_need = result[0]["f1_infoneed"]
            if f1_info_need > bestf1_info_need:
                bestf1_info_need = f1_info_need
                bestfold = num_fold

            # emtpy cache to avoid memory leak and cuda OOM across multiple folds
            model.cpu()
            torch.cuda.empty_cache()

        # Save the per-fold results to json for a separate, more detailed analysis
        with open(
                f"classification_results/test/{current_info_need}-{language_model_name}-{condition}-{num_epochs}_epochs-{folds}-fold-cv.results.json",
                "wt") as fp:
            json.dump(allresults, fp)

        # calculate overall metrics across all folds
        xval_f1_other = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 pos_label="Other")
        xval_f1_info_need = f1_score(all_labels,
                                     all_preds,
                                     labels=label_list,
                                     pos_label=current_info_need)
        xval_f1_micro = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 average="micro")
        xval_f1_macro = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 average="macro")
        xval_mcc = matthews_corrcoef(all_labels, all_preds)

        xval_overall_results = {
            "xval_f1_other": xval_f1_other,
            f"xval_f1_infoneed": xval_f1_info_need,
            "xval_f1_micro": xval_f1_micro,
            "xval_f1_macro": xval_f1_macro,
            "xval_f1_mcc": xval_mcc
        }

        logger.info(f"XVAL F1 MICRO: {xval_f1_micro}")
        logger.info(f"XVAL F1 MACRO: {xval_f1_macro}")
        logger.info(f"XVAL F1 OTHER: {xval_f1_other}")
        logger.info(
            f"XVAL F1 {current_info_need} {condition} {num_epochs} epochs:   {xval_f1_info_need}"
        )
        logger.info(f"XVAL MCC: {xval_mcc}")

        # -----------------------------------------------------
        # Just for illustration, use the best model from the best xval val for evaluation on
        # the original (still unseen) test set.
        logger.info(
            "###### Final Eval on hold out test set using best model #####")
        evaluator_origtest = Evaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device)
        # restore model from the best fold
        lm_name = model.language_model.name
        save_dir = Path(
            f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}-{bestfold}"
        )
        model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)

        result = evaluator_origtest.eval(model)
        logger.info("TEST F1 MICRO: {}".format(result[0]["f1_micro"]))
        logger.info("TEST F1 MACRO: {}".format(result[0]["f1_macro"]))
        logger.info("TEST F1 OTHER: {}".format(result[0]["f1_other"]))
        logger.info("TEST F1 {0}: {1}".format(current_info_need,
                                              result[0]["f1_infoneed"]))
        logger.info("TEST MCC:  {}".format(result[0]["mcc"]))

        test_set_results = {
            "test_f1_other": result[0]["f1_other"],
            "test_f1_infoneed": result[0][f"f1_infoneed"],
            "test_f1_micro": result[0]["f1_micro"],
            "test_f1_macro": result[0]["f1_macro"],
            "test_f1_mcc": result[0]["mcc"]
        }
Ejemplo n.º 19
0
def outcome_pretraining(task_config,
                        model_name,
                        cache_dir,
                        run_name="0",
                        lr=1e-05,
                        warmup_steps=5000,
                        embeds_dropout=0.1,
                        epochs=200,  # large because we use early stopping by default
                        batch_size=20,
                        grad_acc_steps=1,
                        early_stopping_metric="loss",
                        early_stopping_mode="min",
                        early_stopping_patience=10,
                        model_class="Bert",
                        tokenizer_class="BertTokenizer",
                        do_lower_case=True,
                        do_train=True,
                        do_eval=True,
                        do_hpo=False,
                        max_seq_len=512,
                        seed=11,
                        eval_every=500,
                        use_amp=False,
                        use_cuda=True,
                        ):
    # Load task config
    task_config = yaml.safe_load(open(task_config))

    data_dir = Path(task_config["data"]["data_dir"])

    # General Settings
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name, tokenizer_class=tokenizer_class,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = OutcomePretrainingProcessor(tokenizer=tokenizer,
                                            max_seq_len=max_seq_len,
                                            data_dir=data_dir,
                                            train_filename=task_config["data"]["train_filename"],
                                            dev_filename=task_config["data"]["dev_filename"],
                                            seed=seed,
                                            max_size_admission=50,
                                            max_size_discharge=50,
                                            cache_dir=cache_dir)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = OutcomePretrainingDataSilo(
        processor=processor,
        caching=True,
        cache_dir=cache_dir,
        batch_size=batch_size,
        max_multiprocessing_chunksize=200)

    if do_train:

        # Set save dir for experiment output
        save_dir = Path(task_config["output_dir"]) / f'{task_config["experiment_name"]}_{run_name}'

        # Use HPO config args if config is passed
        if do_hpo:
            save_dir = save_dir / tune.session.get_trial_name()
        else:
            exp_name = f"exp_{random.randint(100000, 999999)}"
            save_dir = save_dir / exp_name

        # Create save dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Setup MLFlow logger
        ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"])
        ml_logger.init_experiment(experiment_name=task_config["experiment_name"],
                                  run_name=f'{task_config["experiment_name"]}_{run_name}')

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis

        language_model = LanguageModel.load(model_name, language_model_class=model_class)

        # b) and NextSentenceHead prediction head or TextClassificationHead if it's not a Bert Model
        if model_class == "Bert":
            next_sentence_head = NextSentenceHead.load(model_class)
        else:
            next_sentence_head = TextClassificationHead(num_labels=2)

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[next_sentence_head],
            embeds_dropout_prob=embeds_dropout,
            lm_output_types=["per_sequence"],
            device=device,
        )

        # 5. Create an optimizer
        schedule_opts = {"name": "LinearWarmup",
                         "num_warmup_steps": warmup_steps}

        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=lr,
            device=device,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=epochs,
            use_amp=use_amp,
            grad_acc_steps=grad_acc_steps,
            schedule_opts=schedule_opts)

        # 6. Create an early stopping instance
        early_stopping = None
        if early_stopping_mode != "none":
            early_stopping = EarlyStopping(
                mode=early_stopping_mode,
                min_delta=0.0001,
                save_dir=save_dir,
                metric=early_stopping_metric,
                patience=early_stopping_patience
            )

        # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it
        # from time to time

        trainer = ExtendedTrainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=eval_every,
            early_stopping=early_stopping,
            device=device,
            grad_acc_steps=grad_acc_steps,
            evaluator_test=do_eval
        )

        def score_callback(eval_score, train_loss):
            tune.report(roc_auc_dev=eval_score, train_loss=train_loss)

        # 8. Train the model
        trainer.train(score_callback=score_callback if do_hpo else None)

        # 9. Save model if not saved in early stopping
        model.save(save_dir / "final_model")
        processor.save(save_dir / "final_model")

    if do_eval:
        # Load newly trained model or existing model
        if do_train:
            model_dir = save_dir
        else:
            model_dir = Path(model_name)

        logger.info("###### Eval on TEST SET #####")

        evaluator_test = Evaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device
        )

        # Load trained model for evaluation
        model = AdaptiveModel.load(model_dir, device)
        model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

        # Evaluate
        results = evaluator_test.eval(model, return_preds_and_labels=True)

        # Log results
        utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader),
                          save_path=model_dir / "eval_results.txt")
def doc_classification_crossvalidation():
    # the code for this function is partially taken from:
    # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_multilabel.py and
    # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_crossvalidation.py

    # for local logging:
    ml_logger = MLFlowLogger(tracking_uri="")
    ml_logger.init_experiment(experiment_name="covid-document-classification",
                              run_name=RUNNAME)

    # model settings
    xval_folds = FOLDS
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    if RUNLOCAL:
        device = "cpu"
    n_epochs = NEPOCHS
    batch_size = BATCHSIZE
    evaluate_every = EVALEVERY
    lang_model = MODELTYPE
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    metric = "f1_macro"

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # The processor wants to know the possible labels ...
    label_list = LABELS
    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=MAXLEN,
                                            data_dir=DATADIR,
                                            train_filename=TRAIN,
                                            test_filename=TEST,
                                            dev_split=0.1,
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="Categories",
                                            # confusing parameter name: it should be called multiCLASS
                                            # not multiLABEL
                                            multilabel=True
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir, dev):
        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = MultiLabelTextClassificationHead(
            # there is still an error with class weights ...
            # class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.2,
            lm_output_types=["per_sequence"],
            device=dev)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=dev,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        # unfortunately, early stopping is still not working
        earlystopping = EarlyStopping(
            metric="f1_macro", mode="max",
            save_dir=save_dir,  # where to save the best model
            patience=5 # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model, optimizer=optimizer,
                          data_silo=silo_to_use, epochs=n_epochs,
                          n_gpu=n_gpu, lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=dev, evaluator_test=False,
                          #early_stopping=earlystopping)
                          )
        # train it
        trainer.train()
        trainer.model.save(save_dir)
        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    allresults = []
    all_preds = []
    all_labels = []
    bestfold = None
    bestf1_macro = -1
    save_dir = Path("saved_models/covid-classification-v1")

    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold, save_dir, device)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(
            data_loader=silo.get_data_loader("test"),
            tasks=silo.processor.tasks,
            device=device,
        )
        result = evaluator_test.eval(model, return_preds_and_labels=True)

        os.makedirs(os.path.dirname(BESTMODEL + "/classification_report.txt"), exist_ok=True)
        with open(BESTMODEL + "/classification_report.txt", "a+") as file:
            file.write("Evaluation on withheld split for numfold no. {} \n".format(num_fold))
            file.write(result[0]["report"])
            file.write("\n\n")
            file.close()

        evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold)

        allresults.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))

        # keep track of best fold
        f1_macro = result[0]["f1_macro"]
        if f1_macro > bestf1_macro:
            bestf1_macro = f1_macro
            bestfold = num_fold

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("../data/predictions/covid-classification-xval.results.json", "wt") as fp:
        json.dump(allresults, fp, cls=NumpyArrayEncoder)

    # calculate overall f1 score across all folds
    xval_f1_macro = f1_score(all_labels, all_preds, average="macro")
    ml_logger.log_metrics({"f1 macro across all folds": xval_f1_macro}, step=None)

    # test performance
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/covid-classification-v1-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

    result = evaluator_origtest.eval(model)
    ml_logger.log_metrics({"f1 macro on final test set": result[0]["f1_macro"]}, step=None)

    with open(BESTMODEL + "/classification_report.txt", "a+") as file:
        file.write("Final result of the best model \n")
        file.write(result[0]["report"])
        file.write("\n\n")
        file.close()

    ml_logger.log_artifacts(BESTMODEL + "/")

    # save model for later use
    processor.save(BESTMODEL)
    model.save(BESTMODEL)
    return model
def doc_classification_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    xval_folds = 5
    xval_stratification = True

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    dev_split = 0.1
    # For xval the dev_stratification parameter must not be None: with None, the devset cannot be created
    # using the default method of only splitting by the available chunks as initial train set for each fold
    # is just a single chunk!
    dev_stratification = True
    lang_model = "bert-base-german-cased"
    do_lower_case = False
    use_amp = None

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    # For xval, we also store the actual predictions and labels in each result so we can
    # calculate overall metrics over all folds later
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels).get("acc")
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
        mcc = matthews_corrcoef(labels, preds)
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro,
            "mcc": mcc
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        dev_split=dev_split,
        dev_stratification=dev_stratification,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo,
                                     sets=["train", "dev"],
                                     n_splits=xval_folds,
                                     stratification=xval_stratification)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} of {xval_folds} ############"
        )
        logger.info(
            f"Fold training   samples: {len(silo_to_use.data['train'])}")
        logger.info(f"Fold dev        samples: {len(silo_to_use.data['dev'])}")
        logger.info(
            f"Fold testing    samples: {len(silo_to_use.data['test'])}")
        logger.info(
            "Total number of samples: "
            f"{len(silo_to_use.data['train'])+len(silo_to_use.data['dev'])+len(silo_to_use.data['test'])}"
        )

        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = TextClassificationHead(
            class_weights=data_silo.calculate_class_weights(
                task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(language_model=language_model,
                              prediction_heads=[prediction_head],
                              embeds_dropout_prob=0.2,
                              lm_output_types=["per_sequence"],
                              device=device)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        # An early stopping instance can be used to save the model that performs best on the dev set
        # according to some metric and stop training when no improvement is happening for some iterations.
        # NOTE: Using a different save directory for each fold, allows us afterwards to use the
        # nfolds best models in an ensemble!
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        earlystopping = EarlyStopping(
            metric="f1_offense",
            mode=
            "max",  # use the metric from our own metrics function instead of loss
            save_dir=save_dir,  # where to save the best model
            patience=
            5  # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          early_stopping=earlystopping,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold

    # remember all individual evaluation results
    allresults = []
    bestfold = None
    bestf1_offense = -1
    save_dir = Path("saved_models/bert-german-doc-tutorial-es")
    for num_fold, silo in enumerate(silos):
        mlflow.start_run(run_name=f"fold-{num_fold + 1}-of-{len(silos)}",
                         nested=True)
        model = train_on_split(silo, num_fold, save_dir)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        allresults.append(result)

        # keep track of best fold
        f1_offense = result[0]["f1_offense"]
        if f1_offense > bestf1_offense:
            bestf1_offense = f1_offense
            bestfold = num_fold
        mlflow.end_run()
        # emtpy cache to avoid memory leak and cuda OOM across multiple folds
        model.cpu()
        torch.cuda.empty_cache()

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("doc_classification_xval.results.json", "wt") as fp:
        json.dump(allresults, fp)

    # log the best fold metric and fold
    logger.info(f"Best fold f1_offense: {bestf1_offense} in fold {bestfold}")

    # calculate overall metrics across all folds: we only have one head so we do this only for the first head
    # information in each of the per-fold results

    # First create a dict where for each metric, we have a list of values from each fold
    xval_metric_lists_head0 = defaultdict(list)
    for results in allresults:
        head0results = results[0]
        for name in head0results.keys():
            if name not in ["preds", "labels"] and not name.startswith("_") and \
                    isinstance(head0results[name], numbers.Number):
                xval_metric_lists_head0[name].append(head0results[name])
    # Now calculate the mean and stdev for each metric, also copy over the task name
    xval_metric = {}
    xval_metric["task_name"] = allresults[0][0].get("task_name",
                                                    "UNKNOWN TASKNAME")
    for name in xval_metric_lists_head0.keys():
        values = xval_metric_lists_head0[name]
        vmean = statistics.mean(values)
        vstdev = statistics.stdev(values)
        xval_metric[name + "_mean"] = vmean
        xval_metric[name + "_stdev"] = vstdev

    logger.info(
        f"XVAL Accuracy:   mean {xval_metric['acc_mean']} stdev {xval_metric['acc_stdev']}"
    )
    logger.info(
        f"XVAL F1 MICRO:   mean {xval_metric['f1_micro_mean']} stdev {xval_metric['f1_micro_stdev']}"
    )
    logger.info(
        f"XVAL F1 MACRO:   mean {xval_metric['f1_macro_mean']} stdev {xval_metric['f1_macro_stdev']}"
    )
    logger.info(
        f"XVAL F1 OFFENSE: mean {xval_metric['f1_offense_mean']} stdev {xval_metric['f1_offense_stdev']}"
    )
    logger.info(
        f"XVAL F1 OTHER:   mean {xval_metric['f1_other_mean']} stdev {xval_metric['f1_other_stdev']}"
    )
    logger.info(
        f"XVAL MCC:        mean {xval_metric['mcc_mean']} stdev {xval_metric['mcc_stdev']}"
    )

    # -----------------------------------------------------
    # Just for illustration, use the best model from the best xval val for evaluation on
    # the original (still unseen) test set.
    logger.info(
        "###### Final Eval on hold out test set using best model #####")
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device)
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)

    result = evaluator_origtest.eval(model)
    logger.info(f"TEST Accuracy:   {result[0]['acc']}")
    logger.info(f"TEST F1 MICRO:   {result[0]['f1_micro']}")
    logger.info(f"TEST F1 MACRO:   {result[0]['f1_macro']}")
    logger.info(f"TEST F1 OFFENSE: {result[0]['f1_offense']}")
    logger.info(f"TEST F1 OTHER:   {result[0]['f1_other']}")
    logger.info(f"TEST MCC:        {result[0]['mcc']}")
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_text_pair_classification")

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 32
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    We do not have a sample dataset for regression yet, add your own dataset to run the example
    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="acc",
        label_column_name="label",
        max_seq_len=64,
        train_filename="train.tsv",
        test_filename="test.tsv",
        dev_filename="dev.tsv",
        #dev_split = 0.5,
        data_dir=Path("/mnt/data/datasets/patents/patent_matching"),
        tasks={"text_classification"},
        delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = StreamingDataSilo(processor=processor, batch_size=batch_size)

    #Alte Version vor StreamingDataSilo
    #data_silo = DataSilo(
    #    processor=processor,
    #    batch_size=batch_size, max_processes=4)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=[0.56, 0.44]  # Todo: Reihenfolge checken
    )

    model = AdaptiveModel.load("saved_models/text_pair_classification_model",
                               device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(model=model,
                                                         learning_rate=5e-6,
                                                         device=device,
                                                         n_batches=1466,
                                                         n_epochs=n_epochs)

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        #metric="f1_weighted", mode="max",  # use f1_macro from the dev evaluator of the trainer
        metric="loss",
        mode="min",  # use loss from the dev evaluator of the trainer
        save_dir=Path("saved_models/text_pair_classification_model_Tuned"
                      ),  # where to save the best model
        patience=
        2  # number of evaluations to wait for improvement before terminating the training
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/text_pair_classification_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    basic_texts = [
        {
            "text":
            "<claim-text>The method of claim 10, wherein the indium metal layer is 10 nm to 100 µm thick.</claim-text>",
            "text_b":
            "<p id="
            "p0001"
            " num="
            "0001"
            ">The present invention is directed to metal plating compositions and methods. More specifically, the present invention is directed to metal plating compositions and methods which provide improved leveling and throwing power.</p <p id="
            "p0039"
            " num="
            "0039"
            ">One or more conventional surfactants may be used. Typically, surfactants include, but are not limited to, nonionic surfactants such as alkyl phenoxy polyethoxyethanols. Other suitable surfactants containing multiple oxyethylene groups also may be used. Such surfactants include compounds of polyoxyethylene polymers having from as many as 20 to 150 repeating units. Such compounds also may perform as suppressors. Also included in the class of polymers are both block and random copolymers of polyoxyethylene (EO) and polyoxypropylene (PO). Surfactants may be added in conventional amounts, such as from 0.05 g/L to 20 g/L or such as from 0.5 g/L to 5 g/L.</p <p id="
            "p0040"
            " num="
            "0040"
            ">Conventional levelers include, but are not limited to, one or more of alkylated polyalkyleneimines and organic sulfo sulfonates. Examples of such compounds include, 4-mercaptopyridine, 2-mercaptothiazoline, ethylene thiourea, thiourea, 1-(2-hydroxyethyl)-2-imidazolidinethion (HIT) and alkylated polyalkyleneimines. Such levelers are included in conventional amounts. Typically, such levelers are included in amounts of 1ppb to 1 g/L, or such as from 10ppb to 500ppm.</p <p id="
            "p0042"
            " num="
            "0042"
            ">Alkali metal salts which may be included in the plating compositions include, but are not limited to, sodium and potassium salts of halogens, such as chloride, fluoride and bromide. Typically chloride is used. Such alkali metal salts are used in conventional amounts.</p <p id="
            "p0053"
            " num="
            "0053"
            ">The metal plating compositions may be used to plate a metal or metal alloy on a substrate by any method known in the art and literature. Typically, the metal or metal alloy is electroplated using conventional electroplating processes with conventional apparatus. A soluble or insoluble anode may be used with the electroplating compositions.</p <p id="
            "p0022"
            " num="
            "0022"
            ">One or more sources of metal ions are included in metal plating compositions to plate metals. The one or more sources of metal ions provide metal ions which include, but are not limited to, copper, tin, nickel, gold, silver, palladium, platinum and indium. Alloys include, but are not limited to, binary and ternary alloys of the foregoing metals. Typically, metals chosen from copper, tin, nickel, gold, silver or indium are plated with the metal plating compositions. More typically, metals chosen from copper, tin, silver or indium are plated. Most typically, copper is plated.</p <p id="
            "p0030"
            " num="
            "0030"
            ">Indium salts which may be used include, but are not limited to, one or more of indium salts of alkane sulfonic acids and aromatic sulfonic acids, such as methanesulfonic acid, ethanesulfonic acid, butane sulfonic acid, benzenesulfonic acid and toluenesulfonic acid, salts of sulfamic acid, sulfate salts, chloride and bromide salts of indium, nitrate salts, hydroxide salts, indium oxides, fluoroborate salts, indium salts of carboxylic acids, such as citric acid, acetoacetic acid, glyoxylic acid, pyruvic acid, glycolic acid, malonic acid, hydroxamic acid, iminodiacetic acid, salicylic acid, glyceric acid, succinic acid, malic acid, tartaric acid, hydroxybutyric acid, indium salts of amino acids, such as arginine, aspartic acid, asparagine, glutamic acid, glycine, glutamine, leucine, lysine, threonine, isoleucine, and valine.</p"
        },
        {
            "text":
            "<claim-text>A toner comprising: <claim-text>toner base particles; and</claim-text> <claim-text>an external additive,</claim-text> <claim-text>the toner base particles each comprising a binder resin and a colorant,</claim-text> <claim-text>wherein the external additive comprises coalesced particles,</claim-text> <claim-text>wherein the coalesced particles are each a non-spherical secondary particle in which primary particles are coalesced together, and</claim-text> <claim-text>wherein an index of a particle size distribution of the coalesced particles is expressed by the following Formula (1): <maths id="
            "math0004"
            " num="
            "(formula (1)"
            "><math display="
            "block"
            "><mfrac><msub><mi>Db</mi><mn>50</mn></msub><msub><mi>Db</mi><mn>10</mn></msub></mfrac><mo>≦</mo><mn>1.20</mn></math><img id="
            "ib0008"
            " file="
            "imgb0008.tif"
            " wi="
            "93"
            " he="
            "21"
            " img-content="
            "math"
            " img-format="
            "tif"
            "/></maths><br/> where, in a distribution diagram in which particle diameters in nm of the coalesced particles are on a horizontal axis and cumulative percentages in % by number of the coalesced particles are on a vertical axis and in which the coalesced particles are accumulated from the coalesced particles having smaller particle diameters to the coalesced particles having larger particle diameters, Db<sub>50</sub> denotes a particle diameter of the coalesced particle at which the cumulative percentage is 50% by number, and Db<sub>10</sub> denotes a particle diameter of the coalesced particle at which the cumulative percentage is 10% by number.</claim-text></claim-text>",
            "text_b":
            "<p id="
            "p0177"
            " num="
            "0177"
            ">For a similar reason, it is preferred that the electroconductive fine powder has a volume-average particle size of 0.5 - 5 µm, more preferably 0.8 - 5 µm, further preferably 1.1 - 5 µm and has a particle size distribution such that particles of 0.5 µm or smaller occupy at most 70 % by volume and particles of 5.0 µm or larger occupy at most 5 % by number.</p <p id="
            "p0189"
            " num="
            "0189"
            ">The volume-average particle size and particle size distribution of the electroconductive fine powder described herein are based on values measured in the following manner. A laser diffraction-type particle size distribution measurement apparatus ("
            "Model LS-230"
            ", available from Coulter Electronics Inc.) is equipped with a liquid module, and the measurement is performed in a particle size range of 0.04 - 2000 µm to obtain a volume-basis particle size distribution. For the measurement, a minor amount of surfactant is added to 10 cc of pure water and 10 mg of a sample electroconductive fine powder is added thereto, followed by 10 min. of dispersion by means of an ultrasonic disperser (ultrasonic homogenizer) to obtain a sample dispersion liquid, which is subjected to a single time of measurement for 90 sec.</p <p id="
            "p0191"
            " num="
            "0191"
            ">In the case where the electroconductive fine powder is composed of agglomerate particles, the particle size of the electroconductive fine powder is determined as the particle size of the agglomerate. The electroconductive fine powder in the form of agglomerated secondary particles can be used as well as that in the form of primary particles. Regardless of its agglomerated form, the electroconductive fine powder can exhibit its desired function of charging promotion by presence in the form of the agglomerate in the charging section at the contact position<!-- EPO <DP n="
            "85"
            "> --> between the charging member and the image-bearing member or in a region in proximity thereto.</p"
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)