Esempio n. 1
0
    def log_results(results, dataset_name, steps, logging=True, print=True):
        # Print a header
        header = "\n\n"
        header += BUSH_SEP + "\n"
        header += "***************************************************\n"
        header += f"***** EVALUATION | {dataset_name.upper()} SET | AFTER {steps} BATCHES *****\n"
        header += "***************************************************\n"
        header += BUSH_SEP + "\n"
        logger.info(header)

        for head_num, head in enumerate(results):
            logger.info("\n _________ {} _________".format(head['task_name']))
            for metric_name, metric_val in head.items():
                # log with ML framework (e.g. Mlflow)
                if logging:
                    if isinstance(metric_val, numbers.Number):
                        MlLogger.log_metrics(
                            metrics={
                                f"{dataset_name}_{metric_name}_{head['task_name']}": metric_val
                            },
                            step=steps,
                        )
                # print via standard python logger
                if print:
                    if metric_name == "report":
                        if isinstance(metric_val, str) and len(metric_val) > 8000:
                            metric_val = metric_val[:7500] + "\n ............................. \n" + metric_val[-500:]
                        logger.info("{}: \n {}".format(metric_name, metric_val))
                    else:
                        logger.info("{}: {}".format(metric_name, metric_val))
Esempio n. 2
0
    def _calculate_statistics(self, ):
        self.counts = {
            "train": len(self.data["train"]),
            "dev": len(self.data["dev"]),
            "test": len(self.data.get("test", [])),
        }

        train_input_numpy = self.data["train"][:][0].numpy()
        seq_lens = np.sum(train_input_numpy != 0, axis=1)
        self.ave_len = np.mean(seq_lens)
        max_seq_len = self.data["train"][:][0].shape[1]
        self.clipped = np.mean(seq_lens == max_seq_len)

        logger.info("Examples in train: {}".format(self.counts["train"]))
        logger.info("Examples in dev  : {}".format(self.counts["dev"]))
        logger.info("Examples in test : {}".format(self.counts["test"]))
        logger.info("")
        logger.info("Max sequence length:     {}".format(max(seq_lens)))
        logger.info("Average sequence length: {}".format(self.ave_len))
        logger.info("Proportion clipped:      {}".format(self.clipped))

        MlLogger.log_params({
            "n_samples_train": self.counts["train"],
            "n_samples_dev": self.counts["train"],
            "n_samples_test": self.counts["train"],
            "ave_seq_len": self.ave_len,
            "clipped": self.clipped
        })
Esempio n. 3
0
    def build_task_data(params: Params, data_supplier) -> SeqTagTaskData:
        dataset_dict: Dict[str, List[TaggedSequence]] = data_supplier()
        ner_labels = ["[PAD]", NIT] + list(
            set(tag for taggedseqs in dataset_dict.values()
                for taggedseq in taggedseqs for tok, tag in taggedseq))

        ml_logger = MLFlowLogger(tracking_uri=os.environ["HOME"] +
                                 "/data/mlflow_experiments/mlruns")
        ml_logger.init_experiment(experiment_name="Sequence_Tagging",
                                  run_name="Run_ner")

        lang_model = "bert-base-cased"
        do_lower_case = False

        tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                                   do_lower_case=do_lower_case)

        processor = NERProcessor(
            tokenizer=tokenizer,
            max_seq_len=128,
            data_dir=None,  # noqa
            metric="seq_f1",
            label_list=ner_labels,
        )

        task_data = {
            "num_labels": len(ner_labels),
            "lang_model": lang_model,
            "ml_logger": ml_logger,
            "processor": processor,
            "params": params,
        }
        return SeqTagTaskData(data=dataset_dict, task_data=task_data)
    def logits_to_loss(self, logits, global_step=None, **kwargs):
        """
        Get losses from all prediction heads & reduce to single loss *per sample*.

        :param logits: logits, can vary in shape and type, depending on task
        :type logits: object
        :param global_step: number of current training step
        :type global_step: int
        :param kwargs: placeholder for passing generic parameters.
                       Note: Contains the batch (as dict of tensors), when called from Trainer.train().
        :type kwargs: object
        :return loss: torch.tensor that is the per sample loss (len: batch_size)
        """
        all_losses = self.logits_to_loss_per_head(logits, **kwargs)
        # This aggregates the loss per sample across multiple prediction heads
        # Default is sum(), but you can configure any fn that takes [Tensor, Tensor ...] and returns [Tensor]

        # Log the loss per task
        for i, per_sample_loss in enumerate(all_losses):
            task_name = self.prediction_heads[i].task_name
            task_loss = per_sample_loss.mean()
            MlLogger.log_metrics(
                {
                    f"train_loss_{task_name}":
                    float(task_loss.detach().cpu().numpy())
                },
                step=global_step)

        loss = self.loss_aggregation_fn(all_losses,
                                        global_step=global_step,
                                        batch=kwargs)
        return loss
Esempio n. 5
0
 def log_params(self):
     """
     Logs paramteres to generic logger MlLogger
     """
     params = {
         "lm1_type":
         self.language_model1.__class__.__name__,
         "lm1_name":
         self.language_model1.name,
         "lm1_output_types":
         ",".join(self.lm1_output_types),
         "lm2_type":
         self.language_model2.__class__.__name__,
         "lm2_name":
         self.language_model2.name,
         "lm2_output_types":
         ",".join(self.lm2_output_types),
         "prediction_heads":
         ",".join(
             [head.__class__.__name__ for head in self.prediction_heads])
     }
     try:
         MlLogger.log_params(params)
     except Exception as e:
         logger.warning(f"ML logging didn't work: {e}")
Esempio n. 6
0
def main():
    config_files = [
        "experiments/ner/conll2003_de_config.json",
        "experiments/ner/germEval14_config.json",
        "experiments/text_classification/germEval18Fine_config.json",
        "experiments/text_classification/germEval18Coarse_config.json",
        "experiments/text_classification/gnad_config.json",
        "experiments/qa/squad20_config.json",
    ]

    for conf_file in config_files:
        experiments = load_experiments(conf_file)
        for args in experiments:
            logger.info(
                "\n***********************************************"
                f"\n************* Experiment: {args.task.name} ************"
                "\n************************************************"
            )
            ml_logger = MLFlowLogger(tracking_uri=args.logging.mlflow_url)
            ml_logger.init_experiment(
                experiment_name=args.logging.mlflow_experiment,
                run_name=args.logging.mlflow_run_name,
                nested=args.logging.mlflow_nested,
            )
            run_experiment(args)
Esempio n. 7
0
File: train.py Progetto: wwmmqq/FARM
    def backward_propagate(self, loss, step):
        loss = self.adjust_loss(loss)
        if self.global_step % 10 == 1:
            MlLogger.log_metrics(
                {"Train_loss_total": float(loss.detach().cpu().numpy())},
                step=self.global_step,
            )
        if self.use_amp:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        if self.log_learning_rate:
            MlLogger.log_metrics(
                {"learning_rate": self.lr_schedule.get_lr()[0]},
                step=self.global_step)

        if step % self.grad_acc_steps == 0:
            # TODO We might wanna add gradient clipping here
            self.optimizer.step()
            self.optimizer.zero_grad()
            if self.lr_schedule:
                self.lr_schedule.step()
        return loss
Esempio n. 8
0
File: train.py Progetto: Wkryst/FARM
 def log_params(self):
     params = {
         "epochs": self.epochs,
         "n_gpu": self.n_gpu,
         "device": self.device
     }
     MlLogger.log_params(params)
Esempio n. 9
0
 def log_results(results, dataset_name, steps, logging=True, print=True):
     logger.info(
         "\n***** Evaluation Results on {} data after {} steps *****".format(
             dataset_name, steps
         )
     )
     for head_num, head in enumerate(results):
         logger.info("\n _________ Prediction Head {} _________".format(head_num))
         for metric_name, metric_val in head.items():
             # log with ML framework (e.g. Mlflow)
             if logging:
                 if isinstance(metric_val, numbers.Number):
                     MlLogger.log_metrics(
                         metrics={
                             f"{dataset_name}_{metric_name}_head{head_num}": metric_val
                         },
                         step=steps,
                     )
             # print via standard python logger
             if print:
                 if metric_name == "report":
                     if isinstance(metric_val, str) and len(metric_val) > 8000:
                         metric_val = metric_val[:7500] + "\n ............................. \n" + metric_val[-500:]
                     logger.info("{}: \n {}".format(metric_name, metric_val))
                 else:
                     logger.info("{}: {}".format(metric_name, metric_val))
Esempio n. 10
0
    def _calculate_statistics(self, ):
        self.counts = {
            "train": len(self.data["train"]),
            "dev": len(self.data["dev"]),
            "test": len(self.data.get("test", [])),
        }

        train_input_numpy = self.data["train"][:][0].numpy()
        seq_lens = np.sum(train_input_numpy != 0, axis=1)
        self.ave_len = np.mean(seq_lens)
        max_seq_len = self.data["train"][:][0].shape[1]
        self.clipped = np.mean(seq_lens == max_seq_len)

        logger.info("Examples in train: {}".format(self.counts["train"]))
        logger.info("Examples in dev  : {}".format(self.counts["dev"]))
        logger.info("Examples in test : {}".format(self.counts["test"]))
        logger.info("")
        logger.info("Max sequence length:     {}".format(max(seq_lens)))
        logger.info("Average sequence length: {}".format(self.ave_len))
        logger.info("Proportion clipped:      {}".format(self.clipped))
        if self.clipped > 0.5:
            logger.info(
                "[Farmer's Tip] {}% of your samples got cut down to {} tokens. "
                "Consider increasing max_seq_len. "
                "This will lead to higher memory consumption but is likely to "
                "improve your model performance".format(
                    round(self.clipped * 100, 1), max_seq_len))

        MlLogger.log_params({
            "n_samples_train": self.counts["train"],
            "n_samples_dev": self.counts["train"],
            "n_samples_test": self.counts["train"],
            "ave_seq_len": self.ave_len,
            "clipped": self.clipped
        })
Esempio n. 11
0
    def backward_propagate(self, loss, step):
        loss = self.adjust_loss(loss)
        if self.global_step % self.log_loss_every == 0 and self.local_rank in [
                -1, 0
        ]:
            if self.local_rank in [-1, 0]:
                MlLogger.log_metrics(
                    {"Train_loss_total": float(loss.detach().cpu().numpy())},
                    step=self.global_step,
                )
                if self.log_learning_rate:
                    MlLogger.log_metrics(
                        {"learning_rate": self.lr_schedule.get_last_lr()[0]},
                        step=self.global_step)
        if self.use_amp:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        if step % self.grad_acc_steps == 0:
            if self.max_grad_norm is not None:
                if self.use_amp:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(self.optimizer), self.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            if self.lr_schedule:
                self.lr_schedule.step()
        return loss
Esempio n. 12
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    
    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")
Esempio n. 13
0
def _get_optim(model, opts):
    """ Get the optimizer based on dictionary with options. Options are passed to the optimizer constructor.

    :param model: model to optimize
    :param opts: config dictionary that will be passed to optimizer together with the params
    (e.g. lr, weight_decay, correct_bias ...). no_decay' can be given - parameters containing any of those strings
    will have weight_decay set to 0.
    :return: created optimizer
    """

    optimizer_name = opts.pop('name', None)

    # Logging
    logger.info(f"Loading optimizer `{optimizer_name}`: '{opts}'")
    MlLogger.log_params(opts)
    MlLogger.log_params({"optimizer_name": optimizer_name})

    weight_decay = opts.pop('weight_decay', None)
    no_decay = opts.pop('no_decay', None)

    if no_decay:
        optimizable_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad],
             **opts},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
             'weight_decay': 0.0,
             **opts}
        ]
    else:
        optimizable_parameters = [{'params': [p for p in model.parameters() if p.requires_grad], **opts}]

    # default weight decay is not the same for all optimizers, so we can't use default value
    # only explicitly add weight decay if it's given
    if weight_decay is not None:
        optimizable_parameters[0]['weight_decay'] = weight_decay

    # Import optimizer by checking in order: torch, transformers, apex and local imports
    try:
        optim_constructor = getattr(import_module('torch.optim'), optimizer_name)
    except AttributeError:
        try:
            optim_constructor = getattr(import_module('transformers.optimization'), optimizer_name)
        except AttributeError:
            try:
                optim_constructor = getattr(import_module('apex.optimizers'), optimizer_name)
            except (AttributeError, ImportError):
                try:
                    # Workaround to allow loading AdamW from transformers
                    # pytorch > 1.2 has now also a AdamW (but without the option to set bias_correction = False,
                    # which is done in the original BERT implementation)
                    optim_constructor = getattr(sys.modules[__name__], optimizer_name)
                except (AttributeError, ImportError):
                    raise AttributeError(f"Optimizer '{optimizer_name}' not found in 'torch', 'transformers', 'apex' or 'local imports")

    return optim_constructor(optimizable_parameters)
Esempio n. 14
0
    def _calculate_statistics(self):
        """ Calculate and log simple summary statistics of the datasets """

        self.counts = {}

        if self.data["train"]:
            self.counts["train"] = len(self.data["train"])
        else:
            self.counts["train"] = 0

        if self.data["dev"]:
            self.counts["dev"] = len(self.data["dev"])
        else:
            self.counts["dev"] = 0

        if self.data["test"]:
            self.counts["test"] = len(self.data["test"])
        else:
            self.counts["test"] = 0

        seq_lens = []
        if self.data["train"]:
            for dataset in self.data["train"].datasets:
                train_input_numpy = dataset[:][0].numpy()
                seq_lens.extend(np.sum(train_input_numpy != self.processor.tokenizer.pad_token_id, axis=1))
            max_seq_len = dataset[:][0].shape[1]

        self.clipped = np.mean(np.array(seq_lens) == max_seq_len) if seq_lens else 0
        self.ave_len = np.mean(seq_lens) if seq_lens else 0

        logger.info("Examples in train: {}".format(self.counts["train"]))
        logger.info("Examples in dev  : {}".format(self.counts["dev"]))
        logger.info("Examples in test : {}".format(self.counts["test"]))
        logger.info("")
        if self.data["train"]:
            logger.info("Longest sequence length observed after clipping:     {}".format(max(seq_lens)))
            logger.info("Average sequence length after clipping: {}".format(self.ave_len))
            logger.info("Proportion clipped:      {}".format(self.clipped))
            if self.clipped > 0.5:
                logger.info("[Farmer's Tip] {}% of your samples got cut down to {} tokens. "
                            "Consider increasing max_seq_len. "
                            "This will lead to higher memory consumption but is likely to "
                            "improve your model performance".format(round(self.clipped * 100, 1), max_seq_len))

        MlLogger.log_params(
            {
                "n_samples_train": self.counts["train"],
                "n_samples_dev": self.counts["dev"],
                "n_samples_test": self.counts["test"],
                "batch_size": self.batch_size,
                "ave_seq_len": self.ave_len,
                "clipped": self.clipped,
            }
        )
Esempio n. 15
0
 def _log_params(self):
     params = {
         "processor": self.__class__.__name__,
         "tokenizer": self.tokenizer.__class__.__name__,
     }
     names = ["max_seq_len", "dev_split"]
     for name in names:
         value = getattr(self, name)
         params.update({name: str(value)})
     try:
         MlLogger.log_params(params)
     except Exception as e:
         logger.warning(f"ML logging didn't work: {e}")
def log_results(results, dataset_name, steps, logging=True, print=True, save_path=None, num_fold=None):
    logger = get_logger(__name__)

    # Print a header
    header = "\n\n"
    header += BUSH_SEP + "\n"
    header += "***************************************************\n"
    if num_fold:
        header += f"***** EVALUATION | FOLD: {num_fold} | {dataset_name.upper()} SET | AFTER {steps} BATCHES *****\n"
    else:
        header += f"***** EVALUATION | {dataset_name.upper()} SET | AFTER {steps} BATCHES *****\n"
    header += "***************************************************\n"
    header += BUSH_SEP + "\n"
    logger.info(header)

    save_log = header

    for head_num, head in enumerate(results):
        logger.info("\n _________ {} _________".format(head['task_name']))
        for metric_name, metric_val in head.items():
            metric_log = None

            # log with ML framework (e.g. Mlflow)
            if logging:
                if not metric_name in ["preds", "probs", "labels"] and not metric_name.startswith("_"):
                    if isinstance(metric_val, numbers.Number):
                        MlLogger.log_metrics(
                            metrics={
                                f"{dataset_name}_{metric_name}_{head['task_name']}": metric_val
                            },
                            step=steps,
                        )

            # print via standard python logger
            if print:
                if metric_name == "report":
                    if isinstance(metric_val, str) and len(metric_val) > 8000:
                        metric_val = metric_val[:7500] + "\n ............................. \n" + metric_val[-500:]
                    metric_log = "{}: \n {}".format(metric_name, metric_val)
                    logger.info(metric_log)
                else:
                    if not metric_name in ["preds", "probs", "labels"] and not metric_name.startswith("_"):
                        metric_log = "{}: {};".format(metric_name, metric_val)
                        logger.info(metric_log)

            if save_path and metric_log:
                save_log += "\n" + metric_log

    if save_path:
        with open(save_path, "w", encoding="utf-8") as log_file:
            log_file.write(save_log)
Esempio n. 17
0
def eval_question_similarity(y_true,
                             y_pred,
                             lang,
                             model_name,
                             params,
                             user=None,
                             log_to_mlflow=True,
                             run_name="default"):
    # basic metrics
    mean_diff = np.mean(np.abs(y_true - y_pred))
    roc_auc = roc_auc_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred.round(0))
    metrics = {"roc_auc": roc_auc, "mean_abs_diff": mean_diff, "f1_score": f1}
    print(metrics)

    # log experiment results to MLFlow (visit https://public-mlflow.deepset.ai/)
    if log_to_mlflow:
        params["lang"] = lang
        params["model_name"] = model_name
        if user:
            params["user"] = user

        ml_logger = MLFlowLogger(
            tracking_uri="https://public-mlflow.deepset.ai/")
        ml_logger.init_experiment(experiment_name="COVID-question-sim",
                                  run_name=run_name)
        ml_logger.log_params(params)
        ml_logger.log_metrics(metrics, step=0)
Esempio n. 18
0
 def log_params(self):
     """
     Logs paramteres to generic logger MlLogger
     :return: just log into the void
     """
     params = {
         "lm": self.language_model.__class__.__name__,
         "prediction_heads": ",".join(
             [head.__class__.__name__ for head in self.prediction_heads]
         ),
         "lm_output_types": ",".join(self.lm_output_types),
     }
     try:
         MlLogger.log_params(params)
     except Exception as e:
         logger.warning(f"ML logging didn't work: {e}")
Esempio n. 19
0
    def _calculate_statistics(self, ):
        self.counts = {
            "train": len(self.data["train"]),
            "dev": len(self.data["dev"]),
            "test": len(self.data.get("test", [])),
        }

        logger.info("Examples in train: {}".format(self.counts["train"]))
        logger.info("Examples in dev  : {}".format(self.counts["dev"]))
        logger.info("Examples in test : {}".format(self.counts["test"]))

        MlLogger.log_params({
            "n_samples_train": self.counts["train"],
            "n_samples_dev": self.counts["train"],
            "n_samples_test": self.counts["train"],
        })
 def init_logging():
     logger = logging.getLogger(__name__)
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
         level=logging.INFO)
     # reduce verbosity from transformers library
     logging.getLogger('transformers').setLevel(logging.WARNING)
     ml_logger = MLFlowLogger(tracking_uri="logs")
     return logger, ml_logger
Esempio n. 21
0
    def dataset_from_file(self, file, log_time=True):
        """
        Contains all the functionality to turn a data file into a PyTorch Dataset and a
        list of tensor names. This is used for training and evaluation.

        :param file: Name of the file containing the data.
        :type file: str
        :return: a Pytorch dataset and a list of tensor names.
        """
        if log_time:
            a = time.time()
            self._init_baskets_from_file(file)
            b = time.time()
            MlLogger.log_metrics(metrics={"t_from_file": (b - a) / 60}, step=0)
            self._init_samples_in_baskets()
            c = time.time()
            MlLogger.log_metrics(metrics={"t_init_samples": (c - b) / 60}, step=0)
            self._featurize_samples()
            d = time.time()
            MlLogger.log_metrics(metrics={"t_featurize_samples": (d - c) / 60}, step=0)
            self._log_samples(3)
        else:
            self._init_baskets_from_file(file)
            self._init_samples_in_baskets()
            self._featurize_samples()
            self._log_samples(3)
        dataset, tensor_names = self._create_dataset()
        return dataset, tensor_names
Esempio n. 22
0
def get_scheduler(optimizer, opts):
    """ Get the scheduler based on dictionary with options. Options are passed to the scheduler constructor.

    :param optimizer: optimizer whose learning rate to control
    :param opts: dictionary of args to be passed to constructor of schedule
    :return: created scheduler
    """
    schedule_name = opts.get('name')
    try:
        sched_constructor = getattr(import_module('torch.optim.lr_scheduler'),
                                    schedule_name)
    except AttributeError:
        try:
            # The method names in transformers became quite long and unhandy.
            # for convenience we offer usage of shorter alias (e.g. "LinearWarmup")
            scheduler_translations = {
                "LinearWarmup":
                "get_linear_schedule_with_warmup",
                "ConstantWarmup":
                "get_constant_schedule_with_warmup",
                "Constant":
                "get_constant_schedule",
                "CosineWarmup":
                "get_cosine_schedule_with_warmup",
                "CosineWarmupWithRestarts":
                "get_cosine_with_hard_restarts_schedule_with_warmup"
            }
            if schedule_name in scheduler_translations.keys():
                schedule_name = scheduler_translations[schedule_name]
            # in contrast to torch, we actually get here a method and not a class
            sched_constructor = getattr(
                import_module('transformers.optimization'), schedule_name)
        except AttributeError:
            raise AttributeError(
                f"Scheduler '{schedule_name}' not found in 'torch' or 'transformers'"
            )

    logger.info(f"Using scheduler '{schedule_name}'")

    # get supported args of constructor
    allowed_args = inspect.signature(sched_constructor).parameters.keys()

    # convert from warmup proportion to steps if required
    if 'num_warmup_steps' in allowed_args and 'num_warmup_steps' not in opts and 'warmup_proportion' in opts:
        opts['num_warmup_steps'] = int(opts["warmup_proportion"] *
                                       opts["num_training_steps"])
        MlLogger.log_params({"warmup_proportion": opts["warmup_proportion"]})

    # only pass args that are supported by the constructor
    constructor_opts = {k: v for k, v in opts.items() if k in allowed_args}

    # Logging
    logger.info(f"Loading schedule `{schedule_name}`: '{constructor_opts}'")
    MlLogger.log_params(constructor_opts)
    MlLogger.log_params({"schedule_name": schedule_name})

    scheduler = sched_constructor(optimizer, **constructor_opts)
    scheduler.opts = opts  # save the opts with the scheduler to use in load/save
    return scheduler
Esempio n. 23
0
File: train.py Progetto: Wkryst/FARM
    def backward_propagate(self, loss, step):
        loss = self.adjust_loss(loss)
        if self.global_step % 10 == 1:
            MlLogger.log_metrics(
                {"Train_loss_total": float(loss.detach().cpu().numpy())},
                step=self.global_step,
            )
        if self.fp16:
            self.optimizer.backward(loss)
        else:
            loss.backward()

        if (step + 1) % self.grad_acc_steps == 0:
            if self.fp16:
                # modify learning rate with special warm up BERT uses
                # if args.fp16 is False, BertAdam is used that handles this automatically
                lr_this_step = self.learning_rate * self.warmup_linear.get_lr(
                    self.global_step, self.warmup_proportion)
                for param_group in self.optimizer.param_groups:
                    param_group["lr"] = lr_this_step
                # MlLogger.write_metrics({"learning_rate": lr_this_step}, step=self.global_step)
            self.optimizer.step()
            self.optimizer.zero_grad()
def doc_classification(
    task_config,
    model_name_or_path,
    cache_dir,
    data_dir,
    save_dir,
    model_dir,
    run_name="0",
    lr=1e-05,
    warmup_steps=5000,
    balance_classes=True,
    embeds_dropout=0.1,
    epochs=200,  # large because we use early stopping by default
    batch_size=20,
    grad_acc_steps=1,
    early_stopping_metric="roc_auc",
    early_stopping_mode="max",
    early_stopping_patience=10,
    model_class="Bert",
    tokenizer_class="BertTokenizer",
    do_lower_case=False,
    do_train=True,
    do_eval=True,
    do_hpo=False,
    print_preds=False,
    print_dev_preds=False,
    max_seq_len=512,
    seed=11,
    eval_every=500,
    use_amp=False,
    use_cuda=True,
):
    # Load task config
    task_config = yaml.safe_load(open(task_config))

    data_dir = data_dir
    save_dir = save_dir
    model_dir = model_dir

    # Create label list from args list or (for large label lists) create from file by splitting by space
    if isinstance(task_config["data"]["label_list"], list):
        label_list = task_config["data"]["label_list"]
    else:
        with open(data_dir / 'labels' /
                  task_config["data"]["label_list"]) as code_file:
            label_list = code_file.read().split(" ")

    # Register Outcome Metrics
    register_task_metrics(label_list)

    # General Settings
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=model_name_or_path,
        tokenizer_class=tokenizer_class,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        data_dir=data_dir,
        label_list=label_list,
        metric=task_config["metric"],
        multilabel=task_config["multilabel"],
        train_filename=task_config["data"]["train_filename"],
        dev_filename=task_config["data"]["dev_filename"],
        dev_split=task_config["data"]["dev_split"]
        if "dev_split" in task_config["data"] else None,
        test_filename=task_config["data"]["test_filename"],
        delimiter=task_config["data"]["parsing"]["delimiter"],
        quote_char=task_config["data"]["parsing"]["quote_char"],
        label_column_name=task_config["data"]["parsing"]["label_column"])

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor,
                         caching=True,
                         cache_path=Path(cache_dir),
                         batch_size=batch_size)

    if do_train:

        # Setup MLFlow logger
        ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"])
        ml_logger.init_experiment(
            experiment_name=task_config["experiment_name"],
            run_name=f'{task_config["experiment_name"]}_{run_name}')

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(model_name_or_path,
                                            language_model_class=model_class)

        # b) and a prediction head on top that is suited for our task

        # Define class weights
        if balance_classes:
            class_weights = data_silo.calculate_class_weights(
                task_name=task_config["task_type"])
        else:
            class_weights = None

        # Create Multi- or Single-Label Classification Heads
        if task_config["multilabel"]:

            prediction_head = MultiLabelTextClassificationHead(
                class_weights=class_weights, num_labels=len(label_list))

        else:
            prediction_head = ExtendedTextClassificationHead(
                class_weights=class_weights, num_labels=len(label_list))

        model = ExtendedAdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=embeds_dropout,
            lm_output_types=[task_config["output_type"]],
            device=device)

        # 5. Create an optimizer
        schedule_opts = {
            "name": "LinearWarmup",
            "num_warmup_steps": warmup_steps
        }

        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=lr,
            device=device,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=epochs,
            use_amp=use_amp,
            grad_acc_steps=grad_acc_steps,
            schedule_opts=schedule_opts)

        # 6. Create an early stopping instance
        early_stopping = None
        if early_stopping_mode != "none":
            early_stopping = EarlyStopping(mode=early_stopping_mode,
                                           min_delta=0.0001,
                                           save_dir=model_dir,
                                           metric=early_stopping_metric,
                                           patience=early_stopping_patience)

        # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it
        # from time to time

        trainer = ExtendedTrainer(model=model,
                                  optimizer=optimizer,
                                  data_silo=data_silo,
                                  epochs=epochs,
                                  n_gpu=n_gpu,
                                  lr_schedule=lr_schedule,
                                  evaluate_every=eval_every,
                                  early_stopping=early_stopping,
                                  device=device,
                                  grad_acc_steps=grad_acc_steps,
                                  evaluator_test=do_eval)

        def score_callback(eval_score, train_loss):
            tune.report(roc_auc_dev=eval_score, train_loss=train_loss)

        # 8. Train the model
        trainer.train(score_callback=score_callback if do_hpo else None)

        # 9. Save model if not saved in early stopping
        model.save(model_dir + "/final_model")
        processor.save(model_dir + "/final_model")

    if do_eval:
        # Load newly trained model or existing model
        if do_train:
            model_dir = model_dir
        else:
            model_dir = Path(model_name_or_path)

        logger.info("###### Eval on TEST SET #####")

        evaluator_test = ExtendedEvaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device)

        # Load trained model for evaluation
        model = ExtendedAdaptiveModel.load(model_dir, device)
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)

        # Evaluate
        results = evaluator_test.eval(model, return_preds_and_labels=True)

        # Log results
        utils.log_results(results,
                          dataset_name="test",
                          steps=len(evaluator_test.data_loader),
                          save_path=model_dir + "/eval_results.txt")

        if print_preds:
            # Print model test predictions
            utils.save_predictions(results,
                                   save_dir=model_dir,
                                   multilabel=task_config["multilabel"])

        if print_dev_preds:
            # Evaluate on dev set, e.g. for threshold tuning
            evaluator_dev = Evaluator(
                data_loader=data_silo.get_data_loader("dev"),
                tasks=data_silo.processor.tasks,
                device=device)
            dev_results = evaluator_dev.eval(model,
                                             return_preds_and_labels=True)
            utils.log_results(dev_results,
                              dataset_name="dev",
                              steps=len(evaluator_dev.data_loader),
                              save_path=model_dir + "/eval_dev_results.txt")

            # Print model dev predictions
            utils.save_predictions(dev_results,
                                   save_dir=model_dir,
                                   multilabel=task_config["multilabel"],
                                   dataset_name="dev")
def finetune_sentence_level(args):
    logging.basicConfig(
        format="%(asctime)s %(levelname)s %(name)s  %(message)s",
        datefmt="%d-%m-%y %H:%M:%S",
        level=logging.INFO)
    args.logger = logging.getLogger(__name__)
    if args.do_logfile:
        filehandler = logging.FileHandler(
            os.path.join(args.log_dir, f"{args.run_name}.log"))
        args.logger.addHandler(filehandler)
    args.logger.info(vars(args))
    # Setup MLFlow
    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name=args.experiment_name,
                              run_name=args.run_name)
    set_all_seeds(seed=args.seed)
    args.device, args.n_gpu = initialize_device_settings(use_cuda=True)
    # Create a tokenizer
    tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer"
    tokenizer = CustomTokenizer.load(
        pretrained_model_name_or_path=args.model_name,
        do_lower_case=args.do_lower_case,
        tokenizer_class=tok_class)
    # Create a processor for the dataset
    processor = load_processor(args, tokenizer)
    # Create a DataSilo that loads several datasets (train/dev/test)
    # provides DataLoaders and calculates descriptive statistics
    data_silo = DataSilo(processor=processor, batch_size=args.batch_size)
    if args.do_feat_embeds:
        args.feat_size = processor.feat_size
    # We do cross-validation
    if args.folds > 1:
        evaluate_kfold(args, data_silo, processor)
    else:
        adapt_model = train_on_split(args, data_silo, processor)
        evaluator_test = MultitaskEvaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=args.device)
        result = evaluator_test.eval(adapt_model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   steps=len(
                                       data_silo.get_data_loader("test")))
        pred_tsv = pd.DataFrame()
        args.logger.info("Test results:")
        for res in result[1:]:
            args.logger.info(f"__{res['task_name']}__")
            if args.train_mode == "classification":
                metrics = classification_metrics(res.get("preds"),
                                                 res.get("labels"))
                args.logger.info(metrics)
            else:
                metrics = regression_metrics(res.get("preds"),
                                             res.get("labels"))
                for metric in metrics.keys():
                    args.logger.info(f"{metric}: {metrics[metric]}")
            if args.save_predictions:
                pred_tsv[f"{res['task_name']}_preds"] = res.get("preds")[0]
                pred_tsv[f"{res['task_name']}_labels"] = res.get("labels")[0]
        if args.save_predictions:
            save_tsv(pred_tsv,
                     os.path.join(args.out_dir, f"{args.run_name}.tsv"))
        # Load trained model and perform inference
        dicts = [
            {
                "text":
                "The intense interest aroused in the public has now somewhat subsided."
            },
            {
                "text": "The quick brown fox jumped over the lazy dog."
            },
        ]
        model = MultitaskInferencer.load(args.save_dir,
                                         gpu=True,
                                         level="sentence")
        result = model.inference_from_dicts(dicts=dicts)
        args.logger.info("Inference example:")
        args.logger.info(result)
def doc_classification_with_earlystopping():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    # ml_logger = MLFlowLogger(tracking_uri="logs")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.2,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=0.5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        metric="f1_offense",
        mode=
        "max",  # use the metric from our own metrics function instead of loss
        # metric="f1_macro", mode="max",  # use f1_macro from the dev evaluator of the trainer
        # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
        save_dir=Path("saved_models/bert-german-doc-tutorial-es"
                      ),  # where to save the best model
        patience=
        5  # number of evaluations to wait for improvement before terminating the training
    )

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model.
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]

    # Load from the final epoch directory and apply
    print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING")
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
    model.close_multiprocessing_pool()

    # Load from saved best model
    print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING")
    model = Inferencer.load(earlystopping.save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print("APPLICATION ON BEST MODEL")
    print(result)
    model.close_multiprocessing_pool()
Esempio n. 27
0
from farm.data_handler.processor import RegressionProcessor
from farm.experiment import initialize_optimizer
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import RegressionHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO)

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM",
                          run_name="Run_doc_regression")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 5
batch_size = 32
evaluate_every = 30
lang_model = "bert-base-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.from_pretrained(pretrained_model_name_or_path=lang_model,
Esempio n. 28
0
File: ner.py Progetto: tholor/FARM
from farm.modeling.optimization import initialize_optimizer
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import Bert
from farm.modeling.prediction_head import TokenClassificationHead
from farm.modeling.tokenization import BertTokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM",
                          run_name="Run_minimal_example_ner")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 4
batch_size = 32
evaluate_every = 50
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = BertTokenizer.from_pretrained(
Esempio n. 29
0
def question_answering():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_question_answering")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 24
    n_epochs = 2
    evaluate_every = 2000
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    label_list = ["start_token", "end_token"]
    metric = "squad"
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=label_list,
        metric=metric,
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("../data/squad20"),
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Question Answering
    prediction_head = QuestionAnsweringHead()

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/bert-english-qa-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    QA_input = [{
        "qas": ["Who counted the game among the best ever made?"],
        "context":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]

    model = QAInferencer.load(save_dir, batch_size=40, gpu=True)
    result = model.inference_from_dicts(dicts=QA_input)[0]

    pprint.pprint(result)

    # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk
    filename = os.path.join(processor.data_dir, processor.dev_filename)
    result = model.inference_from_file(file=filename, return_json=False)
    result_squad = [x.to_squad_eval() for x in result]

    write_squad_predictions(predictions=result_squad,
                            predictions_filename=filename,
                            out_filename="predictions.json")
Esempio n. 30
0
def question_answering_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    #ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    #ml_logger.init_experiment(experiment_name="QA_X-Validation", run_name="Squad_Roberta_Base")

    ##########################
    ########## Settings
    ##########################
    save_per_fold_results = False  # unsupported for now
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False

    n_epochs = 2
    batch_size = 80
    learning_rate = 3e-5

    data_dir = Path("../data/covidqa")
    filename = "COVID-QA.json"
    xval_folds = 5
    dev_split = 0
    evaluate_every = 0
    no_ans_boost = -100  # use large negative values to disable giving "no answer" option
    accuracy_at = 3  # accuracy at n is useful for answers inside long documents
    use_amp = None

    ##########################
    ########## k fold Cross validation
    ##########################

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=filename,
        dev_filename=None,
        dev_split=dev_split,
        test_filename=None,
        data_dir=data_dir,
        doc_stride=192,
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} ############")

        # fine-tune pre-trained question-answering model
        model = AdaptiveModel.convert_from_transformers(
            lang_model, device=device, task_type="question_answering")
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)
        # If positive, thjs will boost "No Answer" as prediction.
        # If negative, this will prevent the model from giving "No Answer" as prediction.
        model.prediction_heads[0].no_ans_boost = no_ans_boost
        # Number of predictions the model will make per Question.
        # The multiple predictions are used for evaluating top n recall.
        model.prediction_heads[0].n_best = accuracy_at

        # # or train question-answering models from scratch
        # # Create an AdaptiveModel
        # # a) which consists of a pretrained language model as a basis
        # language_model = LanguageModel.load(lang_model)
        # # b) and a prediction head on top that is suited for our task => Question-answering
        # prediction_head = QuestionAnsweringHead(no_ans_boost=no_ans_boost, n_best=accuracy_at)
        # model = AdaptiveModel(
        #    language_model=language_model,
        #    prediction_heads=[prediction_head],
        #    embeds_dropout_prob=0.1,
        #    lm_output_types=["per_token"],
        #    device=device,)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=learning_rate,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, then evaluate the model on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    all_results = []
    all_preds = []
    all_labels = []
    all_f1 = []
    all_em = []
    all_topnaccuracy = []

    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold)

        # do eval on test set here (and not in Trainer),
        # so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   logging=False,
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        all_results.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))
        all_f1.append(result[0]["f1"])
        all_em.append(result[0]["EM"])
        all_topnaccuracy.append(result[0]["top_n_accuracy"])

        # emtpy cache to avoid memory leak and cuda OOM across multiple folds
        model.cpu()
        torch.cuda.empty_cache()

    # Save the per-fold results to json for a separate, more detailed analysis
    # TODO currently not supported - adjust to QAPred and QACandidate objects
    # if save_per_fold_results:
    #     def convert_numpy_dtype(obj):
    #         if type(obj).__module__ == "numpy":
    #             return obj.item()
    #
    #         raise TypeError("Unknown type:", type(obj))
    #
    #     with open("qa_xval.results.json", "wt") as fp:
    #          json.dump(all_results, fp, default=convert_numpy_dtype)

    # calculate overall metrics across all folds
    xval_score = squad(preds=all_preds, labels=all_labels)

    logger.info(f"Single EM-Scores:   {all_em}")
    logger.info(f"Single F1-Scores:   {all_f1}")
    logger.info(
        f"Single top_{accuracy_at}_accuracy Scores:   {all_topnaccuracy}")
    logger.info(f"XVAL EM:   {xval_score['EM']}")
    logger.info(f"XVAL f1:   {xval_score['f1']}")
    logger.info(
        f"XVAL top_{accuracy_at}_accuracy:   {xval_score['top_n_accuracy']}")
    ml_logger.log_metrics({"XVAL EM": xval_score["EM"]}, 0)
    ml_logger.log_metrics({"XVAL f1": xval_score["f1"]}, 0)
    ml_logger.log_metrics(
        {f"XVAL top_{accuracy_at}_accuracy": xval_score["top_n_accuracy"]}, 0)