Exemple #1
0
    def eval_on_file(self, data_dir: str, test_filename: str, device: str):
        """
        Performs evaluation on a SQuAD-formatted file.
        Returns a dict containing the following metrics:
            - "EM": exact match score
            - "f1": F1-Score
            - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer

        :param data_dir: The directory in which the test set can be found
        :type data_dir: Path or str
        :param test_filename: The name of the file containing the test data in SQuAD format.
        :type test_filename: str
        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
        :type device: str
        """
        eval_processor = SquadProcessor(
            tokenizer=self.inferencer.processor.tokenizer,
            max_seq_len=self.inferencer.processor.max_seq_len,
            label_list=self.inferencer.processor.tasks["question_answering"]
            ["label_list"],
            metric=self.inferencer.processor.tasks["question_answering"]
            ["metric"],
            train_filename=None,
            dev_filename=None,
            dev_split=0,
            test_filename=test_filename,
            data_dir=Path(data_dir),
        )

        data_silo = DataSilo(processor=eval_processor,
                             batch_size=self.inferencer.batch_size,
                             distributed=False)
        data_loader = data_silo.get_data_loader("test")

        evaluator = Evaluator(data_loader=data_loader,
                              tasks=eval_processor.tasks,
                              device=device)

        eval_results = evaluator.eval(self.inferencer.model)
        results = {
            "EM": eval_results[0]["EM"],
            "f1": eval_results[0]["f1"],
            "top_n_accuracy": eval_results[0]["top_n_accuracy"]
        }
        return results
def setup_evaluator(dataset_name, data_silo, device):
    evaluator = Evaluator(
        data_loader=data_silo.get_data_loader(dataset_name),
        label_maps=data_silo.processor.label_maps,
        device=device,
        metrics=data_silo.processor.metrics,
        classification_report=False,
    )
    return evaluator
    def perform_fine_tuning(current_info_need,
                            bert_model,
                            label_list,
                            num_epochs,
                            condition,
                            folds=10,
                            stratified=True,
                            learning_rate=2e-5,
                            batch_size=32,
                            embeds_dropout_prob=.1):

        ## Define evaluation metrics ##
        def evaluation_metrics(preds, labels):
            acc = simple_accuracy(preds, labels).get("acc")
            f1other = f1_score(y_true=labels, y_pred=preds, pos_label="Other")
            f1infoneed = f1_score(y_true=labels,
                                  y_pred=preds,
                                  pos_label=current_info_need)
            recall_infoneed = recall_score(y_true=labels,
                                           y_pred=preds,
                                           pos_label=current_info_need)
            precision_infoneed = precision_score(y_true=labels,
                                                 y_pred=preds,
                                                 pos_label=current_info_need)
            recall_other = recall_score(y_true=labels,
                                        y_pred=preds,
                                        pos_label="Other")
            precision_other = precision_score(y_true=labels,
                                              y_pred=preds,
                                              pos_label="Other")
            recall_macro = recall_score(y_true=labels,
                                        y_pred=preds,
                                        average="macro")
            precision_macro = precision_score(y_true=labels,
                                              y_pred=preds,
                                              average="macro")
            recall_micro = recall_score(y_true=labels,
                                        y_pred=preds,
                                        average="micro")
            precision_micro = precision_score(y_true=labels,
                                              y_pred=preds,
                                              average="micro")
            recall_weighted = recall_score(y_true=labels,
                                           y_pred=preds,
                                           average="weighted")
            precision_weighted = precision_score(y_true=labels,
                                                 y_pred=preds,
                                                 average="weighted")
            f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
            f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
            mcc = matthews_corrcoef(labels, preds)
            f1weighted = f1_score(y_true=labels,
                                  y_pred=preds,
                                  average="weighted")

            return {
                "info_need": current_info_need,
                "model": bert_model,
                "num_epochs": num_epochs,
                "condition": condition,
                "acc": acc,
                "f1_other": f1other,
                "f1_infoneed": f1infoneed,
                "precision_infoneed": precision_infoneed,
                "recall_infoneed": recall_infoneed,
                "recall_other": recall_other,
                "precision_other": precision_other,
                "recall_macro": recall_macro,
                "precision_macro": precision_macro,
                "recall_micro": recall_micro,
                "precision_micro": precision_micro,
                "recall_weighted": recall_weighted,
                "precision_weighted": precision_weighted,
                "f1_weighted": f1weighted,
                "f1_macro": f1macro,
                "f1_micro": f1micro,
                "f1_weighted": f1weighted,
                "mcc": mcc
            }

        register_metrics(
            f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs',
            evaluation_metrics)
        metric = f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs'
        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=True)
        logger, ml_logger = init_logging()
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=bert_model,
                                   do_lower_case=False)

        processor = TextClassificationProcessor(
            tokenizer=tokenizer,
            max_seq_len=256,
            train_filename=
            f"{current_info_need}_{condition}_{num_epochs}_epochs_train.csv",
            test_filename=
            f"{current_info_need}_{condition}_{num_epochs}_epochs_test.csv",
            data_dir="data/",
            label_list=label_list,
            metric=metric,
            text_column_name="utterance",
            label_column_name=level,
            delimiter=";")

        data_silo = DataSilo(processor=processor, batch_size=batch_size)

        silos = DataSiloForCrossVal.make(data_silo,
                                         n_splits=folds,
                                         sets=['train', 'test'])

        # the following steps should be run for each of the folds of the cross validation, so we put them
        # into a function
        def train_on_split(silo_to_use, n_fold, save_dir):
            logger.info(
                f"############ Crossvalidation: Fold {n_fold} ############")
            # Create an AdaptiveModel
            # a) which consists of a pretrained language model as a basis
            language_model = LanguageModel.load(bert_model)
            # b) and a prediction head on top that is suited for our task => Text classification
            prediction_head = TextClassificationHead(
                class_weights=data_silo.calculate_class_weights(
                    task_name="text_classification"),
                num_labels=len(label_list))

            model = AdaptiveModel(language_model=language_model,
                                  prediction_heads=[prediction_head],
                                  embeds_dropout_prob=embeds_dropout_prob,
                                  lm_output_types=["per_sequence"],
                                  device=device)

            # Create an optimizer
            model, optimizer, lr_schedule = initialize_optimizer(
                model=model,
                learning_rate=learning_rate,
                device=device,
                n_batches=len(silo_to_use.loaders["train"]),
                n_epochs=num_epochs,
                use_amp=None)

            # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
            # Also create an EarlyStopping instance and pass it on to the trainer

            # An early stopping instance can be used to save the model that performs best on the dev set
            # according to some metric and stop training when no improvement is happening for some iterations.
            # NOTE: Using a different save directory for each fold, allows us afterwards to use the
            # nfolds best models in an ensemble!
            save_dir = Path(str(save_dir) + f"-{n_fold}")
            earlystopping = EarlyStopping(
                metric="f1_infoneed",
                mode=
                "max",  # use the metric from our own metrics function instead of loss
                save_dir=save_dir,  # where to save the best model
                patience=
                5  # number of evaluations to wait for improvement before terminating the training
            )

            trainer = Trainer(model=model,
                              optimizer=optimizer,
                              data_silo=silo_to_use,
                              epochs=num_epochs,
                              n_gpu=n_gpu,
                              lr_schedule=lr_schedule,
                              evaluate_every=100,
                              device=device,
                              early_stopping=earlystopping,
                              evaluator_test=False)

            # train it
            trainer.train()

            return trainer.model

        # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
        # on the test set of each fold
        # Remember all the results for overall metrics over all predictions of all folds and for averaging
        allresults = []
        all_preds = []
        all_labels = []
        bestfold = None
        bestf1_info_need = -1
        language_model_name = bert_model
        if language_model_name.find("/") != -1:
            language_model_name = language_model_name.replace("/", "_")
        save_dir = Path(
            f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}"
        )
        for num_fold, silo in enumerate(silos):
            model = train_on_split(silo, num_fold, save_dir)

            # do eval on test set here (and not in Trainer),
            #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
            evaluator_test = Evaluator(
                data_loader=silo.get_data_loader("test"),
                tasks=silo.processor.tasks,
                device=device)
            result = evaluator_test.eval(model, return_preds_and_labels=True)
            evaluator_test.log_results(result,
                                       "Test",
                                       steps=len(silo.get_data_loader("test")),
                                       num_fold=num_fold)

            allresults.append(result)
            all_preds.extend(result[0].get("preds"))
            all_labels.extend(result[0].get("labels"))

            # keep track of best fold
            f1_info_need = result[0]["f1_infoneed"]
            if f1_info_need > bestf1_info_need:
                bestf1_info_need = f1_info_need
                bestfold = num_fold

            # emtpy cache to avoid memory leak and cuda OOM across multiple folds
            model.cpu()
            torch.cuda.empty_cache()

        # Save the per-fold results to json for a separate, more detailed analysis
        with open(
                f"classification_results/test/{current_info_need}-{language_model_name}-{condition}-{num_epochs}_epochs-{folds}-fold-cv.results.json",
                "wt") as fp:
            json.dump(allresults, fp)

        # calculate overall metrics across all folds
        xval_f1_other = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 pos_label="Other")
        xval_f1_info_need = f1_score(all_labels,
                                     all_preds,
                                     labels=label_list,
                                     pos_label=current_info_need)
        xval_f1_micro = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 average="micro")
        xval_f1_macro = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 average="macro")
        xval_mcc = matthews_corrcoef(all_labels, all_preds)

        xval_overall_results = {
            "xval_f1_other": xval_f1_other,
            f"xval_f1_infoneed": xval_f1_info_need,
            "xval_f1_micro": xval_f1_micro,
            "xval_f1_macro": xval_f1_macro,
            "xval_f1_mcc": xval_mcc
        }

        logger.info(f"XVAL F1 MICRO: {xval_f1_micro}")
        logger.info(f"XVAL F1 MACRO: {xval_f1_macro}")
        logger.info(f"XVAL F1 OTHER: {xval_f1_other}")
        logger.info(
            f"XVAL F1 {current_info_need} {condition} {num_epochs} epochs:   {xval_f1_info_need}"
        )
        logger.info(f"XVAL MCC: {xval_mcc}")

        # -----------------------------------------------------
        # Just for illustration, use the best model from the best xval val for evaluation on
        # the original (still unseen) test set.
        logger.info(
            "###### Final Eval on hold out test set using best model #####")
        evaluator_origtest = Evaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device)
        # restore model from the best fold
        lm_name = model.language_model.name
        save_dir = Path(
            f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}-{bestfold}"
        )
        model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)

        result = evaluator_origtest.eval(model)
        logger.info("TEST F1 MICRO: {}".format(result[0]["f1_micro"]))
        logger.info("TEST F1 MACRO: {}".format(result[0]["f1_macro"]))
        logger.info("TEST F1 OTHER: {}".format(result[0]["f1_other"]))
        logger.info("TEST F1 {0}: {1}".format(current_info_need,
                                              result[0]["f1_infoneed"]))
        logger.info("TEST MCC:  {}".format(result[0]["mcc"]))

        test_set_results = {
            "test_f1_other": result[0]["f1_other"],
            "test_f1_infoneed": result[0][f"f1_infoneed"],
            "test_f1_micro": result[0]["f1_micro"],
            "test_f1_macro": result[0]["f1_macro"],
            "test_f1_mcc": result[0]["mcc"]
        }
Exemple #4
0
# for each fold, run the whole training, earlystopping to get a model, then evaluate the model
# on the test set of each fold
# Remember all the results for overall metrics over all predictions of all folds and for averaging
allresults = []
all_preds = []
all_labels = []
bestfold = None
bestf1_offense = -1
save_dir = "saved_models/bert-german-doc-tutorial-es"
for num_fold, silo in enumerate(silos):
    model = train_on_split(silo, num_fold, save_dir)

    # do eval on test set here (and not in Trainer),
    #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
    evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                               tasks=silo.processor.tasks,
                               device=device)
    result = evaluator_test.eval(model, return_preds_and_labels=True)
    evaluator_test.log_results(result,
                               "Test",
                               steps=len(silo.get_data_loader("test")),
                               num_fold=num_fold)

    allresults.append(result)
    all_preds.extend(result[0].get("preds"))
    all_labels.extend(result[0].get("labels"))

    # keep track of best fold
    f1_offense = result[0]["f1_offense"]
    if f1_offense > bestf1_offense:
        bestf1_offense = f1_offense
Exemple #5
0
def outcome_pretraining(task_config,
                        model_name,
                        cache_dir,
                        run_name="0",
                        lr=1e-05,
                        warmup_steps=5000,
                        embeds_dropout=0.1,
                        epochs=200,  # large because we use early stopping by default
                        batch_size=20,
                        grad_acc_steps=1,
                        early_stopping_metric="loss",
                        early_stopping_mode="min",
                        early_stopping_patience=10,
                        model_class="Bert",
                        tokenizer_class="BertTokenizer",
                        do_lower_case=True,
                        do_train=True,
                        do_eval=True,
                        do_hpo=False,
                        max_seq_len=512,
                        seed=11,
                        eval_every=500,
                        use_amp=False,
                        use_cuda=True,
                        ):
    # Load task config
    task_config = yaml.safe_load(open(task_config))

    data_dir = Path(task_config["data"]["data_dir"])

    # General Settings
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name, tokenizer_class=tokenizer_class,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = OutcomePretrainingProcessor(tokenizer=tokenizer,
                                            max_seq_len=max_seq_len,
                                            data_dir=data_dir,
                                            train_filename=task_config["data"]["train_filename"],
                                            dev_filename=task_config["data"]["dev_filename"],
                                            seed=seed,
                                            max_size_admission=50,
                                            max_size_discharge=50,
                                            cache_dir=cache_dir)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = OutcomePretrainingDataSilo(
        processor=processor,
        caching=True,
        cache_dir=cache_dir,
        batch_size=batch_size,
        max_multiprocessing_chunksize=200)

    if do_train:

        # Set save dir for experiment output
        save_dir = Path(task_config["output_dir"]) / f'{task_config["experiment_name"]}_{run_name}'

        # Use HPO config args if config is passed
        if do_hpo:
            save_dir = save_dir / tune.session.get_trial_name()
        else:
            exp_name = f"exp_{random.randint(100000, 999999)}"
            save_dir = save_dir / exp_name

        # Create save dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Setup MLFlow logger
        ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"])
        ml_logger.init_experiment(experiment_name=task_config["experiment_name"],
                                  run_name=f'{task_config["experiment_name"]}_{run_name}')

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis

        language_model = LanguageModel.load(model_name, language_model_class=model_class)

        # b) and NextSentenceHead prediction head or TextClassificationHead if it's not a Bert Model
        if model_class == "Bert":
            next_sentence_head = NextSentenceHead.load(model_class)
        else:
            next_sentence_head = TextClassificationHead(num_labels=2)

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[next_sentence_head],
            embeds_dropout_prob=embeds_dropout,
            lm_output_types=["per_sequence"],
            device=device,
        )

        # 5. Create an optimizer
        schedule_opts = {"name": "LinearWarmup",
                         "num_warmup_steps": warmup_steps}

        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=lr,
            device=device,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=epochs,
            use_amp=use_amp,
            grad_acc_steps=grad_acc_steps,
            schedule_opts=schedule_opts)

        # 6. Create an early stopping instance
        early_stopping = None
        if early_stopping_mode != "none":
            early_stopping = EarlyStopping(
                mode=early_stopping_mode,
                min_delta=0.0001,
                save_dir=save_dir,
                metric=early_stopping_metric,
                patience=early_stopping_patience
            )

        # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it
        # from time to time

        trainer = ExtendedTrainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=eval_every,
            early_stopping=early_stopping,
            device=device,
            grad_acc_steps=grad_acc_steps,
            evaluator_test=do_eval
        )

        def score_callback(eval_score, train_loss):
            tune.report(roc_auc_dev=eval_score, train_loss=train_loss)

        # 8. Train the model
        trainer.train(score_callback=score_callback if do_hpo else None)

        # 9. Save model if not saved in early stopping
        model.save(save_dir / "final_model")
        processor.save(save_dir / "final_model")

    if do_eval:
        # Load newly trained model or existing model
        if do_train:
            model_dir = save_dir
        else:
            model_dir = Path(model_name)

        logger.info("###### Eval on TEST SET #####")

        evaluator_test = Evaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device
        )

        # Load trained model for evaluation
        model = AdaptiveModel.load(model_dir, device)
        model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

        # Evaluate
        results = evaluator_test.eval(model, return_preds_and_labels=True)

        # Log results
        utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader),
                          save_path=model_dir / "eval_results.txt")
Exemple #6
0
    def eval(
        self,
        document_store: BaseDocumentStore,
        device: str,
        label_index: str = "label",
        doc_index: str = "eval_document",
        label_origin: str = "gold_label",
    ):
        """
        Performs evaluation on evaluation documents in the DocumentStore.

        Returns a dict containing the following metrics:
            - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers
            - "f1": Average overlap between predicted answers and their corresponding correct answers
            - "top_n_accuracy": Proportion of predicted answers that match with correct answer

        :param document_store: DocumentStore containing the evaluation documents
        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
        :param label_index: Index/Table name where labeled questions are stored
        :param doc_index: Index/Table name where documents that are used for evaluation are stored
        """

        # extract all questions for evaluation
        filters = {"origin": [label_origin]}

        labels = document_store.get_all_labels(index=label_index,
                                               filters=filters)

        # Aggregate all answer labels per question
        aggregated_per_doc = defaultdict(list)
        for label in labels:
            if not label.document_id:
                logger.error(f"Label does not contain a document_id")
                continue
            aggregated_per_doc[label.document_id].append(label)

        # Create squad style dicts
        d: Dict[str, Any] = {}
        for doc_id in aggregated_per_doc.keys():
            doc = document_store.get_document_by_id(doc_id, index=doc_index)
            if not doc:
                logger.error(
                    f"Document with the ID '{doc_id}' is not present in the document store."
                )
                continue
            d[str(doc_id)] = {"context": doc.text}
            # get all questions / answers
            aggregated_per_question: Dict[str, Any] = defaultdict(list)
            for label in aggregated_per_doc[doc_id]:
                # add to existing answers
                if label.question in aggregated_per_question.keys():
                    aggregated_per_question[label.question]["answers"].append({
                        "text":
                        label.answer,
                        "answer_start":
                        label.offset_start_in_doc
                    })
                # create new one
                else:
                    aggregated_per_question[label.question] = {
                        "id":
                        str(hash(str(doc_id) + label.question)),
                        "question":
                        label.question,
                        "answers": [{
                            "text": label.answer,
                            "answer_start": label.offset_start_in_doc
                        }]
                    }
            # Get rid of the question key again (after we aggregated we don't need it anymore)
            d[str(doc_id)]["qas"] = [
                v for v in aggregated_per_question.values()
            ]

        # Convert input format for FARM
        farm_input = [v for v in d.values()]

        # Create DataLoader that can be passed to the Evaluator
        indices = range(len(farm_input))
        dataset, tensor_names = self.inferencer.processor.dataset_from_dicts(
            farm_input, indices=indices)
        data_loader = NamedDataLoader(dataset=dataset,
                                      batch_size=self.inferencer.batch_size,
                                      tensor_names=tensor_names)

        evaluator = Evaluator(data_loader=data_loader,
                              tasks=self.inferencer.processor.tasks,
                              device=device)

        eval_results = evaluator.eval(self.inferencer.model)
        results = {
            "EM": eval_results[0]["EM"],
            "f1": eval_results[0]["f1"],
            "top_n_accuracy": eval_results[0]["top_n_accuracy"]
        }
        return results
Exemple #7
0
def question_answering_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    #ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    #ml_logger.init_experiment(experiment_name="QA_X-Validation", run_name="Squad_Roberta_Base")

    ##########################
    ########## Settings
    ##########################
    save_per_fold_results = False  # unsupported for now
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False

    n_epochs = 2
    batch_size = 80
    learning_rate = 3e-5

    data_dir = Path("../data/covidqa")
    filename = "COVID-QA.json"
    xval_folds = 5
    dev_split = 0
    evaluate_every = 0
    no_ans_boost = -100  # use large negative values to disable giving "no answer" option
    accuracy_at = 3  # accuracy at n is useful for answers inside long documents
    use_amp = None

    ##########################
    ########## k fold Cross validation
    ##########################

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=filename,
        dev_filename=None,
        dev_split=dev_split,
        test_filename=None,
        data_dir=data_dir,
        doc_stride=192,
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} ############")

        # fine-tune pre-trained question-answering model
        model = AdaptiveModel.convert_from_transformers(
            lang_model, device=device, task_type="question_answering")
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)
        # If positive, thjs will boost "No Answer" as prediction.
        # If negative, this will prevent the model from giving "No Answer" as prediction.
        model.prediction_heads[0].no_ans_boost = no_ans_boost
        # Number of predictions the model will make per Question.
        # The multiple predictions are used for evaluating top n recall.
        model.prediction_heads[0].n_best = accuracy_at

        # # or train question-answering models from scratch
        # # Create an AdaptiveModel
        # # a) which consists of a pretrained language model as a basis
        # language_model = LanguageModel.load(lang_model)
        # # b) and a prediction head on top that is suited for our task => Question-answering
        # prediction_head = QuestionAnsweringHead(no_ans_boost=no_ans_boost, n_best=accuracy_at)
        # model = AdaptiveModel(
        #    language_model=language_model,
        #    prediction_heads=[prediction_head],
        #    embeds_dropout_prob=0.1,
        #    lm_output_types=["per_token"],
        #    device=device,)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=learning_rate,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, then evaluate the model on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    all_results = []
    all_preds = []
    all_labels = []
    all_f1 = []
    all_em = []
    all_topnaccuracy = []

    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold)

        # do eval on test set here (and not in Trainer),
        # so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   logging=False,
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        all_results.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))
        all_f1.append(result[0]["f1"])
        all_em.append(result[0]["EM"])
        all_topnaccuracy.append(result[0]["top_n_accuracy"])

        # emtpy cache to avoid memory leak and cuda OOM across multiple folds
        model.cpu()
        torch.cuda.empty_cache()

    # Save the per-fold results to json for a separate, more detailed analysis
    # TODO currently not supported - adjust to QAPred and QACandidate objects
    # if save_per_fold_results:
    #     def convert_numpy_dtype(obj):
    #         if type(obj).__module__ == "numpy":
    #             return obj.item()
    #
    #         raise TypeError("Unknown type:", type(obj))
    #
    #     with open("qa_xval.results.json", "wt") as fp:
    #          json.dump(all_results, fp, default=convert_numpy_dtype)

    # calculate overall metrics across all folds
    xval_score = squad(preds=all_preds, labels=all_labels)

    logger.info(f"Single EM-Scores:   {all_em}")
    logger.info(f"Single F1-Scores:   {all_f1}")
    logger.info(
        f"Single top_{accuracy_at}_accuracy Scores:   {all_topnaccuracy}")
    logger.info(f"XVAL EM:   {xval_score['EM']}")
    logger.info(f"XVAL f1:   {xval_score['f1']}")
    logger.info(
        f"XVAL top_{accuracy_at}_accuracy:   {xval_score['top_n_accuracy']}")
    ml_logger.log_metrics({"XVAL EM": xval_score["EM"]}, 0)
    ml_logger.log_metrics({"XVAL f1": xval_score["f1"]}, 0)
    ml_logger.log_metrics(
        {f"XVAL top_{accuracy_at}_accuracy": xval_score["top_n_accuracy"]}, 0)
def test_evaluation():
    ##########################
    ########## Settings
    ##########################
    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False

    test_assertions = True

    data_dir = Path("testsave/data/squad20")
    evaluation_filename = "dev-v2.0.json"

    device, n_gpu = initialize_device_settings(use_cuda=True)

    # loading models and evals
    model = AdaptiveModel.convert_from_transformers(
        lang_model, device=device, task_type="question_answering")
    model.prediction_heads[0].no_ans_boost = 0
    model.prediction_heads[0].n_best = 1

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
        doc_stride=128,
    )

    starttime = time()

    data_silo = DataSilo(processor=processor, batch_size=50)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    # 1. Test FARM internal evaluation
    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnrecall = results[0]["top_n_recall"] * 100
    elapsed = time() - starttime
    print(results)
    print(elapsed)

    gold_EM = 77.7478
    gold_f1 = 82.1557
    gold_tnrecall = 84.0646  # top 1 recall
    gold_elapsed = 70  # 4x V100
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}")
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}")
        np.testing.assert_allclose(
            tnrecall,
            gold_tnrecall,
            rtol=0.001,
            err_msg=f"FARM Eval changed for top 1 recall by: {em_score-gold_EM}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )

    # 2. Test FARM predictions with outside eval script
    starttime = time()
    model = Inferencer(model=model,
                       processor=processor,
                       task_type="question_answering",
                       batch_size=50,
                       gpu=device.type == "cuda")
    filename = data_dir / evaluation_filename
    result = model.inference_from_file(file=filename)

    elapsed = time() - starttime

    os.makedirs("../testsave", exist_ok=True)
    write_squad_predictions(predictions=result,
                            predictions_filename=filename,
                            out_filename="testsave/predictions.json")
    script_params = {
        "data_file": filename,
        "pred_file": "testsave/predictions.json",
        "na_prob_thresh": 1,
        "na_prob_file": False,
        "out_file": False
    }
    results_official = squad_evaluation.main(OPTS=DotMap(script_params))
    f1_score = results_official["f1"]
    em_score = results_official["exact"]

    gold_EM = 78.4890
    gold_f1 = 81.7104
    gold_elapsed = 66  # 4x V100
    print(elapsed)
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for EM by: {em_score - gold_EM}"
        )
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for f1 score by: {f1_score - gold_f1}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )
Exemple #9
0
    def train(self):
        """
        Perform the training procedure.

        The training is visualized by a progress bar. It counts the epochs in a zero based manner.
        For example, when you specify ``epochs=20`` it starts to count from 0 to 19.

        If trainer evaluates the model with a test set the result of the
        evaluation is stored in ``test_result``.

        :return: Returns the model after training. When you do ``early_stopping``
            with a ``save_dir`` the best model is loaded and returned.
        """

        # connect the prediction heads with the right output from processor
        self.model.connect_heads_with_processor(self.data_silo.processor.tasks,
                                                require_labels=True)
        # Check that the tokenizer(s) fits the language model(s)
        if hasattr(self.model, "language_model2"):
            self.model.verify_vocab_size(
                vocab_size1=len(self.data_silo.processor.query_tokenizer),
                vocab_size2=len(self.data_silo.processor.passage_tokenizer))
        else:
            self.model.verify_vocab_size(
                vocab_size=len(self.data_silo.processor.tokenizer))
        self.model.train()

        do_stopping = False
        evalnr = 0
        loss = 0
        resume_from_step = self.from_step

        if self.local_rank in [0, -1]:
            logger.info(f"\n {GROWING_TREE}")

        for epoch in range(self.from_epoch, self.epochs):
            early_break = False
            self.from_epoch = epoch
            train_data_loader = self.data_silo.get_data_loader("train")
            progress_bar = tqdm(train_data_loader,
                                disable=self.local_rank not in [0, -1]
                                or self.disable_tqdm)
            for step, batch in enumerate(progress_bar):
                # when resuming training from a checkpoint, we want to fast forward to the step of the checkpoint
                if resume_from_step and step <= resume_from_step:
                    # TODO: Improve skipping for StreamingDataSilo
                    # The seeds before and within the loop are currently needed, if you need full reproducibility
                    # of runs with vs. without checkpointing using StreamingDataSilo. Reason: While skipping steps in StreamingDataSilo,
                    # we update the state of the random number generator (e.g. due to masking words), which can impact the model behaviour (e.g. dropout)
                    if step % 10000 == 0:
                        logger.info(
                            f"Skipping {step} out of {resume_from_step} steps ..."
                        )
                    if resume_from_step == step:
                        logger.info(
                            f"Finished skipping {resume_from_step} steps ...")
                        resume_from_step = None
                    else:
                        continue

                progress_bar.set_description(
                    f"Train epoch {epoch}/{self.epochs-1} (Cur. train loss: {loss:.4f})"
                )

                # Only for distributed training: we need to ensure that all ranks still have a batch left for training
                if self.local_rank != -1:
                    if not self._all_ranks_have_data(has_data=1, step=step):
                        early_break = True
                        break

                # Move batch of samples to device
                batch = {key: batch[key].to(self.device) for key in batch}
                # Forward & backward pass through model
                logits = self.model.forward(**batch)
                per_sample_loss = self.model.logits_to_loss(
                    logits=logits, global_step=self.global_step, **batch)
                loss = self.backward_propagate(per_sample_loss, step)

                # Perform  evaluation
                if self.evaluate_every != 0 \
                        and self.global_step % self.evaluate_every == 0 \
                        and self.global_step != 0\
                        and self.local_rank in [0,-1]:
                    # When using StreamingDataSilo, each evaluation creates a new instance of
                    # dev_data_loader. In cases like training from scratch, this could cause
                    # some variance across evaluators due to the randomness in word masking.
                    dev_data_loader = self.data_silo.get_data_loader("dev")
                    if dev_data_loader is not None:
                        evaluator_dev = Evaluator(
                            data_loader=dev_data_loader,
                            tasks=self.data_silo.processor.tasks,
                            device=self.device,
                            report=self.eval_report)
                        evalnr += 1
                        result = evaluator_dev.eval(self.model)
                        evaluator_dev.log_results(result, "Dev",
                                                  self.global_step)
                        if self.early_stopping:
                            do_stopping, save_model, eval_value = self.early_stopping.check_stopping(
                                result)
                            if save_model:
                                logger.info(
                                    "Saving current best model to {}, eval={}".
                                    format(self.early_stopping.save_dir,
                                           eval_value))
                                self.model.save(self.early_stopping.save_dir)
                                self.data_silo.processor.save(
                                    self.early_stopping.save_dir)
                            if do_stopping:
                                # log the stopping
                                logger.info(
                                    "STOPPING EARLY AT EPOCH {}, STEP {}, EVALUATION {}"
                                    .format(epoch, step, evalnr))
                if do_stopping:
                    break

                self.global_step += 1
                self.from_step = step + 1

                # save the current state as a checkpoint before exiting if a SIGTERM signal is received
                if self.sigterm_handler and self.sigterm_handler.kill_now:
                    logger.info(
                        "Received a SIGTERM signal. Saving the current train state as a checkpoint ..."
                    )
                    if self.local_rank in [0, -1]:
                        self._save()
                        torch.distributed.destroy_process_group()
                        sys.exit(0)

                # save a checkpoint and continue train
                if self.checkpoint_every and step % self.checkpoint_every == 0:
                    if self.local_rank in [0, -1]:
                        self._save()
                    # Let other ranks wait until rank 0 has finished saving
                    if self.local_rank != -1:
                        torch.distributed.barrier()

            if do_stopping:
                break

            # Only for distributed training: we need to ensure that all ranks still have a batch left for training
            if self.local_rank != -1 and not early_break:
                self._all_ranks_have_data(has_data=False)

        # With early stopping we want to restore the best model
        if self.early_stopping and self.early_stopping.save_dir:
            logger.info("Restoring best model so far from {}".format(
                self.early_stopping.save_dir))
            lm_name = self.model.language_model.name
            self.model = AdaptiveModel.load(self.early_stopping.save_dir,
                                            self.device,
                                            lm_name=lm_name)
            self.model.connect_heads_with_processor(
                self.data_silo.processor.tasks, require_labels=True)

        # Eval on test set
        if self.evaluator_test and self.local_rank in [0, -1]:
            test_data_loader = self.data_silo.get_data_loader("test")
            if test_data_loader is not None:
                evaluator_test = Evaluator(
                    data_loader=test_data_loader,
                    tasks=self.data_silo.processor.tasks,
                    device=self.device)
                self.test_result = evaluator_test.eval(self.model)
                evaluator_test.log_results(self.test_result, "Test",
                                           self.global_step)
        return self.model
Exemple #10
0
    def eval(
        self,
        document_store: BaseDocumentStore,
        device: str,
        label_index: str = "label",
        doc_index: str = "eval_document",
        label_origin: str = "gold_label",
    ):
        """
        Performs evaluation on evaluation documents in the DocumentStore.
        Returns a dict containing the following metrics:
              - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers
              - "f1": Average overlap between predicted answers and their corresponding correct answers
              - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer

        :param document_store: DocumentStore containing the evaluation documents
        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
        :param label_index: Index/Table name where labeled questions are stored
        :param doc_index: Index/Table name where documents that are used for evaluation are stored
        """

        if self.top_k_per_candidate != 4:
            logger.info(
                f"Performing Evaluation using top_k_per_candidate = {self.top_k_per_candidate} \n"
                f"and consequently, QuestionAnsweringPredictionHead.n_best = {self.top_k_per_candidate + 1}. \n"
                f"This deviates from FARM's default where QuestionAnsweringPredictionHead.n_best = 5"
            )

        # extract all questions for evaluation
        filters = {"origin": [label_origin]}

        labels = document_store.get_all_labels(index=label_index,
                                               filters=filters)

        # Aggregate all answer labels per question
        aggregated_per_doc = defaultdict(list)
        for label in labels:
            if not label.document_id:
                logger.error(f"Label does not contain a document_id")
                continue
            aggregated_per_doc[label.document_id].append(label)

        # Create squad style dicts
        d: Dict[str, Any] = {}
        all_doc_ids = [
            x.id for x in document_store.get_all_documents(doc_index)
        ]
        for doc_id in all_doc_ids:
            doc = document_store.get_document_by_id(doc_id, index=doc_index)
            if not doc:
                logger.error(
                    f"Document with the ID '{doc_id}' is not present in the document store."
                )
                continue
            d[str(doc_id)] = {"context": doc.text}
            # get all questions / answers
            aggregated_per_question: Dict[str, Any] = defaultdict(list)
            if doc_id in aggregated_per_doc:
                for label in aggregated_per_doc[doc_id]:
                    # add to existing answers
                    if label.question in aggregated_per_question.keys():
                        # Hack to fix problem where duplicate questions are merged by doc_store processing creating a QA example with 8 annotations > 6 annotation max
                        if len(aggregated_per_question[label.question]
                               ["answers"]) >= 6:
                            continue
                        aggregated_per_question[
                            label.question]["answers"].append({
                                "text":
                                label.answer,
                                "answer_start":
                                label.offset_start_in_doc
                            })
                    # create new one
                    else:
                        aggregated_per_question[label.question] = {
                            "id":
                            str(hash(str(doc_id) + label.question)),
                            "question":
                            label.question,
                            "answers": [{
                                "text":
                                label.answer,
                                "answer_start":
                                label.offset_start_in_doc
                            }]
                        }
            # Get rid of the question key again (after we aggregated we don't need it anymore)
            d[str(doc_id)]["qas"] = [
                v for v in aggregated_per_question.values()
            ]

        # Convert input format for FARM
        farm_input = [v for v in d.values()]
        n_queries = len([y for x in farm_input for y in x["qas"]])

        # Create DataLoader that can be passed to the Evaluator
        tic = perf_counter()
        indices = range(len(farm_input))
        dataset, tensor_names = self.inferencer.processor.dataset_from_dicts(
            farm_input, indices=indices)
        data_loader = NamedDataLoader(dataset=dataset,
                                      batch_size=self.inferencer.batch_size,
                                      tensor_names=tensor_names)

        evaluator = Evaluator(data_loader=data_loader,
                              tasks=self.inferencer.processor.tasks,
                              device=device)

        eval_results = evaluator.eval(self.inferencer.model)
        toc = perf_counter()
        reader_time = toc - tic
        results = {
            "EM": eval_results[0]["EM"],
            "f1": eval_results[0]["f1"],
            "top_n_accuracy": eval_results[0]["top_n_accuracy"],
            "top_n": self.inferencer.model.prediction_heads[0].n_best,
            "reader_time": reader_time,
            "seconds_per_query": reader_time / n_queries
        }
        return results
Exemple #11
0
    def __init__(
        self,
        model,
        optimizer,
        data_silo,
        epochs,
        n_gpu,
        device,
        lr_schedule=None,
        evaluate_every=100,
        evaluator_dev=None,
        evaluator_test=None,
        use_amp=None,
        grad_acc_steps=1,
        local_rank=-1,
        early_stopping=None,
        log_learning_rate=False,
        checkpoint_on_sigterm=False,
        checkpoint_every=None,
        checkpoint_root_dir=None,
        checkpoints_to_keep=3,
        from_epoch=0,
        from_step=0,
    ):
        """
        :param optimizer: An optimizer object that determines the learning strategy to be used during training
        :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders
        :type data_silo: DataSilo
        :param epochs: How many times the training procedure will loop through the train dataset
        :type epochs: int
        :param n_gpu: The number of gpus available for training and evaluation.
        :type n_gpu: int
        :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda".
        :param lr_schedule: An optional scheduler object that can regulate the learning rate of the optimizer
        :param evaluate_every: Perform dev set evaluation after this many steps of training.
        :type evaluate_every: int
        :param evaluator_dev: Evaluator for dev set. Options:
                              `None` (Default) => will init a new evaluator, if there's a dev set in the DataSilo
                              `Evaluator Object` => use the manually supplied evaluator
                              `False` => Don't use any evaluator
        :type evaluator_dev: Evaluator, None or False
        :param evaluator_test: Evaluator for test set. Options:
                              `None` (Default) => will init a new evaluator, if there's a test set in the DataSilo
                              `Evaluator Object` => use the manually supplied evaluator
                              `False` => Don't use any evaluator
        :type evaluator_test: Evaluator, None or False
        :param use_amp: Whether to use automatic mixed precision with Apex. One of the optimization levels must be chosen.
                        "O1" is recommended in almost all cases.
        :type use_amp: str
        :param grad_acc_steps: TODO
        :type grad_acc_steps: int
        :param local_rank: TODO
        :type local_rank: int
        :param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models.
        :type early_stopping: EarlyStopping
        :param log_learning_rate: Whether to log learning rate to Mlflow
        :type log_learning_rate: bool
        :param checkpoint_on_sigterm: save a checkpoint for the Trainer when a SIGTERM signal is sent. The checkpoint
               can be used to resume training. It is useful in frameworks like AWS SageMaker with Spot instances where
               a SIGTERM notifies to save the training state and subsequently the instance is terminated.
        :type checkpoint_on_sigterm: bool
        :param checkpoint_every: save a train checkpoint after this many steps of training.
        :type checkpoint_every: int
        :param checkpoint_root_dir: the Path of directory where all train checkpoints are saved. For each individual
               checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created.
        :type checkpoint_root_dir: Path
        :param checkpoints_to_keep: maximum number of train checkpoints to save.
        :type checkpoints_to_keep: int
        :param from_epoch: the epoch number to start the training from. In the case when training resumes from a saved
               checkpoint, it is used to fast-forward training to the last epoch in the checkpoint.
        :type from_epoch: int
        :param from_step: the step number to start the training from. In the case when training resumes from a saved
               checkpoint, it is used to fast-forward training to the last step in the checkpoint.
        :type from_step: int
        """

        self.model = model
        self.data_silo = data_silo
        self.epochs = int(epochs)
        self.optimizer = optimizer
        self.evaluate_every = evaluate_every
        self.n_gpu = n_gpu
        self.grad_acc_steps = grad_acc_steps
        self.use_amp = use_amp
        self.lr_schedule = lr_schedule
        self.data_loader_train = data_silo.get_data_loader("train")
        self.device = device
        self.local_rank = local_rank
        self.log_params()
        self.early_stopping = early_stopping
        self.log_learning_rate = log_learning_rate

        if use_amp and not AMP_AVAILABLE:
            raise ImportError(f'Got use_amp = {use_amp}, but cannot find apex. '
                              'Please install Apex if you want to make use of automatic mixed precision. '
                              'https://github.com/NVIDIA/apex')
        self.checkpoint_on_sigterm = checkpoint_on_sigterm
        if checkpoint_on_sigterm:
            self.sigterm_handler = GracefulKiller()
        else:
            self.sigterm_handler = None
        self.checkpoint_root_dir = checkpoint_root_dir
        self.checkpoints_to_keep = checkpoints_to_keep
        self.checkpoint_every = checkpoint_every
        if self.checkpoint_every and not checkpoint_root_dir:
            raise Exception("checkpoint_path needs to be supplied when using checkpoint_every.")
        if checkpoint_on_sigterm and not checkpoint_root_dir:
            raise Exception("checkpoint_path needs to be supplied when using checkpoint_on_sigterm.")

        self.from_epoch = from_epoch
        self.from_step = from_step
        self.global_step = (from_epoch * from_step) - 1

        # evaluator on dev set
        if evaluator_dev is None and self.data_silo.get_data_loader("dev"):
            evaluator_dev = Evaluator(
                data_loader=self.data_silo.get_data_loader("dev"),
                tasks=self.data_silo.processor.tasks,
                device=device,
            )
        self.evaluator_dev = evaluator_dev

        # evaluator on test set
        if evaluator_test is None and self.data_silo.get_data_loader("test"):
            evaluator_test = Evaluator(
                data_loader=self.data_silo.get_data_loader("test"),
                tasks=self.data_silo.processor.tasks,
                device=device
            )
        self.evaluator_test = evaluator_test
Exemple #12
0
def dense_passage_retrieval():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="FARM-dense_passage_retrieval",
                              run_name="Run_dpr")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    batch_size = 4
    n_epochs = 3
    distributed = False  # enable for multi GPU training via DDP
    evaluate_every = 1000
    question_lang_model = "facebook/dpr-question_encoder-single-nq-base"
    passage_lang_model = "facebook/dpr-ctx_encoder-single-nq-base"
    do_lower_case = True
    use_fast = True
    embed_title = True
    num_hard_negatives = 1
    similarity_function = "dot_product"
    train_filename = "nq-train.json"
    dev_filename = "nq-dev.json"
    test_filename = "nq-dev.json"
    max_samples = None  # load a smaller dataset (e.g. for debugging)

    # For multi GPU Training via DDP we need to get the local rank
    args = parse_arguments()
    device, n_gpu = initialize_device_settings(use_cuda=True,
                                               local_rank=args.local_rank)

    # 1.Create question and passage tokenizers
    query_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=question_lang_model,
        do_lower_case=do_lower_case,
        use_fast=use_fast)
    passage_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=passage_lang_model,
        do_lower_case=do_lower_case,
        use_fast=use_fast)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # data_dir "data/retriever" should contain DPR training and dev files downloaded from https://github.com/facebookresearch/DPR
    # i.e., nq-train.json, nq-dev.json or trivia-train.json, trivia-dev.json
    label_list = ["hard_negative", "positive"]
    metric = "text_similarity_metric"
    processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
                                        passage_tokenizer=passage_tokenizer,
                                        max_seq_len_query=64,
                                        max_seq_len_passage=256,
                                        label_list=label_list,
                                        metric=metric,
                                        data_dir="../data/retriever",
                                        train_filename=train_filename,
                                        dev_filename=dev_filename,
                                        test_filename=test_filename,
                                        embed_title=embed_title,
                                        num_hard_negatives=num_hard_negatives,
                                        max_samples=max_samples)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=distributed)

    # 4. Create an BiAdaptiveModel+
    # a) which consists of 2 pretrained language models as a basis
    question_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRQuestionEncoder")
    passage_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRContextEncoder")

    # b) and a prediction head on top that is suited for our task => Question Answering
    prediction_head = TextSimilarityHead(
        similarity_function=similarity_function)

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
                        "eps": 1e-08},
        schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        grad_acc_steps=1,
        device=device,
        distributed=distributed
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/dpr-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Evaluate
    test_data_loader = data_silo.get_data_loader("test")
    if test_data_loader is not None:
        evaluator_test = Evaluator(data_loader=test_data_loader,
                                   tasks=data_silo.processor.tasks,
                                   device=device)
        model.connect_heads_with_processor(processor.tasks)
        test_result = evaluator_test.eval(model)
def doc_classification_crossvalidation():
    # the code for this function is partially taken from:
    # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_multilabel.py and
    # https://github.com/deepset-ai/FARM/blob/master/examples/doc_classification_crossvalidation.py

    # for local logging:
    ml_logger = MLFlowLogger(tracking_uri="")
    ml_logger.init_experiment(experiment_name="covid-document-classification",
                              run_name=RUNNAME)

    # model settings
    xval_folds = FOLDS
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    if RUNLOCAL:
        device = "cpu"
    n_epochs = NEPOCHS
    batch_size = BATCHSIZE
    evaluate_every = EVALEVERY
    lang_model = MODELTYPE
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    metric = "f1_macro"

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # The processor wants to know the possible labels ...
    label_list = LABELS
    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=MAXLEN,
                                            data_dir=DATADIR,
                                            train_filename=TRAIN,
                                            test_filename=TEST,
                                            dev_split=0.1,
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="Categories",
                                            # confusing parameter name: it should be called multiCLASS
                                            # not multiLABEL
                                            multilabel=True
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir, dev):
        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = MultiLabelTextClassificationHead(
            # there is still an error with class weights ...
            # class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.2,
            lm_output_types=["per_sequence"],
            device=dev)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=dev,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        # unfortunately, early stopping is still not working
        earlystopping = EarlyStopping(
            metric="f1_macro", mode="max",
            save_dir=save_dir,  # where to save the best model
            patience=5 # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model, optimizer=optimizer,
                          data_silo=silo_to_use, epochs=n_epochs,
                          n_gpu=n_gpu, lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=dev, evaluator_test=False,
                          #early_stopping=earlystopping)
                          )
        # train it
        trainer.train()
        trainer.model.save(save_dir)
        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    allresults = []
    all_preds = []
    all_labels = []
    bestfold = None
    bestf1_macro = -1
    save_dir = Path("saved_models/covid-classification-v1")

    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold, save_dir, device)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(
            data_loader=silo.get_data_loader("test"),
            tasks=silo.processor.tasks,
            device=device,
        )
        result = evaluator_test.eval(model, return_preds_and_labels=True)

        os.makedirs(os.path.dirname(BESTMODEL + "/classification_report.txt"), exist_ok=True)
        with open(BESTMODEL + "/classification_report.txt", "a+") as file:
            file.write("Evaluation on withheld split for numfold no. {} \n".format(num_fold))
            file.write(result[0]["report"])
            file.write("\n\n")
            file.close()

        evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold)

        allresults.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))

        # keep track of best fold
        f1_macro = result[0]["f1_macro"]
        if f1_macro > bestf1_macro:
            bestf1_macro = f1_macro
            bestfold = num_fold

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("../data/predictions/covid-classification-xval.results.json", "wt") as fp:
        json.dump(allresults, fp, cls=NumpyArrayEncoder)

    # calculate overall f1 score across all folds
    xval_f1_macro = f1_score(all_labels, all_preds, average="macro")
    ml_logger.log_metrics({"f1 macro across all folds": xval_f1_macro}, step=None)

    # test performance
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/covid-classification-v1-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

    result = evaluator_origtest.eval(model)
    ml_logger.log_metrics({"f1 macro on final test set": result[0]["f1_macro"]}, step=None)

    with open(BESTMODEL + "/classification_report.txt", "a+") as file:
        file.write("Final result of the best model \n")
        file.write(result[0]["report"])
        file.write("\n\n")
        file.close()

    ml_logger.log_artifacts(BESTMODEL + "/")

    # save model for later use
    processor.save(BESTMODEL)
    model.save(BESTMODEL)
    return model
def test_evaluation():
    ##########################
    ########## Settings
    ##########################
    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False

    test_assertions = False

    data_dir = Path("testsave/data/squad20")
    evaluation_filename = "dev-v2.0.json"

    device, n_gpu = initialize_device_settings(use_cuda=True)

    # loading models and evals
    model = AdaptiveModel.convert_from_transformers(
        lang_model, device=device, task_type="question_answering")
    model.prediction_heads[0].no_ans_boost = 0
    model.prediction_heads[0].n_best = 1
    model.prediction_heads[0].n_best_per_sample = 1

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
        doc_stride=128,
    )

    starttime = time()

    data_silo = DataSilo(processor=processor, batch_size=40 * n_gpu_factor)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)
    model, _ = optimize_model(model=model,
                              device=device,
                              local_rank=-1,
                              optimizer=None,
                              distributed=False,
                              use_amp=None)

    evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    # 1. Test FARM internal evaluation
    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnacc = results[0]["top_n_accuracy"] * 100
    elapsed = time() - starttime
    print(results)
    print(elapsed)

    gold_EM = 78.4721
    gold_f1 = 82.6671
    gold_tnacc = 84.3594  # top 1 recall
    gold_elapsed = 40  # 4x V100
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}")
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}")
        np.testing.assert_allclose(
            tnacc,
            gold_tnacc,
            rtol=0.001,
            err_msg=
            f"FARM Eval changed for top 1 accuracy by: {tnacc-gold_tnacc}")
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )

    if not np.allclose(f1_score, gold_f1, rtol=0.001):
        error_messages.append(
            f"FARM Eval changed for f1 score by: {round(f1_score - gold_f1, 4)}"
        )
    if not np.allclose(em_score, gold_EM, rtol=0.001):
        error_messages.append(
            f"FARM Eval changed for EM by: {round(em_score - gold_EM, 4)}")
    if not np.allclose(tnacc, gold_tnacc, rtol=0.001):
        error_messages.append(
            f"FARM Eval changed for top 1 accuracy by: {round(tnacc-gold_tnacc, 4)}"
        )
    if not np.allclose(elapsed, gold_elapsed, rtol=0.1):
        error_messages.append(
            f"FARM Eval speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds"
        )

    benchmark_result = [{
        "run": "FARM internal evaluation",
        "f1_change": round(f1_score - gold_f1, 4),
        "em_change": round(em_score - gold_EM, 4),
        "tnacc_change": round(tnacc - gold_tnacc, 4),
        "elapsed_change": round(elapsed - gold_elapsed, 4),
        "f1": f1_score,
        "em": em_score,
        "tnacc": round(tnacc, 4),
        "elapsed": elapsed,
        "f1_gold": gold_f1,
        "em_gold": gold_EM,
        "tnacc_gold": gold_tnacc,
        "elapsed_gold": gold_elapsed
    }]
    logger.info("\n\n" + pformat(benchmark_result[0]) + "\n")

    # # 2. Test FARM predictions with outside eval script
    starttime = time()
    model = Inferencer(model=model,
                       processor=processor,
                       task_type="question_answering",
                       batch_size=40 * n_gpu_factor,
                       gpu=device.type == "cuda")
    filename = data_dir / evaluation_filename
    result = model.inference_from_file(file=filename,
                                       return_json=False,
                                       multiprocessing_chunksize=80)
    results_squad = [x.to_squad_eval() for x in result]
    model.close_multiprocessing_pool()

    elapsed = time() - starttime

    os.makedirs("../testsave", exist_ok=True)
    write_squad_predictions(predictions=results_squad,
                            predictions_filename=filename,
                            out_filename="testsave/predictions.json")
    script_params = {
        "data_file": filename,
        "pred_file": "testsave/predictions.json",
        "na_prob_thresh": 1,
        "na_prob_file": False,
        "out_file": False
    }
    results_official = squad_evaluation.main(OPTS=DotMap(script_params))
    f1_score = results_official["f1"]
    em_score = results_official["exact"]

    gold_EM = 79.878
    gold_f1 = 82.917
    gold_elapsed = 27  # 4x V100
    print(elapsed)
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for EM by: {em_score - gold_EM}"
        )
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for f1 score by: {f1_score - gold_f1}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )
    if not np.allclose(f1_score, gold_f1, rtol=0.001):
        error_messages.append(
            f"Eval with official script changed for f1 score by: {round(f1_score - gold_f1, 4)}"
        )
    if not np.allclose(em_score, gold_EM, rtol=0.001):
        error_messages.append(
            f"Eval with official script changed for EM by: {round(em_score - gold_EM, 4)}"
        )
    if not np.allclose(elapsed, gold_elapsed, rtol=0.1):
        error_messages.append(
            f"Inference speed changed significantly by: {round(elapsed - gold_elapsed,4)} seconds"
        )

    benchmark_result.append({
        "run": "outside eval script",
        "f1_change": round(f1_score - gold_f1, 4),
        "em_change": round(em_score - gold_EM, 4),
        "tnacc_change": "-",
        "elapsed_change": round(elapsed - gold_elapsed, 4),
        "f1": f1_score,
        "em": em_score,
        "tnacc": "-",
        "elapsed": elapsed,
        "f1_gold": gold_f1,
        "em_gold": gold_EM,
        "tnacc_gold": "-",
        "elapsed_gold": gold_elapsed
    })
    logger.info("\n\n" + pformat(benchmark_result[1]) + "\n")
    return benchmark_result
def train_evaluation_single(seed=42):
    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    # GPU utilization on 4x V100
    # 40*4, 14.3/16GB on master, 12.6/16 on others
    batch_size = 40 * n_gpu_factor
    n_epochs = 2
    evaluate_every = 2000000  # disabling dev eval
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    test_assertions = False
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # Load model and train
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("testsave/data/squad20"),
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = LanguageModel.load(lang_model)
    prediction_head = QuestionAnsweringHead(n_best=5, n_best_per_sample=1)
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    starttime = time()
    trainer.train()
    elapsed = time() - starttime

    save_dir = Path("testsave/roberta-qa-dev")
    model.save(save_dir)
    processor.save(save_dir)

    # Create Evaluator
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnacc = results[0]["top_n_accuracy"] * 100

    print(results)
    print(elapsed)

    gold_f1 = 82.155
    gold_EM = 78.6575  #77.714
    gold_tnrecall = 97.3721
    gold_elapsed = 1135
    if test_assertions:
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.01,
            err_msg=
            f"FARM Training changed for f1 score by: {f1_score - gold_f1}")
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.01,
            err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}")
        np.testing.assert_allclose(
            tnacc,
            gold_tnrecall,
            rtol=0.01,
            err_msg=
            f"FARM Training changed for top 5 accuracy by: {tnacc - gold_tnrecall}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )
    if not np.allclose(f1_score, gold_f1, rtol=0.01):
        error_messages.append(
            f"FARM Training changed for f1 score by: {round(f1_score - gold_f1, 4)}"
        )
    if not np.allclose(em_score, gold_EM, rtol=0.01):
        error_messages.append(
            f"FARM Training changed for EM by: {round(em_score - gold_EM, 4)}")
    if not np.allclose(tnacc, gold_tnrecall, rtol=0.01):
        error_messages.append(
            f"FARM Training changed for top 5 accuracy by: {round(tnacc - gold_tnrecall, 4)}"
        )
    if not np.allclose(elapsed, gold_elapsed, rtol=0.1):
        error_messages.append(
            f"FARM Training speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds"
        )

    benchmark_result = [{
        "run": "train evaluation",
        "f1_change": round(f1_score - gold_f1, 4),
        "em_change": round(em_score - gold_EM, 4),
        "tnacc_change": round(tnacc - gold_tnrecall, 4),
        "elapsed_change": round(elapsed - gold_elapsed, 4),
        "f1": f1_score,
        "em": em_score,
        "tnacc": round(tnacc, 4),
        "elapsed": elapsed,
        "f1_gold": gold_f1,
        "em_gold": gold_EM,
        "tnacc_gold": gold_tnrecall,
        "elapsed_gold": gold_elapsed
    }]
    logger.info("\n\n" + pformat(benchmark_result) + "\n")
    return benchmark_result
Exemple #16
0
    def __init__(self,
                 optimizer,
                 data_silo,
                 epochs,
                 n_gpu,
                 device,
                 lr_schedule=None,
                 evaluate_every=100,
                 evaluator_dev=None,
                 evaluator_test=None,
                 use_amp=None,
                 grad_acc_steps=1,
                 local_rank=-1,
                 early_stopping=None,
                 log_learning_rate=False):
        """
        :param optimizer: An optimizer object that determines the learning strategy to be used during training
        :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders
        :type data_silo: DataSilo
        :param epochs: How many times the training procedure will loop through the train dataset
        :type epochs: int
        :param n_gpu: The number of gpus available for training and evaluation.
        :type n_gpu: int
        :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda".
        :param lr_schedule: An optional scheduler object that can regulate the learning rate of the optimizer
        :param evaluate_every: Perform dev set evaluation after this many steps of training.
        :type evaluate_every: int
        :param evaluator_dev: Evaluator for dev set. Options:
                              `None` (Default) => will init a new evaluator, if there's a dev set in the DataSilo
                              `Evaluator Object` => use the manually supplied evaluator
                              `False` => Don't use any evaluator
        :type evaluator_dev: Evaluator, None or False
        :param evaluator_test: Evaluator for test set. Options:
                              `None` (Default) => will init a new evaluator, if there's a test set in the DataSilo
                              `Evaluator Object` => use the manually supplied evaluator
                              `False` => Don't use any evaluator
        :type evaluator_test: Evaluator, None or False
        :param use_amp: Whether to use automatic mixed precision with Apex. One of the optimization levels must be chosen.
                        "O1" is recommended in almost all cases.
        :type use_amp: str
        :param grad_acc_steps: TODO
        :type grad_acc_steps: int
        :param local_rank: TODO
        :type local_rank: int
        :param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models.
        :type early_stopping: EarlyStopping
        :param log_learning_rate: Whether to log learning rate to Mlflow
        :type log_learning_rate: bool
        """
        self.data_silo = data_silo
        self.epochs = int(epochs)
        self.optimizer = optimizer
        self.evaluate_every = evaluate_every
        self.n_gpu = n_gpu
        self.grad_acc_steps = grad_acc_steps
        self.use_amp = use_amp
        self.lr_schedule = lr_schedule
        self.global_step = 0
        self.data_loader_train = data_silo.get_data_loader("train")
        self.device = device
        self.local_rank = local_rank
        self.log_params()
        self.early_stopping = early_stopping
        self.log_learning_rate = log_learning_rate

        if use_amp and not AMP_AVAILABLE:
            raise ImportError(
                f'Got use_amp = {use_amp}, but cannot find apex. '
                'Please install Apex if you want to make use of automatic mixed precision. '
                'https://github.com/NVIDIA/apex')

        # evaluator on dev set
        if evaluator_dev is None and self.data_silo.get_data_loader("dev"):
            evaluator_dev = Evaluator(
                data_loader=self.data_silo.get_data_loader("dev"),
                tasks=self.data_silo.processor.tasks,
                device=device,
            )
        self.evaluator_dev = evaluator_dev

        # evaluator on test set
        if evaluator_test is None and self.data_silo.get_data_loader("test"):
            evaluator_test = Evaluator(
                data_loader=self.data_silo.get_data_loader("test"),
                tasks=self.data_silo.processor.tasks,
                device=device)
        self.evaluator_test = evaluator_test
Exemple #17
0
    def __init__(
        self,
        optimizer,
        data_silo,
        epochs,
        n_gpu,
        device,
        warmup_linear=0.1,
        evaluate_every=100,
        evaluator_dev=None,
        evaluator_test=None,
        fp16=False,
        grad_acc_steps=1,
        local_rank=-1,
        early_stopping=None,
    ):
        """
        :param optimizer: An optimizer object that determines the learning strategy to be used during training
        :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders
        :type data_silo: DataSilo
        :param epochs: How many times the training procedure will loop through the train dataset
        :type epochs: int
        :param n_gpu: The number of gpus available for training and evaluation.
        :type n_gpu: int
        :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda".
        :param warmup_linear: TODO
        :param evaluate_every: Perform dev set evaluation after this many steps of training.
        :type evaluate_every: int
        :param evaluator_dev: Evaluator for dev set. Options:
                              `None` (Default) => will init a new evaluator, if there's a dev set in the DataSilo
                              `Evaluator Object` => use the manually supplied evaluator
                              `False` => Don't use any evaluator
        :type evaluator_dev: Evaluator, None or False
        :param evaluator_test: Evaluator for test set. Options:
                              `None` (Default) => will init a new evaluator, if there's a test set in the DataSilo
                              `Evaluator Object` => use the manually supplied evaluator
                              `False` => Don't use any evaluator
        :type evaluator_test: Evaluator, None or False
        :param fp16: Whether to use floating point 16 mode.
        :type fp16: bool
        :param grad_acc_steps: TODO
        :type grad_acc_steps: int
        :param local_rank: TODO
        :type local_rank: int
        :param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models.
        :type early_stopping: EarlyStopping
        """
        self.data_silo = data_silo
        self.epochs = int(epochs)
        self.optimizer = optimizer
        self.evaluate_every = evaluate_every
        self.n_gpu = n_gpu
        self.grad_acc_steps = grad_acc_steps
        self.fp16 = fp16
        self.learning_rate = self.optimizer.get_lr()
        self.warmup_linear = warmup_linear
        self.global_step = 0
        self.data_loader_train = data_silo.get_data_loader("train")
        self.device = device
        self.local_rank = local_rank
        self.log_params()
        self.early_stopping = early_stopping

        # evaluator on dev set
        if evaluator_dev is None and self.data_silo.get_data_loader("dev"):
            evaluator_dev = Evaluator(
                data_loader=self.data_silo.get_data_loader("dev"),
                tasks=self.data_silo.processor.tasks,
                device=device,
            )
        self.evaluator_dev = evaluator_dev

        # evaluator on test set
        if evaluator_test is None and self.data_silo.get_data_loader("test"):
            evaluator_test = Evaluator(
                data_loader=self.data_silo.get_data_loader("test"),
                tasks=self.data_silo.processor.tasks,
                device=device)
        self.evaluator_test = evaluator_test
Exemple #18
0
def doc_classification_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    xval_folds = 5
    xval_stratified = True

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    use_amp = None

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    # For xval, we also store the actual predictions and labels in each result so we can
    # calculate overall metrics over all folds later
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels).get("acc")
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        mcc = matthews_corrcoef(labels, preds)
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro,
            "mcc": mcc
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} ############")
        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = TextClassificationHead(
            class_weights=data_silo.calculate_class_weights(
                task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(language_model=language_model,
                              prediction_heads=[prediction_head],
                              embeds_dropout_prob=0.2,
                              lm_output_types=["per_sequence"],
                              device=device)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        # An early stopping instance can be used to save the model that performs best on the dev set
        # according to some metric and stop training when no improvement is happening for some iterations.
        # NOTE: Using a different save directory for each fold, allows us afterwards to use the
        # nfolds best models in an ensemble!
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        earlystopping = EarlyStopping(
            metric="f1_offense",
            mode=
            "max",  # use the metric from our own metrics function instead of loss
            save_dir=save_dir,  # where to save the best model
            patience=
            5  # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          early_stopping=earlystopping,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    allresults = []
    all_preds = []
    all_labels = []
    bestfold = None
    bestf1_offense = -1
    save_dir = Path("saved_models/bert-german-doc-tutorial-es")
    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold, save_dir)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        allresults.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))

        # keep track of best fold
        f1_offense = result[0]["f1_offense"]
        if f1_offense > bestf1_offense:
            bestf1_offense = f1_offense
            bestfold = num_fold

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("doc_classification_xval.results.json", "wt") as fp:
        json.dump(allresults, fp)

    # calculate overall metrics across all folds
    xval_f1_micro = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             average="micro")
    xval_f1_macro = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             average="macro")
    xval_f1_offense = f1_score(all_labels,
                               all_preds,
                               labels=label_list,
                               pos_label="OFFENSE")
    xval_f1_other = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             pos_label="OTHER")
    xval_mcc = matthews_corrcoef(all_labels, all_preds)

    logger.info("XVAL F1 MICRO:   ", xval_f1_micro)
    logger.info("XVAL F1 MACRO:   ", xval_f1_macro)
    logger.info("XVAL F1 OFFENSE: ", xval_f1_offense)
    logger.info("XVAL F1 OTHER:   ", xval_f1_other)
    logger.info("XVAL MCC:        ", xval_mcc)

    # -----------------------------------------------------
    # Just for illustration, use the best model from the best xval val for evaluation on
    # the original (still unseen) test set.
    logger.info(
        "###### Final Eval on hold out test set using best model #####")
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device)
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)

    result = evaluator_origtest.eval(model)
    logger.info("TEST F1 MICRO:   ", result[0]["f1_micro"])
    logger.info("TEST F1 MACRO:   ", result[0]["f1_macro"])
    logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"])
    logger.info("TEST F1 OTHER:   ", result[0]["f1_other"])
    logger.info("TEST MCC:        ", result[0]["mcc"])
def doc_classification_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    xval_folds = 5
    xval_stratification = True

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    dev_split = 0.1
    # For xval the dev_stratification parameter must not be None: with None, the devset cannot be created
    # using the default method of only splitting by the available chunks as initial train set for each fold
    # is just a single chunk!
    dev_stratification = True
    lang_model = "bert-base-german-cased"
    do_lower_case = False
    use_amp = None

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    # For xval, we also store the actual predictions and labels in each result so we can
    # calculate overall metrics over all folds later
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels).get("acc")
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
        mcc = matthews_corrcoef(labels, preds)
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro,
            "mcc": mcc
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        dev_split=dev_split,
        dev_stratification=dev_stratification,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo,
                                     sets=["train", "dev"],
                                     n_splits=xval_folds,
                                     stratification=xval_stratification)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} of {xval_folds} ############"
        )
        logger.info(
            f"Fold training   samples: {len(silo_to_use.data['train'])}")
        logger.info(f"Fold dev        samples: {len(silo_to_use.data['dev'])}")
        logger.info(
            f"Fold testing    samples: {len(silo_to_use.data['test'])}")
        logger.info(
            "Total number of samples: "
            f"{len(silo_to_use.data['train'])+len(silo_to_use.data['dev'])+len(silo_to_use.data['test'])}"
        )

        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = TextClassificationHead(
            class_weights=data_silo.calculate_class_weights(
                task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(language_model=language_model,
                              prediction_heads=[prediction_head],
                              embeds_dropout_prob=0.2,
                              lm_output_types=["per_sequence"],
                              device=device)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        # An early stopping instance can be used to save the model that performs best on the dev set
        # according to some metric and stop training when no improvement is happening for some iterations.
        # NOTE: Using a different save directory for each fold, allows us afterwards to use the
        # nfolds best models in an ensemble!
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        earlystopping = EarlyStopping(
            metric="f1_offense",
            mode=
            "max",  # use the metric from our own metrics function instead of loss
            save_dir=save_dir,  # where to save the best model
            patience=
            5  # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          early_stopping=earlystopping,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold

    # remember all individual evaluation results
    allresults = []
    bestfold = None
    bestf1_offense = -1
    save_dir = Path("saved_models/bert-german-doc-tutorial-es")
    for num_fold, silo in enumerate(silos):
        mlflow.start_run(run_name=f"fold-{num_fold + 1}-of-{len(silos)}",
                         nested=True)
        model = train_on_split(silo, num_fold, save_dir)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        allresults.append(result)

        # keep track of best fold
        f1_offense = result[0]["f1_offense"]
        if f1_offense > bestf1_offense:
            bestf1_offense = f1_offense
            bestfold = num_fold
        mlflow.end_run()
        # emtpy cache to avoid memory leak and cuda OOM across multiple folds
        model.cpu()
        torch.cuda.empty_cache()

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("doc_classification_xval.results.json", "wt") as fp:
        json.dump(allresults, fp)

    # log the best fold metric and fold
    logger.info(f"Best fold f1_offense: {bestf1_offense} in fold {bestfold}")

    # calculate overall metrics across all folds: we only have one head so we do this only for the first head
    # information in each of the per-fold results

    # First create a dict where for each metric, we have a list of values from each fold
    xval_metric_lists_head0 = defaultdict(list)
    for results in allresults:
        head0results = results[0]
        for name in head0results.keys():
            if name not in ["preds", "labels"] and not name.startswith("_") and \
                    isinstance(head0results[name], numbers.Number):
                xval_metric_lists_head0[name].append(head0results[name])
    # Now calculate the mean and stdev for each metric, also copy over the task name
    xval_metric = {}
    xval_metric["task_name"] = allresults[0][0].get("task_name",
                                                    "UNKNOWN TASKNAME")
    for name in xval_metric_lists_head0.keys():
        values = xval_metric_lists_head0[name]
        vmean = statistics.mean(values)
        vstdev = statistics.stdev(values)
        xval_metric[name + "_mean"] = vmean
        xval_metric[name + "_stdev"] = vstdev

    logger.info(
        f"XVAL Accuracy:   mean {xval_metric['acc_mean']} stdev {xval_metric['acc_stdev']}"
    )
    logger.info(
        f"XVAL F1 MICRO:   mean {xval_metric['f1_micro_mean']} stdev {xval_metric['f1_micro_stdev']}"
    )
    logger.info(
        f"XVAL F1 MACRO:   mean {xval_metric['f1_macro_mean']} stdev {xval_metric['f1_macro_stdev']}"
    )
    logger.info(
        f"XVAL F1 OFFENSE: mean {xval_metric['f1_offense_mean']} stdev {xval_metric['f1_offense_stdev']}"
    )
    logger.info(
        f"XVAL F1 OTHER:   mean {xval_metric['f1_other_mean']} stdev {xval_metric['f1_other_stdev']}"
    )
    logger.info(
        f"XVAL MCC:        mean {xval_metric['mcc_mean']} stdev {xval_metric['mcc_stdev']}"
    )

    # -----------------------------------------------------
    # Just for illustration, use the best model from the best xval val for evaluation on
    # the original (still unseen) test set.
    logger.info(
        "###### Final Eval on hold out test set using best model #####")
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device)
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)

    result = evaluator_origtest.eval(model)
    logger.info(f"TEST Accuracy:   {result[0]['acc']}")
    logger.info(f"TEST F1 MICRO:   {result[0]['f1_micro']}")
    logger.info(f"TEST F1 MACRO:   {result[0]['f1_macro']}")
    logger.info(f"TEST F1 OFFENSE: {result[0]['f1_offense']}")
    logger.info(f"TEST F1 OTHER:   {result[0]['f1_other']}")
    logger.info(f"TEST MCC:        {result[0]['mcc']}")
Exemple #20
0
    def eval(self,
             document_store: ElasticsearchDocumentStore,
             device: str,
             label_index: str = "feedback",
             doc_index: str = "eval_document",
             label_origin: str = "gold_label"):
        """
        Performs evaluation on evaluation documents in Elasticsearch DocumentStore.

        Returns a dict containing the following metrics:
            - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers
            - "f1": Average overlap between predicted answers and their corresponding correct answers
            - "top_n_recall": Proportion of predicted answers that overlap with correct answer

        :param document_store: The ElasticsearchDocumentStore containing the evaluation documents
        :type document_store: ElasticsearchDocumentStore
        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
        :type device: str
        :param label_index: Elasticsearch index where labeled questions are stored
        :type label_index: str
        :param doc_index: Elasticsearch index where documents that are used for evaluation are stored
        :type doc_index: str
        """

        # extract all questions for evaluation
        filter = {"origin": label_origin}
        questions = document_store.get_all_documents_in_index(
            index=label_index, filters=filter)

        # mapping from doc_id to questions
        doc_questions_dict = {}
        id = 0
        for question in questions:
            doc_id = question["_source"]["doc_id"]
            if doc_id not in doc_questions_dict:
                doc_questions_dict[doc_id] = [{
                    "id":
                    id,
                    "question":
                    question["_source"]["question"],
                    "answers":
                    question["_source"]["answers"],
                    "is_impossible":
                    False if question["_source"]["answers"] else True
                }]
            else:
                doc_questions_dict[doc_id].append({
                    "id":
                    id,
                    "question":
                    question["_source"]["question"],
                    "answers":
                    question["_source"]["answers"],
                    "is_impossible":
                    False if question["_source"]["answers"] else True
                })
            id += 1

        # extract eval documents and convert data back to SQuAD-like format
        documents = document_store.get_all_documents_in_index(index=doc_index)
        dicts = []
        for document in documents:
            doc_id = document["_source"]["doc_id"]
            text = document["_source"]["text"]
            questions = doc_questions_dict[doc_id]
            dicts.append({"qas": questions, "context": text})

        # Create DataLoader that can be passed to the Evaluator
        indices = range(len(dicts))
        dataset, tensor_names = self.inferencer.processor.dataset_from_dicts(
            dicts, indices=indices)
        data_loader = NamedDataLoader(dataset=dataset,
                                      batch_size=self.inferencer.batch_size,
                                      tensor_names=tensor_names)

        evaluator = Evaluator(data_loader=data_loader,
                              tasks=self.inferencer.processor.tasks,
                              device=device)

        eval_results = evaluator.eval(self.inferencer.model)
        results = {
            "EM": eval_results[0]["EM"],
            "f1": eval_results[0]["f1"],
            "top_n_recall": eval_results[0]["top_n_recall"]
        }
        return results
Exemple #21
0
def question_answering_confidence():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False
    batch_size = 80

    data_dir = Path("../data/squad20")
    # We use the same file for dev and test set only for demo purposes
    dev_filename = "dev-v2.0.json"
    test_filename = "dev-v2.0.json"
    accuracy_at = 3 # accuracy at n is useful for answers inside long documents


    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=dev_filename,
        test_filename=test_filename,
        data_dir=data_dir,
        doc_stride=192,
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)


    # 4. Load pre-trained question-answering model
    model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering")
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)
    # Number of predictions the model will make per Question.
    # The multiple predictions are used for evaluating top n recall.
    model.prediction_heads[0].n_best = accuracy_at

    # 5. The calibration of model confidence scores sets one parameter, which is called temperature and can be accessed through the prediction_head.
    # This temperature is applied to each logit in the forward pass, where each logit is divided by the temperature.
    # A softmax function is applied to the logits afterward to get confidence scores in the range [0,1].
    # A temperature larger than 1 decreases the model’s confidence scores.
    logger.info(f"Parameter used for temperature scaling of model confidence scores: {model.prediction_heads[0].temperature_for_confidence}")

    # 6a. We can either manually set the temperature (default value is 1.0)...
    model.prediction_heads[0].temperature_for_confidence = torch.nn.Parameter((torch.ones(1) * 1.0).to(device=device))

    # 6b. ...or we can run the evaluator on the dev set and use it to calibrate confidence scores with a technique called temperature scaling.
    # It will align the confidence scores with the model's accuracy based on the dev set data by tuning the temperature parameter.
    # During the calibration, this parameter is automatically set internally as an attribute of the prediction head.
    evaluator_dev = Evaluator(
        data_loader=data_silo.get_data_loader("dev"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    result_dev = evaluator_dev.eval(model, return_preds_and_labels=True, calibrate_conf_scores=True)
    # evaluator_dev.log_results(result_dev, "Dev", logging=False, steps=len(data_silo.get_data_loader("dev")))

    # 7. Optionally, run the evaluator on the test set to see how well the confidence scores are aligned with the model's accuracy
    evaluator_test = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    result_test = evaluator_test.eval(model, return_preds_and_labels=True)[0]
    logger.info("Grouping predictions by confidence score and calculating metrics for each bin.")
    em_per_bin, confidence_per_bin, count_per_bin = metrics_per_bin(result_test["preds"], result_test["labels"], num_bins=10)
    for bin_number in range(10):
        logger.info(f"Bin {bin_number} - exact match: {em_per_bin[bin_number]}, average confidence score: {confidence_per_bin[bin_number]}")

    # 8. Hooray! You have a model with calibrated confidence scores.
    # Store the model and the temperature parameter will be stored automatically as an attribute of the prediction head.
    save_dir = Path("../saved_models/qa-confidence-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. When making a prediction with the calibrated model, we could filter out predictions where the model is not confident enough
    # To this end, load the stored model, which will automatically load the stored temperature parameter.
    # The confidence scores are automatically adjusted based on this temperature parameter.
    # For each prediction, we can check the model's confidence and decide whether to output the prediction or not.
    inferencer = QAInferencer.load(save_dir, batch_size=40, gpu=True)
    logger.info(f"Loaded model with stored temperature: {inferencer.model.prediction_heads[0].temperature_for_confidence}")

    QA_input = [
        {
            "questions": ["Who counted the game among the best ever made?"],
            "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
        }]
    result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0]
    if result.prediction[0].confidence > 0.9:
        print(result.prediction[0].answer)
    else:
        print("The confidence is not high enough to give an answer.")
Exemple #22
0
    def __init__(
        self,
        optimizer,
        data_silo,
        epochs,
        n_gpu,
        device,
        warmup_linear=0.1,
        evaluate_every=100,
        evaluator_dev=None,
        evaluator_test=None,
        fp16=False,
        grad_acc_steps=1,
    ):
        """
        :param optimizer: An optimizer object that determines the learning strategy to be used during training
        :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders
        :type data_silo: DataSilo
        :param epochs: How many times the training procedure will loop through the train dataset
        :type epochs: int
        :param n_gpu: The number of gpus available for training and evaluation.
        :type n_gpu: int
        :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda".
        :param warmup_linear: TODO
        :param evaluate_every: Perform dev set evaluation after this many steps of training.
        :type evaluate_every: int
        :param evaluator_dev: The dev set Evaluator object.
        :type evaluator_dev: Evaluator
        :param evaluator_test: The test set Evaluator object.
        :type evaluator_test: Evaluator
        :param fp16: Whether to use floating point 16 mode.
        :type fp16: bool
        :param grad_acc_steps: TODO
        """
        self.data_silo = data_silo
        self.epochs = int(epochs)
        self.optimizer = optimizer
        self.evaluate_every = evaluate_every
        self.n_gpu = n_gpu
        self.grad_acc_steps = grad_acc_steps
        self.fp16 = fp16
        self.learning_rate = self.optimizer.get_lr()
        self.warmup_linear = warmup_linear
        self.global_step = 0
        self.data_loader_train = data_silo.get_data_loader("train")
        self.device = device
        self.log_params()

        # evaluator on dev set
        if evaluator_dev is None and self.data_silo.get_data_loader("dev"):
            evaluator_dev = Evaluator(
                data_loader=self.data_silo.get_data_loader("dev"),
                label_maps=self.data_silo.processor.label_maps,
                device=device,
                metrics=self.data_silo.processor.metrics,
            )
        self.evaluator_dev = evaluator_dev

        # evaluator on test set
        if evaluator_test is None and self.data_silo.get_data_loader("test"):
            evaluator_test = Evaluator(
                data_loader=self.data_silo.get_data_loader("test"),
                label_maps=self.data_silo.processor.label_maps,
                device=device,
                metrics=self.data_silo.processor.metrics,
            )
        self.evaluator_test = evaluator_test
Exemple #23
0
def evaluate_question_answering():
    ##########################
    ########## Settings
    ##########################
    device, n_gpu = initialize_device_settings(use_cuda=True)
    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = True

    data_dir = Path("../data/squad20")
    evaluation_filename = "dev-v2.0.json"

    batch_size = 50
    no_ans_boost = 0
    recall_at = 3 # recall at n is only useful for answers inside long documents

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list= ["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
        doc_stride=128,
    )

    # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an Evaluator
    evaluator = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )

    # 5. Load model
    model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering")
    # use "load" if you want to use a local model that was trained with FARM
    #model = AdaptiveModel.load(lang_model, device=device)
    model.prediction_heads[0].no_ans_boost = no_ans_boost
    model.prediction_heads[0].n_best = recall_at
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

    # 6. Run the Evaluator
    results = evaluator.eval(model)
    f1_score = results[0]["f1"]
    em_score = results[0]["EM"]
    tnrecall = results[0]["top_n_recall"]
    print("F1-Score:", f1_score)
    print("Exact Match Score:", em_score)
    print(f"top_{recall_at}_recall:", tnrecall)
def train_evaluation_single(seed=42):
    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 32 * 4  # 4x V100
    n_epochs = 2
    evaluate_every = 2000000  # disabling dev eval
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # Load model and train
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("testsave/data/squad20"),
    )
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)
    language_model = LanguageModel.load(lang_model)
    prediction_head = QuestionAnsweringHead()
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    starttime = time()
    trainer.train()
    elapsed = time() - starttime

    save_dir = Path("testsave/roberta-qa-dev")
    model.save(save_dir)
    processor.save(save_dir)

    # Create Evaluator
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnrecall = results[0]["top_n_recall"] * 100

    print(results)
    print(elapsed)

    gold_f1 = 82.155
    gold_EM = 77.714
    gold_tnrecall = 97.3721  #
    gold_elapsed = 1286.30
    np.testing.assert_allclose(
        f1_score,
        gold_f1,
        rtol=0.01,
        err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}")
    np.testing.assert_allclose(
        em_score,
        gold_EM,
        rtol=0.01,
        err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}")
    np.testing.assert_allclose(
        tnrecall,
        gold_tnrecall,
        rtol=0.01,
        err_msg=
        f"FARM Training changed for top 1 recall by: {em_score - gold_EM}")
    np.testing.assert_allclose(
        elapsed,
        gold_elapsed,
        rtol=0.1,
        err_msg=
        f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds"
    )
Exemple #25
0
    def train(self):
        """ Perform the training procedure. """

        # connect the prediction heads with the right output from processor
        self.model.connect_heads_with_processor(self.data_silo.processor.tasks,
                                                require_labels=True)

        # Check that the tokenizer fits the language model
        self.model.verify_vocab_size(
            vocab_size=len(self.data_silo.processor.tokenizer))

        logger.info(f"\n {GROWING_TREE}")
        self.model.train()

        do_stopping = False
        evalnr = 0
        loss = 0

        resume_from_step = self.from_step

        for epoch in range(self.from_epoch + 1, self.epochs + 1):
            train_data_loader = self.data_silo.get_data_loader("train")
            progress_bar = tqdm(train_data_loader)
            for step, batch in enumerate(progress_bar, start=1):
                # when resuming training from a checkpoint, we want to fast forward to the step of the checkpoint
                if resume_from_step and step <= resume_from_step:
                    if resume_from_step == step:
                        resume_from_step = None
                    continue

                if self.sigterm_handler and self.sigterm_handler.kill_now:  # save the current state as a checkpoint
                    logger.info(
                        "Received a SIGTERM signal. Saving the current train state as a checkpoint ..."
                    )
                    self._save()
                    sys.exit(0)

                # save a checkpoint and continue train (do not create a new checkpoint if just resumed from a checkpoint)
                if self.checkpoint_every and step % self.checkpoint_every == 0 and resume_from_step + 1 != step:
                    self._save()

                progress_bar.set_description(
                    f"Train epoch {epoch}/{self.epochs} (Cur. train loss: {loss:.4f})"
                )

                # Move batch of samples to device
                batch = {key: batch[key].to(self.device) for key in batch}

                # Forward pass through model
                logits = self.model.forward(**batch)
                per_sample_loss = self.model.logits_to_loss(
                    logits=logits, global_step=self.global_step, **batch)

                loss = self.backward_propagate(per_sample_loss, step)

                # Perform  evaluation
                if self.global_step % self.evaluate_every == 0 and self.global_step != 0:
                    # When using StreamingDataSilo, each evaluation creates a new instance of
                    # dev_data_loader. In cases like training from scratch, this could cause
                    # some variance across evaluators due to the randomness in word masking.
                    dev_data_loader = self.data_silo.get_data_loader("dev")
                    if dev_data_loader is not None:
                        evaluator_dev = Evaluator(
                            data_loader=dev_data_loader,
                            tasks=self.data_silo.processor.tasks,
                            device=self.device)
                        evalnr += 1
                        result = evaluator_dev.eval(self.model)
                        evaluator_dev.log_results(result, "Dev",
                                                  self.global_step)
                        if self.early_stopping:
                            do_stopping, save_model, eval_value = self.early_stopping.check_stopping(
                                result)
                            if save_model:
                                logger.info(
                                    "Saving current best model to {}, eval={}".
                                    format(self.early_stopping.save_dir,
                                           eval_value))
                                self.model.save(self.early_stopping.save_dir)
                                self.data_silo.processor.save(
                                    self.early_stopping.save_dir)
                            if do_stopping:
                                # log the stopping
                                logger.info(
                                    "STOPPING EARLY AT EPOCH {}, STEP {}, EVALUATION {}"
                                    .format(epoch, step, evalnr))
                if do_stopping:
                    break
                self.global_step += 1
                self.from_step = step
            self.from_epoch = epoch
            if do_stopping:
                break

        # With early stopping we want to restore the best model
        if self.early_stopping and self.early_stopping.save_dir:
            logger.info("Restoring best model so far from {}".format(
                self.early_stopping.save_dir))
            lm_name = self.model.language_model.name
            model = AdaptiveModel.load(self.early_stopping.save_dir,
                                       self.device,
                                       lm_name=lm_name)
            model.connect_heads_with_processor(self.data_silo.processor.tasks,
                                               require_labels=True)

        # Eval on test set
        test_data_loader = self.data_silo.get_data_loader("test")
        if test_data_loader is not None:
            evaluator_test = Evaluator(data_loader=test_data_loader,
                                       tasks=self.data_silo.processor.tasks,
                                       device=self.device)
            result = evaluator_test.eval(self.model)
            evaluator_test.log_results(result, "Test", self.global_step)
        return self.model
def doc_classification(
    task_config,
    model_name_or_path,
    cache_dir,
    data_dir,
    save_dir,
    model_dir,
    run_name="0",
    lr=1e-05,
    warmup_steps=5000,
    balance_classes=True,
    embeds_dropout=0.1,
    epochs=200,  # large because we use early stopping by default
    batch_size=20,
    grad_acc_steps=1,
    early_stopping_metric="roc_auc",
    early_stopping_mode="max",
    early_stopping_patience=10,
    model_class="Bert",
    tokenizer_class="BertTokenizer",
    do_lower_case=False,
    do_train=True,
    do_eval=True,
    do_hpo=False,
    print_preds=False,
    print_dev_preds=False,
    max_seq_len=512,
    seed=11,
    eval_every=500,
    use_amp=False,
    use_cuda=True,
):
    # Load task config
    task_config = yaml.safe_load(open(task_config))

    data_dir = data_dir
    save_dir = save_dir
    model_dir = model_dir

    # Create label list from args list or (for large label lists) create from file by splitting by space
    if isinstance(task_config["data"]["label_list"], list):
        label_list = task_config["data"]["label_list"]
    else:
        with open(data_dir / 'labels' /
                  task_config["data"]["label_list"]) as code_file:
            label_list = code_file.read().split(" ")

    # Register Outcome Metrics
    register_task_metrics(label_list)

    # General Settings
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=model_name_or_path,
        tokenizer_class=tokenizer_class,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        data_dir=data_dir,
        label_list=label_list,
        metric=task_config["metric"],
        multilabel=task_config["multilabel"],
        train_filename=task_config["data"]["train_filename"],
        dev_filename=task_config["data"]["dev_filename"],
        dev_split=task_config["data"]["dev_split"]
        if "dev_split" in task_config["data"] else None,
        test_filename=task_config["data"]["test_filename"],
        delimiter=task_config["data"]["parsing"]["delimiter"],
        quote_char=task_config["data"]["parsing"]["quote_char"],
        label_column_name=task_config["data"]["parsing"]["label_column"])

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor,
                         caching=True,
                         cache_path=Path(cache_dir),
                         batch_size=batch_size)

    if do_train:

        # Setup MLFlow logger
        ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"])
        ml_logger.init_experiment(
            experiment_name=task_config["experiment_name"],
            run_name=f'{task_config["experiment_name"]}_{run_name}')

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(model_name_or_path,
                                            language_model_class=model_class)

        # b) and a prediction head on top that is suited for our task

        # Define class weights
        if balance_classes:
            class_weights = data_silo.calculate_class_weights(
                task_name=task_config["task_type"])
        else:
            class_weights = None

        # Create Multi- or Single-Label Classification Heads
        if task_config["multilabel"]:

            prediction_head = MultiLabelTextClassificationHead(
                class_weights=class_weights, num_labels=len(label_list))

        else:
            prediction_head = ExtendedTextClassificationHead(
                class_weights=class_weights, num_labels=len(label_list))

        model = ExtendedAdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=embeds_dropout,
            lm_output_types=[task_config["output_type"]],
            device=device)

        # 5. Create an optimizer
        schedule_opts = {
            "name": "LinearWarmup",
            "num_warmup_steps": warmup_steps
        }

        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=lr,
            device=device,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=epochs,
            use_amp=use_amp,
            grad_acc_steps=grad_acc_steps,
            schedule_opts=schedule_opts)

        # 6. Create an early stopping instance
        early_stopping = None
        if early_stopping_mode != "none":
            early_stopping = EarlyStopping(mode=early_stopping_mode,
                                           min_delta=0.0001,
                                           save_dir=model_dir,
                                           metric=early_stopping_metric,
                                           patience=early_stopping_patience)

        # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it
        # from time to time

        trainer = ExtendedTrainer(model=model,
                                  optimizer=optimizer,
                                  data_silo=data_silo,
                                  epochs=epochs,
                                  n_gpu=n_gpu,
                                  lr_schedule=lr_schedule,
                                  evaluate_every=eval_every,
                                  early_stopping=early_stopping,
                                  device=device,
                                  grad_acc_steps=grad_acc_steps,
                                  evaluator_test=do_eval)

        def score_callback(eval_score, train_loss):
            tune.report(roc_auc_dev=eval_score, train_loss=train_loss)

        # 8. Train the model
        trainer.train(score_callback=score_callback if do_hpo else None)

        # 9. Save model if not saved in early stopping
        model.save(model_dir + "/final_model")
        processor.save(model_dir + "/final_model")

    if do_eval:
        # Load newly trained model or existing model
        if do_train:
            model_dir = model_dir
        else:
            model_dir = Path(model_name_or_path)

        logger.info("###### Eval on TEST SET #####")

        evaluator_test = ExtendedEvaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device)

        # Load trained model for evaluation
        model = ExtendedAdaptiveModel.load(model_dir, device)
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)

        # Evaluate
        results = evaluator_test.eval(model, return_preds_and_labels=True)

        # Log results
        utils.log_results(results,
                          dataset_name="test",
                          steps=len(evaluator_test.data_loader),
                          save_path=model_dir + "/eval_results.txt")

        if print_preds:
            # Print model test predictions
            utils.save_predictions(results,
                                   save_dir=model_dir,
                                   multilabel=task_config["multilabel"])

        if print_dev_preds:
            # Evaluate on dev set, e.g. for threshold tuning
            evaluator_dev = Evaluator(
                data_loader=data_silo.get_data_loader("dev"),
                tasks=data_silo.processor.tasks,
                device=device)
            dev_results = evaluator_dev.eval(model,
                                             return_preds_and_labels=True)
            utils.log_results(dev_results,
                              dataset_name="dev",
                              steps=len(evaluator_dev.data_loader),
                              save_path=model_dir + "/eval_dev_results.txt")

            # Print model dev predictions
            utils.save_predictions(dev_results,
                                   save_dir=model_dir,
                                   multilabel=task_config["multilabel"],
                                   dataset_name="dev")
Exemple #27
0
    processor=processor,
    batch_size=4,
    gpu=True,
    # TODO: how to mix for multihead?
    task_type="classification")
basic_texts = [
    {
        "text": "Some text you want to classify"
    },
    {
        "text": "A second sample"
    },
]

ret = inferencer.inference_from_dicts(basic_texts)
logger.info(f"Result of inference: {ret}")

logger.info(f"Evaluating on training set...")
evaluator = Evaluator(data_loader=data_silo.get_data_loader("train"),
                      tasks=processor.tasks,
                      device=device)

result = evaluator.eval(inferencer.model, return_preds_and_labels=True)

evaluator.log_results(result,
                      "Test",
                      steps=len(data_silo.get_data_loader("test")))

inferencer.close_multiprocessing_pool()
logger.info("PROCESSING FINISHED")