Python register_metricsの例

プログラミング言語: Python

名前空間/パッケージ名: farm.evaluation.metrics

メソッド/関数: register_metrics

hotexamples.comのコード掲載数: 12

Python register_metrics - 12件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのfarm.evaluation.metrics.register_metricsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: processor.py プロジェクト: gsarti/interpreting-complexity

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        label_list=None,
        metric=None,
        train_filename="train.tsv",
        dev_filename=None,
        test_filename="test.tsv",
        dev_split=0.1,
        delimiter="\t",
        quote_char=csv.QUOTE_NONE,
        skiprows=None,
        label_column_names=[],
        label_names=[],
        multilabel=False,
        header=0,
        proxies=None,
        max_samples=None,
        text_column_name="text",
        **kwargs,
    ):
        self.delimiter = delimiter
        self.quote_char = quote_char
        self.skiprows = skiprows
        self.header = header
        self.max_samples = max_samples
        self.text_column_name = text_column_name

        super(TextClassificationProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies,
        )
        if metric is None:
            metric = "classification_metrics"
            register_metrics(metric, classification_metrics)
        if multilabel:
            task_type = 'multilabel_classification'
        else:
            task_type = "classification"
        data = read_tsv(os.path.join(data_dir, train_filename))
        if label_column_names and label_names:
            for col_name, l_name in zip(label_column_names, label_names):
                self.add_task(
                    name=l_name,
                    metric=metric,
                    label_list=list(set(data[col_name])),
                    label_column_name=col_name,
                    task_type=task_type,
                    label_name=l_name,
                )

コード例 #2

ファイルを表示

ファイル: metrics.py プロジェクト: swajahataziz/clinical-outcome-prediction

def register_task_metrics(label_list):
    register_metrics('binary_classification_metrics',
                     binary_classification_metrics)
    register_metrics('multiclass_classification_metrics',
                     multiclass_classification_metrics)

    register_multilabel_classification_metrics_3_digits_only(
        'binary_classification_metrics_3_digits_only', label_list)
    register_multilabel_classification_metrics_i2b2_only(
        'binary_classification_metrics_i2b2_only', label_list)

コード例 #3

ファイルを表示

ファイル: metrics.py プロジェクト: swajahataziz/clinical-outcome-prediction

def register_multilabel_classification_metrics_i2b2_only(
        metric_name, label_list):
    def multilabel_classification_metrics_i2b2_only(preds, probs, labels,
                                                    multilabel):
        mask = list(map(utils.is_i2b2_code, label_list))
        logger.info(f"Evaluate on {mask.count(True)} i2b2 codes.")

        return binary_classification_metrics(preds[:, mask],
                                             [prob[mask] for prob in probs],
                                             labels[:, mask], multilabel)

    register_metrics(metric_name, multilabel_classification_metrics_i2b2_only)

コード例 #4

ファイルを表示

ファイル: doc_classification_with_earlystopping.py プロジェクト: paweldobrzynski/FARM

def doc_classification_with_earlystopping():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    # ml_logger = MLFlowLogger(tracking_uri="logs")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.2,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=0.5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        metric="f1_offense",
        mode=
        "max",  # use the metric from our own metrics function instead of loss
        # metric="f1_macro", mode="max",  # use f1_macro from the dev evaluator of the trainer
        # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
        save_dir=Path("saved_models/bert-german-doc-tutorial-es"
                      ),  # where to save the best model
        patience=
        5  # number of evaluations to wait for improvement before terminating the training
    )

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model.
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]

    # Load from the final epoch directory and apply
    print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING")
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
    model.close_multiprocessing_pool()

    # Load from saved best model
    print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING")
    model = Inferencer.load(earlystopping.save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print("APPLICATION ON BEST MODEL")
    print(result)
    model.close_multiprocessing_pool()

コード例 #5

ファイルを表示

ファイル: multitransquest.py プロジェクト: TharinduDR/MultiTransQuest

                    sep='\t',
                    index=False)
    eval_df.to_csv(os.path.join(multitransquest_config['cache_dir'],
                                "eval.tsv"),
                   header=True,
                   sep='\t',
                   index=False)

    set_all_seeds(seed=SEED * i)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = multitransquest_config['num_train_epochs']
    batch_size = multitransquest_config['train_batch_size']
    evaluate_every = multitransquest_config['evaluate_during_training_steps']
    lang_model = MODEL_NAME

    register_metrics(name="pearson_correlation", implementation=pearson_corr)

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model)

    processor = TextPairRegressionProcessor(
        tokenizer=tokenizer,
        label_list=None,
        metric="pearson_correlation",
        max_seq_len=multitransquest_config['max_seq_length'],
        train_filename="train.tsv",
        dev_filename="eval.tsv",
        test_filename=None,
        data_dir=Path(multitransquest_config['cache_dir']),
        delimiter="\t")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

コード例 #6

ファイルを表示

ファイル: bert_based_experiments.py プロジェクト: AlexFrummet/CookversationalSearch

    def perform_fine_tuning(current_info_need,
                            bert_model,
                            label_list,
                            num_epochs,
                            condition,
                            folds=10,
                            stratified=True,
                            learning_rate=2e-5,
                            batch_size=32,
                            embeds_dropout_prob=.1):

        ## Define evaluation metrics ##
        def evaluation_metrics(preds, labels):
            acc = simple_accuracy(preds, labels).get("acc")
            f1other = f1_score(y_true=labels, y_pred=preds, pos_label="Other")
            f1infoneed = f1_score(y_true=labels,
                                  y_pred=preds,
                                  pos_label=current_info_need)
            recall_infoneed = recall_score(y_true=labels,
                                           y_pred=preds,
                                           pos_label=current_info_need)
            precision_infoneed = precision_score(y_true=labels,
                                                 y_pred=preds,
                                                 pos_label=current_info_need)
            recall_other = recall_score(y_true=labels,
                                        y_pred=preds,
                                        pos_label="Other")
            precision_other = precision_score(y_true=labels,
                                              y_pred=preds,
                                              pos_label="Other")
            recall_macro = recall_score(y_true=labels,
                                        y_pred=preds,
                                        average="macro")
            precision_macro = precision_score(y_true=labels,
                                              y_pred=preds,
                                              average="macro")
            recall_micro = recall_score(y_true=labels,
                                        y_pred=preds,
                                        average="micro")
            precision_micro = precision_score(y_true=labels,
                                              y_pred=preds,
                                              average="micro")
            recall_weighted = recall_score(y_true=labels,
                                           y_pred=preds,
                                           average="weighted")
            precision_weighted = precision_score(y_true=labels,
                                                 y_pred=preds,
                                                 average="weighted")
            f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
            f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
            mcc = matthews_corrcoef(labels, preds)
            f1weighted = f1_score(y_true=labels,
                                  y_pred=preds,
                                  average="weighted")

            return {
                "info_need": current_info_need,
                "model": bert_model,
                "num_epochs": num_epochs,
                "condition": condition,
                "acc": acc,
                "f1_other": f1other,
                "f1_infoneed": f1infoneed,
                "precision_infoneed": precision_infoneed,
                "recall_infoneed": recall_infoneed,
                "recall_other": recall_other,
                "precision_other": precision_other,
                "recall_macro": recall_macro,
                "precision_macro": precision_macro,
                "recall_micro": recall_micro,
                "precision_micro": precision_micro,
                "recall_weighted": recall_weighted,
                "precision_weighted": precision_weighted,
                "f1_weighted": f1weighted,
                "f1_macro": f1macro,
                "f1_micro": f1micro,
                "f1_weighted": f1weighted,
                "mcc": mcc
            }

        register_metrics(
            f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs',
            evaluation_metrics)
        metric = f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs'
        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=True)
        logger, ml_logger = init_logging()
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=bert_model,
                                   do_lower_case=False)

        processor = TextClassificationProcessor(
            tokenizer=tokenizer,
            max_seq_len=256,
            train_filename=
            f"{current_info_need}_{condition}_{num_epochs}_epochs_train.csv",
            test_filename=
            f"{current_info_need}_{condition}_{num_epochs}_epochs_test.csv",
            data_dir="data/",
            label_list=label_list,
            metric=metric,
            text_column_name="utterance",
            label_column_name=level,
            delimiter=";")

        data_silo = DataSilo(processor=processor, batch_size=batch_size)

        silos = DataSiloForCrossVal.make(data_silo,
                                         n_splits=folds,
                                         sets=['train', 'test'])

        # the following steps should be run for each of the folds of the cross validation, so we put them
        # into a function
        def train_on_split(silo_to_use, n_fold, save_dir):
            logger.info(
                f"############ Crossvalidation: Fold {n_fold} ############")
            # Create an AdaptiveModel
            # a) which consists of a pretrained language model as a basis
            language_model = LanguageModel.load(bert_model)
            # b) and a prediction head on top that is suited for our task => Text classification
            prediction_head = TextClassificationHead(
                class_weights=data_silo.calculate_class_weights(
                    task_name="text_classification"),
                num_labels=len(label_list))

            model = AdaptiveModel(language_model=language_model,
                                  prediction_heads=[prediction_head],
                                  embeds_dropout_prob=embeds_dropout_prob,
                                  lm_output_types=["per_sequence"],
                                  device=device)

            # Create an optimizer
            model, optimizer, lr_schedule = initialize_optimizer(
                model=model,
                learning_rate=learning_rate,
                device=device,
                n_batches=len(silo_to_use.loaders["train"]),
                n_epochs=num_epochs,
                use_amp=None)

            # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
            # Also create an EarlyStopping instance and pass it on to the trainer

            # An early stopping instance can be used to save the model that performs best on the dev set
            # according to some metric and stop training when no improvement is happening for some iterations.
            # NOTE: Using a different save directory for each fold, allows us afterwards to use the
            # nfolds best models in an ensemble!
            save_dir = Path(str(save_dir) + f"-{n_fold}")
            earlystopping = EarlyStopping(
                metric="f1_infoneed",
                mode=
                "max",  # use the metric from our own metrics function instead of loss
                save_dir=save_dir,  # where to save the best model
                patience=
                5  # number of evaluations to wait for improvement before terminating the training
            )

            trainer = Trainer(model=model,
                              optimizer=optimizer,
                              data_silo=silo_to_use,
                              epochs=num_epochs,
                              n_gpu=n_gpu,
                              lr_schedule=lr_schedule,
                              evaluate_every=100,
                              device=device,
                              early_stopping=earlystopping,
                              evaluator_test=False)

            # train it
            trainer.train()

            return trainer.model

        # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
        # on the test set of each fold
        # Remember all the results for overall metrics over all predictions of all folds and for averaging
        allresults = []
        all_preds = []
        all_labels = []
        bestfold = None
        bestf1_info_need = -1
        language_model_name = bert_model
        if language_model_name.find("/") != -1:
            language_model_name = language_model_name.replace("/", "_")
        save_dir = Path(
            f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}"
        )
        for num_fold, silo in enumerate(silos):
            model = train_on_split(silo, num_fold, save_dir)

            # do eval on test set here (and not in Trainer),
            #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
            evaluator_test = Evaluator(
                data_loader=silo.get_data_loader("test"),
                tasks=silo.processor.tasks,
                device=device)
            result = evaluator_test.eval(model, return_preds_and_labels=True)
            evaluator_test.log_results(result,
                                       "Test",
                                       steps=len(silo.get_data_loader("test")),
                                       num_fold=num_fold)

            allresults.append(result)
            all_preds.extend(result[0].get("preds"))
            all_labels.extend(result[0].get("labels"))

            # keep track of best fold
            f1_info_need = result[0]["f1_infoneed"]
            if f1_info_need > bestf1_info_need:
                bestf1_info_need = f1_info_need
                bestfold = num_fold

            # emtpy cache to avoid memory leak and cuda OOM across multiple folds
            model.cpu()
            torch.cuda.empty_cache()

        # Save the per-fold results to json for a separate, more detailed analysis
        with open(
                f"classification_results/test/{current_info_need}-{language_model_name}-{condition}-{num_epochs}_epochs-{folds}-fold-cv.results.json",
                "wt") as fp:
            json.dump(allresults, fp)

        # calculate overall metrics across all folds
        xval_f1_other = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 pos_label="Other")
        xval_f1_info_need = f1_score(all_labels,
                                     all_preds,
                                     labels=label_list,
                                     pos_label=current_info_need)
        xval_f1_micro = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 average="micro")
        xval_f1_macro = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 average="macro")
        xval_mcc = matthews_corrcoef(all_labels, all_preds)

        xval_overall_results = {
            "xval_f1_other": xval_f1_other,
            f"xval_f1_infoneed": xval_f1_info_need,
            "xval_f1_micro": xval_f1_micro,
            "xval_f1_macro": xval_f1_macro,
            "xval_f1_mcc": xval_mcc
        }

        logger.info(f"XVAL F1 MICRO: {xval_f1_micro}")
        logger.info(f"XVAL F1 MACRO: {xval_f1_macro}")
        logger.info(f"XVAL F1 OTHER: {xval_f1_other}")
        logger.info(
            f"XVAL F1 {current_info_need} {condition} {num_epochs} epochs:   {xval_f1_info_need}"
        )
        logger.info(f"XVAL MCC: {xval_mcc}")

        # -----------------------------------------------------
        # Just for illustration, use the best model from the best xval val for evaluation on
        # the original (still unseen) test set.
        logger.info(
            "###### Final Eval on hold out test set using best model #####")
        evaluator_origtest = Evaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device)
        # restore model from the best fold
        lm_name = model.language_model.name
        save_dir = Path(
            f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}-{bestfold}"
        )
        model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)

        result = evaluator_origtest.eval(model)
        logger.info("TEST F1 MICRO: {}".format(result[0]["f1_micro"]))
        logger.info("TEST F1 MACRO: {}".format(result[0]["f1_macro"]))
        logger.info("TEST F1 OTHER: {}".format(result[0]["f1_other"]))
        logger.info("TEST F1 {0}: {1}".format(current_info_need,
                                              result[0]["f1_infoneed"]))
        logger.info("TEST MCC:  {}".format(result[0]["mcc"]))

        test_set_results = {
            "test_f1_other": result[0]["f1_other"],
            "test_f1_infoneed": result[0][f"f1_infoneed"],
            "test_f1_micro": result[0]["f1_micro"],
            "test_f1_macro": result[0]["f1_macro"],
            "test_f1_mcc": result[0]["mcc"]
        }

コード例 #7

ファイルを表示

ファイル: classification.py プロジェクト: cw18-coder/verseagility

def doc_classification(task,
                       model_type,
                       n_epochs,
                       batch_size,
                       embeds_dropout,
                       evaluate_every,
                       use_cuda,
                       max_seq_len,
                       learning_rate,
                       do_lower_case,
                       register_model,
                       save_model=True,
                       early_stopping=False):

    language = cu.params.get('language')

    # Check task
    if cu.tasks.get(str(task)).get('type') != 'classification':
        raise Exception('NOT A CLASSIFICATION TASK')

    # Data
    dt_task = dt.Data(task=task)
    ## Download training files
    if not os.path.isfile(dt_task.get_path('fn_train', dir='data_dir')):
        dt_task.download('data_dir', dir='data_dir', source='datastore')

    # Settings
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)
    lang_model = he.get_farm_model(model_type, language)
    save_dir = dt_task.get_path('model_dir')
    label_list = dt_task.load('fn_label', dir='data_dir',
                              header=None)[0].to_list()

    # AML log
    try:
        aml_run.log('task', task)
        aml_run.log('language', language)
        aml_run.log('n_epochs', n_epochs)
        aml_run.log('batch_size', batch_size)
        aml_run.log('learning_rate', learning_rate)
        aml_run.log('embeds_dropout', embeds_dropout)
        aml_run.log('max_seq_len', max_seq_len)
        aml_run.log('lang_model', lang_model)
        aml_run.log_list('label_list', label_list)
    except:
        pass

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
        # AML log
        try:
            aml_run.log('acc', acc.get('acc'))
            aml_run.log('f1macro', f1macro)
            aml_run.log('f1micro', f1micro)
        except:
            pass
        return {"acc": acc, "f1_macro": f1macro, "f1_micro": f1micro}

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        data_dir=dt_task.data_dir,
        label_list=label_list,
        metric=metric,
        label_column_name="label",
        train_filename=dt_task.get_path('fn_train', dir='data_dir'),
        test_filename=dt_task.get_path('fn_test', dir='data_dir'))

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    ## Pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)

    ## Prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(processor.tasks["text_classification"]["label_list"]),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=embeds_dropout,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        learning_rate=learning_rate,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    if early_stopping:
        earlystopping = EarlyStopping(
            metric="f1_macro",
            mode="max",  # use f1_macro from the dev evaluator of the trainer
            # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
            save_dir=save_dir,  # where to save the best model
            patience=
            2  # number of evaluations to wait for improvement before terminating the training
        )
    else:
        earlystopping = None

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Store it:
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    if save_model:
        model.save(save_dir)
        processor.save(save_dir)

        if register_model:
            dt_task.upload('model_dir', destination='model')

コード例 #8

ファイルを表示

ファイル: doc_classification_crossvalidation.py プロジェクト: yon606/FARM

def doc_classification_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    xval_folds = 5
    xval_stratified = True

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False
    use_amp = None

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    # For xval, we also store the actual predictions and labels in each result so we can
    # calculate overall metrics over all folds later
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels).get("acc")
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        mcc = matthews_corrcoef(labels, preds)
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro,
            "mcc": mcc
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} ############")
        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = TextClassificationHead(
            class_weights=data_silo.calculate_class_weights(
                task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(language_model=language_model,
                              prediction_heads=[prediction_head],
                              embeds_dropout_prob=0.2,
                              lm_output_types=["per_sequence"],
                              device=device)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        # An early stopping instance can be used to save the model that performs best on the dev set
        # according to some metric and stop training when no improvement is happening for some iterations.
        # NOTE: Using a different save directory for each fold, allows us afterwards to use the
        # nfolds best models in an ensemble!
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        earlystopping = EarlyStopping(
            metric="f1_offense",
            mode=
            "max",  # use the metric from our own metrics function instead of loss
            save_dir=save_dir,  # where to save the best model
            patience=
            5  # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          early_stopping=earlystopping,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    allresults = []
    all_preds = []
    all_labels = []
    bestfold = None
    bestf1_offense = -1
    save_dir = Path("saved_models/bert-german-doc-tutorial-es")
    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold, save_dir)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        allresults.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))

        # keep track of best fold
        f1_offense = result[0]["f1_offense"]
        if f1_offense > bestf1_offense:
            bestf1_offense = f1_offense
            bestfold = num_fold

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("doc_classification_xval.results.json", "wt") as fp:
        json.dump(allresults, fp)

    # calculate overall metrics across all folds
    xval_f1_micro = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             average="micro")
    xval_f1_macro = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             average="macro")
    xval_f1_offense = f1_score(all_labels,
                               all_preds,
                               labels=label_list,
                               pos_label="OFFENSE")
    xval_f1_other = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             pos_label="OTHER")
    xval_mcc = matthews_corrcoef(all_labels, all_preds)

    logger.info("XVAL F1 MICRO:   ", xval_f1_micro)
    logger.info("XVAL F1 MACRO:   ", xval_f1_macro)
    logger.info("XVAL F1 OFFENSE: ", xval_f1_offense)
    logger.info("XVAL F1 OTHER:   ", xval_f1_other)
    logger.info("XVAL MCC:        ", xval_mcc)

    # -----------------------------------------------------
    # Just for illustration, use the best model from the best xval val for evaluation on
    # the original (still unseen) test set.
    logger.info(
        "###### Final Eval on hold out test set using best model #####")
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device)
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)

    result = evaluator_origtest.eval(model)
    logger.info("TEST F1 MICRO:   ", result[0]["f1_micro"])
    logger.info("TEST F1 MACRO:   ", result[0]["f1_macro"])
    logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"])
    logger.info("TEST F1 OTHER:   ", result[0]["f1_other"])
    logger.info("TEST MCC:        ", result[0]["mcc"])

コード例 #9

ファイルを表示

ファイル: processor.py プロジェクト: gsarti/interpreting-complexity

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        metric=None,
        train_filename="train.tsv",
        dev_filename=None,
        test_filename="test.tsv",
        dev_split=0.1,
        delimiter="\t",
        label_column_names=[],
        label_names=[],
        scaler_mean=None,
        scaler_scale=None,
        proxies=None,
        **kwargs,
    ):
        """
        :param tokenizer: Used to split a sentence (str) into tokens.
        :param max_seq_len: Samples are truncated after this many tokens.
        :type max_seq_len: int
        :param data_dir: The directory in which the train and dev files can be found.
        :type data_dir: str
        :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
            Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
            For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
        :type metric: str, function, or list
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: None
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
        :type dev_split: float
        :param delimiter: Separator used in the input tsv / csv file. German version of Conll03 uses a whitespace. GermEval 2014 is tab separated \t
        :type delimiter: str
        :param label_column_name: name of the column in the input csv/tsv that shall be used as training labels
        :type label_column_name: str
        :param label_name: name for the internal label variable in FARM (only needed to adjust in rare cases)
        :type label_name: str
        :param scaler_mean: Value to substract from the label for normalization
        :type scaler_mean: float
        :param scaler_scale: Value to divide the label by for normalization
        :type scaler_scale: float
        :param proxies: proxy configuration to allow downloads of remote datasets.
                        Format as in  "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
        :type proxies: dict
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        """
        # Custom processor attributes
        self.delimiter = delimiter

        super(TokenRegressionProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies,
        )
        if metric is None:
            metric = "token_level_regression_metrics"
            register_metrics(metric, token_level_regression_metrics)
        if label_column_names and label_names:
            for col_name, l_name in zip(label_column_names, label_names):
                self.add_task(
                    name=l_name,
                    metric=metric,
                    label_list=[scaler_mean, scaler_scale],
                    label_column_name=col_name,
                    task_type="token_regression",
                    label_name=l_name,
                )
        else:
            logger.info(
                "Initialized processor without tasks. Supply `label_names` and `label_column_names` to the constructor for "
                "using the default task or add a custom task later via processor.add_task()"
            )

コード例 #10

ファイルを表示

ファイル: processor.py プロジェクト: gsarti/interpreting-complexity

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        metric=None,
        train_filename="train.tsv",
        dev_filename=None,
        test_filename="test.tsv",
        dev_split=0.1,
        delimiter="\t",
        quote_char=csv.QUOTE_NONE,
        skiprows=None,
        label_column_names=[],
        label_names=[],
        scaler_mean=None,
        scaler_scale=None,
        proxies=None,
        start_feat_col=None,
        text_column_name="text",
        **kwargs,
    ):
        """
        :param tokenizer: Used to split a sentence (str) into tokens.
        :param max_seq_len: Samples are truncated after this many tokens.
        :type max_seq_len: int
        :param data_dir: The directory in which the train and dev files can be found.
        :type data_dir: str
        :param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
        :type label_list: list
        :param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
                 Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
                 For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
        :type metric: str, function, or list
        :param train_filename: The name of the file containing training data.
        :type train_filename: str
        :param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
                             will be a slice of the train set.
        :type dev_filename: str or None
        :param test_filename: None
        :type test_filename: str
        :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
        :type dev_split: float
        :param delimiter: Separator used in the input tsv / csv file
        :type delimiter: str
        :param quote_char: Character used for quoting strings in the input tsv/ csv file
        :type quote_char: str
        :param skiprows: number of rows to skip in the tsvs (e.g. for multirow headers)
        :type skiprows: int
        :param label_column_name: name of the column in the input csv/tsv that shall be used as training labels
        :type label_column_name: str
        :param label_name: name for the internal label variable in FARM (only needed to adjust in rare cases)
        :type label_name: str
        :param scaler_mean: Value to substract from the label for normalization
        :type scaler_mean: float
        :param scaler_scale: Value to divide the label by for normalization
        :type scaler_scale: float
        :param proxies: proxy configuration to allow downloads of remote datasets.
                        Format as in  "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
        :type proxies: dict
        :param text_column_name: name of the column in the input csv/tsv that shall be used as training text
        :type text_column_name: str
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        """

        # Custom processor attributes
        self.delimiter = delimiter
        self.quote_char = quote_char
        self.skiprows = skiprows
        self.text_column_name = text_column_name
        self.features = start_feat_col
        self.feat_size = None

        super(CustomRegressionProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies,
        )
        if metric is None:
            metric = "regression_metrics"
            register_metrics(metric, regression_metrics)
        if label_column_names and label_names:
            for col_name, l_name in zip(label_column_names, label_names):
                self.add_task(
                    name=l_name,
                    metric=metric,
                    label_list=[scaler_mean, scaler_scale],
                    label_column_name=col_name,
                    task_type="regression",
                    label_name=l_name,
                )

コード例 #11

ファイルを表示

def doc_classification_holdout():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    # local logging into directory "logs"
    mlflogger = MLFlowLogger(tracking_uri="logs")
    mlflogger.init_experiment(experiment_name="Example-docclass-xval",
                              run_name="testrun1")

    ##########################
    ########## Settings
    ##########################
    holdout_splits = 5
    holdout_train_split = 0.8
    holdout_stratification = True

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    dev_split = 0.1
    # For holdout the dev_stratification parameter must not be None: with None, the devset cannot be created
    # using the default method of only splitting by the available chunks as initial train set for each fold
    # is just a single chunk!
    dev_stratification = True
    do_lower_case = False
    use_amp = None

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels).get("acc")
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
        mcc = matthews_corrcoef(labels, preds)
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro,
            "mcc": mcc
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        dev_split=dev_split,
        dev_stratification=dev_stratification,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForHoldout.make(data_silo,
                                    sets=["train", "dev"],
                                    n_splits=holdout_splits,
                                    train_split=holdout_train_split,
                                    stratification=holdout_stratification)

    # the following steps should be run for each of the folds of the holdout evaluation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_eval, save_dir):
        logger.info(
            f"############ Holdout: Evaluation {n_eval} of {holdout_splits} ############"
        )
        logger.info(
            f"Fold training   samples: {len(silo_to_use.data['train'])}")
        logger.info(f"Fold dev        samples: {len(silo_to_use.data['dev'])}")
        logger.info(
            f"Fold testing    samples: {len(silo_to_use.data['test'])}")
        logger.info(
            "Total number of samples: "
            f"{len(silo_to_use.data['train'])+len(silo_to_use.data['dev'])+len(silo_to_use.data['test'])}"
        )
        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = TextClassificationHead(
            class_weights=data_silo.calculate_class_weights(
                task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(language_model=language_model,
                              prediction_heads=[prediction_head],
                              embeds_dropout_prob=0.2,
                              lm_output_types=["per_sequence"],
                              device=device)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        # An early stopping instance can be used to save the model that performs best on the dev set
        # according to some metric and stop training when no improvement is happening for some iterations.
        # NOTE: Using a different save directory for each fold, allows us afterwards to use the
        # nfolds best models in an ensemble!
        save_dir = Path(str(save_dir) + f"-{n_eval}")
        earlystopping = EarlyStopping(
            metric="f1_offense",
            mode=
            "max",  # use the metric from our own metrics function instead of loss
            save_dir=save_dir,  # where to save the best model
            patience=
            5  # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          early_stopping=earlystopping,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold

    # remember all individual evaluation results
    allresults = []
    bestfold = None
    bestf1_offense = -1
    save_dir = Path("saved_models/bert-german-doc-tutorial-es")
    for num_fold, silo in enumerate(silos):
        mlflow.start_run(run_name=f"split-{num_fold + 1}-of-{len(silos)}",
                         nested=True)
        model = train_on_split(silo, num_fold, save_dir)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        allresults.append(result)

        # keep track of best fold
        f1_offense = result[0]["f1_offense"]
        if f1_offense > bestf1_offense:
            bestf1_offense = f1_offense
            bestfold = num_fold
        mlflow.end_run()
        # emtpy cache to avoid memory leak and cuda OOM across multiple folds
        model.cpu()
        torch.cuda.empty_cache()

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("doc_classification_holdout.results.json", "wt") as fp:
        json.dump(allresults, fp)

    # log the best fold metric and fold
    logger.info(f"Best fold f1_offense: {bestf1_offense} in fold {bestfold}")

    # calculate overall metrics across all folds: we only have one head so we do this only for the first head
    # information in each of the per-fold results

    # First create a dict where for each metric, we have a list of values from each fold
    eval_metric_lists_head0 = defaultdict(list)
    for results in allresults:
        head0results = results[0]
        for name in head0results.keys():
            if name not in ["preds", "labels"] and not name.startswith("_") and \
                    isinstance(head0results[name], numbers.Number):
                eval_metric_lists_head0[name].append(head0results[name])
    # Now calculate the mean and stdev for each metric, also copy over the task name
    eval_metric = {}
    eval_metric["task_name"] = allresults[0][0].get("task_name",
                                                    "UNKNOWN TASKNAME")
    for name in eval_metric_lists_head0.keys():
        values = eval_metric_lists_head0[name]
        vmean = statistics.mean(values)
        vstdev = statistics.stdev(values)
        eval_metric[name + "_mean"] = vmean
        eval_metric[name + "_stdev"] = vstdev

    logger.info(
        f"HOLDOUT Accuracy:   mean {eval_metric['acc_mean']} stdev {eval_metric['acc_stdev']}"
    )
    logger.info(
        f"HOLDOUT F1 MICRO:   mean {eval_metric['f1_micro_mean']} stdev {eval_metric['f1_micro_stdev']}"
    )
    logger.info(
        f"HOLDOUT F1 MACRO:   mean {eval_metric['f1_macro_mean']} stdev {eval_metric['f1_macro_stdev']}"
    )
    logger.info(
        f"HOLDOUT F1 OFFENSE: mean {eval_metric['f1_offense_mean']} stdev {eval_metric['f1_offense_stdev']}"
    )
    logger.info(
        f"HOLDOUT F1 OTHER:   mean {eval_metric['f1_other_mean']} stdev {eval_metric['f1_other_stdev']}"
    )
    logger.info(
        f"HOLDOUT MCC:        mean {eval_metric['mcc_mean']} stdev {eval_metric['mcc_stdev']}"
    )

    # -----------------------------------------------------
    # Just for illustration, use the best model from the best xval val for evaluation on
    # the original (still unseen) test set.
    logger.info(
        "###### Final Eval on hold out test set using best model #####")
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device)
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)

    result = evaluator_origtest.eval(model)
    logger.info(f"TEST Accuracy:   {result[0]['acc']}")
    logger.info(f"TEST F1 MICRO:   {result[0]['f1_micro']}")
    logger.info(f"TEST F1 MACRO:   {result[0]['f1_macro']}")
    logger.info(f"TEST F1 OFFENSE: {result[0]['f1_offense']}")
    logger.info(f"TEST F1 OTHER:   {result[0]['f1_other']}")
    logger.info(f"TEST MCC:        {result[0]['mcc']}")

コード例 #12

ファイルを表示

ファイル: multitask_learning.py プロジェクト: imdiptanu/FARM

TRIGGER_LABELS = ["X", "0", "1"]
LABEL_LIST = ["not sw", "sw"]

processor = MTLProcessor(data_dir = ".", 
                          tokenizer=tokenizer,
                          max_seq_len=128,
                          train_filename=TRAIN_FILE,
                          test_filename=TEST_FILE,
                          delimiter=",",
                          )



from farm.evaluation.metrics import register_metrics
register_metrics('f1_weighted', custom_f1_score)

metric = 'f1_weighted'
processor.add_task(name="document_level_task", label_list=LABEL_LIST, metric="acc", text_column_name="text", label_column_name="label", task_type="classification")
processor.add_task(name="token_level_task", label_list=TRIGGER_LABELS, metric=metric, text_column_name="text", label_column_name="tokens", task_type="ner")


data_silo = DataSilo(processor=processor,
                    batch_size=BATCH_SIZE
                    )

language_model = LanguageModel.load(LANG_MODEL)

document_level_task_head = TextClassificationHead(num_labels=len(LABEL_LIST), task_name="document_level_task")
token_level_task_head = TokenClassificationHead(num_labels=len(TRIGGER_LABELS), task_name="token_level_task")