Beispiel #1
0
def onnx_runtime_example():
    """
    This example converts a Question Answering FARM AdaptiveModel
    to ONNX format and uses ONNX Runtime for doing Inference.
    """

    device = "cpu"
    model_name_or_path = "deepset/bert-base-cased-squad2"
    onnx_model_export_path = Path("./onnx-export")

    model = AdaptiveModel.convert_from_transformers(model_name_or_path, device=device, task_type="question_answering")
    model.convert_to_onnx(onnx_model_export_path)

    inferencer = Inferencer.load(model_name_or_path=onnx_model_export_path)

    qa_input = [
        {
            "qas": ["Who counted the game among the best ever made?"],
            "context": "Twilight Princess was released to universal critical acclaim and commercial success. "
            "It received perfect scores from major publications such as 1UP.com, Computer and Video Games, "
            "Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators "
            "GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii "
            "version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called "
            "it one of the greatest games ever created.",
        }
    ]

    results = inferencer.inference_from_dicts(qa_input)
    print(results)
def convert_from_transformers():
    transformers_input_name = "deepset/bert-base-german-cased-hatespeech-GermEval18Coarse"
    farm_output_dir = Path(
        "../saved_models/farm-bert-base-german-cased-hatespeech-GermEval18Coarse"
    )

    # # CASE 1: MODEL
    # # Load model from transformers model hub (-> continue training / compare models / ...)
    model = AdaptiveModel.convert_from_transformers(
        transformers_input_name, device="cpu", task_type="text_classification")
    # # ... continue as in the other examples e.g. to fine-tune this QA model on your own data
    #
    # # CASE 2: INFERENCER
    # # Load Inferencer from transformers, incl. model & tokenizer (-> just get predictions)
    nlp = Inferencer.load(transformers_input_name,
                          task_type="text_classification")
    #
    # # run predictions
    result = nlp.inference_from_dicts(dicts=[{
        "text": "Was ein scheiß Nazi!"
    }],
                                      rest_api_schema=True)
    pprint.pprint(result)

    # save it
    nlp.save(farm_output_dir)
def convert_from_transformers():
    # CASE 1: MODEL
    # Load model from transformers model hub (-> continue training / compare models / ...)
    model = AdaptiveModel.convert_from_transformers(
        "deepset/bert-large-uncased-whole-word-masking-squad2",
        device="cpu",
        task_type="question_answering")
    # ... continue as in the other examples e.g. to fine-tune this QA model on your own data

    # CASE 2: INFERENCER
    # Load Inferencer from transformers, incl. model & tokenizer (-> just get predictions)
    nlp = Inferencer.load(
        "deepset/bert-large-uncased-whole-word-masking-squad2",
        task_type="question_answering")

    # run predictions
    QA_input = [{
        "questions": ["Why is model conversion important?"],
        "text":
        "The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks."
    }]
    result = nlp.inference_from_dicts(dicts=QA_input, rest_api_schema=True)
    pprint.pprint(result)

    # save it
    farm_model_dir = Path("../saved_models/bert-english-qa-large")
    nlp.save(farm_model_dir)
Beispiel #4
0
def evaluate_question_answering():
    ##########################
    ########## Settings
    ##########################
    device, n_gpu = initialize_device_settings(use_cuda=True)
    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = True

    data_dir = Path("../data/squad20")
    evaluation_filename = "dev-v2.0.json"

    batch_size = 50
    no_ans_boost = 0
    accuracy_at = 3  # accuracy at n is useful for answers inside long documents

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
        doc_stride=128,
    )

    # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an Evaluator
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    # 5. Load model
    model = AdaptiveModel.convert_from_transformers(
        lang_model, device=device, task_type="question_answering")
    # use "load" if you want to use a local model that was trained with FARM
    #model = AdaptiveModel.load(lang_model, device=device)
    model.prediction_heads[0].no_ans_boost = no_ans_boost
    model.prediction_heads[0].n_best = accuracy_at
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)

    # 6. Run the Evaluator
    results = evaluator.eval(model)
    f1_score = results[0]["f1"]
    em_score = results[0]["EM"]
    tnacc = results[0]["top_n_accuracy"]
    print("F1-Score:", f1_score)
    print("Exact Match Score:", em_score)
    print(f"top_{accuracy_at}_accuracy:", tnacc)
Beispiel #5
0
def evaluate_classification():
    ##########################
    ########## Settings
    ##########################
    device, n_gpu = initialize_device_settings(use_cuda=True)
    lang_model = "deepset/bert-base-german-cased-sentiment-Germeval17"
    do_lower_case = False
    batch_size = 100

    data_dir = Path("../data/germeval17")
    evaluation_filename = "test_TIMESTAMP1.tsv"
    label_list = ["negative", "neutral", "positive"]
    metric = "f1_macro"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2017 Data automaticaly if it is not available.

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=label_list,
        metric=metric,
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
    )

    # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an Evaluator
    evaluator = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )

    # 5. Load model
    model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="text_classification")
    # use "load" if you want to use a local model that was trained with FARM
    # model = AdaptiveModel.load(lang_model, device=device)
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

    # 6. Run the Evaluator
    results = evaluator.eval(model)
    f1_score = results[0]["f1_macro"]
    print("Macro-averaged F1-Score:", f1_score)
Beispiel #6
0
    def train_on_split(silo_to_use, n_fold):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} ############")

        # fine-tune pre-trained question-answering model
        model = AdaptiveModel.convert_from_transformers(
            lang_model, device=device, task_type="question_answering")
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)
        # If positive, thjs will boost "No Answer" as prediction.
        # If negative, this will prevent the model from giving "No Answer" as prediction.
        model.prediction_heads[0].no_ans_boost = no_ans_boost
        # Number of predictions the model will make per Question.
        # The multiple predictions are used for evaluating top n recall.
        model.prediction_heads[0].n_best = accuracy_at

        # # or train question-answering models from scratch
        # # Create an AdaptiveModel
        # # a) which consists of a pretrained language model as a basis
        # language_model = LanguageModel.load(lang_model)
        # # b) and a prediction head on top that is suited for our task => Question-answering
        # prediction_head = QuestionAnsweringHead(no_ans_boost=no_ans_boost, n_best=accuracy_at)
        # model = AdaptiveModel(
        #    language_model=language_model,
        #    prediction_heads=[prediction_head],
        #    embeds_dropout_prob=0.1,
        #    lm_output_types=["per_token"],
        #    device=device,)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=learning_rate,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model
Beispiel #7
0
def onnx_adaptive_model_qa(use_gpu, num_processes):
    model_name_or_path = "deepset/bert-base-cased-squad2"
    onnx_model_export_path = Path("benchmarks/onnx-export")
    if not (onnx_model_export_path / "model.onnx").is_file():
        model = AdaptiveModel.convert_from_transformers(
            model_name_or_path, device="cpu", task_type="question_answering")
        model.convert_to_onnx(onnx_model_export_path)

    model = Inferencer.load(onnx_model_export_path,
                            task_type="question_answering",
                            batch_size=1,
                            num_processes=num_processes,
                            gpu=use_gpu)

    return model
Beispiel #8
0
def test_conversion_adaptive_model(caplog):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    model = AdaptiveModel.convert_from_transformers(
        "deepset/bert-base-cased-squad2",
        device="cpu",
        task_type="question_answering")
    transformer_model = model.convert_to_transformers()
    transformer_model2 = AutoModelForQuestionAnswering.from_pretrained(
        "deepset/bert-base-cased-squad2")
    # compare weights
    for p1, p2 in zip(transformer_model.parameters(),
                      transformer_model2.parameters()):
        assert (p1.data.ne(p2.data).sum() == 0)
Beispiel #9
0
def onnx_adaptive_model_qa(
        use_gpu,
        num_processes,
        model_name_or_path="deepset/bert-base-cased-squad2"):
    if (Path(model_name_or_path) /
            "model.onnx").is_file():  # load model directly if in ONNX format
        onnx_model_path = model_name_or_path
    else:  # convert to ONNX format
        onnx_model_path = Path("benchmarks/onnx-export")
        model = AdaptiveModel.convert_from_transformers(
            model_name_or_path, device="cpu", task_type="question_answering")
        model.convert_to_onnx(onnx_model_path)

    model = Inferencer.load(onnx_model_path,
                            task_type="question_answering",
                            batch_size=1,
                            num_processes=num_processes,
                            gpu=use_gpu)

    return model
Beispiel #10
0
config = None
with open(configFile) as file:
    config = yaml.safe_load(file)

modelName = config["model"]
sqlUrl = config["sqlUrl"]
modelDir = config["modelDir"]
doc_dir = os.path.join(dirname, config["docDir"])

# download and save the model
from farm.modeling.adaptive_model import AdaptiveModel
from farm.data_handler.processor import Processor

model = AdaptiveModel.convert_from_transformers(modelName,
                                                device="cpu",
                                                task_type="question_answering")
processor = Processor.convert_from_transformers(modelName,
                                                task_type="question_answering",
                                                max_seq_len=384,
                                                doc_stride=128)
model.save(modelDir)
processor.save(modelDir)

# try:
# //or we should config the cache_dir on from_pretrain # dont rely on cache , must save it with save()
# model = TransformersReader(model_name_or_path=modelName, use_gpu=-1)
# model = FARMReader(model_name_or_path=modelName,
#                                use_gpu=False, no_ans_boost=0)
# except:
#   pass;
def test_evaluation():
    ##########################
    ########## Settings
    ##########################
    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False

    test_assertions = True

    data_dir = Path("testsave/data/squad20")
    evaluation_filename = "dev-v2.0.json"

    device, n_gpu = initialize_device_settings(use_cuda=True)

    # loading models and evals
    model = AdaptiveModel.convert_from_transformers(
        lang_model, device=device, task_type="question_answering")
    model.prediction_heads[0].no_ans_boost = 0
    model.prediction_heads[0].n_best = 1

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
        doc_stride=128,
    )

    starttime = time()

    data_silo = DataSilo(processor=processor, batch_size=50)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    # 1. Test FARM internal evaluation
    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnrecall = results[0]["top_n_recall"] * 100
    elapsed = time() - starttime
    print(results)
    print(elapsed)

    gold_EM = 77.7478
    gold_f1 = 82.1557
    gold_tnrecall = 84.0646  # top 1 recall
    gold_elapsed = 70  # 4x V100
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}")
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}")
        np.testing.assert_allclose(
            tnrecall,
            gold_tnrecall,
            rtol=0.001,
            err_msg=f"FARM Eval changed for top 1 recall by: {em_score-gold_EM}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )

    # 2. Test FARM predictions with outside eval script
    starttime = time()
    model = Inferencer(model=model,
                       processor=processor,
                       task_type="question_answering",
                       batch_size=50,
                       gpu=device.type == "cuda")
    filename = data_dir / evaluation_filename
    result = model.inference_from_file(file=filename)

    elapsed = time() - starttime

    os.makedirs("../testsave", exist_ok=True)
    write_squad_predictions(predictions=result,
                            predictions_filename=filename,
                            out_filename="testsave/predictions.json")
    script_params = {
        "data_file": filename,
        "pred_file": "testsave/predictions.json",
        "na_prob_thresh": 1,
        "na_prob_file": False,
        "out_file": False
    }
    results_official = squad_evaluation.main(OPTS=DotMap(script_params))
    f1_score = results_official["f1"]
    em_score = results_official["exact"]

    gold_EM = 78.4890
    gold_f1 = 81.7104
    gold_elapsed = 66  # 4x V100
    print(elapsed)
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for EM by: {em_score - gold_EM}"
        )
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for f1 score by: {f1_score - gold_f1}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )
Beispiel #12
0
    def load(cls,
             model_name_or_path,
             batch_size=4,
             gpu=False,
             task_type=None,
             return_class_probs=False,
             strict=True,
             max_seq_len=256):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :return: An instance of the Inferencer.

        """

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = AdaptiveModel.load(model_name_or_path,
                                       device,
                                       strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

        # b) or from remote transformers model hub
        else:
            logger.info(
                f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..."
            )
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path, device, task_type)
            config = AutoConfig.from_pretrained(model_name_or_path)
            tokenizer = Tokenizer.load(model_name_or_path)

            # TODO infer task_type automatically from config (if possible)
            if task_type == "question_answering":
                processor = SquadProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    label_list=["start_token", "end_token"],
                    metric="squad",
                    data_dir=None,
                )
            elif task_type == "embeddings":
                processor = InferenceProcessor(tokenizer=tokenizer,
                                               max_seq_len=max_seq_len)

            elif task_type == "text_classification":
                label_list = list(config.id2label[id]
                                  for id in range(len(config.id2label)))
                processor = TextClassificationProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    data_dir=None,
                    label_list=label_list,
                    label_column_name="label",
                    metric="acc",
                    quote_char='"',
                )

            # elif task_type == "multilabel-classification":
            #     # label_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
            #     label_list = list(config.label2id.keys())
            #
            #     processor = TextClassificationProcessor(tokenizer=tokenizer,
            #                                             max_seq_len=max_seq_len,
            #                                             data_dir=None,
            #                                             label_list=label_list,
            #                                             label_column_name="label",
            #                                             metric="acc",
            #                                             quote_char='"',
            #                                             multilabel=True,
            #                                             )

            elif task_type == "ner":
                label_list = list(config.label2id.keys())
                processor = NERProcessor(tokenizer=tokenizer,
                                         max_seq_len=max_seq_len,
                                         data_dir=None,
                                         metric="seq_f1",
                                         label_list=label_list)
            else:
                raise ValueError(
                    f"`task_type` {task_type} is not supported yet. "
                    f"Valid options for arg `task_type`: 'question_answering', 'embeddings', 'text_classification'"
                )

        return cls(
            model,
            processor,
            batch_size=batch_size,
            gpu=gpu,
            name=name,
            return_class_probs=return_class_probs,
        )
Beispiel #13
0
    def load(
        cls,
        model_name_or_path,
        batch_size=4,
        gpu=False,
        task_type=None,
        return_class_probs=False,
        strict=True,
        max_seq_len=256,
        doc_stride=128,
        extraction_layer=None,
        extraction_strategy=None,
        s3e_stats=None,
        num_processes=None,
        disable_tqdm=False,
        tokenizer_class=None,
        use_fast=False,
        tokenizer_args=None,
        dummy_ph=False,
        benchmarking=False,
    ):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :param max_seq_len: maximum length of one text sample
        :type max_seq_len: int
        :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride
        :type doc_stride: int
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()`
                          (only needed for task_type="embeddings" and extraction_strategy = "s3e")
        :type s3e_stats: dict
        :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer use all CPU cores minus one. If you want to
                              debug the Language Model, you might need to disable multiprocessing!
                              **Warning!** If you use multiprocessing you have to close the
                              `multiprocessing.Pool` again! To do so call
                              :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are
                              done using this class. The garbage collector will not do this for you!
        :type num_processes: int
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
        :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method.
            See https://huggingface.co/transformers/main_classes/tokenizer.html and detailed tokenizer documentation
            on `Hugging Face Transformers <https://huggingface.co/transformers/>`_.
        :type tokenizer_args: dict
        :type use_fast: bool
        :param dummy_ph: If True, methods of the prediction head will be replaced
                             with a dummy method. This is used to isolate lm run time from ph run time.
        :type dummy_ph: bool
        :param benchmarking: If True, a benchmarking object will be initialised within the class and
                             certain parts of the code will be timed for benchmarking. Should be kept
                             False if not benchmarking since these timing checkpoints require synchronization
                             of the asynchronous Pytorch operations and may slow down the model.
        :type benchmarking: bool
        :return: An instance of the Inferencer.

        """
        if tokenizer_args is None:
            tokenizer_args = {}

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = BaseAdaptiveModel.load(load_dir=model_name_or_path,
                                           device=device,
                                           strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

            # override processor attributes loaded from config file with inferencer params
            processor.max_seq_len = max_seq_len
            if hasattr(processor, "doc_stride"):
                processor.doc_stride = doc_stride

        # b) or from remote transformers model hub
        else:
            logger.info(
                f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..."
            )
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification', 'ner'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path, device, task_type)
            config = AutoConfig.from_pretrained(model_name_or_path)
            tokenizer = Tokenizer.load(
                model_name_or_path,
                tokenizer_class=tokenizer_class,
                use_fast=use_fast,
                **tokenizer_args,
            )

            # TODO infer task_type automatically from config (if possible)
            if task_type == "question_answering":
                processor = SquadProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    label_list=["start_token", "end_token"],
                    metric="squad",
                    data_dir="data",
                    doc_stride=doc_stride)
            elif task_type == "embeddings":
                processor = InferenceProcessor(tokenizer=tokenizer,
                                               max_seq_len=max_seq_len)

            elif task_type == "text_classification":
                label_list = list(config.id2label[id]
                                  for id in range(len(config.id2label)))
                processor = TextClassificationProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    data_dir="data",
                    label_list=label_list,
                    label_column_name="label",
                    metric="acc",
                    quote_char='"',
                )
            elif task_type == "ner":
                label_list = list(config.label2id.keys())
                processor = NERProcessor(tokenizer=tokenizer,
                                         max_seq_len=max_seq_len,
                                         data_dir="data",
                                         metric="seq_f1",
                                         label_list=label_list)
            else:
                raise ValueError(
                    f"`task_type` {task_type} is not supported yet. "
                    f"Valid options for arg `task_type`: 'question_answering', "
                    f"'embeddings', 'text_classification', 'ner'")

        if not isinstance(model, ONNXAdaptiveModel):
            model, _ = optimize_model(model=model,
                                      device=device,
                                      local_rank=-1,
                                      optimizer=None)
        return cls(model,
                   processor,
                   task_type=task_type,
                   batch_size=batch_size,
                   gpu=gpu,
                   name=name,
                   return_class_probs=return_class_probs,
                   extraction_strategy=extraction_strategy,
                   extraction_layer=extraction_layer,
                   s3e_stats=s3e_stats,
                   num_processes=num_processes,
                   disable_tqdm=disable_tqdm,
                   benchmarking=benchmarking,
                   dummy_ph=dummy_ph)
Beispiel #14
0
    def load(
        cls,
        model_name_or_path,
        revision=None,
        batch_size=4,
        gpu=False,
        task_type=None,
        return_class_probs=False,
        strict=True,
        max_seq_len=256,
        doc_stride=128,
        extraction_layer=None,
        extraction_strategy=None,
        s3e_stats=None,
        num_processes=None,
        disable_tqdm=False,
        tokenizer_class=None,
        use_fast=True,
        tokenizer_args=None,
        multithreading_rust=True,
        dummy_ph=False,
        benchmarking=False,
    ):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :param max_seq_len: maximum length of one text sample
        :type max_seq_len: int
        :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride
        :type doc_stride: int
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()`
                          (only needed for task_type="embeddings" and extraction_strategy = "s3e")
        :type s3e_stats: dict
        :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer use all CPU cores minus one. If you want to
                              debug the Language Model, you might need to disable multiprocessing!
                              **Warning!** If you use multiprocessing you have to close the
                              `multiprocessing.Pool` again! To do so call
                              :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are
                              done using this class. The garbage collector will not do this for you!
        :type num_processes: int
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, True by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
        :type use_fast: bool
        :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method.
            See https://huggingface.co/transformers/main_classes/tokenizer.html and detailed tokenizer documentation
            on `Hugging Face Transformers <https://huggingface.co/transformers/>`_.
        :type tokenizer_args: dict
        :param multithreading_rust: Whether to allow multithreading in Rust, e.g. for FastTokenizers.
                                    Note: Enabling multithreading in Rust AND multiprocessing in python might cause
                                    deadlocks.
        :type multithreading_rust: bool
        :param dummy_ph: If True, methods of the prediction head will be replaced
                             with a dummy method. This is used to isolate lm run time from ph run time.
        :type dummy_ph: bool
        :param benchmarking: If True, a benchmarking object will be initialised within the class and
                             certain parts of the code will be timed for benchmarking. Should be kept
                             False if not benchmarking since these timing checkpoints require synchronization
                             of the asynchronous Pytorch operations and may slow down the model.
        :type benchmarking: bool
        :return: An instance of the Inferencer.

        """
        if tokenizer_args is None:
            tokenizer_args = {}

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = BaseAdaptiveModel.load(load_dir=model_name_or_path,
                                           device=device,
                                           strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

        # b) or from remote transformers model hub
        else:
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification', 'ner'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path,
                revision=revision,
                device=device,
                task_type=task_type)
            processor = Processor.convert_from_transformers(
                model_name_or_path,
                revision=revision,
                task_type=task_type,
                max_seq_len=max_seq_len,
                doc_stride=doc_stride,
                tokenizer_class=tokenizer_class,
                tokenizer_args=tokenizer_args,
                use_fast=use_fast)

        # override processor attributes loaded from config or HF with inferencer params
        processor.max_seq_len = max_seq_len
        processor.multithreading_rust = multithreading_rust
        if hasattr(processor, "doc_stride"):
            assert doc_stride < max_seq_len, "doc_stride is longer than max_seq_len. This means that there will be gaps " \
                                             "as the passage windows slide, causing the model to skip over parts of the document. " \
                                             "Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384) "
            processor.doc_stride = doc_stride

        return cls(model,
                   processor,
                   task_type=task_type,
                   batch_size=batch_size,
                   gpu=gpu,
                   name=name,
                   return_class_probs=return_class_probs,
                   extraction_strategy=extraction_strategy,
                   extraction_layer=extraction_layer,
                   s3e_stats=s3e_stats,
                   num_processes=num_processes,
                   disable_tqdm=disable_tqdm,
                   benchmarking=benchmarking,
                   dummy_ph=dummy_ph)
def test_evaluation():
    ##########################
    ########## Settings
    ##########################
    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False

    test_assertions = False

    data_dir = Path("testsave/data/squad20")
    evaluation_filename = "dev-v2.0.json"

    device, n_gpu = initialize_device_settings(use_cuda=True)

    # loading models and evals
    model = AdaptiveModel.convert_from_transformers(
        lang_model, device=device, task_type="question_answering")
    model.prediction_heads[0].no_ans_boost = 0
    model.prediction_heads[0].n_best = 1
    model.prediction_heads[0].n_best_per_sample = 1

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
        doc_stride=128,
    )

    starttime = time()

    data_silo = DataSilo(processor=processor, batch_size=40 * n_gpu_factor)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)
    model, _ = optimize_model(model=model,
                              device=device,
                              local_rank=-1,
                              optimizer=None,
                              distributed=False,
                              use_amp=None)

    evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    # 1. Test FARM internal evaluation
    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnacc = results[0]["top_n_accuracy"] * 100
    elapsed = time() - starttime
    print(results)
    print(elapsed)

    gold_EM = 78.4721
    gold_f1 = 82.6671
    gold_tnacc = 84.3594  # top 1 recall
    gold_elapsed = 40  # 4x V100
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}")
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}")
        np.testing.assert_allclose(
            tnacc,
            gold_tnacc,
            rtol=0.001,
            err_msg=
            f"FARM Eval changed for top 1 accuracy by: {tnacc-gold_tnacc}")
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )

    if not np.allclose(f1_score, gold_f1, rtol=0.001):
        error_messages.append(
            f"FARM Eval changed for f1 score by: {round(f1_score - gold_f1, 4)}"
        )
    if not np.allclose(em_score, gold_EM, rtol=0.001):
        error_messages.append(
            f"FARM Eval changed for EM by: {round(em_score - gold_EM, 4)}")
    if not np.allclose(tnacc, gold_tnacc, rtol=0.001):
        error_messages.append(
            f"FARM Eval changed for top 1 accuracy by: {round(tnacc-gold_tnacc, 4)}"
        )
    if not np.allclose(elapsed, gold_elapsed, rtol=0.1):
        error_messages.append(
            f"FARM Eval speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds"
        )

    benchmark_result = [{
        "run": "FARM internal evaluation",
        "f1_change": round(f1_score - gold_f1, 4),
        "em_change": round(em_score - gold_EM, 4),
        "tnacc_change": round(tnacc - gold_tnacc, 4),
        "elapsed_change": round(elapsed - gold_elapsed, 4),
        "f1": f1_score,
        "em": em_score,
        "tnacc": round(tnacc, 4),
        "elapsed": elapsed,
        "f1_gold": gold_f1,
        "em_gold": gold_EM,
        "tnacc_gold": gold_tnacc,
        "elapsed_gold": gold_elapsed
    }]
    logger.info("\n\n" + pformat(benchmark_result[0]) + "\n")

    # # 2. Test FARM predictions with outside eval script
    starttime = time()
    model = Inferencer(model=model,
                       processor=processor,
                       task_type="question_answering",
                       batch_size=40 * n_gpu_factor,
                       gpu=device.type == "cuda")
    filename = data_dir / evaluation_filename
    result = model.inference_from_file(file=filename,
                                       return_json=False,
                                       multiprocessing_chunksize=80)
    results_squad = [x.to_squad_eval() for x in result]
    model.close_multiprocessing_pool()

    elapsed = time() - starttime

    os.makedirs("../testsave", exist_ok=True)
    write_squad_predictions(predictions=results_squad,
                            predictions_filename=filename,
                            out_filename="testsave/predictions.json")
    script_params = {
        "data_file": filename,
        "pred_file": "testsave/predictions.json",
        "na_prob_thresh": 1,
        "na_prob_file": False,
        "out_file": False
    }
    results_official = squad_evaluation.main(OPTS=DotMap(script_params))
    f1_score = results_official["f1"]
    em_score = results_official["exact"]

    gold_EM = 79.878
    gold_f1 = 82.917
    gold_elapsed = 27  # 4x V100
    print(elapsed)
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for EM by: {em_score - gold_EM}"
        )
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for f1 score by: {f1_score - gold_f1}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )
    if not np.allclose(f1_score, gold_f1, rtol=0.001):
        error_messages.append(
            f"Eval with official script changed for f1 score by: {round(f1_score - gold_f1, 4)}"
        )
    if not np.allclose(em_score, gold_EM, rtol=0.001):
        error_messages.append(
            f"Eval with official script changed for EM by: {round(em_score - gold_EM, 4)}"
        )
    if not np.allclose(elapsed, gold_elapsed, rtol=0.1):
        error_messages.append(
            f"Inference speed changed significantly by: {round(elapsed - gold_elapsed,4)} seconds"
        )

    benchmark_result.append({
        "run": "outside eval script",
        "f1_change": round(f1_score - gold_f1, 4),
        "em_change": round(em_score - gold_EM, 4),
        "tnacc_change": "-",
        "elapsed_change": round(elapsed - gold_elapsed, 4),
        "f1": f1_score,
        "em": em_score,
        "tnacc": "-",
        "elapsed": elapsed,
        "f1_gold": gold_f1,
        "em_gold": gold_EM,
        "tnacc_gold": "-",
        "elapsed_gold": gold_elapsed
    })
    logger.info("\n\n" + pformat(benchmark_result[1]) + "\n")
    return benchmark_result
Beispiel #16
0
    def load(cls,
             model_name_or_path,
             batch_size=4,
             gpu=False,
             task_type=None,
             return_class_probs=False,
             strict=True,
             max_seq_len=256,
             doc_stride=128,
             extraction_layer=None,
             extraction_strategy=None,
             s3e_stats=None,
             num_processes=None,
             disable_tqdm=False):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :param max_seq_len: maximum length of one text sample
        :type max_seq_len: int
        :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride
        :type doc_stride: int
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()`
                          (only needed for task_type="embeddings" and extraction_strategy = "s3e")
        :type s3e_stats: dict
        :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer use all CPU cores. If you want to
                              debug the Language Model, you might need to disable multiprocessing!
        :type num_processes: int
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :return: An instance of the Inferencer.

        """

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = BaseAdaptiveModel.load(load_dir=model_name_or_path,
                                           device=device,
                                           strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

            # override processor attributes loaded from config file with inferencer params
            processor.max_seq_len = max_seq_len
            if hasattr(processor, "doc_stride"):
                processor.doc_stride = doc_stride

        # b) or from remote transformers model hub
        else:
            logger.info(
                f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..."
            )
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification', 'ner'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path, device, task_type)
            config = AutoConfig.from_pretrained(model_name_or_path)
            tokenizer = Tokenizer.load(model_name_or_path)

            # TODO infer task_type automatically from config (if possible)
            if task_type == "question_answering":
                processor = SquadProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    label_list=["start_token", "end_token"],
                    metric="squad",
                    data_dir="data",
                    doc_stride=doc_stride)
            elif task_type == "embeddings":
                processor = InferenceProcessor(tokenizer=tokenizer,
                                               max_seq_len=max_seq_len)

            elif task_type == "text_classification":
                label_list = list(config.id2label[id]
                                  for id in range(len(config.id2label)))
                processor = TextClassificationProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    data_dir="data",
                    label_list=label_list,
                    label_column_name="label",
                    metric="acc",
                    quote_char='"',
                )
            elif task_type == "ner":
                label_list = list(config.label2id.keys())
                processor = NERProcessor(tokenizer=tokenizer,
                                         max_seq_len=max_seq_len,
                                         data_dir="data",
                                         metric="seq_f1",
                                         label_list=label_list)
            else:
                raise ValueError(
                    f"`task_type` {task_type} is not supported yet. "
                    f"Valid options for arg `task_type`: 'question_answering', "
                    f"'embeddings', 'text_classification', 'ner'")

        return cls(model,
                   processor,
                   task_type=task_type,
                   batch_size=batch_size,
                   gpu=gpu,
                   name=name,
                   return_class_probs=return_class_probs,
                   extraction_strategy=extraction_strategy,
                   extraction_layer=extraction_layer,
                   s3e_stats=s3e_stats,
                   num_processes=num_processes,
                   disable_tqdm=disable_tqdm)
Beispiel #17
0
def question_answering_confidence():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False
    batch_size = 80

    data_dir = Path("../data/squad20")
    # We use the same file for dev and test set only for demo purposes
    dev_filename = "dev-v2.0.json"
    test_filename = "dev-v2.0.json"
    accuracy_at = 3 # accuracy at n is useful for answers inside long documents


    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=dev_filename,
        test_filename=test_filename,
        data_dir=data_dir,
        doc_stride=192,
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)


    # 4. Load pre-trained question-answering model
    model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering")
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)
    # Number of predictions the model will make per Question.
    # The multiple predictions are used for evaluating top n recall.
    model.prediction_heads[0].n_best = accuracy_at

    # 5. The calibration of model confidence scores sets one parameter, which is called temperature and can be accessed through the prediction_head.
    # This temperature is applied to each logit in the forward pass, where each logit is divided by the temperature.
    # A softmax function is applied to the logits afterward to get confidence scores in the range [0,1].
    # A temperature larger than 1 decreases the model’s confidence scores.
    logger.info(f"Parameter used for temperature scaling of model confidence scores: {model.prediction_heads[0].temperature_for_confidence}")

    # 6a. We can either manually set the temperature (default value is 1.0)...
    model.prediction_heads[0].temperature_for_confidence = torch.nn.Parameter((torch.ones(1) * 1.0).to(device=device))

    # 6b. ...or we can run the evaluator on the dev set and use it to calibrate confidence scores with a technique called temperature scaling.
    # It will align the confidence scores with the model's accuracy based on the dev set data by tuning the temperature parameter.
    # During the calibration, this parameter is automatically set internally as an attribute of the prediction head.
    evaluator_dev = Evaluator(
        data_loader=data_silo.get_data_loader("dev"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    result_dev = evaluator_dev.eval(model, return_preds_and_labels=True, calibrate_conf_scores=True)
    # evaluator_dev.log_results(result_dev, "Dev", logging=False, steps=len(data_silo.get_data_loader("dev")))

    # 7. Optionally, run the evaluator on the test set to see how well the confidence scores are aligned with the model's accuracy
    evaluator_test = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    result_test = evaluator_test.eval(model, return_preds_and_labels=True)[0]
    logger.info("Grouping predictions by confidence score and calculating metrics for each bin.")
    em_per_bin, confidence_per_bin, count_per_bin = metrics_per_bin(result_test["preds"], result_test["labels"], num_bins=10)
    for bin_number in range(10):
        logger.info(f"Bin {bin_number} - exact match: {em_per_bin[bin_number]}, average confidence score: {confidence_per_bin[bin_number]}")

    # 8. Hooray! You have a model with calibrated confidence scores.
    # Store the model and the temperature parameter will be stored automatically as an attribute of the prediction head.
    save_dir = Path("../saved_models/qa-confidence-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. When making a prediction with the calibrated model, we could filter out predictions where the model is not confident enough
    # To this end, load the stored model, which will automatically load the stored temperature parameter.
    # The confidence scores are automatically adjusted based on this temperature parameter.
    # For each prediction, we can check the model's confidence and decide whether to output the prediction or not.
    inferencer = QAInferencer.load(save_dir, batch_size=40, gpu=True)
    logger.info(f"Loaded model with stored temperature: {inferencer.model.prediction_heads[0].temperature_for_confidence}")

    QA_input = [
        {
            "questions": ["Who counted the game among the best ever made?"],
            "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
        }]
    result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0]
    if result.prediction[0].confidence > 0.9:
        print(result.prediction[0].answer)
    else:
        print("The confidence is not high enough to give an answer.")
Beispiel #18
0
import shutil
# get documents that we want to query
dirname = os.path.dirname(__file__)

configFile = os.path.join(dirname,'config.yaml')

import yaml

config = None
with open(configFile) as file:
  config = yaml.safe_load(file)

sqlUrlFAQ = config["sqlUrlFAQ"]


model = AdaptiveModel.convert_from_transformers(
    "deepset/sentence_bert", device="cpu", task_type="embeddings")
processor = Processor.convert_from_transformers(
    "deepset/sentence_bert", task_type="embeddings", max_seq_len=384, doc_stride=128)
model.save("sentence_bert-saved")
processor.save("sentence_bert-saved")


document_store = FAISSDocumentStore(sql_url=sqlUrlFAQ)


# from haystack.retriever.dense import DensePassageRetriever
# retriever = DensePassageRetriever(document_store=document_store,
#                                   query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
#                                   passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
#                                   max_seq_len_query=64,
#                                   max_seq_len_passage=256,