def import_downstream_models():
    ####################### loads a SQUAD finetuned model
    # saves it as a FARM adaptive model
    device, n_gpu = initialize_device_settings(use_cuda=True)
    model = "bert-large-uncased-whole-word-masking-finetuned-squad"
    save_dir = "saved_models/FARM-bert-large-uncased-whole-word-masking-finetuned-squad"
    lm = Bert.load(model)
    ph = QuestionAnsweringHead.load(model)
    am = AdaptiveModel(language_model=lm,
                       prediction_heads=[ph],
                       embeds_dropout_prob=0.1,
                       lm_output_types="per_token",
                       device=device)
    am.save(save_dir)
    # saves the processor associated with it, so you can use it in inference mode
    # TODO load HF's tokenizer_config.json and adjust settings
    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=model)
    label_list = ["start_token", "end_token"]
    metric = "squad"
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=label_list,
        metric=metric,
        data_dir="../data/squad20",
    )
    processor.save(save_dir)
Example #2
0
def test_qa(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 2
    n_epochs = 1
    evaluate_every = 4
    base_LM_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=base_LM_model, do_lower_case=False
    )
    label_list = ["start_token", "end_token"]
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=20,
        doc_stride=10,
        max_query_length=6,
        train_filename="train-sample.json",
        dev_filename="dev-sample.json",
        test_filename=None,
        data_dir="samples/qa",
        label_list=label_list,
        metric="squad"
    )

    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = LanguageModel.load(base_LM_model)
    prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)])
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device
    )
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device
    )
    model = trainer.train(model)
    save_dir = "testsave/qa"
    model.save(save_dir)
    processor.save(save_dir)
Example #3
0
def test_qa(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 2
    n_epochs = 1
    evaluate_every = 4
    base_LM_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model,
                               do_lower_case=False)
    label_list = ["start_token", "end_token"]
    processor = SquadProcessor(tokenizer=tokenizer,
                               max_seq_len=20,
                               doc_stride=10,
                               max_query_length=6,
                               train_filename="train-sample.json",
                               dev_filename="dev-sample.json",
                               test_filename=None,
                               data_dir="samples/qa",
                               label_list=label_list,
                               metric="squad")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = LanguageModel.load(base_LM_model)
    prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)])
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        warmup_proportion=0.2,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )
    model = trainer.train(model)
    save_dir = "testsave/qa"
    model.save(save_dir)
    processor.save(save_dir)

    QA_input = [{
        "questions": ["In what country is Normandy"],
        "text":
        'The Normans gave their name to Normandy, a region in France.',
    }]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=QA_input,
                                        use_multiprocessing=False)
    assert isinstance(
        result[0]["predictions"][0]["answers"][0]["offset_start"], int)
Example #4
0
def question_answering():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_question_answering")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 24
    n_epochs = 2
    evaluate_every = 2000
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    label_list = ["start_token", "end_token"]
    metric = "squad"
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=label_list,
        metric=metric,
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("../data/squad20"),
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Question Answering
    prediction_head = QuestionAnsweringHead()

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/bert-english-qa-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    QA_input = [{
        "qas": ["Who counted the game among the best ever made?"],
        "context":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]

    model = QAInferencer.load(save_dir, batch_size=40, gpu=True)
    result = model.inference_from_dicts(dicts=QA_input)[0]

    pprint.pprint(result)

    # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk
    filename = os.path.join(processor.data_dir, processor.dev_filename)
    result = model.inference_from_file(file=filename, return_json=False)
    result_squad = [x.to_squad_eval() for x in result]

    write_squad_predictions(predictions=result_squad,
                            predictions_filename=filename,
                            out_filename="predictions.json")
def train_evaluation_single(seed=42):
    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 32 * 4  # 4x V100
    n_epochs = 2
    evaluate_every = 2000000  # disabling dev eval
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # Load model and train
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("testsave/data/squad20"),
    )
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)
    language_model = LanguageModel.load(lang_model)
    prediction_head = QuestionAnsweringHead()
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    starttime = time()
    trainer.train()
    elapsed = time() - starttime

    save_dir = Path("testsave/roberta-qa-dev")
    model.save(save_dir)
    processor.save(save_dir)

    # Create Evaluator
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnrecall = results[0]["top_n_recall"] * 100

    print(results)
    print(elapsed)

    gold_f1 = 82.155
    gold_EM = 77.714
    gold_tnrecall = 97.3721  #
    gold_elapsed = 1286.30
    np.testing.assert_allclose(
        f1_score,
        gold_f1,
        rtol=0.01,
        err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}")
    np.testing.assert_allclose(
        em_score,
        gold_EM,
        rtol=0.01,
        err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}")
    np.testing.assert_allclose(
        tnrecall,
        gold_tnrecall,
        rtol=0.01,
        err_msg=
        f"FARM Training changed for top 1 recall by: {em_score - gold_EM}")
    np.testing.assert_allclose(
        elapsed,
        gold_elapsed,
        rtol=0.1,
        err_msg=
        f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds"
    )
Example #6
0
def test_qa(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 2
    n_epochs = 1
    evaluate_every = 4
    base_LM_model = "bert-base-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=base_LM_model, do_lower_case=False)
    label_list = ["start_token", "end_token"]
    processor = SquadProcessor(tokenizer=tokenizer,
                               max_seq_len=16,
                               max_query_length=4,
                               train_filename="train-sample.json",
                               dev_filename="dev-sample.json",
                               test_filename=None,
                               data_dir="samples/qa",
                               labels=label_list,
                               metric="squad")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = Bert.load(base_LM_model)
    prediction_head = QuestionAnsweringHead(layer_dims=[768, len(label_list)])
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        warmup_proportion=0.2,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )
    model = trainer.train(model)
    save_dir = "testsave/qa"
    model.save(save_dir)
    processor.save(save_dir)

    QA_input = [{
        "questions": ["In what country is Normandy located?"],
        "text":
        'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
    }]

    model = Inferencer.load(save_dir)
    result = model.run_inference(dicts=QA_input)
    assert isinstance(result[0]["predictions"][0]["end"], int)
Example #7
0
    def convert_to_onnx(self,
                        output_path,
                        opset_version=11,
                        optimize_for=None):
        """
        Convert a PyTorch AdaptiveModel to ONNX.

        The conversion is trace-based by performing a forward pass on the model with a input batch.

        :param output_path: model dir to write the model and config files
        :type output_path: Path
        :param opset_version: ONNX opset version
        :type opset_version: int
        :param optimize_for: optimize the exported model for a target device. Available options
                             are "gpu_tensor_core" (GPUs with tensor core like V100 or T4),
                             "gpu_without_tensor_core" (most other GPUs), and "cpu".
        :type optimize_for: str
        :return:
        """
        if type(self.prediction_heads[0]) is not QuestionAnsweringHead:
            raise NotImplementedError

        tokenizer = Tokenizer.load(
            pretrained_model_name_or_path="deepset/bert-base-cased-squad2")

        label_list = ["start_token", "end_token"]
        metric = "squad"
        max_seq_len = 384
        batch_size = 1
        processor = SquadProcessor(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            label_list=label_list,
            metric=metric,
            train_filename=
            "stub-file",  # the data is loaded from dicts instead of file.
            dev_filename=None,
            test_filename=None,
            data_dir="stub-dir",
        )

        data_silo = DataSilo(processor=processor,
                             batch_size=1,
                             distributed=False,
                             automatic_loading=False)
        sample_dict = [{
            "context":
            'The Normans were the people who in the 10th and 11th centuries gave their name to Normandy, '
            'a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders '
            'and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear '
            'fealty to King Charles III of West Francia.',
            "qas": [{
                "question": "In what country is Normandy located?",
                "id": "56ddde6b9a695914005b9628",
                "answers": [{
                    "text": "France",
                    "answer_start": 159
                }],
                "is_impossible": False,
            }],
        }]

        data_silo._load_data(train_dicts=sample_dict)
        data_loader = data_silo.get_data_loader("train")
        data = next(iter(data_loader))
        data = list(data.values())

        inputs = {
            'input_ids':
            data[0].to(self.device).reshape(batch_size, max_seq_len),
            'padding_mask':
            data[1].to(self.device).reshape(batch_size, max_seq_len),
            'segment_ids':
            data[2].to(self.device).reshape(batch_size, max_seq_len)
        }

        # The method argument passing in torch.onnx.export is different to AdaptiveModel's forward().
        # To resolve that, an ONNXWrapper instance is used.
        model = ONNXWrapper.load_from_adaptive_model(self)

        if not os.path.exists(output_path):
            os.makedirs(output_path)

        with torch.no_grad():
            symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
            torch.onnx.export(
                model,
                args=tuple(inputs.values()),
                f=output_path / 'model.onnx'.format(opset_version),
                opset_version=opset_version,
                do_constant_folding=True,
                input_names=['input_ids', 'padding_mask', 'segment_ids'],
                output_names=['logits'],
                dynamic_axes={
                    'input_ids': symbolic_names,
                    'padding_mask': symbolic_names,
                    'segment_ids': symbolic_names,
                    'logits': symbolic_names,
                })

        if optimize_for:
            optimize_args = Namespace(disable_attention=False,
                                      disable_bias_gelu=False,
                                      disable_embed_layer_norm=False,
                                      opt_level=99,
                                      disable_skip_layer_norm=False,
                                      disable_bias_skip_layer_norm=False,
                                      hidden_size=768,
                                      verbose=False,
                                      input='onnx-export/model.onnx',
                                      model_type='bert',
                                      num_heads=12,
                                      output='onnx-export/model.onnx')

            if optimize_for == "gpu_tensor_core":
                optimize_args.float16 = True
                optimize_args.input_int32 = True
            elif optimize_for == "gpu_without_tensor_core":
                optimize_args.float16 = False
                optimize_args.input_int32 = True
            elif optimize_for == "cpu":
                logger.info("")
                optimize_args.float16 = False
                optimize_args.input_int32 = False
            else:
                raise NotImplementedError(
                    f"ONNXRuntime model optimization is not available for {optimize_for}. Choose "
                    f"one of 'gpu_tensor_core'(V100 or T4), 'gpu_without_tensor_core' or 'cpu'."
                )

            optimize_onnx_model(optimize_args)
        else:
            logger.info(
                "Exporting unoptimized ONNX model. To enable optimization, supply "
                "'optimize_for' parameter with the target device.'")

        # PredictionHead contains functionalities like logits_to_preds() that would still be needed
        # for Inference with ONNX models. Only the config of the PredictionHead is stored.
        for i, ph in enumerate(self.prediction_heads):
            ph.save_config(output_path, i)

        processor.save(output_path)

        onnx_model_config = {
            "onnx_opset_version": opset_version,
            "language": self.get_language(),
        }
        with open(output_path / "model_config.json", "w") as f:
            json.dump(onnx_model_config, f)

        logger.info(f"Model exported at path {output_path}")
Example #8
0
def xlmr_qa_demo():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="run_xmlr_qa")

    #########################
    ######## Settings
    ########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 3
    grad_acc_steps = 8
    n_epochs = 2
    evaluate_every = 200
    base_LM_model = "xlm-roberta-large"

    data_dir = Path("../data/squad20")
    train_filename = Path("train-v2.0.json")
    dev_filename = Path("dev-v2.0.json")

    save_dir = Path("../saved_models/xlmr-large-qa")

    inference_file = Path("../data/MLQA_V1/dev/dev-context-de-question-de.json")
    predictions_file = save_dir / "predictions.json"
    full_predictions_file = save_dir / "full_predictions.json"
    max_processes_for_inference = 8
    train = True
    inference = False

    if train:
        # 1.Create a tokenizer
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model)
        # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        label_list = ["start_token", "end_token"]
        metric = "squad"
        processor = SquadProcessor(
            tokenizer=tokenizer,
            max_seq_len=384,
            label_list=label_list,
            metric=metric,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=None,
            data_dir=data_dir,
            dev_split=0.0
        )

        # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=1)

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(base_LM_model, n_added_tokens=3)
        # b) and a prediction head on top that is suited for our task => Question Answering
        prediction_head = QuestionAnsweringHead()

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token"],
            device=device,
        )

        # 5. Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=3e-5,
            schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2},
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            grad_acc_steps=grad_acc_steps,
            device=device
        )

        # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )
        # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
        model = trainer.train(model)

        # 8. Hooray! You have a model. Store it:
        model.save(save_dir)
        processor.save(save_dir)


    if inference:
        model = Inferencer.load(save_dir, batch_size=32, gpu=True)
        full_result = model.inference_from_file(
            file=inference_file,
            max_processes=max_processes_for_inference,
        )

        for x in full_result:
            print(x)
            print()

        result = {r["id"]: r["preds"][0][0] for r in full_result}
        full_result = {r["id"]: r["preds"] for r in full_result}

        json.dump(result,
                  open(predictions_file, "w"),
                  indent=4,
                  ensure_ascii=False)
        json.dump(full_result,
                  open(full_predictions_file, "w"),
                  indent=4,
                  ensure_ascii=False)
Example #9
0
def test_qa(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 2
    n_epochs = 1
    evaluate_every = 4
    base_LM_model = "distilbert-base-uncased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model,
                               do_lower_case=True)
    label_list = ["start_token", "end_token"]
    processor = SquadProcessor(tokenizer=tokenizer,
                               max_seq_len=20,
                               doc_stride=10,
                               max_query_length=6,
                               train_filename="train-sample.json",
                               dev_filename="dev-sample.json",
                               test_filename=None,
                               data_dir=Path("samples/qa"),
                               label_list=label_list,
                               metric="squad")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(base_LM_model)
    prediction_head = QuestionAnsweringHead()
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)
    trainer.train()
    save_dir = Path("testsave/qa")
    model.save(save_dir)
    processor.save(save_dir)

    inferencer = Inferencer.load(save_dir,
                                 batch_size=2,
                                 gpu=False,
                                 num_processes=0)

    qa_format_1 = [{
        "questions": ["Who counted the game among the best ever made?"],
        "text":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]
    qa_format_2 = [{
        "qas": ["Who counted the game among the best ever made?"],
        "context":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
    }]

    result1 = inferencer.inference_from_dicts(dicts=qa_format_1)
    result2 = inferencer.inference_from_dicts(dicts=qa_format_2)
    assert result1 == result2
Example #10
0
    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    model = trainer.train(model)

    # 8. Hooray! You have a model. Store it:
    model.save(save_dir)
    processor.save(save_dir)

if inference:
    model = Inferencer.load(save_dir, batch_size=32, gpu=True)
    full_result = model.inference_from_file(
        file=inference_file, use_multiprocessing=inference_multiprocessing)

    for x in full_result:
        print(x)
        print()

    result = {r["id"]: r["preds"][0][0] for r in full_result}
    full_result = {r["id"]: r["preds"] for r in full_result}

    json.dump(result,
              open(predictions_file, "w"),
Example #11
0
def question_answering_confidence():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False
    batch_size = 80

    data_dir = Path("../data/squad20")
    # We use the same file for dev and test set only for demo purposes
    dev_filename = "dev-v2.0.json"
    test_filename = "dev-v2.0.json"
    accuracy_at = 3 # accuracy at n is useful for answers inside long documents


    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=dev_filename,
        test_filename=test_filename,
        data_dir=data_dir,
        doc_stride=192,
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)


    # 4. Load pre-trained question-answering model
    model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering")
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)
    # Number of predictions the model will make per Question.
    # The multiple predictions are used for evaluating top n recall.
    model.prediction_heads[0].n_best = accuracy_at

    # 5. The calibration of model confidence scores sets one parameter, which is called temperature and can be accessed through the prediction_head.
    # This temperature is applied to each logit in the forward pass, where each logit is divided by the temperature.
    # A softmax function is applied to the logits afterward to get confidence scores in the range [0,1].
    # A temperature larger than 1 decreases the model’s confidence scores.
    logger.info(f"Parameter used for temperature scaling of model confidence scores: {model.prediction_heads[0].temperature_for_confidence}")

    # 6a. We can either manually set the temperature (default value is 1.0)...
    model.prediction_heads[0].temperature_for_confidence = torch.nn.Parameter((torch.ones(1) * 1.0).to(device=device))

    # 6b. ...or we can run the evaluator on the dev set and use it to calibrate confidence scores with a technique called temperature scaling.
    # It will align the confidence scores with the model's accuracy based on the dev set data by tuning the temperature parameter.
    # During the calibration, this parameter is automatically set internally as an attribute of the prediction head.
    evaluator_dev = Evaluator(
        data_loader=data_silo.get_data_loader("dev"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    result_dev = evaluator_dev.eval(model, return_preds_and_labels=True, calibrate_conf_scores=True)
    # evaluator_dev.log_results(result_dev, "Dev", logging=False, steps=len(data_silo.get_data_loader("dev")))

    # 7. Optionally, run the evaluator on the test set to see how well the confidence scores are aligned with the model's accuracy
    evaluator_test = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    result_test = evaluator_test.eval(model, return_preds_and_labels=True)[0]
    logger.info("Grouping predictions by confidence score and calculating metrics for each bin.")
    em_per_bin, confidence_per_bin, count_per_bin = metrics_per_bin(result_test["preds"], result_test["labels"], num_bins=10)
    for bin_number in range(10):
        logger.info(f"Bin {bin_number} - exact match: {em_per_bin[bin_number]}, average confidence score: {confidence_per_bin[bin_number]}")

    # 8. Hooray! You have a model with calibrated confidence scores.
    # Store the model and the temperature parameter will be stored automatically as an attribute of the prediction head.
    save_dir = Path("../saved_models/qa-confidence-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. When making a prediction with the calibrated model, we could filter out predictions where the model is not confident enough
    # To this end, load the stored model, which will automatically load the stored temperature parameter.
    # The confidence scores are automatically adjusted based on this temperature parameter.
    # For each prediction, we can check the model's confidence and decide whether to output the prediction or not.
    inferencer = QAInferencer.load(save_dir, batch_size=40, gpu=True)
    logger.info(f"Loaded model with stored temperature: {inferencer.model.prediction_heads[0].temperature_for_confidence}")

    QA_input = [
        {
            "questions": ["Who counted the game among the best ever made?"],
            "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
        }]
    result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0]
    if result.prediction[0].confidence > 0.9:
        print(result.prediction[0].answer)
    else:
        print("The confidence is not high enough to give an answer.")
def train_evaluation_single(seed=42):
    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    # GPU utilization on 4x V100
    # 40*4, 14.3/16GB on master, 12.6/16 on others
    batch_size = 40 * n_gpu_factor
    n_epochs = 2
    evaluate_every = 2000000  # disabling dev eval
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    test_assertions = False
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # Load model and train
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("testsave/data/squad20"),
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = LanguageModel.load(lang_model)
    prediction_head = QuestionAnsweringHead(n_best=5, n_best_per_sample=1)
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    starttime = time()
    trainer.train()
    elapsed = time() - starttime

    save_dir = Path("testsave/roberta-qa-dev")
    model.save(save_dir)
    processor.save(save_dir)

    # Create Evaluator
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnacc = results[0]["top_n_accuracy"] * 100

    print(results)
    print(elapsed)

    gold_f1 = 82.155
    gold_EM = 78.6575  #77.714
    gold_tnrecall = 97.3721
    gold_elapsed = 1135
    if test_assertions:
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.01,
            err_msg=
            f"FARM Training changed for f1 score by: {f1_score - gold_f1}")
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.01,
            err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}")
        np.testing.assert_allclose(
            tnacc,
            gold_tnrecall,
            rtol=0.01,
            err_msg=
            f"FARM Training changed for top 5 accuracy by: {tnacc - gold_tnrecall}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )
    if not np.allclose(f1_score, gold_f1, rtol=0.01):
        error_messages.append(
            f"FARM Training changed for f1 score by: {round(f1_score - gold_f1, 4)}"
        )
    if not np.allclose(em_score, gold_EM, rtol=0.01):
        error_messages.append(
            f"FARM Training changed for EM by: {round(em_score - gold_EM, 4)}")
    if not np.allclose(tnacc, gold_tnrecall, rtol=0.01):
        error_messages.append(
            f"FARM Training changed for top 5 accuracy by: {round(tnacc - gold_tnrecall, 4)}"
        )
    if not np.allclose(elapsed, gold_elapsed, rtol=0.1):
        error_messages.append(
            f"FARM Training speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds"
        )

    benchmark_result = [{
        "run": "train evaluation",
        "f1_change": round(f1_score - gold_f1, 4),
        "em_change": round(em_score - gold_EM, 4),
        "tnacc_change": round(tnacc - gold_tnrecall, 4),
        "elapsed_change": round(elapsed - gold_elapsed, 4),
        "f1": f1_score,
        "em": em_score,
        "tnacc": round(tnacc, 4),
        "elapsed": elapsed,
        "f1_gold": gold_f1,
        "em_gold": gold_EM,
        "tnacc_gold": gold_tnrecall,
        "elapsed_gold": gold_elapsed
    }]
    logger.info("\n\n" + pformat(benchmark_result) + "\n")
    return benchmark_result