Beispiel #1
0
def test_dataset_from_dicts_qa_inference(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    models = [
        "deepset/roberta-base-squad2",
        "deepset/bert-base-cased-squad2",
        "deepset/xlm-roberta-large-squad2",
        "deepset/minilm-uncased-squad2",
        "deepset/electra-base-squad2",
    ]
    sample_types = [
        "answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"
    ]

    for model in models:
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=model,
                                   use_fast=True)
        processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None)

        for sample_type in sample_types:
            dicts = processor.file_to_dicts(f"samples/qa/{sample_type}.json")
            dataset, tensor_names, problematic_sample_ids, baskets = processor.dataset_from_dicts(
                dicts, indices=[1], return_baskets=True)
            assert tensor_names == [
                'input_ids', 'padding_mask', 'segment_ids', 'passage_start_t',
                'start_of_word', 'labels', 'id', 'seq_2_start_t', 'span_mask'
            ], f"Processing for {model} has changed."
            assert len(problematic_sample_ids
                       ) == 0, f"Processing for {model} has changed."
            assert baskets[
                0].id_external == '5ad3d560604f3c001a3ff2c8', f"Processing for {model} has changed."
            assert baskets[
                0].id_internal == '1-0', f"Processing for {model} has changed."

            # roberta
            if model == "deepset/roberta-base-squad2":
                assert len(baskets[0].samples[0].tokenized["passage_tokens"]
                           ) == 6, f"Processing for {model} has changed."
                assert len(baskets[0].samples[0].tokenized["question_tokens"]
                           ) == 7, f"Processing for {model} has changed."
                if sample_type == "noanswer":
                    assert baskets[0].samples[0].features[0]["input_ids"][:13] == \
                           [0, 6179, 171, 82, 697, 11, 2201, 116, 2, 2, 26795, 2614, 34], \
                        f"Processing for {model} and {sample_type}-testsample has changed."
                else:
                    assert baskets[0].samples[0].features[0]["input_ids"][:13] == \
                           [0, 6179, 171, 82, 697, 11, 5459, 116, 2, 2, 26795, 2614, 34], \
                        f"Processing for {model} and {sample_type}-testsample has changed."

            # bert
            if model == "deepset/bert-base-cased-squad2":
                assert len(baskets[0].samples[0].tokenized["passage_tokens"]
                           ) == 5, f"Processing for {model} has changed."
                assert len(baskets[0].samples[0].tokenized["question_tokens"]
                           ) == 7, f"Processing for {model} has changed."
                if sample_type == "noanswer":
                    assert baskets[0].samples[0].features[0]["input_ids"][:10] == \
                           [101, 1731, 1242, 1234, 1686, 1107, 2123, 136, 102, 3206], \
                        f"Processing for {model} and {sample_type}-testsample has changed."
                else:
                    assert baskets[0].samples[0].features[0]["input_ids"][:10] == \
                           [101, 1731, 1242, 1234, 1686, 1107, 3206, 136, 102, 3206], \
                        f"Processing for {model} and {sample_type}-testsample has changed."

            # xlm-roberta
            if model == "deepset/xlm-roberta-large-squad2":
                assert len(baskets[0].samples[0].tokenized["passage_tokens"]
                           ) == 7, f"Processing for {model} has changed."
                assert len(baskets[0].samples[0].tokenized["question_tokens"]
                           ) == 7, f"Processing for {model} has changed."
                if sample_type == "noanswer":
                    assert baskets[0].samples[0].features[0]["input_ids"][:12] == \
                           [0, 11249, 5941, 3395, 6867, 23, 7270, 32, 2, 2, 10271, 1556], \
                        f"Processing for {model} and {sample_type}-testsample has changed."
                else:
                    assert baskets[0].samples[0].features[0]["input_ids"][:12] == \
                           [0, 11249, 5941, 3395, 6867, 23, 10271, 32, 2, 2, 10271, 1556], \
                        f"Processing for {model} and {sample_type}-testsample has changed."

            # minilm and electra have same vocab + tokenizer
            if model == "deepset/minilm-uncased-squad2" or model == "deepset/electra-base-squad2":
                assert len(baskets[0].samples[0].tokenized["passage_tokens"]
                           ) == 5, f"Processing for {model} has changed."
                assert len(baskets[0].samples[0].tokenized["question_tokens"]
                           ) == 7, f"Processing for {model} has changed."
                if sample_type == "noanswer":
                    assert baskets[0].samples[0].features[0]["input_ids"][:10] == \
                           [101, 2129, 2116, 2111, 2444, 1999, 3000, 1029, 102, 4068], \
                        f"Processing for {model} and {sample_type}-testsample has changed."
                else:
                    assert baskets[0].samples[0].features[0]["input_ids"][:10] == \
                           [101, 2129, 2116, 2111, 2444, 1999, 4068, 1029, 102, 4068], \
                        f"Processing for {model} and {sample_type}-testsample has changed."
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 5
batch_size = 100
evaluate_every = 20
lang_model = "bert-base-cased"
do_lower_case = False

# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load Cola 2018 Data.

label_list = ["0", "1"]
metric = "mcc"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=64,
                                        data_dir="../data/cola",
                                        dev_filename="dev.tsv",
                                        dev_split=None,
                                        test_filename=None,
                                        label_list=label_list,
                                        metric=metric,
Beispiel #3
0
def test_dpr_processor(embed_title, passage_ids, passage_attns, use_fast,
                       num_hard_negatives, labels):
    dict = [{
        'query':
        'where is castle on the hill based on',
        'answers': ['Framlingham Castle'],
        'passages': [
            {
                "text":
                'Castle on the Hill "Castle on the Hill" is a song by English singer-songwriter Ed Sheeran. It was released as a digital download on 6 January 2017 as one of the double lead singles from his third studio album "÷" (2017), along with "Shape of You". "Castle on the Hill" was written and produced by Ed Sheeran and Benny Blanco. The song refers to Framlingham Castle in Sheeran\'s home town. Released on the same day as "Shape of You", "Castle on the Hill" reached number two in a number of countries, including the UK, Australia and Germany, while "Shape of',
                "title": 'Castle on the Hill',
                "label": "positive",
                "external_id": '19930582'
            },
            {
                "text":
                'crops so as to feed a struggling infant colony. Governor King began Government Farm 3 there on 8 July 1801, referring to it as "Castle Hill" on 1 March 1802. The majority of the convicts who worked the prison farm were Irish Catholics, many having been transported for seditious activity in 1798. The most notorious incident being the Battle of Vinegar Hill where around 39 were slaughtered. They were branded "politicals" and exiled for life, never to return. The first free settler in Castle Hill, a Frenchman Baron Verincourt de Clambe, in unusual circumstances received a grant of 200 acres',
                "title": 'Castle Hill, New South Wales',
                "label": "hard_negative",
                "external_id": '1977568'
            },
            {
                "text":
                'Tom Gleeson, proposed ""high on the peak of Castle Hill, overlooking the harbour"" would be a suitable location for the monument. Having arrived in Townsville, the monument was then placed in storage for a number of years. It was not until October 1947 that the Council discussed where to place the monument. A number of locations were considered: Castle Hill, the Botanic Gardens, in front of the Queens Hotel, the Anzac Memorial Park and the Railway Oval, but Castle Hill was ultimately the council\'s choice. In February 1948, the Queensland Government gave its approval to the council to place the',
                "title": 'Castle Hill, Townsville',
                "label": "hard_negative",
                "external_id": '3643705'
            },
        ]
    }, {
        'query':
        'when did the royal family adopt the name windsor',
        'answers': ['in 1917'],
        'passages': [{
            "text":
            'House of Windsor The House of Windsor is the reigning royal house of the United Kingdom and the other Commonwealth realms. The dynasty is of German paternal descent and was originally a branch of the House of Saxe-Coburg and Gotha, itself derived from the House of Wettin, which succeeded the House of Hanover to the British monarchy following the death of Queen Victoria, wife of Albert, Prince Consort. The name was changed from "Saxe-Coburg and Gotha" to the English "Windsor" (from "Windsor Castle") in 1917 because of anti-German sentiment in the British Empire during World War I. There have been',
            "title": 'House of Windsor',
            "label": "positive",
            "external_id": '1478954'
        }, {
            "text":
            "2005, and was to take place in a civil ceremony at Windsor Castle, with a subsequent religious service of blessing at St George's Chapel. However, to conduct a civil marriage at Windsor Castle would oblige the venue to obtain a licence for civil marriages, which it did not have. A condition of such a licence is that the licensed venue must be available for a period of one year to anyone wishing to be married there, and as the royal family did not wish to make Windsor Castle available to the public for civil marriages, even just for one year,",
            "title": 'Camilla, Duchess of Cornwall',
            "label": "hard_negative",
            "external_id": '1399730'
        }]
    }, {
        'query':
        'what is a cat?',
        'answers': ['animal', 'feline'],
        'passages': [{
            "text": 'This is a <mask> sentence. Cats are good pets.',
            "title": 'title with "special characters" ',
            "label": "positive",
            "external_id": '0'
        }, {
            "text": "2nd text => More text about cats is good",
            "title": '2nd title \n',
            "label": "positive",
            "external_id": '1'
        }]
    }]

    query_tok = "facebook/dpr-question_encoder-single-nq-base"
    query_tokenizer = Tokenizer.load(query_tok, use_fast=use_fast)
    passage_tok = "facebook/dpr-ctx_encoder-single-nq-base"
    context_tokenizer = Tokenizer.load(passage_tok, use_fast=use_fast)
    processor = TextSimilarityProcessor(
        tokenizer=query_tokenizer,
        passage_tokenizer=context_tokenizer,
        max_seq_len=256,
        data_dir="data/retriever",
        train_filename="nq-train.json",
        test_filename="nq-dev.json",
        embed_title=embed_title,
        num_hard_negatives=num_hard_negatives,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        shuffle_negatives=False)

    for i, d in enumerate(dict):
        sample = processor._dict_to_samples(d)
        feat = processor._sample_to_features(sample[0])
        assert (torch.all(
            torch.eq(torch.tensor(feat[0]["query_input_ids"][:10]),
                     query_input_ids[i])))
        assert (len(torch.tensor(feat[0]["query_segment_ids"]).nonzero()) == 0)
        assert (torch.all(
            torch.eq(
                torch.tensor(feat[0]["query_attention_mask"]).nonzero(),
                query_attention_mask[i])))

        positive_indices = np.where(
            np.array(feat[0]["label_ids"]) == 1)[0].item()
        assert (torch.all(
            torch.eq(
                torch.tensor(
                    feat[0]["passage_input_ids"])[positive_indices, :10],
                passage_ids[i][positive_indices])))
        for j in range(num_hard_negatives + 1):
            assert (torch.all(
                torch.eq(
                    torch.tensor(
                        feat[0]["passage_attention_mask"][j]).nonzero(),
                    passage_attns[i][j])))
        assert (torch.all(
            torch.eq(torch.tensor(feat[0]["label_ids"]),
                     torch.tensor(labels[i])[:num_hard_negatives + 1])))
        assert (len(torch.tensor(
            feat[0]["passage_segment_ids"]).nonzero()) == 0)
Beispiel #4
0
def lm_finetuning():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    set_all_seeds(seed=42)
    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_minimal_example_lm")
    ##########################
    ########## Settings
    ##########################
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 30
    lang_model = "bert-base-cased"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = BertStyleLMProcessor(data_dir=Path("../data/lm_finetune_nips"),
                                     tokenizer=tokenizer,
                                     max_seq_len=128,
                                     max_docs=20)
    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_multiprocessing_chunksize=20)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
    lm_prediction_head = BertLMHead.load(lang_model)
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-english-lm-tutorial")
    model.save(save_dir)
    processor.save(save_dir)
Beispiel #5
0
    def load(
        cls,
        model_name_or_path,
        batch_size=4,
        gpu=False,
        task_type=None,
        return_class_probs=False,
        strict=True,
        max_seq_len=256,
        doc_stride=128,
        extraction_layer=None,
        extraction_strategy=None,
        num_processes=None,
    ):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :param max_seq_len: maximum length of one text sample
        :type max_seq_len: int
        :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride
        :type doc_stride: int
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer use all CPU cores. If you want to
                              debug the Language Model, you might need to disable multiprocessing!
        :type num_processes: int
        :return: An instance of the Inferencer.

        """

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = BaseAdaptiveModel.load(load_dir=model_name_or_path,
                                           device=device,
                                           strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

            # override processor attributes loaded from config file with inferencer params
            processor.max_seq_len = max_seq_len
            if hasattr(processor, "doc_stride"):
                processor.doc_stride = doc_stride

        # b) or from remote transformers model hub
        else:
            logger.info(
                f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..."
            )
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification', 'ner'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path, device, task_type)
            config = AutoConfig.from_pretrained(model_name_or_path)
            tokenizer = Tokenizer.load(model_name_or_path)

            # TODO infer task_type automatically from config (if possible)
            if task_type == "question_answering":
                processor = SquadProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    label_list=["start_token", "end_token"],
                    metric="squad",
                    data_dir="data",
                    doc_stride=doc_stride)
            elif task_type == "embeddings":
                processor = InferenceProcessor(tokenizer=tokenizer,
                                               max_seq_len=max_seq_len)

            elif task_type == "text_classification":
                label_list = list(config.id2label[id]
                                  for id in range(len(config.id2label)))
                processor = TextClassificationProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    data_dir="data",
                    label_list=label_list,
                    label_column_name="label",
                    metric="acc",
                    quote_char='"',
                )
            elif task_type == "ner":
                label_list = list(config.label2id.keys())
                processor = NERProcessor(tokenizer=tokenizer,
                                         max_seq_len=max_seq_len,
                                         data_dir="data",
                                         metric="seq_f1",
                                         label_list=label_list)
            else:
                raise ValueError(
                    f"`task_type` {task_type} is not supported yet. "
                    f"Valid options for arg `task_type`: 'question_answering', "
                    f"'embeddings', 'text_classification', 'ner'")

        return cls(
            model,
            processor,
            task_type=task_type,
            batch_size=batch_size,
            gpu=gpu,
            name=name,
            return_class_probs=return_class_probs,
            extraction_strategy=extraction_strategy,
            extraction_layer=extraction_layer,
            num_processes=num_processes,
        )
Beispiel #6
0
    def __init__(self,
                 document_store: BaseDocumentStore,
                 query_embedding_model: Union[
                     Path,
                     str] = "facebook/dpr-question_encoder-single-nq-base",
                 passage_embedding_model: Union[
                     Path, str] = "facebook/dpr-ctx_encoder-single-nq-base",
                 max_seq_len_query: int = 64,
                 max_seq_len_passage: int = 256,
                 use_gpu: bool = True,
                 batch_size: int = 16,
                 embed_title: bool = True,
                 use_fast_tokenizers: bool = True,
                 similarity_function: str = "dot_product"):
        """
        Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
        The checkpoint format matches huggingface transformers' model format

        **Example:**

                ```python
                |    # remote model from FAIR
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                |                          passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base")
                |    # or from local path
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="model_directory/question-encoder",
                |                          passage_embedding_model="model_directory/context-encoder")
                ```

        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the
                                      one used by hugging-face transformers' modelhub models
                                      Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"``
        :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the
                                        one used by hugging-face transformers' modelhub models
                                        Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"``
        :param max_seq_len_query: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down."
        :param max_seq_len_passage: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down."
        :param use_gpu: Whether to use gpu or not
        :param batch_size: Number of questions or passages to encode at once
        :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding.
                            This is the approach used in the original paper and is likely to improve performance if your
                            titles contain meaningful information for retrieval (topic, entities etc.) .
                            The title is expected to be present in doc.meta["name"] and can be supplied in the documents
                            before writing them to the DocumentStore like this:
                            {"text": "my text", "meta": {"name": "my title"}}.
        """

        self.document_store = document_store
        self.batch_size = batch_size
        self.max_seq_len_passage = max_seq_len_passage
        self.max_seq_len_query = max_seq_len_query

        if document_store.similarity != "dot_product":
            logger.warning(
                f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. "
                "We recommend you use dot_product instead. "
                "This can be set when initializing the DocumentStore")

        if use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.embed_title = embed_title

        # Init & Load Encoders
        self.query_tokenizer = Tokenizer.load(
            pretrained_model_name_or_path=query_embedding_model,
            do_lower_case=True,
            use_fast=use_fast_tokenizers)
        self.query_encoder = LanguageModel.load(
            pretrained_model_name_or_path=query_embedding_model,
            language_model_class="DPRQuestionEncoder")

        self.passage_tokenizer = Tokenizer.load(
            pretrained_model_name_or_path=passage_embedding_model,
            do_lower_case=True,
            use_fast=use_fast_tokenizers)
        self.passage_encoder = LanguageModel.load(
            pretrained_model_name_or_path=passage_embedding_model,
            language_model_class="DPRContextEncoder")

        self.processor = TextSimilarityProcessor(
            tokenizer=self.query_tokenizer,
            passage_tokenizer=self.passage_tokenizer,
            max_seq_len_passage=self.max_seq_len_passage,
            max_seq_len_query=self.max_seq_len_query,
            label_list=["hard_negative", "positive"],
            metric="text_similarity_metric",
            embed_title=self.embed_title,
            num_hard_negatives=0,
            num_negatives=0)

        prediction_head = TextSimilarityHead(
            similarity_function=similarity_function)
        self.model = BiAdaptiveModel(
            language_model1=self.query_encoder,
            language_model2=self.passage_encoder,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm1_output_types=["per_sequence"],
            lm2_output_types=["per_sequence"],
            device=self.device,
        )
        self.model.connect_heads_with_processor(self.processor.tasks,
                                                require_labels=False)
Beispiel #7
0
def question_answering():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_natural_questions")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 24
    n_epochs = 1
    evaluate_every = 500
    lang_model = "deepset/roberta-base-squad2"  # start with a model that can already extract answers
    do_lower_case = False  # roberta is a cased model
    train_filename = "train_medium.jsonl"
    dev_filename = "dev_medium.jsonl"
    keep_is_impossible = 0.15  # downsample negative examples after data conversion
    downsample_context_size = 300  # reduce length of wikipedia articles to relevant part around the answer

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart
    html_tags = [
        "<Th>",
        "</Th>",
        "<Td>",
        "</Td>",
        "<Tr>",
        "</Tr>",
        "<Li>",
        "</Li>",
        "<P>",
        "</P>",
        "<Ul>",
        "</Ul>",
        "<H1>",
        "</H1>",
        "<H2>",
        "</H2>",
        "<H3>",
        "</H3>",
        "<H4>",
        "</H4>",
        "<H5>",
        "</H5>",
        "<Td_colspan=",
    ]
    tokenizer.add_tokens(html_tags)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = NaturalQuestionsProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        train_filename=train_filename,
        dev_filename=dev_filename,
        keep_is_impossible=keep_is_impossible,
        downsample_context_size=downsample_context_size,
        data_dir=Path("../data/natural_questions"),
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         caching=True)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model,
                                        n_added_tokens=len(html_tags))
    # b) and in case of Natural Questions we need two Prediction Heads
    #    one for extractive Question Answering
    qa_head = QuestionAnsweringHead()
    #    another one for answering yes/no questions or deciding if the given text passage might contain an answer
    classification_head = TextClassificationHead(
        num_labels=len(processor.answer_type_list)
    )  # answer_type_list = ["is_impossible", "span", "yes", "no"]
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[qa_head, classification_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/roberta-base-squad2-nq")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Since training on the whole NQ corpus requires substantial compute resources we trained and uploaded a model on s3
    fetch_archive_from_http(
        "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/models/roberta-base-squad2-nq.zip",
        output_dir="../saved_models/farm")
    QA_input = [{
        "qas": [
            "Did GameTrailers rated Twilight Princess as one of the best games ever created?"
        ],
        "context":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]

    model = Inferencer.load(
        model_name_or_path="../saved_models/farm/roberta-base-squad2-nq",
        batch_size=batch_size,
        gpu=True)
    result = model.inference_from_dicts(dicts=QA_input)

    pprint.pprint(result)

    print(
        f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?"
        f"\nAnswer from model: {result[0]['predictions'][0]['answers'][0]['classification']}"
    )
def doc_classification_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    xval_folds = 5
    xval_stratified = True

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False
    use_amp = None

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    # For xval, we also store the actual predictions and labels in each result so we can
    # calculate overall metrics over all folds later
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels).get("acc")
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        mcc = matthews_corrcoef(labels, preds)
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro,
            "mcc": mcc
        }
    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]
    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=64,
                                            data_dir=Path("../data/germeval18"),
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="coarse_label"
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir):
        logger.info(f"############ Crossvalidation: Fold {n_fold} ############")
        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = TextClassificationHead(
            class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.2,
            lm_output_types=["per_sequence"],
            device=device)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        # An early stopping instance can be used to save the model that performs best on the dev set
        # according to some metric and stop training when no improvement is happening for some iterations.
        # NOTE: Using a different save directory for each fold, allows us afterwards to use the
        # nfolds best models in an ensemble!
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        earlystopping = EarlyStopping(
            metric="f1_offense", mode="max",   # use the metric from our own metrics function instead of loss
            save_dir=save_dir,  # where to save the best model
            patience=5    # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            data_silo=silo_to_use,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
            early_stopping=earlystopping,
            evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    allresults = []
    all_preds = []
    all_labels = []
    bestfold = None
    bestf1_offense = -1
    save_dir = Path("saved_models/bert-german-doc-tutorial-es")
    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold, save_dir)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(
            data_loader=silo.get_data_loader("test"),
            tasks=silo.processor.tasks,
            device=device
        )
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold)

        allresults.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))

        # keep track of best fold
        f1_offense = result[0]["f1_offense"]
        if f1_offense > bestf1_offense:
            bestf1_offense = f1_offense
            bestfold = num_fold

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("doc_classification_xval.results.json", "wt") as fp:
        json.dump(allresults, fp)

    # calculate overall metrics across all folds
    xval_f1_micro = f1_score(all_labels, all_preds, labels=label_list, average="micro")
    xval_f1_macro = f1_score(all_labels, all_preds, labels=label_list, average="macro")
    xval_f1_offense = f1_score(all_labels, all_preds, labels=label_list, pos_label="OFFENSE")
    xval_f1_other = f1_score(all_labels, all_preds, labels=label_list, pos_label="OTHER")
    xval_mcc = matthews_corrcoef(all_labels, all_preds)

    logger.info("XVAL F1 MICRO:   ", xval_f1_micro)
    logger.info("XVAL F1 MACRO:   ", xval_f1_macro)
    logger.info("XVAL F1 OFFENSE: ", xval_f1_offense)
    logger.info("XVAL F1 OTHER:   ", xval_f1_other)
    logger.info("XVAL MCC:        ", xval_mcc)

    # -----------------------------------------------------
    # Just for illustration, use the best model from the best xval val for evaluation on
    # the original (still unseen) test set.
    logger.info("###### Final Eval on hold out test set using best model #####")
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

    result = evaluator_origtest.eval(model)
    logger.info("TEST F1 MICRO:   ", result[0]["f1_micro"])
    logger.info("TEST F1 MACRO:   ", result[0]["f1_macro"])
    logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"])
    logger.info("TEST F1 OTHER:   ", result[0]["f1_other"])
    logger.info("TEST MCC:        ", result[0]["mcc"])
Beispiel #9
0
# ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
# ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_minimal_example_ner")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.from_pretrained(pretrained_model_name_or_path=lang_model,
                                      do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
ner_labels = [
    "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG",
    "B-LOC", "I-LOC", "B-OTH", "I-OTH"
]

processor = NERProcessor(tokenizer=tokenizer,
                         max_seq_len=128,
                         data_dir="../data/conll03-de",
                         metric="seq_f1",
                         label_list=ner_labels)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)
Beispiel #10
0
def evaluate_question_answering():
    ##########################
    ########## Settings
    ##########################
    device, n_gpu = initialize_device_settings(use_cuda=True)
    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = True

    data_dir = Path("../data/squad20")
    evaluation_filename = "dev-v2.0.json"

    batch_size = 50
    no_ans_boost = 0
    recall_at = 3 # recall at n is only useful for answers inside long documents

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list= ["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
        doc_stride=128,
    )

    # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an Evaluator
    evaluator = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )

    # 5. Load model
    model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering")
    # use "load" if you want to use a local model that was trained with FARM
    #model = AdaptiveModel.load(lang_model, device=device)
    model.prediction_heads[0].no_ans_boost = no_ans_boost
    model.prediction_heads[0].n_best = recall_at
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

    # 6. Run the Evaluator
    results = evaluator.eval(model)
    f1_score = results[0]["f1"]
    em_score = results[0]["EM"]
    tnrecall = results[0]["top_n_recall"]
    print("F1-Score:", f1_score)
    print("Exact Match Score:", em_score)
    print(f"top_{recall_at}_recall:", tnrecall)
Beispiel #11
0
def test_ner(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 5
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
                  "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir="samples/ner",
        train_filename="train-sample.txt",
        dev_filename="dev-sample.txt",
        test_filename=None,
        delimiter=" ",
        label_list=ner_labels,
        metric="seq_f1"
    )

    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(layer_dims=[768, len(ner_labels)])

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={'name': 'LinearWarmup', 'warmup_proportion': 0.1}
    )
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = "testsave/ner"
    model = trainer.train(model)
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Albrecht Lehman ist eine Person"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts, max_processes=1)
    print(result)
Beispiel #12
0
def question_answering_confidence():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False
    batch_size = 80

    data_dir = Path("../data/squad20")
    # We use the same file for dev and test set only for demo purposes
    dev_filename = "dev-v2.0.json"
    test_filename = "dev-v2.0.json"
    accuracy_at = 3 # accuracy at n is useful for answers inside long documents


    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=dev_filename,
        test_filename=test_filename,
        data_dir=data_dir,
        doc_stride=192,
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)


    # 4. Load pre-trained question-answering model
    model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering")
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)
    # Number of predictions the model will make per Question.
    # The multiple predictions are used for evaluating top n recall.
    model.prediction_heads[0].n_best = accuracy_at

    # 5. The calibration of model confidence scores sets one parameter, which is called temperature and can be accessed through the prediction_head.
    # This temperature is applied to each logit in the forward pass, where each logit is divided by the temperature.
    # A softmax function is applied to the logits afterward to get confidence scores in the range [0,1].
    # A temperature larger than 1 decreases the model’s confidence scores.
    logger.info(f"Parameter used for temperature scaling of model confidence scores: {model.prediction_heads[0].temperature_for_confidence}")

    # 6a. We can either manually set the temperature (default value is 1.0)...
    model.prediction_heads[0].temperature_for_confidence = torch.nn.Parameter((torch.ones(1) * 1.0).to(device=device))

    # 6b. ...or we can run the evaluator on the dev set and use it to calibrate confidence scores with a technique called temperature scaling.
    # It will align the confidence scores with the model's accuracy based on the dev set data by tuning the temperature parameter.
    # During the calibration, this parameter is automatically set internally as an attribute of the prediction head.
    evaluator_dev = Evaluator(
        data_loader=data_silo.get_data_loader("dev"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    result_dev = evaluator_dev.eval(model, return_preds_and_labels=True, calibrate_conf_scores=True)
    # evaluator_dev.log_results(result_dev, "Dev", logging=False, steps=len(data_silo.get_data_loader("dev")))

    # 7. Optionally, run the evaluator on the test set to see how well the confidence scores are aligned with the model's accuracy
    evaluator_test = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )
    result_test = evaluator_test.eval(model, return_preds_and_labels=True)[0]
    logger.info("Grouping predictions by confidence score and calculating metrics for each bin.")
    em_per_bin, confidence_per_bin, count_per_bin = metrics_per_bin(result_test["preds"], result_test["labels"], num_bins=10)
    for bin_number in range(10):
        logger.info(f"Bin {bin_number} - exact match: {em_per_bin[bin_number]}, average confidence score: {confidence_per_bin[bin_number]}")

    # 8. Hooray! You have a model with calibrated confidence scores.
    # Store the model and the temperature parameter will be stored automatically as an attribute of the prediction head.
    save_dir = Path("../saved_models/qa-confidence-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. When making a prediction with the calibrated model, we could filter out predictions where the model is not confident enough
    # To this end, load the stored model, which will automatically load the stored temperature parameter.
    # The confidence scores are automatically adjusted based on this temperature parameter.
    # For each prediction, we can check the model's confidence and decide whether to output the prediction or not.
    inferencer = QAInferencer.load(save_dir, batch_size=40, gpu=True)
    logger.info(f"Loaded model with stored temperature: {inferencer.model.prediction_heads[0].temperature_for_confidence}")

    QA_input = [
        {
            "questions": ["Who counted the game among the best ever made?"],
            "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
        }]
    result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0]
    if result.prediction[0].confidence > 0.9:
        print(result.prediction[0].answer)
    else:
        print("The confidence is not high enough to give an answer.")
Beispiel #13
0
def doc_classification_multilabel_roberta():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 32

    evaluate_every = 500
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load Toxic Comments Data automaticaly if it is not available.

    label_list = [
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ]
    metric = "acc"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=Path("../data/toxic-comments"),
        label_list=label_list,
        label_column_name="label",
        metric=metric,
        quote_char='"',
        multilabel=True,
        train_filename=Path("train.tsv"),
        dev_filename=Path("val.tsv"),
        test_filename=None,
        dev_split=0,
        max_samples=1000)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = Roberta.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = MultiLabelTextClassificationHead(
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-multi-doc-roberta")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text": "You f*****g bastards"
        },
        {
            "text": "What a lovely world"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.run_inference(dicts=basic_texts)
    print(result)
Beispiel #14
0
def test_ner(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
                  "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir="samples/ner",
        train_filename="train-sample.txt",
        dev_filename="dev-sample.txt",
        test_filename=None,
        delimiter=" ",
        label_list=ner_labels,
        metric="seq_f1"
    )

    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(layer_dims=[768, len(ner_labels)])

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = "testsave/ner"
    model = trainer.train(model)
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts,use_multiprocessing=False)
    assert result[0]["predictions"][0]["context"] == "sagte"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
Beispiel #15
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_text_pair_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]
    train_filename = "train.tsv"
    dev_filename = "dev_200k.tsv"

    # The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking
    generate_data = False
    data_dir = Path("../data/msmarco_passage")
    predictions_raw_filename = "predictions_raw.txt"
    predictions_filename = "predictions.txt"
    train_source_filename = "triples.train.1m.tsv"
    qrels_filename = "qrels.dev.tsv"
    queries_filename = "queries.dev.tsv"
    passages_filename = "collection.tsv"
    top1000_filename = "top1000.dev"

    # 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once!
    # The final format is a tsv file with 3 columns (text, text_b and label)
    if generate_data:
        reformat_msmarco_train(data_dir / train_source_filename,
                               data_dir / train_filename)
        reformat_msmarco_dev(data_dir / queries_filename,
                             data_dir / passages_filename,
                             data_dir / qrels_filename,
                             data_dir / top1000_filename,
                             data_dir / dev_filename)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    Evaluation during training will be performed on a slice of the train set
    #    We will be using the msmarco dev set as our final evaluation set
    processor = TextPairClassificationProcessor(tokenizer=tokenizer,
                                                label_list=label_list,
                                                train_filename=train_filename,
                                                test_filename=None,
                                                dev_split=0.001,
                                                max_seq_len=128,
                                                data_dir=data_dir,
                                                delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"),
    )

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/passage_ranking_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    model = Inferencer.load(save_dir,
                            gpu=True,
                            max_seq_len=128,
                            batch_size=128)
    result = model.inference_from_file(data_dir / dev_filename)

    write_msmarco_results(result, save_dir / predictions_raw_filename)

    msmarco_evaluation(preds_file=save_dir / predictions_raw_filename,
                       dev_file=data_dir / dev_filename,
                       qrels_file=data_dir / qrels_filename,
                       output_file=save_dir / predictions_filename)
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False
    # or a local path:
    # lang_model = Path("../saved_models/farm-bert-base-cased")
    use_amp = None

    #############################################
    # CUSTOM OPTIMIZER & LR SCHEDULE
    #############################################
    # learning rate schedules from transformers
    schedule_opts = {"name": "LinearWarmup", "warmup_proportion": 0.4}
    # schedule_opts = {"name": "Constant"}
    # schedule_opts = {"name": "CosineWarmup", "warmup_proportion": 0.4}
    # schedule_opts = {"name": "CosineWarmupWithRestarts", "warmup_proportion": 0.4}

    # or from native pytorch (see https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html for all options)
    # schedule_opts = {"name": "StepLR", "step_size": 30, "gamma": 0.1}
    # schedule_opts = {"name": "ReduceLROnPlateau", "mode": 'min', "factor": 0.1, "patience":10}

    # optimizers from pytorch (see https://pytorch.org/docs/stable/optim.html for all options)
    optimizer_opts = {"name": "SGD", "momentum": 0.0}

    # or from apex (see https://github.com/NVIDIA/apex/tree/master/apex/optimizers for all options)
    # optimizer_opts = {"name": "FusedLAMB", "bias_correction": True}

    # or from transformers (default in FARM)
    #optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01}
    #############################################

    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-3,
        optimizer_opts=optimizer_opts,
        schedule_opts=schedule_opts,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
Beispiel #17
0
def doc_classification_with_earlystopping():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    # ml_logger = MLFlowLogger(tracking_uri="logs")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.2,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=0.5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        metric="f1_offense",
        mode=
        "max",  # use the metric from our own metrics function instead of loss
        # metric="f1_macro", mode="max",  # use f1_macro from the dev evaluator of the trainer
        # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
        save_dir=Path("saved_models/bert-german-doc-tutorial-es"
                      ),  # where to save the best model
        patience=
        5  # number of evaluations to wait for improvement before terminating the training
    )

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model.
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]

    # Load from the final epoch directory and apply
    print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING")
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)

    # Load from saved best model
    print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING")
    model = Inferencer.load(earlystopping.save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print("APPLICATION ON BEST MODEL")
    print(result)
def train_evaluation_single(seed=42):
    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 32 * 4  # 4x V100
    n_epochs = 2
    evaluate_every = 2000000  # disabling dev eval
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # Load model and train
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("testsave/data/squad20"),
    )
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)
    language_model = LanguageModel.load(lang_model)
    prediction_head = QuestionAnsweringHead()
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    starttime = time()
    trainer.train()
    elapsed = time() - starttime

    save_dir = Path("testsave/roberta-qa-dev")
    model.save(save_dir)
    processor.save(save_dir)

    # Create Evaluator
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnrecall = results[0]["top_n_recall"] * 100

    print(results)
    print(elapsed)

    gold_f1 = 82.155
    gold_EM = 77.714
    gold_tnrecall = 97.3721  #
    gold_elapsed = 1286.30
    np.testing.assert_allclose(
        f1_score,
        gold_f1,
        rtol=0.01,
        err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}")
    np.testing.assert_allclose(
        em_score,
        gold_EM,
        rtol=0.01,
        err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}")
    np.testing.assert_allclose(
        tnrecall,
        gold_tnrecall,
        rtol=0.01,
        err_msg=
        f"FARM Training changed for top 1 recall by: {em_score - gold_EM}")
    np.testing.assert_allclose(
        elapsed,
        gold_elapsed,
        rtol=0.1,
        err_msg=
        f"FARM Eval speed changed significantly: {elapsed - gold_elapsed}")
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_text_pair_classification")

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset.
    # The TextPairClassificationProcessor expects a csv with columns called "text', "text_b" and "label"
    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="f1_macro",
        max_seq_len=128,
        train_filename="train.tsv",
        dev_filename="dev.tsv",
        test_filename=None,
        data_dir=Path("../data/asnq_binary"),
        delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/text_pair_classification_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    # For correct Text Pair Classification on raw dictionaries (inference mode), we need to put both
    # texts (text, text_b) into a tuple.
    # See corresponding conversion in the file_to_dicts() method of TextPairClassificationProcessor: https://github.com/deepset-ai/FARM/blob/5ab5b1620cb51ceb874d4b30c887e377ad1a6e9a/farm/data_handler/processor.py#L744
    basic_texts = [
        {
            "text":
            ("how many times have real madrid won the champions league in a row",
             "They have also won the competition the most times in a row, winning it five times from 1956 to 1960"
             )
        },
        {
            "text": ("how many seasons of the blacklist are there on netflix",
                     "Retrieved March 27 , 2018 .")
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)
    model.close_multiprocessing_pool()
def test_evaluation():
    ##########################
    ########## Settings
    ##########################
    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False

    test_assertions = True

    data_dir = Path("testsave/data/squad20")
    evaluation_filename = "dev-v2.0.json"

    device, n_gpu = initialize_device_settings(use_cuda=True)

    # loading models and evals
    model = AdaptiveModel.convert_from_transformers(
        lang_model, device=device, task_type="question_answering")
    model.prediction_heads[0].no_ans_boost = 0
    model.prediction_heads[0].n_best = 1

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
        doc_stride=128,
    )

    starttime = time()

    data_silo = DataSilo(processor=processor, batch_size=50)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    # 1. Test FARM internal evaluation
    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnrecall = results[0]["top_n_recall"] * 100
    elapsed = time() - starttime
    print(results)
    print(elapsed)

    gold_EM = 77.7478
    gold_f1 = 82.1557
    gold_tnrecall = 84.0646  # top 1 recall
    gold_elapsed = 78  # 4x V100
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}")
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}")
        np.testing.assert_allclose(
            tnrecall,
            gold_tnrecall,
            rtol=0.001,
            err_msg=f"FARM Eval changed for top 1 recall by: {em_score-gold_EM}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"FARM Eval speed changed significantly: {elapsed - gold_elapsed}")

    # 2. Test FARM predictions with outside eval script
    starttime = time()
    model = Inferencer(model=model,
                       processor=processor,
                       task_type="question_answering",
                       batch_size=50,
                       gpu=device.type == "cuda")
    filename = data_dir / evaluation_filename
    result = model.inference_from_file(file=filename)

    elapsed = time() - starttime

    os.makedirs("../testsave", exist_ok=True)
    write_squad_predictions(predictions=result,
                            predictions_filename=filename,
                            out_filename="testsave/predictions.json")
    script_params = {
        "data_file": filename,
        "pred_file": "testsave/predictions.json",
        "na_prob_thresh": 1,
        "na_prob_file": False,
        "out_file": False
    }
    results_official = squad_evaluation.main(OPTS=DotMap(script_params))
    f1_score = results_official["f1"]
    em_score = results_official["exact"]

    gold_EM = 78.489
    gold_f1 = 81.7104
    gold_elapsed = 74  # 4x V100
    print(elapsed)
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for EM by: {em_score - gold_EM}"
        )
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for f1 score by: {f1_score - gold_f1}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"Inference speed changed significantly: {elapsed - gold_elapsed}")
Beispiel #21
0
def test_ner_amp(caplog):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"
    if AMP_AVAILABLE:
        use_amp = 'O1'
    else:
        use_amp = None

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
                  "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir=Path("samples/ner"),
        train_filename=Path("train-sample.txt"),
        dev_filename=Path("dev-sample.txt"),
        test_filename=None,
        delimiter=" ",
        label_list=ner_labels,
        metric="seq_f1"
    )

    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-05,
        schedule_opts=None,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        use_amp=use_amp)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "1980 kam der Crown von Toyota"},
    ]
    model = Inferencer.load(save_dir, num_processes=0)
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0][0]["context"] == "Crown"
    assert isinstance(result[0]["predictions"][0][0]["probability"], np.float32)
    assert np.isclose(result[0]["predictions"][0][0]["probability"], 0.124, rtol=0.05)
    assert result[0]["predictions"][0][0]["label"] == "OTH"
Beispiel #22
0
def ner(task: str, lm: str):
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42, deterministic_cudnn=use_cuda)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)
    n_epochs = 10
    batch_size = 32
    evaluate_every = 1000
    model_dir = MODEL_DIR
    if lm == 'bert-hgcrw':
        lang_model = "redewiedergabe/bert-base-historical-german-rw-cased"
        model_dir += '_bert-hgcrw'
    elif lm == 'lmgot01':
        lang_model = Path(
            "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_01")
        model_dir += '_lmgot01'
    elif lm == 'lmgot02':
        lang_model = Path(
            "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_02")
        model_dir += '_lmgot02'
    else:
        lang_model = "bert-base-german-cased"
    if task != 'all':
        model_dir += '_' + task
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    if task == 'direct':
        ner_labels = ["[PAD]", "X", "O", "B-DIR", "I-DIR"]
    elif task == 'indirect':
        ner_labels = ["[PAD]", "X", "O", "B-IND", "I-IND"]
    elif task == 'reported':
        ner_labels = ["[PAD]", "X", "O", "B-REP", "I-REP"]
    else:
        ner_labels = [
            "[PAD]", "X", "O", "B-DIR", "I-DIR", "B-IND", "I-IND", "B-REP",
            "I-REP"
        ]

    data_dir = DATA_DIR
    if task != 'all':
        data_dir += task + '/'
    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=64,
                             data_dir=Path(data_dir),
                             delimiter="\t",
                             metric="seq_f1",
                             label_list=ner_labels)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_loader_worker = 1
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=data_loader_worker)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = model_dir
    model.save(save_dir)
    processor.save(save_dir)
Beispiel #23
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_text_pair_classification")

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    We do not have a sample dataset for regression yet, add your own dataset to run the example
    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="acc",
        label_column_name="label",
        max_seq_len=64,
        train_filename=training_filename,
        dev_filename=test_filename,
        test_filename=test_filename,
        data_dir=Path("../data"),
        tasks={"text_classification"},
        delimiter="\t")

    # train_filename = training_filename,
    # test_filename = test_filename,
    # dev_filename = test_filename,
    # dev_split = 0.5,

    # data_dir=Path("../data/asnq_binary"),

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    # Alte Version vor StreamingDataSilo
    # data_silo = DataSilo(
    #    processor=processor,
    #    batch_size=batch_size, max_processes=4)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-6,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    now = datetime.now()  # current date and time

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        #metric="f1_weighted", mode="max",  # use f1_macro from the dev evaluator of the trainer
        metric="loss",
        mode="min",  # use loss from the dev evaluator of the trainer
        save_dir=Path(
            "saved_models/earlystopping/" +
            now.strftime("%m%d%Y%H%M%S")),  # where to save the best model
        patience=
        8  # number of evaluations to wait for improvement before terminating the training
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)
    # model=model,
    # optimizer=optimizer,
    # data_silo=data_silo,
    # epochs=n_epochs,
    # n_gpu=n_gpu,
    # lr_schedule=lr_schedule,
    # evaluate_every=evaluate_every,
    # device=device)

    # 7. Let it grow
    #comment if going to use a stored model
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    # When a new model is being trained and need to be saved
    save_dir = Path("saved_models/text_pair_classification_model" +
                    now.strftime("%m%d%Y%H%M%S"))
    model.save(save_dir)
    processor.save(save_dir)

    # When only a model needs to be loaded change the details to load the needed model
    # save_dir = Path("saved_models/text_pair_classification_model" + "01272021103548")

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    basic_texts = [
        {
            "text":
            "<claim-text>The method of claim 10, wherein the indium metal layer is 10 nm to 100 µm thick.</claim-text>",
            "text_b":
            "<p id="
            "p0001"
            " num="
            "0001"
            ">The present invention is directed to metal plating compositions and methods. More specifically, the present invention is directed to metal plating compositions and methods which provide improved leveling and throwing power.</p <p id="
            "p0039"
            " num="
            "0039"
            ">One or more conventional surfactants may be used. Typically, surfactants include, but are not limited to, nonionic surfactants such as alkyl phenoxy polyethoxyethanols. Other suitable surfactants containing multiple oxyethylene groups also may be used. Such surfactants include compounds of polyoxyethylene polymers having from as many as 20 to 150 repeating units. Such compounds also may perform as suppressors. Also included in the class of polymers are both block and random copolymers of polyoxyethylene (EO) and polyoxypropylene (PO). Surfactants may be added in conventional amounts, such as from 0.05 g/L to 20 g/L or such as from 0.5 g/L to 5 g/L.</p <p id="
            "p0040"
            " num="
            "0040"
            ">Conventional levelers include, but are not limited to, one or more of alkylated polyalkyleneimines and organic sulfo sulfonates. Examples of such compounds include, 4-mercaptopyridine, 2-mercaptothiazoline, ethylene thiourea, thiourea, 1-(2-hydroxyethyl)-2-imidazolidinethion (HIT) and alkylated polyalkyleneimines. Such levelers are included in conventional amounts. Typically, such levelers are included in amounts of 1ppb to 1 g/L, or such as from 10ppb to 500ppm.</p <p id="
            "p0042"
            " num="
            "0042"
            ">Alkali metal salts which may be included in the plating compositions include, but are not limited to, sodium and potassium salts of halogens, such as chloride, fluoride and bromide. Typically chloride is used. Such alkali metal salts are used in conventional amounts.</p <p id="
            "p0053"
            " num="
            "0053"
            ">The metal plating compositions may be used to plate a metal or metal alloy on a substrate by any method known in the art and literature. Typically, the metal or metal alloy is electroplated using conventional electroplating processes with conventional apparatus. A soluble or insoluble anode may be used with the electroplating compositions.</p <p id="
            "p0022"
            " num="
            "0022"
            ">One or more sources of metal ions are included in metal plating compositions to plate metals. The one or more sources of metal ions provide metal ions which include, but are not limited to, copper, tin, nickel, gold, silver, palladium, platinum and indium. Alloys include, but are not limited to, binary and ternary alloys of the foregoing metals. Typically, metals chosen from copper, tin, nickel, gold, silver or indium are plated with the metal plating compositions. More typically, metals chosen from copper, tin, silver or indium are plated. Most typically, copper is plated.</p <p id="
            "p0030"
            " num="
            "0030"
            ">Indium salts which may be used include, but are not limited to, one or more of indium salts of alkane sulfonic acids and aromatic sulfonic acids, such as methanesulfonic acid, ethanesulfonic acid, butane sulfonic acid, benzenesulfonic acid and toluenesulfonic acid, salts of sulfamic acid, sulfate salts, chloride and bromide salts of indium, nitrate salts, hydroxide salts, indium oxides, fluoroborate salts, indium salts of carboxylic acids, such as citric acid, acetoacetic acid, glyoxylic acid, pyruvic acid, glycolic acid, malonic acid, hydroxamic acid, iminodiacetic acid, salicylic acid, glyceric acid, succinic acid, malic acid, tartaric acid, hydroxybutyric acid, indium salts of amino acids, such as arginine, aspartic acid, asparagine, glutamic acid, glycine, glutamine, leucine, lysine, threonine, isoleucine, and valine.</p"
        },
        {
            "text":
            "<claim-text>A toner comprising: <claim-text>toner base particles; and</claim-text> <claim-text>an external additive,</claim-text> <claim-text>the toner base particles each comprising a binder resin and a colorant,</claim-text> <claim-text>wherein the external additive comprises coalesced particles,</claim-text> <claim-text>wherein the coalesced particles are each a non-spherical secondary particle in which primary particles are coalesced together, and</claim-text> <claim-text>wherein an index of a particle size distribution of the coalesced particles is expressed by the following Formula (1): <maths id="
            "math0004"
            " num="
            "(formula (1)"
            "><math display="
            "block"
            "><mfrac><msub><mi>Db</mi><mn>50</mn></msub><msub><mi>Db</mi><mn>10</mn></msub></mfrac><mo>≦</mo><mn>1.20</mn></math><img id="
            "ib0008"
            " file="
            "imgb0008.tif"
            " wi="
            "93"
            " he="
            "21"
            " img-content="
            "math"
            " img-format="
            "tif"
            "/></maths><br/> where, in a distribution diagram in which particle diameters in nm of the coalesced particles are on a horizontal axis and cumulative percentages in % by number of the coalesced particles are on a vertical axis and in which the coalesced particles are accumulated from the coalesced particles having smaller particle diameters to the coalesced particles having larger particle diameters, Db<sub>50</sub> denotes a particle diameter of the coalesced particle at which the cumulative percentage is 50% by number, and Db<sub>10</sub> denotes a particle diameter of the coalesced particle at which the cumulative percentage is 10% by number.</claim-text></claim-text>",
            "text_b":
            "<p id="
            "p0177"
            " num="
            "0177"
            ">For a similar reason, it is preferred that the electroconductive fine powder has a volume-average particle size of 0.5 - 5 µm, more preferably 0.8 - 5 µm, further preferably 1.1 - 5 µm and has a particle size distribution such that particles of 0.5 µm or smaller occupy at most 70 % by volume and particles of 5.0 µm or larger occupy at most 5 % by number.</p <p id="
            "p0189"
            " num="
            "0189"
            ">The volume-average particle size and particle size distribution of the electroconductive fine powder described herein are based on values measured in the following manner. A laser diffraction-type particle size distribution measurement apparatus ("
            "Model LS-230"
            ", available from Coulter Electronics Inc.) is equipped with a liquid module, and the measurement is performed in a particle size range of 0.04 - 2000 µm to obtain a volume-basis particle size distribution. For the measurement, a minor amount of surfactant is added to 10 cc of pure water and 10 mg of a sample electroconductive fine powder is added thereto, followed by 10 min. of dispersion by means of an ultrasonic disperser (ultrasonic homogenizer) to obtain a sample dispersion liquid, which is subjected to a single time of measurement for 90 sec.</p <p id="
            "p0191"
            " num="
            "0191"
            ">In the case where the electroconductive fine powder is composed of agglomerate particles, the particle size of the electroconductive fine powder is determined as the particle size of the agglomerate. The electroconductive fine powder in the form of agglomerated secondary particles can be used as well as that in the form of primary particles. Regardless of its agglomerated form, the electroconductive fine powder can exhibit its desired function of charging promotion by presence in the form of the agglomerate in the charging section at the contact position<!-- EPO <DP n="
            "85"
            "> --> between the charging member and the image-bearing member or in a region in proximity thereto.</p"
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)
    model.close_multiprocessing_pool()
Beispiel #24
0
    def convert_to_onnx(self,
                        output_path,
                        opset_version=11,
                        optimize_for=None):
        """
        Convert a PyTorch AdaptiveModel to ONNX.

        The conversion is trace-based by performing a forward pass on the model with a input batch.

        :param output_path: model dir to write the model and config files
        :type output_path: Path
        :param opset_version: ONNX opset version
        :type opset_version: int
        :param optimize_for: optimize the exported model for a target device. Available options
                             are "gpu_tensor_core" (GPUs with tensor core like V100 or T4),
                             "gpu_without_tensor_core" (most other GPUs), and "cpu".
        :type optimize_for: str
        :return:
        """
        if type(self.prediction_heads[0]) is not QuestionAnsweringHead:
            raise NotImplementedError

        tokenizer = Tokenizer.load(
            pretrained_model_name_or_path="deepset/bert-base-cased-squad2")

        label_list = ["start_token", "end_token"]
        metric = "squad"
        max_seq_len = 384
        batch_size = 1
        processor = SquadProcessor(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            label_list=label_list,
            metric=metric,
            train_filename=
            "stub-file",  # the data is loaded from dicts instead of file.
            dev_filename=None,
            test_filename=None,
            data_dir="stub-dir",
        )

        data_silo = DataSilo(processor=processor,
                             batch_size=1,
                             distributed=False,
                             automatic_loading=False)
        sample_dict = [{
            "context":
            'The Normans were the people who in the 10th and 11th centuries gave their name to Normandy, '
            'a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders '
            'and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear '
            'fealty to King Charles III of West Francia.',
            "qas": [{
                "question": "In what country is Normandy located?",
                "id": "56ddde6b9a695914005b9628",
                "answers": [{
                    "text": "France",
                    "answer_start": 159
                }],
                "is_impossible": False,
            }],
        }]

        data_silo._load_data(train_dicts=sample_dict)
        data_loader = data_silo.get_data_loader("train")
        data = next(iter(data_loader))
        data = list(data.values())

        inputs = {
            'input_ids':
            data[0].to(self.device).reshape(batch_size, max_seq_len),
            'padding_mask':
            data[1].to(self.device).reshape(batch_size, max_seq_len),
            'segment_ids':
            data[2].to(self.device).reshape(batch_size, max_seq_len)
        }

        # The method argument passing in torch.onnx.export is different to AdaptiveModel's forward().
        # To resolve that, an ONNXWrapper instance is used.
        model = ONNXWrapper.load_from_adaptive_model(self)

        if not os.path.exists(output_path):
            os.makedirs(output_path)

        with torch.no_grad():
            symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
            torch.onnx.export(
                model,
                args=tuple(inputs.values()),
                f=output_path / 'model.onnx'.format(opset_version),
                opset_version=opset_version,
                do_constant_folding=True,
                input_names=['input_ids', 'padding_mask', 'segment_ids'],
                output_names=['logits'],
                dynamic_axes={
                    'input_ids': symbolic_names,
                    'padding_mask': symbolic_names,
                    'segment_ids': symbolic_names,
                    'logits': symbolic_names,
                })

        if optimize_for:
            optimize_args = Namespace(disable_attention=False,
                                      disable_bias_gelu=False,
                                      disable_embed_layer_norm=False,
                                      opt_level=99,
                                      disable_skip_layer_norm=False,
                                      disable_bias_skip_layer_norm=False,
                                      hidden_size=768,
                                      verbose=False,
                                      input='onnx-export/model.onnx',
                                      model_type='bert',
                                      num_heads=12,
                                      output='onnx-export/model.onnx')

            if optimize_for == "gpu_tensor_core":
                optimize_args.float16 = True
                optimize_args.input_int32 = True
            elif optimize_for == "gpu_without_tensor_core":
                optimize_args.float16 = False
                optimize_args.input_int32 = True
            elif optimize_for == "cpu":
                logger.info("")
                optimize_args.float16 = False
                optimize_args.input_int32 = False
            else:
                raise NotImplementedError(
                    f"ONNXRuntime model optimization is not available for {optimize_for}. Choose "
                    f"one of 'gpu_tensor_core'(V100 or T4), 'gpu_without_tensor_core' or 'cpu'."
                )

            optimize_onnx_model(optimize_args)
        else:
            logger.info(
                "Exporting unoptimized ONNX model. To enable optimization, supply "
                "'optimize_for' parameter with the target device.'")

        # PredictionHead contains functionalities like logits_to_preds() that would still be needed
        # for Inference with ONNX models. Only the config of the PredictionHead is stored.
        for i, ph in enumerate(self.prediction_heads):
            ph.save_config(output_path, i)

        processor.save(output_path)

        onnx_model_config = {
            "onnx_opset_version": opset_version,
            "language": self.get_language(),
        }
        with open(output_path / "model_config.json", "w") as f:
            json.dump(onnx_model_config, f)

        logger.info(f"Model exported at path {output_path}")
Beispiel #25
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False
    # or a local path:
    # lang_model = Path("../saved_models/farm-bert-base-cased")
    use_amp = None

    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir=Path("../data/germeval18"),
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="coarse_label"
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
        {"text": "Martin Müller spielt Handball in Berlin"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
Beispiel #26
0
def test_doc_regression(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    processor = RegressionProcessor(tokenizer=tokenizer,
                                    max_seq_len=8,
                                    data_dir="samples/doc_regr",
                                    train_filename="train-sample.tsv",
                                    dev_filename="test-sample.tsv",
                                    test_filename=None,
                                    label_column_name="label")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = RegressionHead(layer_dims=[768, 1])
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'CosineWarmup',
            'warmup_proportion': 0.1
        })

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = "testsave/doc_regr"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text":
            "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand."
        },
        {
            "text":
            "it just did not fit right. The top is very thin showing everything."
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["pred"], np.float32)
Beispiel #27
0
def test_dpr_modules(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1.Create question and passage tokenizers
    query_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=
        "facebook/dpr-question_encoder-single-nq-base",
        do_lower_case=True,
        use_fast=True)
    context_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base",
        do_lower_case=True,
        use_fast=True)

    processor = TextSimilarityProcessor(
        tokenizer=query_tokenizer,
        passage_tokenizer=context_tokenizer,
        max_seq_len=256,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        data_dir="data/retriever",
        train_filename="nq-train.json",
        dev_filename="nq-dev.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1)

    question_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRQuestionEncoder",
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0)
    passage_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRContextEncoder",
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0)

    prediction_head = TextSimilarityHead(similarity_function="dot_product")

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.0,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )

    model.connect_heads_with_processor(processor.tasks)

    assert type(model) == BiAdaptiveModel
    assert type(processor) == TextSimilarityProcessor
    assert type(question_language_model) == DPRQuestionEncoder
    assert type(passage_language_model) == DPRContextEncoder

    # check embedding layer weights
    assert list(model.named_parameters())[0][1][
        0, 0].item() - -0.010200000368058681 < 0.0001

    d = {
        'query':
        'big little lies season 2 how many episodes',
        'passages': [{
            'title': 'Big Little Lies (TV series)',
            'text':
            'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley',
            'label': 'positive',
            'external_id': '18768923'
        }, {
            'title': 'Little People, Big World',
            'text':
            'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
            'label': 'hard_negative',
            'external_id': '7459116'
        }, {
            'title': 'Cormac McCarthy',
            'text':
            'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
            'label': 'negative',
            'passage_id': '2145653'
        }]
    }

    sample = processor._dict_to_samples(d)
    feats = processor._sample_to_features(sample[0])
    dataset, tensor_names = convert_features_to_dataset(feats)
    features = {
        key: val.unsqueeze(0).to(device)
        for key, val in zip(tensor_names, dataset[0])
    }

    # test features
    assert torch.all(
        torch.eq(
            features["query_input_ids"][0][:10].cpu(),
            torch.tensor(
                [101, 2502, 2210, 3658, 2161, 1016, 2129, 2116, 4178, 102])))
    assert torch.all(
        torch.eq(
            features["passage_input_ids"][0][0][:10].cpu(),
            torch.tensor(
                [101, 2502, 2210, 3658, 1006, 2694, 2186, 1007, 102, 2186])))
    assert len(features["query_segment_ids"][0].nonzero()) == 0
    assert len(features["passage_segment_ids"][0].nonzero()) == 0
    assert torch.all(
        torch.eq(features["query_attention_mask"].nonzero()[:, 1].cpu(),
                 torch.tensor(list(range(10)))))
    assert torch.all(
        torch.eq(
            features["passage_attention_mask"][0][0].nonzero().cpu().squeeze(),
            torch.tensor(list(range(127)))))
    assert torch.all(
        torch.eq(
            features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(),
            torch.tensor(list(range(143)))))

    # test model encodings
    query_vector = model.language_model1(**features)[0]
    passage_vector = model.language_model2(**features)[0]
    assert torch.all(
        torch.le(
            query_vector[0, :10].cpu() - torch.tensor([
                -0.2135, -0.4748, 0.0501, -0.0430, -0.1747, -0.0441, 0.5638,
                0.1405, 0.2285, 0.0893
            ]),
            torch.ones((1, 10)) * 0.0001))
    assert torch.all(
        torch.le(
            passage_vector[0, :10].cpu() - torch.tensor([
                0.0557, -0.6836, -0.3645, -0.5566, 0.2034, -0.3656, 0.2969,
                -0.0555, 0.3405, -0.8691
            ]),
            torch.ones((1, 10)) * 0.0001))
    assert torch.all(
        torch.le(
            passage_vector[1, :10].cpu() - torch.tensor([
                -0.2006, -1.5002, -0.1897, -0.3421, -0.0405, -0.0471, -0.0306,
                0.1156, 0.3350, -0.3412
            ]),
            torch.ones((1, 10)) * 0.0001))

    # test logits and loss
    embeddings = model(**features)
    query_emb, passage_emb = embeddings[0]
    assert torch.all(torch.eq(query_emb.cpu(), query_vector.cpu()))
    assert torch.all(torch.eq(passage_emb.cpu(), passage_vector.cpu()))

    loss = model.logits_to_loss_per_head(embeddings, **features)
    similarity_scores = model.prediction_heads[0]._embeddings_to_scores(
        query_emb, passage_emb).cpu()
    assert torch.all(
        torch.le(
            similarity_scores - torch.tensor([[-1.8311e-03, -6.3016e+00]]),
            torch.ones((1, 2)) * 0.0001))
    assert (loss[0].item() - 0.0018) <= 0.0001
Beispiel #28
0
def test_doc_classification(caplog):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=8,
                                            data_dir=Path("samples/doc_class"),
                                            train_filename=Path("train-sample.tsv"),
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            dev_filename="test-sample.tsv",
                                            test_filename=None,
                                            dev_split=0.0,
                                            label_column_name="coarse_label")

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = DistilBert.load(lang_model)
    prediction_head = TextClassificationHead(num_labels=2)
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts=None)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    trainer.train()

    save_dir = Path("testsave/doc_class")
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Malte liebt Berlin."},
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."}
    ]

    inf = Inferencer.load(save_dir, batch_size=2, num_processes=0)
    result = inf.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
Beispiel #29
0
def test_lm_finetuning(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    processor = BertStyleLMProcessor(data_dir="samples/lm_finetuning",
                                     train_filename="train-sample.txt",
                                     test_filename="test-sample.txt",
                                     dev_filename=None,
                                     tokenizer=tokenizer,
                                     max_seq_len=12,
                                     next_sent_pred=True)
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    model = trainer.train(model)

    # LM embeddings and weight of decoder in head are shared and should therefore be equal
    assert torch.all(
        torch.eq(model.language_model.model.embeddings.word_embeddings.weight,
                 model.prediction_heads[0].decoder.weight))

    save_dir = "testsave/lm_finetuning"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Farmer's life is great."
        },
        {
            "text": "It's nothing for big city kids though."
        },
    ]
    model = Inferencer.load(save_dir, embedder_only=True)
    result = model.extract_vectors(dicts=basic_texts)
    assert result[0]["context"] == [
        'Farmer', "'", 's', 'life', 'is', 'great', '.'
    ]
    assert result[0]["vec"].shape == (768, )
    # TODO check why results vary accross runs with same seed
    assert isinstance(result[0]["vec"][0], np.float32)
Beispiel #30
0
def test_dataset_from_dicts_qa_labelconversion(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    models = [
        "deepset/roberta-base-squad2",
        "deepset/bert-base-cased-squad2",
        "deepset/xlm-roberta-large-squad2",
        "deepset/minilm-uncased-squad2",
        "deepset/electra-base-squad2",
    ]
    sample_types = [
        "answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"
    ]

    for model in models:
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=model,
                                   use_fast=True)
        processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None)

        for sample_type in sample_types:
            dicts = processor.file_to_dicts(f"samples/qa/{sample_type}.json")
            dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(
                dicts, indices=[1], return_baskets=False)

            if sample_type == "answer-wrong" or sample_type == "answer-offset-wrong":
                assert len(
                    problematic_sample_ids
                ) == 1, f"Processing labels for {model} has changed."

            if sample_type == "noanswer":
                assert list(dataset.tensors[tensor_names.index(
                    "labels")].numpy()[0, 0, :]) == [
                        0, 0
                    ], f"Processing labels for {model} has changed."
                assert list(dataset.tensors[
                    tensor_names.index("labels")].numpy()[0, 1, :]) == [
                        -1, -1
                    ], f"Processing labels for {model} has changed."

            if sample_type == "vanilla":
                # roberta
                if model == "deepset/roberta-base-squad2":
                    assert list(dataset.tensors[
                        tensor_names.index("labels")].numpy()[0, 0, :]) == [
                            13, 13
                        ], f"Processing labels for {model} has changed."
                    assert list(dataset.tensors[
                        tensor_names.index("labels")].numpy()[0, 1, :]) == [
                            13, 14
                        ], f"Processing labels for {model} has changed."
                # bert, minilm, electra
                if model == "deepset/bert-base-cased-squad2" or model == "deepset/minilm-uncased-squad2" or model == "deepset/electra-base-squad2":
                    assert list(dataset.tensors[
                        tensor_names.index("labels")].numpy()[0, 0, :]) == [
                            11, 11
                        ], f"Processing labels for {model} has changed."
                # xlm-roberta
                if model == "deepset/xlm-roberta-large-squad2":
                    assert list(dataset.tensors[
                        tensor_names.index("labels")].numpy()[0, 0, :]) == [
                            12, 12
                        ], f"Processing labels for {model} has changed."