Esempi in Python per NERProcessor, esempi in Python per farm.data_handler.processor.NERProcessor

Esempio n. 1

0

Mostra file

    def build_task_data(params: Params, data_supplier) -> SeqTagTaskData:
        dataset_dict: Dict[str, List[TaggedSequence]] = data_supplier()
        ner_labels = ["[PAD]", NIT] + list(
            set(tag for taggedseqs in dataset_dict.values()
                for taggedseq in taggedseqs for tok, tag in taggedseq))

        ml_logger = MLFlowLogger(tracking_uri=os.environ["HOME"] +
                                 "/data/mlflow_experiments/mlruns")
        ml_logger.init_experiment(experiment_name="Sequence_Tagging",
                                  run_name="Run_ner")

        lang_model = "bert-base-cased"
        do_lower_case = False

        tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                                   do_lower_case=do_lower_case)

        processor = NERProcessor(
            tokenizer=tokenizer,
            max_seq_len=128,
            data_dir=None,  # noqa
            metric="seq_f1",
            label_list=ner_labels,
        )

        task_data = {
            "num_labels": len(ner_labels),
            "lang_model": lang_model,
            "ml_logger": ml_logger,
            "processor": processor,
            "params": params,
        }
        return SeqTagTaskData(data=dataset_dict, task_data=task_data)

Esempio n. 2

0

Mostra file

File: ner.py Progetto: tholor/FARM

########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 4
batch_size = 32
evaluate_every = 50
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path=lang_model, do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
processor = NERProcessor(tokenizer=tokenizer,
                         max_seq_len=128,
                         data_dir="../data/conll03-de")

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = Bert.load(lang_model)
# b) and a prediction head on top that is suited for our task => NER
prediction_head = TokenClassificationHead(
    layer_dims=[768, len(processor.label_list)])

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[prediction_head],

Esempio n. 3

0

Mostra file

File: test_ner_amp.py Progetto: wwmmqq/FARM

def test_ner_amp(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"
    if AMP_AVAILABLE:
        use_amp = 'O1'
    else:
        use_amp = None

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename=Path("train-sample.txt"),
                             dev_filename=Path("dev-sample.txt"),
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_token"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-05,
        schedule_opts=None,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        use_amp=use_amp)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "1980 kam der Crown von Toyota"
        },
    ]
    model = Inferencer.load(save_dir, gpu=True)
    result = model.inference_from_dicts(dicts=basic_texts, max_processes=1)
    #print(result)
    assert result[0]["predictions"][0]["context"] == "Crown"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)

Esempio n. 4

0

Mostra file

File: ner.py Progetto: yizhiwan/FARM

evaluate_every = 100
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                           do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
ner_labels = [
    "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG",
    "B-LOC", "I-LOC", "B-OTH", "I-OTH"
]

processor = NERProcessor(tokenizer=tokenizer,
                         max_seq_len=128,
                         data_dir="../data/conll03-de",
                         metric="seq_f1",
                         label_list=ner_labels)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => NER
prediction_head = TokenClassificationHead(
    task_name="ner",
    layer_dims=[768, len(processor.tasks["ner"]["label_list"])])

model = AdaptiveModel(

Esempio n. 5

0

Mostra file

File: ner.py Progetto: yon606/FARM

def ner():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_ner")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 4
    batch_size = 32
    evaluate_every = 400
    lang_model = "bert-base-german-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=128,
                             data_dir=Path("../data/conll03-de"),
                             delimiter=" ",
                             metric="seq_f1",
                             label_list=ner_labels)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = "saved_models/bert-german-ner-tutorial"
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)

Esempio n. 6

0

Mostra file

File: test_ner.py Progetto: jucho2725/FARM

def test_ner(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
                  "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir="samples/ner",
        train_filename="train-sample.txt",
        dev_filename="dev-sample.txt",
        test_filename=None,
        delimiter=" ",
        label_list=ner_labels,
        metric="seq_f1"
    )

    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(layer_dims=[768, len(ner_labels)])

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = "testsave/ner"
    model = trainer.train(model)
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    assert result[0]["predictions"][0]["context"] == "sagte"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)

Esempio n. 7

0

Mostra file

File: infer.py Progetto: wwmmqq/FARM

    def load(cls,
             model_name_or_path,
             batch_size=4,
             gpu=False,
             task_type=None,
             return_class_probs=False,
             strict=True,
             max_seq_len=256):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :return: An instance of the Inferencer.

        """

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = AdaptiveModel.load(model_name_or_path,
                                       device,
                                       strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

        # b) or from remote transformers model hub
        else:
            logger.info(
                f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..."
            )
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path, device, task_type)
            config = AutoConfig.from_pretrained(model_name_or_path)
            tokenizer = Tokenizer.load(model_name_or_path)

            # TODO infer task_type automatically from config (if possible)
            if task_type == "question_answering":
                processor = SquadProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    label_list=["start_token", "end_token"],
                    metric="squad",
                    data_dir=None,
                )
            elif task_type == "embeddings":
                processor = InferenceProcessor(tokenizer=tokenizer,
                                               max_seq_len=max_seq_len)

            elif task_type == "text_classification":
                label_list = list(config.id2label[id]
                                  for id in range(len(config.id2label)))
                processor = TextClassificationProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    data_dir=None,
                    label_list=label_list,
                    label_column_name="label",
                    metric="acc",
                    quote_char='"',
                )

            # elif task_type == "multilabel-classification":
            #     # label_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
            #     label_list = list(config.label2id.keys())
            #
            #     processor = TextClassificationProcessor(tokenizer=tokenizer,
            #                                             max_seq_len=max_seq_len,
            #                                             data_dir=None,
            #                                             label_list=label_list,
            #                                             label_column_name="label",
            #                                             metric="acc",
            #                                             quote_char='"',
            #                                             multilabel=True,
            #                                             )

            elif task_type == "ner":
                label_list = list(config.label2id.keys())
                processor = NERProcessor(tokenizer=tokenizer,
                                         max_seq_len=max_seq_len,
                                         data_dir=None,
                                         metric="seq_f1",
                                         label_list=label_list)
            else:
                raise ValueError(
                    f"`task_type` {task_type} is not supported yet. "
                    f"Valid options for arg `task_type`: 'question_answering', 'embeddings', 'text_classification'"
                )

        return cls(
            model,
            processor,
            batch_size=batch_size,
            gpu=gpu,
            name=name,
            return_class_probs=return_class_probs,
        )

Esempio n. 8

0

Mostra file

File: ai_ner.py Progetto: leonardomra/orapi-task-service-component

 def execML(self, job):
     start_time = time.time()
     if job.task == 'analyse':
         basic_texts = []
         # Will donwload and store dataset...
         sample = self.downloadAndConvertText(job, job.data_sample)
         for text in sample.encode('utf-8').splitlines():
             basic_texts.append({'text': text.decode('utf-8')})
         # Will donwload and store model...
         self.downloadAndStoreZIPModel(job, job.model)
         self.updateJobStatus(job, 'analysing')
         save_dir = 'tmp/' + job.model['id']
         model = Inferencer.load(save_dir)
         result = model.inference_from_dicts(dicts=basic_texts)
         self.persistResult(job, result)
         model.close_multiprocessing_pool()
         self.updateJobStatus(job, 'completed')
     elif job.task == 'train':
         self.updateJobStatus(job, 'training')
         # Will donwload and store dataset...
         self.downloadAndStoreZIPDataset(job, job.data_source)
         # Will donwload and store model...
         self.downloadAndStoreZIPModel(job, job.model)
         set_all_seeds(seed=42)
         device, n_gpu = initialize_device_settings(use_cuda=True)
         n_epochs = 4
         evaluate_every = 400
         do_lower_case = False
         batch_size = 32
         lang_model = os.path.join(Path.cwd(), 'tmp', job.model['id'])
         ner_labels = [
             "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER",
             "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
         ]
         # 1. Create a tokenizer
         tokenizer = Tokenizer.load(
             pretrained_model_name_or_path=lang_model,
             do_lower_case=do_lower_case,
             tokenizer_class='BertTokenizer'
         )  #tokenizer_class='BertTokenizer'
         # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
         processor = NERProcessor(tokenizer=tokenizer,
                                  max_seq_len=128,
                                  data_dir=str(
                                      os.path.join(Path.cwd(), 'tmp',
                                                   job.data_source['id'])),
                                  delimiter=' ',
                                  metric='seq_f1',
                                  label_list=ner_labels)
         # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
         data_silo = DataSilo(processor=processor,
                              batch_size=batch_size,
                              max_processes=1)
         # 4. Create an AdaptiveModel
         # 4.1 which consists of a pretrained language model as a basis
         language_model = LanguageModel.load(lang_model)
         # 4.2 and a prediction head on top that is suited for our task => NER
         prediction_head = TokenClassificationHead(
             num_labels=len(ner_labels))
         model = AdaptiveModel(
             language_model=language_model,
             prediction_heads=[prediction_head],
             embeds_dropout_prob=0.1,
             lm_output_types=['per_token'],
             device=device,
         )
         # 5. Create an optimizer
         model, optimizer, lr_schedule = initialize_optimizer(
             model=model,
             learning_rate=1e-5,
             n_batches=len(data_silo.loaders["train"]),
             n_epochs=n_epochs,
             device=device,
         )
         # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
         trainer = Trainer(
             model=model,
             optimizer=optimizer,
             data_silo=data_silo,
             epochs=n_epochs,
             n_gpu=n_gpu,
             lr_schedule=lr_schedule,
             evaluate_every=evaluate_every,
             device=device,
         )
         # 7. Let it grow
         trainer.train()
         # 8. Hooray! You have a model. Store it:
         newModelId = str(uuid.uuid4())
         save_dir = 'tmp/' + newModelId
         model.save(save_dir)
         processor.save(save_dir)
         model.close_multiprocessing_pool()
         self.persistZIPModel(newModelId, job)
         self.updateJobStatus(job, 'completed')
     elapsed_time = time.time() - start_time
     print('Execution time max: ',
           elapsed_time,
           'for job.id:',
           job.id,
           flush=True)
     return {'status': True, 'code': 'ok', 'msg': 'success'}

Esempio n. 9

0

Mostra file

File: test_ner.py Progetto: yon606/FARM

def test_ner(caplog):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 3
    batch_size = 2
    evaluate_every = 1
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename="train-sample.txt",
                             dev_filename="dev-sample.txt",
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'LinearWarmup',
            'warmup_proportion': 0.1
        })
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    model = trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Paris is a town in France."
        },
    ]
    model = Inferencer.load(
        model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english",
        num_processes=0,
        task_type="ner")
    # labels arent correctly inserted from transformers
    # They are converted to LABEL_1 ... LABEL_N
    # For the inference result to contain predictions we need them in IOB NER format
    model.processor.tasks["ner"]["label_list"][-1] = "B-LOC"
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0]["context"] == "Paris"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)

Esempio n. 10

0

Mostra file

File: test_ner.py Progetto: wwmmqq/FARM

def test_ner(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 5
    batch_size = 2
    evaluate_every = 1
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename="train-sample.txt",
                             dev_filename="dev-sample.txt",
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'LinearWarmup',
            'warmup_proportion': 0.1
        })
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    model = trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Albrecht Lehman ist eine Person"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts, max_processes=1)
    #print(result)
    #assert result[0]["predictions"][0]["context"] == "sagte"
    #assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
    result2 = model.inference_from_dicts(dicts=basic_texts,
                                         rest_api_schema=True)
    assert result == result2

Esempio n. 11

0

Mostra file

##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path=lang_model, do_lower_case=False
)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
processor = NERProcessor(
    tokenizer=tokenizer, max_seq_len=128, data_dir="../data/conll03-de"
)
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]
processor.add_task("ner", "seq_f1", ner_labels)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = Bert.load(lang_model)
# b) and a prediction head on top that is suited for our task => NER
prediction_head = TokenClassificationHead(task_name="ner",
                                          layer_dims=[768, len(processor.tasks["ner"]["label_list"])])

model = AdaptiveModel(

Esempio n. 12

0

Mostra file

File: infer.py Progetto: yon606/FARM

    def load(cls,
             model_name_or_path,
             batch_size=4,
             gpu=False,
             task_type=None,
             return_class_probs=False,
             strict=True,
             max_seq_len=256,
             doc_stride=128,
             extraction_layer=None,
             extraction_strategy=None,
             s3e_stats=None,
             num_processes=None,
             disable_tqdm=False):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :param max_seq_len: maximum length of one text sample
        :type max_seq_len: int
        :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride
        :type doc_stride: int
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()`
                          (only needed for task_type="embeddings" and extraction_strategy = "s3e")
        :type s3e_stats: dict
        :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer use all CPU cores. If you want to
                              debug the Language Model, you might need to disable multiprocessing!
        :type num_processes: int
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :return: An instance of the Inferencer.

        """

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = BaseAdaptiveModel.load(load_dir=model_name_or_path,
                                           device=device,
                                           strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

            # override processor attributes loaded from config file with inferencer params
            processor.max_seq_len = max_seq_len
            if hasattr(processor, "doc_stride"):
                processor.doc_stride = doc_stride

        # b) or from remote transformers model hub
        else:
            logger.info(
                f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..."
            )
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification', 'ner'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path, device, task_type)
            config = AutoConfig.from_pretrained(model_name_or_path)
            tokenizer = Tokenizer.load(model_name_or_path)

            # TODO infer task_type automatically from config (if possible)
            if task_type == "question_answering":
                processor = SquadProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    label_list=["start_token", "end_token"],
                    metric="squad",
                    data_dir="data",
                    doc_stride=doc_stride)
            elif task_type == "embeddings":
                processor = InferenceProcessor(tokenizer=tokenizer,
                                               max_seq_len=max_seq_len)

            elif task_type == "text_classification":
                label_list = list(config.id2label[id]
                                  for id in range(len(config.id2label)))
                processor = TextClassificationProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    data_dir="data",
                    label_list=label_list,
                    label_column_name="label",
                    metric="acc",
                    quote_char='"',
                )
            elif task_type == "ner":
                label_list = list(config.label2id.keys())
                processor = NERProcessor(tokenizer=tokenizer,
                                         max_seq_len=max_seq_len,
                                         data_dir="data",
                                         metric="seq_f1",
                                         label_list=label_list)
            else:
                raise ValueError(
                    f"`task_type` {task_type} is not supported yet. "
                    f"Valid options for arg `task_type`: 'question_answering', "
                    f"'embeddings', 'text_classification', 'ner'")

        return cls(model,
                   processor,
                   task_type=task_type,
                   batch_size=batch_size,
                   gpu=gpu,
                   name=name,
                   return_class_probs=return_class_probs,
                   extraction_strategy=extraction_strategy,
                   extraction_layer=extraction_layer,
                   s3e_stats=s3e_stats,
                   num_processes=num_processes,
                   disable_tqdm=disable_tqdm)

Esempio n. 13

0

Mostra file

File: ner.py Progetto: microsoft/verseagility

    def ner(self, task, model_type, n_epochs, batch_size, evaluate_every,
            use_cude):
        aml_run = he.get_context()
        # Check task
        if cu.tasks.get(str(task)).get('type') != 'ner':
            raise Exception('NOT A NER TASK')
        language = cu.params.get('language')

        # Data
        dt_task = dt.Data(task=task)

        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=True)
        lang_model = he.get_farm_model(model_type, language)
        save_dir = dt_task.get_path('model_dir')
        # ner_labels = dt_task.load('fn_label', header=None)[0].to_list()
        ner_labels = [
            "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
            "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
        ]

        # n_epochs = 4
        # batch_size = 32
        # evaluate_every = 750
        # lang_model =  "xlm-roberta-large"

        # AML log
        try:
            aml_run.log('task', task)
            aml_run.log('language', language)
            aml_run.log('n_epochs', n_epochs)
            aml_run.log('batch_size', batch_size)
            aml_run.log('lang_model', lang_model)
            aml_run.log_list('label_list', label_list)
        except:
            pass

        # 1.Create a tokenizer
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                                   do_lower_case=False)

        # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        processor = NERProcessor(tokenizer=tokenizer,
                                 max_seq_len=128,
                                 data_dir=dt_task.data_dir,
                                 metric="seq_f1",
                                 label_list=ner_labels)

        # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor, batch_size=batch_size)

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => NER
        prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token"],
            device=device,
        )

        # 5. Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=1e-5,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            device=device,
        )

        # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )

        # 7. Let it grow
        trainer.train()

        # 8. Hooray! You have a model. Store it:
        model.save(save_dir)
        processor.save(save_dir)

Esempio n. 14

0

Mostra file

def ner(task: str, lm: str):
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42, deterministic_cudnn=use_cuda)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)
    n_epochs = 10
    batch_size = 32
    evaluate_every = 1000
    model_dir = MODEL_DIR
    if lm == 'bert-hgcrw':
        lang_model = "redewiedergabe/bert-base-historical-german-rw-cased"
        model_dir += '_bert-hgcrw'
    elif lm == 'lmgot01':
        lang_model = Path(
            "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_01")
        model_dir += '_lmgot01'
    elif lm == 'lmgot02':
        lang_model = Path(
            "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_02")
        model_dir += '_lmgot02'
    else:
        lang_model = "bert-base-german-cased"
    if task != 'all':
        model_dir += '_' + task
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    if task == 'direct':
        ner_labels = ["[PAD]", "X", "O", "B-DIR", "I-DIR"]
    elif task == 'indirect':
        ner_labels = ["[PAD]", "X", "O", "B-IND", "I-IND"]
    elif task == 'reported':
        ner_labels = ["[PAD]", "X", "O", "B-REP", "I-REP"]
    else:
        ner_labels = [
            "[PAD]", "X", "O", "B-DIR", "I-DIR", "B-IND", "I-IND", "B-REP",
            "I-REP"
        ]

    data_dir = DATA_DIR
    if task != 'all':
        data_dir += task + '/'
    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=64,
                             data_dir=Path(data_dir),
                             delimiter="\t",
                             metric="seq_f1",
                             label_list=ner_labels)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_loader_worker = 1
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=data_loader_worker)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = model_dir
    model.save(save_dir)
    processor.save(save_dir)

Esempio n. 15

0

Mostra file

File: infer.py Progetto: himanshurawlani/FARM

    def load(
        cls,
        model_name_or_path,
        batch_size=4,
        gpu=False,
        task_type=None,
        return_class_probs=False,
        strict=True,
        max_seq_len=256,
        doc_stride=128,
        extraction_layer=None,
        extraction_strategy=None,
        s3e_stats=None,
        num_processes=None,
        disable_tqdm=False,
        tokenizer_class=None,
        use_fast=False,
        tokenizer_args=None,
        dummy_ph=False,
        benchmarking=False,
    ):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :param max_seq_len: maximum length of one text sample
        :type max_seq_len: int
        :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride
        :type doc_stride: int
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()`
                          (only needed for task_type="embeddings" and extraction_strategy = "s3e")
        :type s3e_stats: dict
        :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer use all CPU cores minus one. If you want to
                              debug the Language Model, you might need to disable multiprocessing!
                              **Warning!** If you use multiprocessing you have to close the
                              `multiprocessing.Pool` again! To do so call
                              :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are
                              done using this class. The garbage collector will not do this for you!
        :type num_processes: int
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
        :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method.
            See https://huggingface.co/transformers/main_classes/tokenizer.html and detailed tokenizer documentation
            on `Hugging Face Transformers <https://huggingface.co/transformers/>`_.
        :type tokenizer_args: dict
        :type use_fast: bool
        :param dummy_ph: If True, methods of the prediction head will be replaced
                             with a dummy method. This is used to isolate lm run time from ph run time.
        :type dummy_ph: bool
        :param benchmarking: If True, a benchmarking object will be initialised within the class and
                             certain parts of the code will be timed for benchmarking. Should be kept
                             False if not benchmarking since these timing checkpoints require synchronization
                             of the asynchronous Pytorch operations and may slow down the model.
        :type benchmarking: bool
        :return: An instance of the Inferencer.

        """
        if tokenizer_args is None:
            tokenizer_args = {}

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = BaseAdaptiveModel.load(load_dir=model_name_or_path,
                                           device=device,
                                           strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

            # override processor attributes loaded from config file with inferencer params
            processor.max_seq_len = max_seq_len
            if hasattr(processor, "doc_stride"):
                assert doc_stride < max_seq_len, "doc_stride is longer than max_seq_len. This means that there will be gaps " \
                                                 "as the passage windows slide, causing the model to skip over parts of the document. "\
                                                 "Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384) "
                processor.doc_stride = doc_stride

        # b) or from remote transformers model hub
        else:
            logger.info(
                f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..."
            )
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification', 'ner'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path, device, task_type)
            config = AutoConfig.from_pretrained(model_name_or_path)
            tokenizer = Tokenizer.load(
                model_name_or_path,
                tokenizer_class=tokenizer_class,
                use_fast=use_fast,
                **tokenizer_args,
            )

            # TODO infer task_type automatically from config (if possible)
            if task_type == "question_answering":
                processor = SquadProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    label_list=["start_token", "end_token"],
                    metric="squad",
                    data_dir="data",
                    doc_stride=doc_stride)
            elif task_type == "embeddings":
                processor = InferenceProcessor(tokenizer=tokenizer,
                                               max_seq_len=max_seq_len)

            elif task_type == "text_classification":
                label_list = list(config.id2label[id]
                                  for id in range(len(config.id2label)))
                processor = TextClassificationProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    data_dir="data",
                    label_list=label_list,
                    label_column_name="label",
                    metric="acc",
                    quote_char='"',
                )
            elif task_type == "ner":
                label_list = list(config.label2id.keys())
                processor = NERProcessor(tokenizer=tokenizer,
                                         max_seq_len=max_seq_len,
                                         data_dir="data",
                                         metric="seq_f1",
                                         label_list=label_list)
            else:
                raise ValueError(
                    f"`task_type` {task_type} is not supported yet. "
                    f"Valid options for arg `task_type`: 'question_answering', "
                    f"'embeddings', 'text_classification', 'ner'")

        if not isinstance(model, ONNXAdaptiveModel):
            model, _ = optimize_model(model=model,
                                      device=device,
                                      local_rank=-1,
                                      optimizer=None)
        return cls(model,
                   processor,
                   task_type=task_type,
                   batch_size=batch_size,
                   gpu=gpu,
                   name=name,
                   return_class_probs=return_class_probs,
                   extraction_strategy=extraction_strategy,
                   extraction_layer=extraction_layer,
                   s3e_stats=s3e_stats,
                   num_processes=num_processes,
                   disable_tqdm=disable_tqdm,
                   benchmarking=benchmarking,
                   dummy_ph=dummy_ph)

Esempio n. 16

0

Mostra file

def ner():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42, deterministic_cudnn=True)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)
    n_epochs = 4
    batch_size = 32
    evaluate_every = 400
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
    )

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer, max_seq_len=128, data_dir=Path(DATA_DIR), delimiter=" ", metric="seq_f1", label_list=ner_labels
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_loader_worker = 15
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=data_loader_worker)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = MODEL_DIR
    model.save(save_dir)
    processor.save(save_dir)