Exemple #1
0
trainer = Trainer(
    optimizer=optimizer,
    data_silo=data_silo,
    epochs=n_epochs,
    n_gpu=n_gpu,
    warmup_linear=warmup_linear,
    evaluate_every=evaluate_every,
    device=device,
)

# 7. Let it grow
model = trainer.train(model)

# 8. Hooray! You have a model. Store it:
save_dir = "saved_models/bert-german-ner-tutorial"
model.save(save_dir)
processor.save(save_dir)

# 9. Load it & harvest your fruits (Inference)
basic_texts = [
    {
        "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
    },
    {
        "text": "Martin Müller spielt Handball in Berlin"
    },
]
model = Inferencer.load(save_dir)
result = model.run_inference(dicts=basic_texts)
print(result)
Exemple #2
0
def ner():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_ner")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 4
    batch_size = 32
    evaluate_every = 400
    lang_model = "bert-base-german-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=128,
                             data_dir=Path("../data/conll03-de"),
                             delimiter=" ",
                             metric="seq_f1",
                             label_list=ner_labels)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = "saved_models/bert-german-ner-tutorial"
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
Exemple #3
0
def test_ner_amp(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"
    if AMP_AVAILABLE:
        use_amp = 'O1'
    else:
        use_amp = None

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename=Path("train-sample.txt"),
                             dev_filename=Path("dev-sample.txt"),
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_token"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-05,
        schedule_opts=None,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        use_amp=use_amp)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "1980 kam der Crown von Toyota"
        },
    ]
    model = Inferencer.load(save_dir, gpu=True)
    result = model.inference_from_dicts(dicts=basic_texts, max_processes=1)
    #print(result)
    assert result[0]["predictions"][0]["context"] == "Crown"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
 def execML(self, job):
     start_time = time.time()
     if job.task == 'analyse':
         basic_texts = []
         # Will donwload and store dataset...
         sample = self.downloadAndConvertText(job, job.data_sample)
         for text in sample.encode('utf-8').splitlines():
             basic_texts.append({'text': text.decode('utf-8')})
         # Will donwload and store model...
         self.downloadAndStoreZIPModel(job, job.model)
         self.updateJobStatus(job, 'analysing')
         save_dir = 'tmp/' + job.model['id']
         model = Inferencer.load(save_dir)
         result = model.inference_from_dicts(dicts=basic_texts)
         self.persistResult(job, result)
         model.close_multiprocessing_pool()
         self.updateJobStatus(job, 'completed')
     elif job.task == 'train':
         self.updateJobStatus(job, 'training')
         # Will donwload and store dataset...
         self.downloadAndStoreZIPDataset(job, job.data_source)
         # Will donwload and store model...
         self.downloadAndStoreZIPModel(job, job.model)
         set_all_seeds(seed=42)
         device, n_gpu = initialize_device_settings(use_cuda=True)
         n_epochs = 4
         evaluate_every = 400
         do_lower_case = False
         batch_size = 32
         lang_model = os.path.join(Path.cwd(), 'tmp', job.model['id'])
         ner_labels = [
             "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER",
             "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
         ]
         # 1. Create a tokenizer
         tokenizer = Tokenizer.load(
             pretrained_model_name_or_path=lang_model,
             do_lower_case=do_lower_case,
             tokenizer_class='BertTokenizer'
         )  #tokenizer_class='BertTokenizer'
         # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
         processor = NERProcessor(tokenizer=tokenizer,
                                  max_seq_len=128,
                                  data_dir=str(
                                      os.path.join(Path.cwd(), 'tmp',
                                                   job.data_source['id'])),
                                  delimiter=' ',
                                  metric='seq_f1',
                                  label_list=ner_labels)
         # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
         data_silo = DataSilo(processor=processor,
                              batch_size=batch_size,
                              max_processes=1)
         # 4. Create an AdaptiveModel
         # 4.1 which consists of a pretrained language model as a basis
         language_model = LanguageModel.load(lang_model)
         # 4.2 and a prediction head on top that is suited for our task => NER
         prediction_head = TokenClassificationHead(
             num_labels=len(ner_labels))
         model = AdaptiveModel(
             language_model=language_model,
             prediction_heads=[prediction_head],
             embeds_dropout_prob=0.1,
             lm_output_types=['per_token'],
             device=device,
         )
         # 5. Create an optimizer
         model, optimizer, lr_schedule = initialize_optimizer(
             model=model,
             learning_rate=1e-5,
             n_batches=len(data_silo.loaders["train"]),
             n_epochs=n_epochs,
             device=device,
         )
         # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
         trainer = Trainer(
             model=model,
             optimizer=optimizer,
             data_silo=data_silo,
             epochs=n_epochs,
             n_gpu=n_gpu,
             lr_schedule=lr_schedule,
             evaluate_every=evaluate_every,
             device=device,
         )
         # 7. Let it grow
         trainer.train()
         # 8. Hooray! You have a model. Store it:
         newModelId = str(uuid.uuid4())
         save_dir = 'tmp/' + newModelId
         model.save(save_dir)
         processor.save(save_dir)
         model.close_multiprocessing_pool()
         self.persistZIPModel(newModelId, job)
         self.updateJobStatus(job, 'completed')
     elapsed_time = time.time() - start_time
     print('Execution time max: ',
           elapsed_time,
           'for job.id:',
           job.id,
           flush=True)
     return {'status': True, 'code': 'ok', 'msg': 'success'}
Exemple #5
0
def test_ner(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
                  "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir="samples/ner",
        train_filename="train-sample.txt",
        dev_filename="dev-sample.txt",
        test_filename=None,
        delimiter=" ",
        label_list=ner_labels,
        metric="seq_f1"
    )

    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(layer_dims=[768, len(ner_labels)])

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = "testsave/ner"
    model = trainer.train(model)
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    assert result[0]["predictions"][0]["context"] == "sagte"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
Exemple #6
0
def test_ner(caplog):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 3
    batch_size = 2
    evaluate_every = 1
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename="train-sample.txt",
                             dev_filename="dev-sample.txt",
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'LinearWarmup',
            'warmup_proportion': 0.1
        })
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    model = trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Paris is a town in France."
        },
    ]
    model = Inferencer.load(
        model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english",
        num_processes=0,
        task_type="ner")
    # labels arent correctly inserted from transformers
    # They are converted to LABEL_1 ... LABEL_N
    # For the inference result to contain predictions we need them in IOB NER format
    model.processor.tasks["ner"]["label_list"][-1] = "B-LOC"
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0]["context"] == "Paris"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
Exemple #7
0
def test_ner(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 5
    batch_size = 2
    evaluate_every = 1
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename="train-sample.txt",
                             dev_filename="dev-sample.txt",
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'LinearWarmup',
            'warmup_proportion': 0.1
        })
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    model = trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Albrecht Lehman ist eine Person"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts, max_processes=1)
    #print(result)
    #assert result[0]["predictions"][0]["context"] == "sagte"
    #assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
    result2 = model.inference_from_dicts(dicts=basic_texts,
                                         rest_api_schema=True)
    assert result == result2
Exemple #8
0
    def ner(self, task, model_type, n_epochs, batch_size, evaluate_every,
            use_cude):
        aml_run = he.get_context()
        # Check task
        if cu.tasks.get(str(task)).get('type') != 'ner':
            raise Exception('NOT A NER TASK')
        language = cu.params.get('language')

        # Data
        dt_task = dt.Data(task=task)

        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=True)
        lang_model = he.get_farm_model(model_type, language)
        save_dir = dt_task.get_path('model_dir')
        # ner_labels = dt_task.load('fn_label', header=None)[0].to_list()
        ner_labels = [
            "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
            "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
        ]

        # n_epochs = 4
        # batch_size = 32
        # evaluate_every = 750
        # lang_model =  "xlm-roberta-large"

        # AML log
        try:
            aml_run.log('task', task)
            aml_run.log('language', language)
            aml_run.log('n_epochs', n_epochs)
            aml_run.log('batch_size', batch_size)
            aml_run.log('lang_model', lang_model)
            aml_run.log_list('label_list', label_list)
        except:
            pass

        # 1.Create a tokenizer
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                                   do_lower_case=False)

        # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        processor = NERProcessor(tokenizer=tokenizer,
                                 max_seq_len=128,
                                 data_dir=dt_task.data_dir,
                                 metric="seq_f1",
                                 label_list=ner_labels)

        # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor, batch_size=batch_size)

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => NER
        prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token"],
            device=device,
        )

        # 5. Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=1e-5,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            device=device,
        )

        # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )

        # 7. Let it grow
        trainer.train()

        # 8. Hooray! You have a model. Store it:
        model.save(save_dir)
        processor.save(save_dir)
Exemple #9
0
def ner(task: str, lm: str):
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42, deterministic_cudnn=use_cuda)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)
    n_epochs = 10
    batch_size = 32
    evaluate_every = 1000
    model_dir = MODEL_DIR
    if lm == 'bert-hgcrw':
        lang_model = "redewiedergabe/bert-base-historical-german-rw-cased"
        model_dir += '_bert-hgcrw'
    elif lm == 'lmgot01':
        lang_model = Path(
            "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_01")
        model_dir += '_lmgot01'
    elif lm == 'lmgot02':
        lang_model = Path(
            "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_02")
        model_dir += '_lmgot02'
    else:
        lang_model = "bert-base-german-cased"
    if task != 'all':
        model_dir += '_' + task
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    if task == 'direct':
        ner_labels = ["[PAD]", "X", "O", "B-DIR", "I-DIR"]
    elif task == 'indirect':
        ner_labels = ["[PAD]", "X", "O", "B-IND", "I-IND"]
    elif task == 'reported':
        ner_labels = ["[PAD]", "X", "O", "B-REP", "I-REP"]
    else:
        ner_labels = [
            "[PAD]", "X", "O", "B-DIR", "I-DIR", "B-IND", "I-IND", "B-REP",
            "I-REP"
        ]

    data_dir = DATA_DIR
    if task != 'all':
        data_dir += task + '/'
    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=64,
                             data_dir=Path(data_dir),
                             delimiter="\t",
                             metric="seq_f1",
                             label_list=ner_labels)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_loader_worker = 1
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=data_loader_worker)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = model_dir
    model.save(save_dir)
    processor.save(save_dir)
Exemple #10
0
def ner():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42, deterministic_cudnn=True)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)
    n_epochs = 4
    batch_size = 32
    evaluate_every = 400
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
    )

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer, max_seq_len=128, data_dir=Path(DATA_DIR), delimiter=" ", metric="seq_f1", label_list=ner_labels
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_loader_worker = 15
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=data_loader_worker)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = MODEL_DIR
    model.save(save_dir)
    processor.save(save_dir)