Beispiel #1
0
def test_lm_finetuning(caplog):
    caplog.set_level(logging.CRITICAL)
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 5
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model, do_lower_case=False)

    processor = BertStyleLMProcessor(
        data_dir="samples/lm_finetuning",
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=64,
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = Bert.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    model = trainer.train(model)

    save_dir = "testsave/lm_finetuning"
    model.save(save_dir)
    processor.save(save_dir)
Beispiel #2
0
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text regression
prediction_head = RegressionHead(layer_dims=[768, 1])

model = AdaptiveModel(language_model=language_model,
                      prediction_heads=[prediction_head],
                      embeds_dropout_prob=0.1,
                      lm_output_types=["per_sequence_continuous"],
                      device=device)

# 5. Create an optimizer
optimizer, warmup_linear = initialize_optimizer(
    model=model,
    learning_rate=2e-5,
    warmup_proportion=0.1,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=n_epochs)

# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
trainer = Trainer(optimizer=optimizer,
                  data_silo=data_silo,
                  epochs=n_epochs,
                  n_gpu=n_gpu,
                  warmup_linear=warmup_linear,
                  evaluate_every=evaluate_every,
                  device=device)

# 7. Let it grow
model = trainer.train(model)
Beispiel #3
0
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text regression
prediction_head = RegressionHead(layer_dims=[768, 1])

model = AdaptiveModel(language_model=language_model,
                      prediction_heads=[prediction_head],
                      embeds_dropout_prob=0.1,
                      lm_output_types=["per_sequence_continuous"],
                      device=device)

# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    learning_rate=2e-5,
    device=device,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=n_epochs)

# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
trainer = Trainer(optimizer=optimizer,
                  data_silo=data_silo,
                  epochs=n_epochs,
                  n_gpu=n_gpu,
                  lr_schedule=lr_schedule,
                  evaluate_every=evaluate_every,
                  device=device)

# 7. Let it grow
model = trainer.train(model)
Beispiel #4
0
def test_doc_classification(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 8
    evaluate_every = 30
    lang_model = "bert-base-german-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model, do_lower_case=False)

    processor = GermEval18CoarseProcessor(tokenizer=tokenizer,
                                          max_seq_len=64,
                                          data_dir="samples/doc_class",
                                          train_filename="train-sample.tsv",
                                          test_filename=None)

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = Bert.load(lang_model)
    prediction_head = TextClassificationHead(
        layer_dims=[768, len(processor.label_list)])
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_examples=data_silo.n_samples("train"),
        batch_size=batch_size,
        n_epochs=1)

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      warmup_linear=warmup_linear,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = "testsave/doc_class"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.run_inference(dicts=basic_texts)
    assert result[0]["predictions"][0]["label"] == "OTHER"
    assert abs(result[0]["predictions"][0]["probability"] -
               0.5358161) <= 0.0001
Beispiel #5
0
def test_lm_finetuning(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False,
        never_split_chars=["-", "_"])

    processor = BertStyleLMProcessor(
        data_dir="samples/lm_finetuning",
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=12,
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = Bert.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    model = trainer.train(model)

    save_dir = "testsave/lm_finetuning"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Farmer's life is great."
        },
        {
            "text": "It's nothing for big city kids though."
        },
    ]
    model = Inferencer.load(save_dir, embedder_only=True)
    result = model.extract_vectors(dicts=basic_texts)
    assert result[0]["context"] == [
        'Farmer', "'", 's', 'life', 'is', 'great', '.'
    ]
    assert result[0]["vec"].shape == (768, )
    # TODO check why results vary accross runs with same seed
    assert isinstance(result[0]["vec"][0], np.float32)
Beispiel #6
0
    layer_dims=[language_model.model.config.hidden_size, 2],
    loss_ignore_index=-1)

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[lm_prediction_head, next_sentence_head],
    embeds_dropout_prob=0.1,
    lm_output_types=["per_token", "per_sequence"],
    device=device,
)

# 5. Create an optimizer
optimizer, warmup_linear = initialize_optimizer(
    model=model,
    learning_rate=2e-5,
    warmup_proportion=0.1,
    n_examples=data_silo.n_samples("train"),
    batch_size=16,
    n_epochs=1,
)

# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
trainer = Trainer(
    optimizer=optimizer,
    data_silo=data_silo,
    epochs=10,
    n_gpu=1,
    warmup_linear=warmup_linear,
    evaluate_every=100,
    device=device,
)
Beispiel #7
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]
    train_filename = "train.tsv"
    dev_filename = "dev_200k.tsv"

    # The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking
    generate_data = False
    data_dir = Path("../data/msmarco_passage")
    predictions_raw_filename = "predictions_raw.txt"
    predictions_filename = "predictions.txt"
    train_source_filename = "triples.train.1m.tsv"
    qrels_filename = "qrels.dev.tsv"
    queries_filename = "queries.dev.tsv"
    passages_filename = "collection.tsv"
    top1000_filename = "top1000.dev"

    # 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once!
    # The final format is a tsv file with 3 columns (text, text_b and label)
    if generate_data:
        reformat_msmarco_train(data_dir / train_source_filename,
                               data_dir / train_filename)
        reformat_msmarco_dev(data_dir / queries_filename,
                             data_dir / passages_filename,
                             data_dir / qrels_filename,
                             data_dir / top1000_filename,
                             data_dir / dev_filename)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    Evaluation during training will be performed on a slice of the train set
    #    We will be using the msmarco dev set as our final evaluation set
    processor = TextPairClassificationProcessor(tokenizer=tokenizer,
                                                label_list=label_list,
                                                metric="f1_macro",
                                                train_filename=train_filename,
                                                test_filename=None,
                                                dev_split=0.001,
                                                max_seq_len=128,
                                                data_dir=data_dir,
                                                delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(num_labels=len(label_list),
                                             class_weights=data_silo.calculate_class_weights(
                                                 task_name="text_classification"),
                                             )

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence_continuous"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/passage_ranking_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    model = Inferencer.load(save_dir, gpu=True, max_seq_len=128, batch_size=128)
    result = model.inference_from_file(data_dir / dev_filename)

    write_msmarco_results(result, save_dir / predictions_raw_filename)

    msmarco_evaluation(preds_file=save_dir / predictions_raw_filename,
                       dev_file=data_dir / dev_filename,
                       qrels_file=data_dir / qrels_filename,
                       output_file=save_dir / predictions_filename)

    model.close_multiprocessing_pool()
Beispiel #8
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_text_pair_classification")

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset.
    # The TextPairClassificationProcessor expects a csv with columns called "text', "text_b" and "label"
    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="f1_macro",
        max_seq_len=128,
        dev_filename="dev.tsv",
        test_filename=None,
        data_dir=Path("../data/asnq_binary"),
        delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-6,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/text_pair_classification_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    basic_texts = [
        {
            "text":
            "how many times have real madrid won the champions league in a row",
            "text_b":
            "They have also won the competition the most times in a row, winning it five times from 1956 to 1960"
        },
        {
            "text": "how many seasons of the blacklist are there on netflix",
            "text_b": "Retrieved March 27 , 2018 ."
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)
Beispiel #9
0
def test_lm_finetuning_no_next_sentence(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    processor = BertStyleLMProcessor(
        data_dir="samples/lm_finetuning",
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=12,
        next_sent_pred=False
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)

    language_model = LanguageModel.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1}
    )
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    model = trainer.train(model)

    # LM embeddings and weight of decoder in head are shared and should therefore be equal
    assert torch.all(
        torch.eq(model.language_model.model.embeddings.word_embeddings.weight, model.prediction_heads[0].decoder.weight))

    save_dir = "testsave/lm_finetuning_no_nsp"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Farmer's life is great."},
        {"text": "It's nothing for big city kids though."},
    ]
    model = Inferencer.load(save_dir, embedder_only=True)
    result = model.extract_vectors(dicts=basic_texts)
    assert result[0]["context"] == ['Farmer', "'", 's', 'life', 'is', 'great', '.']
    assert result[0]["vec"].shape == (768,)
    # TODO check why results vary accross runs with same seed
    assert isinstance(result[0]["vec"][0], np.float32)
Beispiel #10
0
def test_qa(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 6
    n_epochs = 2
    evaluate_every = 100
    base_LM_model = "bert-base-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=base_LM_model, do_lower_case=False)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        train_filename="train-sample.json",
        dev_filename="dev-sample.json",
        test_filename=None,
        data_dir="samples/qa",
    )

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)
    language_model = Bert.load(base_LM_model)
    prediction_head = QuestionAnsweringHead(
        layer_dims=[768, len(processor.label_list)])
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        warmup_proportion=0.2,
        n_examples=data_silo.n_samples("train"),
        batch_size=batch_size,
        n_epochs=n_epochs,
    )
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )
    model = trainer.train(model)
    save_dir = "testsave/qa"
    model.save(save_dir)
    processor.save(save_dir)

    QA_input = [{
        "questions": ["In what country is Normandy located?"],
        "text":
        "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
    }]

    model = Inferencer(save_dir)
    result = model.run_inference(dicts=QA_input)
    assert result[0]["predictions"][0]["label"] == 'The'
Beispiel #11
0
def test_text_pair_classification(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 5
    evaluate_every = 2
    lang_model = "microsoft/MiniLM-L12-H384-uncased"
    label_list = ["0", "1", "2"]

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model)

    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="f1_macro",
        max_seq_len=128,
        train_filename="sample.tsv",
        dev_filename="sample.tsv",
        test_filename=None,
        data_dir=Path("samples/text_pair"),
        delimiter="\t")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    trainer.train()

    save_dir = Path("testsave/text_pair_classification_model")
    model.save(save_dir)
    processor.save(save_dir)

    # For correct Text Pair Classification on raw dictionaries, we need to put both texts (text, text_b) into a tuple
    # See corresponding operation in the file_to_dicts method of TextPairClassificationProcessor here: https://github.com/deepset-ai/FARM/blob/5ab5b1620cb51ceb874d4b30c887e377ad1a6e9a/farm/data_handler/processor.py#L744
    basic_texts = [
        {
            "text":
            ("how many times have real madrid won the champions league in a row",
             "They have also won the competition the most times in a row, winning it five times from 1956 to 1960"
             )
        },
        {
            "text": ("how many seasons of the blacklist are there on netflix",
                     "Retrieved March 27 , 2018 .")
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0]["label"] == "1"
    assert np.isclose(result[0]["predictions"][0]["probability"],
                      0.3781,
                      rtol=0.05)
    model.close_multiprocessing_pool()
Beispiel #12
0
def test_text_pair_regression(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 5
    evaluate_every = 2
    lang_model = "microsoft/MiniLM-L12-H384-uncased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model)

    processor = TextPairRegressionProcessor(tokenizer=tokenizer,
                                            label_list=None,
                                            metric="f1_macro",
                                            max_seq_len=128,
                                            train_filename="sample.tsv",
                                            dev_filename="sample.tsv",
                                            test_filename=None,
                                            data_dir=Path("samples/text_pair"),
                                            delimiter="\t")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = RegressionHead()

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    trainer.train()

    save_dir = Path("testsave/text_pair_regression_model")
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text":
            ("how many times have real madrid won the champions league in a row",
             "They have also won the competition the most times in a row, winning it five times from 1956 to 1960"
             )
        },
        {
            "text": ("how many seasons of the blacklist are there on netflix",
                     "Retrieved March 27 , 2018 .")
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    assert np.isclose(result[0]["predictions"][0]["pred"], 0.7976, rtol=0.05)
    model.close_multiprocessing_pool()
Beispiel #13
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_text_pair_classification")

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    We do not have a sample dataset for regression yet, add your own dataset to run the example
    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="acc",
        label_column_name="label",
        max_seq_len=64,
        train_filename=training_filename,
        dev_filename=test_filename,
        test_filename=test_filename,
        data_dir=Path("../data"),
        tasks={"text_classification"},
        delimiter="\t")

    # train_filename = training_filename,
    # test_filename = test_filename,
    # dev_filename = test_filename,
    # dev_split = 0.5,

    # data_dir=Path("../data/asnq_binary"),

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    # Alte Version vor StreamingDataSilo
    # data_silo = DataSilo(
    #    processor=processor,
    #    batch_size=batch_size, max_processes=4)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-6,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    now = datetime.now()  # current date and time

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        #metric="f1_weighted", mode="max",  # use f1_macro from the dev evaluator of the trainer
        metric="loss",
        mode="min",  # use loss from the dev evaluator of the trainer
        save_dir=Path(
            "saved_models/earlystopping/" +
            now.strftime("%m%d%Y%H%M%S")),  # where to save the best model
        patience=
        8  # number of evaluations to wait for improvement before terminating the training
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)
    # model=model,
    # optimizer=optimizer,
    # data_silo=data_silo,
    # epochs=n_epochs,
    # n_gpu=n_gpu,
    # lr_schedule=lr_schedule,
    # evaluate_every=evaluate_every,
    # device=device)

    # 7. Let it grow
    #comment if going to use a stored model
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    # When a new model is being trained and need to be saved
    save_dir = Path("saved_models/text_pair_classification_model" +
                    now.strftime("%m%d%Y%H%M%S"))
    model.save(save_dir)
    processor.save(save_dir)

    # When only a model needs to be loaded change the details to load the needed model
    # save_dir = Path("saved_models/text_pair_classification_model" + "01272021103548")

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    basic_texts = [
        {
            "text":
            "<claim-text>The method of claim 10, wherein the indium metal layer is 10 nm to 100 µm thick.</claim-text>",
            "text_b":
            "<p id="
            "p0001"
            " num="
            "0001"
            ">The present invention is directed to metal plating compositions and methods. More specifically, the present invention is directed to metal plating compositions and methods which provide improved leveling and throwing power.</p <p id="
            "p0039"
            " num="
            "0039"
            ">One or more conventional surfactants may be used. Typically, surfactants include, but are not limited to, nonionic surfactants such as alkyl phenoxy polyethoxyethanols. Other suitable surfactants containing multiple oxyethylene groups also may be used. Such surfactants include compounds of polyoxyethylene polymers having from as many as 20 to 150 repeating units. Such compounds also may perform as suppressors. Also included in the class of polymers are both block and random copolymers of polyoxyethylene (EO) and polyoxypropylene (PO). Surfactants may be added in conventional amounts, such as from 0.05 g/L to 20 g/L or such as from 0.5 g/L to 5 g/L.</p <p id="
            "p0040"
            " num="
            "0040"
            ">Conventional levelers include, but are not limited to, one or more of alkylated polyalkyleneimines and organic sulfo sulfonates. Examples of such compounds include, 4-mercaptopyridine, 2-mercaptothiazoline, ethylene thiourea, thiourea, 1-(2-hydroxyethyl)-2-imidazolidinethion (HIT) and alkylated polyalkyleneimines. Such levelers are included in conventional amounts. Typically, such levelers are included in amounts of 1ppb to 1 g/L, or such as from 10ppb to 500ppm.</p <p id="
            "p0042"
            " num="
            "0042"
            ">Alkali metal salts which may be included in the plating compositions include, but are not limited to, sodium and potassium salts of halogens, such as chloride, fluoride and bromide. Typically chloride is used. Such alkali metal salts are used in conventional amounts.</p <p id="
            "p0053"
            " num="
            "0053"
            ">The metal plating compositions may be used to plate a metal or metal alloy on a substrate by any method known in the art and literature. Typically, the metal or metal alloy is electroplated using conventional electroplating processes with conventional apparatus. A soluble or insoluble anode may be used with the electroplating compositions.</p <p id="
            "p0022"
            " num="
            "0022"
            ">One or more sources of metal ions are included in metal plating compositions to plate metals. The one or more sources of metal ions provide metal ions which include, but are not limited to, copper, tin, nickel, gold, silver, palladium, platinum and indium. Alloys include, but are not limited to, binary and ternary alloys of the foregoing metals. Typically, metals chosen from copper, tin, nickel, gold, silver or indium are plated with the metal plating compositions. More typically, metals chosen from copper, tin, silver or indium are plated. Most typically, copper is plated.</p <p id="
            "p0030"
            " num="
            "0030"
            ">Indium salts which may be used include, but are not limited to, one or more of indium salts of alkane sulfonic acids and aromatic sulfonic acids, such as methanesulfonic acid, ethanesulfonic acid, butane sulfonic acid, benzenesulfonic acid and toluenesulfonic acid, salts of sulfamic acid, sulfate salts, chloride and bromide salts of indium, nitrate salts, hydroxide salts, indium oxides, fluoroborate salts, indium salts of carboxylic acids, such as citric acid, acetoacetic acid, glyoxylic acid, pyruvic acid, glycolic acid, malonic acid, hydroxamic acid, iminodiacetic acid, salicylic acid, glyceric acid, succinic acid, malic acid, tartaric acid, hydroxybutyric acid, indium salts of amino acids, such as arginine, aspartic acid, asparagine, glutamic acid, glycine, glutamine, leucine, lysine, threonine, isoleucine, and valine.</p"
        },
        {
            "text":
            "<claim-text>A toner comprising: <claim-text>toner base particles; and</claim-text> <claim-text>an external additive,</claim-text> <claim-text>the toner base particles each comprising a binder resin and a colorant,</claim-text> <claim-text>wherein the external additive comprises coalesced particles,</claim-text> <claim-text>wherein the coalesced particles are each a non-spherical secondary particle in which primary particles are coalesced together, and</claim-text> <claim-text>wherein an index of a particle size distribution of the coalesced particles is expressed by the following Formula (1): <maths id="
            "math0004"
            " num="
            "(formula (1)"
            "><math display="
            "block"
            "><mfrac><msub><mi>Db</mi><mn>50</mn></msub><msub><mi>Db</mi><mn>10</mn></msub></mfrac><mo>≦</mo><mn>1.20</mn></math><img id="
            "ib0008"
            " file="
            "imgb0008.tif"
            " wi="
            "93"
            " he="
            "21"
            " img-content="
            "math"
            " img-format="
            "tif"
            "/></maths><br/> where, in a distribution diagram in which particle diameters in nm of the coalesced particles are on a horizontal axis and cumulative percentages in % by number of the coalesced particles are on a vertical axis and in which the coalesced particles are accumulated from the coalesced particles having smaller particle diameters to the coalesced particles having larger particle diameters, Db<sub>50</sub> denotes a particle diameter of the coalesced particle at which the cumulative percentage is 50% by number, and Db<sub>10</sub> denotes a particle diameter of the coalesced particle at which the cumulative percentage is 10% by number.</claim-text></claim-text>",
            "text_b":
            "<p id="
            "p0177"
            " num="
            "0177"
            ">For a similar reason, it is preferred that the electroconductive fine powder has a volume-average particle size of 0.5 - 5 µm, more preferably 0.8 - 5 µm, further preferably 1.1 - 5 µm and has a particle size distribution such that particles of 0.5 µm or smaller occupy at most 70 % by volume and particles of 5.0 µm or larger occupy at most 5 % by number.</p <p id="
            "p0189"
            " num="
            "0189"
            ">The volume-average particle size and particle size distribution of the electroconductive fine powder described herein are based on values measured in the following manner. A laser diffraction-type particle size distribution measurement apparatus ("
            "Model LS-230"
            ", available from Coulter Electronics Inc.) is equipped with a liquid module, and the measurement is performed in a particle size range of 0.04 - 2000 µm to obtain a volume-basis particle size distribution. For the measurement, a minor amount of surfactant is added to 10 cc of pure water and 10 mg of a sample electroconductive fine powder is added thereto, followed by 10 min. of dispersion by means of an ultrasonic disperser (ultrasonic homogenizer) to obtain a sample dispersion liquid, which is subjected to a single time of measurement for 90 sec.</p <p id="
            "p0191"
            " num="
            "0191"
            ">In the case where the electroconductive fine powder is composed of agglomerate particles, the particle size of the electroconductive fine powder is determined as the particle size of the agglomerate. The electroconductive fine powder in the form of agglomerated secondary particles can be used as well as that in the form of primary particles. Regardless of its agglomerated form, the electroconductive fine powder can exhibit its desired function of charging promotion by presence in the form of the agglomerate in the charging section at the contact position<!-- EPO <DP n="
            "85"
            "> --> between the charging member and the image-bearing member or in a region in proximity thereto.</p"
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)
    model.close_multiprocessing_pool()
Beispiel #14
0
def doc_regression():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_regression")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 5
    batch_size = 32
    evaluate_every = 30
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    We do not have a sample dataset for regression yet, add your own dataset to run the example
    processor = RegressionProcessor(tokenizer=tokenizer,
                                    max_seq_len=128,
                                    data_dir=Path("../data/<YOUR-DATASET>"),
                                    label_column_name="label"
                                    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text regression
    prediction_head = RegressionHead()

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence_continuous"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-doc-regression-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    basic_texts = [
        {"text": ""},
        {"text": ""},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)