Esempio n. 1
0
def question_answering():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_question_answering")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 24
    n_epochs = 2
    evaluate_every = 2000
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    label_list = ["start_token", "end_token"]
    metric = "squad"
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=label_list,
        metric=metric,
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("../data/squad20"),
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Question Answering
    prediction_head = QuestionAnsweringHead()

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/bert-english-qa-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    QA_input = [{
        "qas": ["Who counted the game among the best ever made?"],
        "context":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]

    model = QAInferencer.load(save_dir, batch_size=40, gpu=True)
    result = model.inference_from_dicts(dicts=QA_input)[0]

    pprint.pprint(result)

    # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk
    filename = os.path.join(processor.data_dir, processor.dev_filename)
    result = model.inference_from_file(file=filename, return_json=False)
    result_squad = [x.to_squad_eval() for x in result]

    write_squad_predictions(predictions=result_squad,
                            predictions_filename=filename,
                            out_filename="predictions.json")
Esempio n. 2
0
def xlmr_qa_demo():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="run_xmlr_qa")

    #########################
    ######## Settings
    ########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 3
    grad_acc_steps = 8
    n_epochs = 2
    evaluate_every = 200
    base_LM_model = "xlm-roberta-large"

    data_dir = Path("../data/squad20")
    train_filename = Path("train-v2.0.json")
    dev_filename = Path("dev-v2.0.json")

    save_dir = Path("../saved_models/xlmr-large-qa")

    inference_file = Path("../data/MLQA_V1/dev/dev-context-de-question-de.json")
    predictions_file = save_dir / "predictions.json"
    full_predictions_file = save_dir / "full_predictions.json"
    max_processes_for_inference = 8
    train = True
    inference = False

    if train:
        # 1.Create a tokenizer
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model)
        # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        label_list = ["start_token", "end_token"]
        metric = "squad"
        processor = SquadProcessor(
            tokenizer=tokenizer,
            max_seq_len=384,
            label_list=label_list,
            metric=metric,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=None,
            data_dir=data_dir,
            dev_split=0.0
        )

        # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=1)

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(base_LM_model, n_added_tokens=3)
        # b) and a prediction head on top that is suited for our task => Question Answering
        prediction_head = QuestionAnsweringHead()

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token"],
            device=device,
        )

        # 5. Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=3e-5,
            schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2},
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            grad_acc_steps=grad_acc_steps,
            device=device
        )

        # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )
        # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
        model = trainer.train(model)

        # 8. Hooray! You have a model. Store it:
        model.save(save_dir)
        processor.save(save_dir)


    if inference:
        model = Inferencer.load(save_dir, batch_size=32, gpu=True)
        full_result = model.inference_from_file(
            file=inference_file,
            max_processes=max_processes_for_inference,
        )

        for x in full_result:
            print(x)
            print()

        result = {r["id"]: r["preds"][0][0] for r in full_result}
        full_result = {r["id"]: r["preds"] for r in full_result}

        json.dump(result,
                  open(predictions_file, "w"),
                  indent=4,
                  ensure_ascii=False)
        json.dump(full_result,
                  open(full_predictions_file, "w"),
                  indent=4,
                  ensure_ascii=False)
Esempio n. 3
0
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    model = trainer.train(model)

    # 8. Hooray! You have a model. Store it:
    model.save(save_dir)
    processor.save(save_dir)

if inference:
    model = Inferencer.load(save_dir, batch_size=32, gpu=True)
    full_result = model.inference_from_file(
        file=inference_file, use_multiprocessing=inference_multiprocessing)

    for x in full_result:
        print(x)
        print()

    result = {r["id"]: r["preds"][0][0] for r in full_result}
    full_result = {r["id"]: r["preds"] for r in full_result}

    json.dump(result,
              open(predictions_file, "w"),
              indent=4,
              ensure_ascii=False)
    json.dump(full_result,
              open(full_predictions_file, "w"),
              indent=4,
Esempio n. 4
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]
    train_filename = "train.tsv"
    dev_filename = "dev_200k.tsv"

    # The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking
    generate_data = False
    data_dir = Path("../data/msmarco_passage")
    predictions_raw_filename = "predictions_raw.txt"
    predictions_filename = "predictions.txt"
    train_source_filename = "triples.train.1m.tsv"
    qrels_filename = "qrels.dev.tsv"
    queries_filename = "queries.dev.tsv"
    passages_filename = "collection.tsv"
    top1000_filename = "top1000.dev"

    # 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once!
    # The final format is a tsv file with 3 columns (text, text_b and label)
    if generate_data:
        reformat_msmarco_train(data_dir / train_source_filename,
                               data_dir / train_filename)
        reformat_msmarco_dev(data_dir / queries_filename,
                             data_dir / passages_filename,
                             data_dir / qrels_filename,
                             data_dir / top1000_filename,
                             data_dir / dev_filename)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    Evaluation during training will be performed on a slice of the train set
    #    We will be using the msmarco dev set as our final evaluation set
    processor = TextPairClassificationProcessor(tokenizer=tokenizer,
                                                label_list=label_list,
                                                metric="f1_macro",
                                                train_filename=train_filename,
                                                test_filename=None,
                                                dev_split=0.001,
                                                max_seq_len=128,
                                                data_dir=data_dir,
                                                delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(num_labels=len(label_list),
                                             class_weights=data_silo.calculate_class_weights(
                                                 task_name="text_classification"),
                                             )

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence_continuous"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/passage_ranking_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    model = Inferencer.load(save_dir, gpu=True, max_seq_len=128, batch_size=128)
    result = model.inference_from_file(data_dir / dev_filename)

    write_msmarco_results(result, save_dir / predictions_raw_filename)

    msmarco_evaluation(preds_file=save_dir / predictions_raw_filename,
                       dev_file=data_dir / dev_filename,
                       qrels_file=data_dir / qrels_filename,
                       output_file=save_dir / predictions_filename)

    model.close_multiprocessing_pool()
Esempio n. 5
0
model = trainer.train(model)

# 8. Hooray! You have a model. Store it:
save_dir = "../saved_models/bert-english-qa-tutorial"
model.save(save_dir)
processor.save(save_dir)

# 9. Load it & harvest your fruits (Inference)
QA_input = [
        {
            "questions": ["Who counted the game among the best ever made?"],
            "text":  "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
        }]

model = Inferencer.load(save_dir, batch_size=40, gpu=True)
result = model.inference_from_dicts(dicts=QA_input)

for x in result:
    pprint.pprint(x)

# 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk
filename = os.path.join(processor.data_dir,processor.dev_filename)
result = model.inference_from_file(file=filename)
write_squad_predictions(
    predictions=result,
    predictions_filename=filename,
    out_filename="predictions.json"
)