コード例 #1
0
def test_dpr_training():
    batch_size = 1
    n_epochs = 1
    distributed = False  # enable for multi GPU training via DDP
    evaluate_every = 1
    question_lang_model = "microsoft/MiniLM-L12-H384-uncased"
    passage_lang_model = "microsoft/MiniLM-L12-H384-uncased"
    do_lower_case = True
    use_fast = True
    similarity_function = "dot_product"

    device, n_gpu = initialize_device_settings(use_cuda=False)

    query_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=question_lang_model,
        do_lower_case=do_lower_case,
        use_fast=use_fast)
    passage_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=passage_lang_model,
        do_lower_case=do_lower_case,
        use_fast=use_fast)
    label_list = ["hard_negative", "positive"]

    processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
                                        passage_tokenizer=passage_tokenizer,
                                        max_seq_len_query=10,
                                        max_seq_len_passage=10,
                                        label_list=label_list,
                                        metric="text_similarity_metric",
                                        data_dir="samples/dpr/",
                                        train_filename="sample.json",
                                        dev_filename="sample.json",
                                        test_filename=None,
                                        embed_title=True,
                                        num_hard_negatives=1,
                                        dev_split=0,
                                        max_samples=2)

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)

    question_language_model = LanguageModel.load(
        pretrained_model_name_or_path=question_lang_model,
        language_model_class="DPRQuestionEncoder")
    passage_language_model = LanguageModel.load(
        pretrained_model_name_or_path=passage_lang_model,
        language_model_class="DPRContextEncoder")

    prediction_head = TextSimilarityHead(
        similarity_function=similarity_function)

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
                        "eps": 1e-08},
        schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        grad_acc_steps=1,
        device=device,
        distributed=distributed
    )

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    trainer.train()
コード例 #2
0
##########################
set_all_seeds(seed=42)
batch_size = 32
use_gpu = True
device, n_gpu = initialize_device_settings(use_cuda=use_gpu)
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                           do_lower_case=False)

# 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing)
processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

# 4. Create an AdaptiveModel with  a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)

adaptive_model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[],
    embeds_dropout_prob=0,
    lm_output_types=["per_token", "per_sequence"],
    device=device,
)

# 5. Extract embeddings with model in inference mode
basic_texts = [
    {
        "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"
    },
    {
コード例 #3
0
def train_from_scratch(args):
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    #TODO prettify this loading of params from two sources (cmd + json)
    cmd_args = parse_arguments()
    args["local_rank"] = cmd_args.local_rank
    logging.info(f'local_rank: {args["local_rank"]}')

    next_sent_task = bool(int(args.get("next_sent_task", 1)))
    distributed = True
    use_amp = args.get("use_amp", None)
    use_amp = None if use_amp == "" else use_amp

    # Only the main process should log here
    if args["local_rank"] in [-1, 0]:
        ml_logger = StdoutLogger(tracking_uri=None)
        ml_logger.init_experiment(experiment_name="train_from_scratch",
                                  run_name="run")

    set_all_seeds(seed=39)

    device, n_gpu = initialize_device_settings(use_cuda=True,
                                               local_rank=args["local_rank"],
                                               use_amp=use_amp)
    effective_batch_size = int(args["per_gpu_batch_size"]) * int(
        args["gradient_accumulation_steps"]
    ) * torch.distributed.get_world_size()

    logging.info(
        f'Training with effective batch size of {effective_batch_size} '
        f'(per_gpu_batch_size = {int(args["per_gpu_batch_size"])}, gradient_accumulation_steps={int(args["gradient_accumulation_steps"])}, n_gpus = {torch.distributed.get_world_size()} )'
    )

    save_dir = Path("/opt/ml/model")
    data_dir = Path("/opt/ml/input/data/input_channel")

    # Split and shuffle training data
    if args["local_rank"] in [-1, 0]:
        split_file(data_dir / args["train_file"],
                   output_dir=data_dir / "split_files")
    # let other processes wait for splitted files from rank 0
    torch.distributed.barrier()

    args["train_file"] = data_dir / "split_files"

    # 1.Create a tokenizer
    tokenizer = BertTokenizer(data_dir / args["vocab_file"],
                              do_lower_case=bool(int(args["do_lower_case"])))

    # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset
    processor = BertStyleLMProcessor(data_dir=data_dir,
                                     tokenizer=tokenizer,
                                     max_seq_len=int(args["max_seq_len"]),
                                     train_filename=args.get("train_file"),
                                     dev_filename=args.get("dev_file", None),
                                     test_filename=args.get("test_file", None),
                                     next_sent_pred_style=args.get(
                                         "next_sent_pred_style", "bert-style"),
                                     max_docs=args.get("max_docs", None),
                                     next_sent_pred=next_sent_task)

    # 3. Create a DataSilo that loads several datasets (train/dev/test) and provides DataLoaders for them
    data_silo = StreamingDataSilo(processor=processor,
                                  batch_size=int(args["per_gpu_batch_size"]),
                                  dataloader_workers=int(
                                      args.get("data_loader_workers", 8)),
                                  distributed=distributed)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size)

    # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
    lm_prediction_head = BertLMHead(768, tokenizer.vocab_size)
    if next_sent_task:
        next_sentence_head = NextSentenceHead(num_labels=2,
                                              task_name="nextsentence")
        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[lm_prediction_head, next_sentence_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token", "per_sequence"],
            device=device,
        )
    else:
        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[lm_prediction_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token"],
            device=device,
        )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=float(args["learning_rate"]),
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": float(args["warmup_proportion"])
        },
        n_batches=len(data_silo.get_data_loader("train")),
        n_epochs=int(args["n_epochs"]),
        device=device,
        grad_acc_steps=int(args["gradient_accumulation_steps"]),
        distributed=distributed,
        use_amp=use_amp,
        local_rank=args["local_rank"])

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    if args.get("checkpoint_every"):
        checkpoint_every = int(args["checkpoint_every"])
        checkpoint_root_dir = Path("/opt/ml/checkpoints/training")
    else:
        checkpoint_every = None
        checkpoint_root_dir = None

    trainer = Trainer.create_or_load_checkpoint(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=int(args["n_epochs"]),
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=int(args["evaluate_every"]),
        log_loss_every=int(args.get("log_loss_every", 500)),
        log_learning_rate=bool(int(args.get("log_learning_rate", 0))),
        device=device,
        local_rank=args["local_rank"],
        grad_acc_steps=int(args["gradient_accumulation_steps"]),
        checkpoint_every=checkpoint_every,
        checkpoint_root_dir=checkpoint_root_dir,
        checkpoints_to_keep=int(args.get("checkpoints_to_keep", 10)),
        disable_tqdm=True,
        use_amp=use_amp,
    )

    # 7. Let it grow!
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    model.save(save_dir)
    processor.save(save_dir)
コード例 #4
0
def test_doc_classification(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=8,
                                            data_dir="samples/doc_class",
                                            train_filename="train-sample.tsv",
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            dev_filename="test-sample.tsv",
                                            test_filename=None,
                                            dev_split=0.0,
                                            label_column_name="coarse_label")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(layer_dims=[
        768, len(processor.tasks["text_classification"]["label_list"])
    ])
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts=None)

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = "testsave/doc_class"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [{
        "text": "Martin Müller spielt Handball in Berlin."
    }, {
        "text":
        "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."
    }]

    inf = Inferencer.load(save_dir, batch_size=2)
    result = inf.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
コード例 #5
0
ファイル: natural_questions.py プロジェクト: rohanag/FARM
def question_answering():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_natural_questions")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 24
    n_epochs = 1
    evaluate_every = 500
    lang_model = "deepset/roberta-base-squad2" # start with a model that can already extract answers
    do_lower_case = False # roberta is a cased model
    train_filename = "train_medium.jsonl"
    dev_filename = "dev_medium.jsonl"
    keep_is_impossible = 0.15 # downsample negative examples after data conversion
    downsample_context_size = 300 # reduce length of wikipedia articles to relevant part around the answer

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
    )

    # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart
    html_tags = [
                "<Th>","</Th>",
                "<Td>","</Td>",
                "<Tr>","</Tr>",
                "<Li>","</Li>",
                "<P>" ,"</P>",
                "<Ul>","</Ul>",
                "<H1>","</H1>",
                "<H2>","</H2>",
                "<H3>","</H3>",
                "<H4>","</H4>",
                "<H5>", "</H5>",
                "<Td_colspan=",
    ]
    tokenizer.add_tokens(html_tags)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = NaturalQuestionsProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        train_filename=train_filename,
        dev_filename=dev_filename,
        keep_no_answer=keep_is_impossible,
        downsample_context_size=downsample_context_size,
        data_dir=Path("../data/natural_questions"),
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size, caching=True)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model,n_added_tokens=len(html_tags))
    # b) and in case of Natural Questions we need two Prediction Heads
    #    one for extractive Question Answering
    qa_head = QuestionAnsweringHead()
    #    another one for answering yes/no questions or deciding if the given text passage might contain an answer
    classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) # answer_type_list = ["is_impossible", "span", "yes", "no"]
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[qa_head, classification_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/roberta-base-squad2-nq")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Since training on the whole NQ corpus requires substantial compute resources we trained and uploaded a model on s3
    fetch_archive_from_http("https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/models/roberta-base-squad2-nq.zip", output_dir="../saved_models/farm")
    QA_input = [
        {
            "qas": ["Did GameTrailers rated Twilight Princess as one of the best games ever created?"],
            "context":  "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
        }
    ]

    model = QAInferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True)
    result = model.inference_from_dicts(dicts=QA_input, return_json=False) # result is a list of QAPred objects

    print(f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?"
          f"\nAnswer from model: {result[0].prediction[0].answer}")
コード例 #6
0
def train_evaluation_single(seed=42):
    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    # GPU utilization on 4x V100
    # 40*4, 14.3/16GB on master, 12.6/16 on others
    batch_size = 40 * n_gpu_factor
    n_epochs = 2
    evaluate_every = 2000000  # disabling dev eval
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    test_assertions = False
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # Load model and train
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("testsave/data/squad20"),
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = LanguageModel.load(lang_model)
    prediction_head = QuestionAnsweringHead(n_best=5, n_best_per_sample=1)
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    starttime = time()
    trainer.train()
    elapsed = time() - starttime

    save_dir = Path("testsave/roberta-qa-dev")
    model.save(save_dir)
    processor.save(save_dir)

    # Create Evaluator
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnacc = results[0]["top_n_accuracy"] * 100

    print(results)
    print(elapsed)

    gold_f1 = 82.155
    gold_EM = 78.4385
    gold_tnrecall = 97.3721
    gold_elapsed = 1135
    if test_assertions:
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.01,
            err_msg=
            f"FARM Training changed for f1 score by: {f1_score - gold_f1}")
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.01,
            err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}")
        np.testing.assert_allclose(
            tnacc,
            gold_tnrecall,
            rtol=0.01,
            err_msg=
            f"FARM Training changed for top 5 accuracy by: {tnacc - gold_tnrecall}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )
    if not np.allclose(f1_score, gold_f1, rtol=0.01):
        error_messages.append(
            f"FARM Training changed for f1 score by: {round(f1_score - gold_f1, 4)}"
        )
    if not np.allclose(em_score, gold_EM, rtol=0.01):
        error_messages.append(
            f"FARM Training changed for EM by: {round(em_score - gold_EM, 4)}")
    if not np.allclose(tnacc, gold_tnrecall, rtol=0.01):
        error_messages.append(
            f"FARM Training changed for top 5 accuracy by: {round(tnacc - gold_tnrecall, 4)}"
        )
    if not np.allclose(elapsed, gold_elapsed, rtol=0.1):
        error_messages.append(
            f"FARM Training speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds"
        )

    benchmark_result = [{
        "run": "train evaluation",
        "f1_change": round(f1_score - gold_f1, 4),
        "em_change": round(em_score - gold_EM, 4),
        "tnacc_change": round(tnacc - gold_tnrecall, 4),
        "elapsed_change": round(elapsed - gold_elapsed, 4),
        "f1": f1_score,
        "em": em_score,
        "tnacc": round(tnacc, 4),
        "elapsed": elapsed,
        "f1_gold": gold_f1,
        "em_gold": gold_EM,
        "tnacc_gold": gold_tnrecall,
        "elapsed_gold": gold_elapsed
    }]
    logger.info("\n\n" + pformat(benchmark_result) + "\n")
    return benchmark_result
コード例 #7
0
ファイル: dense.py プロジェクト: vaibhavsagar9/haystack
    def __init__(self,
                 document_store: BaseDocumentStore,
                 query_embedding_model: Union[
                     Path,
                     str] = "facebook/dpr-question_encoder-single-nq-base",
                 passage_embedding_model: Union[
                     Path, str] = "facebook/dpr-ctx_encoder-single-nq-base",
                 max_seq_len_query: int = 64,
                 max_seq_len_passage: int = 256,
                 use_gpu: bool = True,
                 batch_size: int = 16,
                 embed_title: bool = True,
                 use_fast_tokenizers: bool = True,
                 similarity_function: str = "dot_product"):
        """
        Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
        The checkpoint format matches huggingface transformers' model format

        **Example:**

                ```python
                |    # remote model from FAIR
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                |                          passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base")
                |    # or from local path
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="model_directory/question-encoder",
                |                          passage_embedding_model="model_directory/context-encoder")
                ```

        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the
                                      one used by hugging-face transformers' modelhub models
                                      Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"``
        :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the
                                        one used by hugging-face transformers' modelhub models
                                        Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"``
        :param max_seq_len_query: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down."
        :param max_seq_len_passage: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down."
        :param use_gpu: Whether to use gpu or not
        :param batch_size: Number of questions or passages to encode at once
        :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding.
                            This is the approach used in the original paper and is likely to improve performance if your
                            titles contain meaningful information for retrieval (topic, entities etc.) .
                            The title is expected to be present in doc.meta["name"] and can be supplied in the documents
                            before writing them to the DocumentStore like this:
                            {"text": "my text", "meta": {"name": "my title"}}.
        """

        self.document_store = document_store
        self.batch_size = batch_size
        self.max_seq_len_passage = max_seq_len_passage
        self.max_seq_len_query = max_seq_len_query

        if document_store is None:
            logger.warning(
                "DensePassageRetriever initialized without a document store. "
                "This is fine if you are performing DPR training. "
                "Otherwise, please provide a document store in the constructor."
            )
        elif document_store.similarity != "dot_product":
            logger.warning(
                f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. "
                "We recommend you use dot_product instead. "
                "This can be set when initializing the DocumentStore")

        if use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.embed_title = embed_title

        # Init & Load Encoders
        self.query_tokenizer = Tokenizer.load(
            pretrained_model_name_or_path=query_embedding_model,
            do_lower_case=True,
            use_fast=use_fast_tokenizers,
            tokenizer_class="DPRQuestionEncoderTokenizer")
        self.query_encoder = LanguageModel.load(
            pretrained_model_name_or_path=query_embedding_model,
            language_model_class="DPRQuestionEncoder")

        self.passage_tokenizer = Tokenizer.load(
            pretrained_model_name_or_path=passage_embedding_model,
            do_lower_case=True,
            use_fast=use_fast_tokenizers,
            tokenizer_class="DPRContextEncoderTokenizer")
        self.passage_encoder = LanguageModel.load(
            pretrained_model_name_or_path=passage_embedding_model,
            language_model_class="DPRContextEncoder")

        self.processor = TextSimilarityProcessor(
            tokenizer=self.query_tokenizer,
            passage_tokenizer=self.passage_tokenizer,
            max_seq_len_passage=self.max_seq_len_passage,
            max_seq_len_query=self.max_seq_len_query,
            label_list=["hard_negative", "positive"],
            metric="text_similarity_metric",
            embed_title=self.embed_title,
            num_hard_negatives=0,
            num_positives=1)

        prediction_head = TextSimilarityHead(
            similarity_function=similarity_function)
        self.model = BiAdaptiveModel(
            language_model1=self.query_encoder,
            language_model2=self.passage_encoder,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm1_output_types=["per_sequence"],
            lm2_output_types=["per_sequence"],
            device=self.device,
        )
        self.model.connect_heads_with_processor(self.processor.tasks,
                                                require_labels=False)
コード例 #8
0
ファイル: multitask_learning.py プロジェクト: imdiptanu/FARM


from farm.evaluation.metrics import register_metrics
register_metrics('f1_weighted', custom_f1_score)

metric = 'f1_weighted'
processor.add_task(name="document_level_task", label_list=LABEL_LIST, metric="acc", text_column_name="text", label_column_name="label", task_type="classification")
processor.add_task(name="token_level_task", label_list=TRIGGER_LABELS, metric=metric, text_column_name="text", label_column_name="tokens", task_type="ner")


data_silo = DataSilo(processor=processor,
                    batch_size=BATCH_SIZE
                    )

language_model = LanguageModel.load(LANG_MODEL)

document_level_task_head = TextClassificationHead(num_labels=len(LABEL_LIST), task_name="document_level_task")
token_level_task_head = TokenClassificationHead(num_labels=len(TRIGGER_LABELS), task_name="token_level_task")

model = AdaptiveModel(
  language_model=language_model,
  prediction_heads=[document_level_task_head, token_level_task_head],
  embeds_dropout_prob=EMBEDS_DROPOUT_PROB,
  lm_output_types=["per_sequence", "per_token"],
  device=DEVICE,
  loss_aggregation_fn=my_loss_agg)

model, optimizer, lr_schedule = initialize_optimizer(
  model=model,
  device=DEVICE,
コード例 #9
0
ファイル: lm_from_scratch.py プロジェクト: vyaslkv/FARM
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
processor = BertStyleLMProcessor(data_dir="../data/lm_finetune_nips",
                                 tokenizer=tokenizer,
                                 max_seq_len=max_seq_len,
                                 train_filename="train_small.txt",
                                 dev_filename="train_small.txt",
                                 test_filename=None)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor,
                     batch_size=batch_size,
                     distributed=False)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.from_scratch("bert", vocab_size)

# b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
lm_prediction_head = BertLMHead(768, vocab_size)
next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence")

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[lm_prediction_head, next_sentence_head],
    embeds_dropout_prob=0.1,
    lm_output_types=["per_token", "per_sequence"],
    device=device,
)

# 5. Create an optimizer
optimizer, warmup_linear = initialize_optimizer(
コード例 #10
0
ファイル: test_lm_finetuning.py プロジェクト: jucho2725/FARM
def test_lm_finetuning_no_next_sentence(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    processor = BertStyleLMProcessor(data_dir="samples/lm_finetuning",
                                     train_filename="train-sample.txt",
                                     test_filename="test-sample.txt",
                                     dev_filename=None,
                                     tokenizer=tokenizer,
                                     max_seq_len=12,
                                     next_sent_pred=False)
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    model = trainer.train(model)

    save_dir = "testsave/lm_finetuning_no_nsp"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Farmer's life is great."
        },
        {
            "text": "It's nothing for big city kids though."
        },
    ]
    model = Inferencer.load(save_dir, embedder_only=True)
    result = model.extract_vectors(dicts=basic_texts)
    assert result[0]["context"] == [
        'Farmer', "'", 's', 'life', 'is', 'great', '.'
    ]
    assert result[0]["vec"].shape == (768, )
    # TODO check why results vary accross runs with same seed
    assert isinstance(result[0]["vec"][0], np.float32)
コード例 #11
0
ファイル: dpr_encoder.py プロジェクト: skiran252/FARM
def dense_passage_retrieval():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="FARM-dense_passage_retrieval",
                              run_name="Run_dpr")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    batch_size = 4
    n_epochs = 3
    distributed = False  # enable for multi GPU training via DDP
    evaluate_every = 1000
    question_lang_model = "facebook/dpr-question_encoder-single-nq-base"
    passage_lang_model = "facebook/dpr-ctx_encoder-single-nq-base"
    do_lower_case = True
    use_fast = True
    embed_title = True
    num_hard_negatives = 1
    similarity_function = "dot_product"
    train_filename = "nq-train.json"
    dev_filename = "nq-dev.json"
    test_filename = "nq-dev.json"
    max_samples = None  # load a smaller dataset (e.g. for debugging)

    # For multi GPU Training via DDP we need to get the local rank
    args = parse_arguments()
    device, n_gpu = initialize_device_settings(use_cuda=True,
                                               local_rank=args.local_rank)

    # 1.Create question and passage tokenizers
    query_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=question_lang_model,
        do_lower_case=do_lower_case,
        use_fast=use_fast)
    passage_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=passage_lang_model,
        do_lower_case=do_lower_case,
        use_fast=use_fast)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # data_dir "data/retriever" should contain DPR training and dev files downloaded from https://github.com/facebookresearch/DPR
    # i.e., nq-train.json, nq-dev.json or trivia-train.json, trivia-dev.json
    label_list = ["hard_negative", "positive"]
    metric = "text_similarity_metric"
    processor = TextSimilarityProcessor(tokenizer=query_tokenizer,
                                        passage_tokenizer=passage_tokenizer,
                                        max_seq_len_query=64,
                                        max_seq_len_passage=256,
                                        label_list=label_list,
                                        metric=metric,
                                        data_dir="../data/retriever",
                                        train_filename=train_filename,
                                        dev_filename=dev_filename,
                                        test_filename=test_filename,
                                        embed_title=embed_title,
                                        num_hard_negatives=num_hard_negatives,
                                        max_samples=max_samples)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=distributed)

    # 4. Create an BiAdaptiveModel+
    # a) which consists of 2 pretrained language models as a basis
    question_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRQuestionEncoder")
    passage_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRContextEncoder")

    # b) and a prediction head on top that is suited for our task => Question Answering
    prediction_head = TextSimilarityHead(
        similarity_function=similarity_function)

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
                        "eps": 1e-08},
        schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        grad_acc_steps=1,
        device=device,
        distributed=distributed
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/dpr-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Evaluate
    test_data_loader = data_silo.get_data_loader("test")
    if test_data_loader is not None:
        evaluator_test = Evaluator(data_loader=test_data_loader,
                                   tasks=data_silo.processor.tasks,
                                   device=device)
        model.connect_heads_with_processor(processor.tasks)
        test_result = evaluator_test.eval(model)
コード例 #12
0
def train_from_scratch(args):
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri=args.get(
        "mlflow_tracking_uri", "file:/opt/ml/model/mlflow"))
    ml_logger.init_experiment(experiment_name="train_from_scratch",
                              run_name="run")

    set_all_seeds(seed=39)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    evaluate_every = int(args["evaluate_every"])

    save_dir = Path("/opt/ml/model")
    data_dir = Path("/opt/ml/input/data/input_channel")

    # 1.Create a tokenizer
    tokenizer = BertTokenizer(data_dir / args["vocab_file"],
                              do_lower_case=args["do_lower_case"])

    # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset
    processor = BertStyleLMProcessor(
        data_dir=data_dir,
        tokenizer=tokenizer,
        max_seq_len=int(args["max_seq_len"]),
        train_filename=args["train_file"],
        dev_filename=args.get("dev_file", None),
        test_filename=args.get("test_file", None),
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and
    #    calculates a few descriptive statistics of our datasets
    stream_data_silo = StreamingDataSilo(processor=processor,
                                         batch_size=int(args["batch_size"]))

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size)

    # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
    lm_prediction_head = BertLMHead(768, tokenizer.vocab_size)
    next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence")

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=float(args["learning_rate"]),
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": float(args["warmup_proportion"])
        },
        n_batches=len(stream_data_silo.get_data_loader("train")),
        n_epochs=int(args["n_epochs"]),
        device=device,
        grad_acc_steps=int(args["gradient_accumulation_steps"]),
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    if args.get("checkpoint_every"):
        checkpoint_every = int(args["checkpoint_every"])
        checkpoint_root_dir = Path("/opt/ml/checkpoints/training")
    else:
        checkpoint_every = None
        checkpoint_root_dir = None

    trainer = Trainer.create_or_load_checkpoint(
        model=model,
        optimizer=optimizer,
        data_silo=stream_data_silo,
        epochs=int(args["n_epochs"]),
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
        grad_acc_steps=int(args["gradient_accumulation_steps"]),
        checkpoint_every=checkpoint_every,
        checkpoint_root_dir=checkpoint_root_dir,
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    model.save(save_dir)
    processor.save(save_dir)
コード例 #13
0
def outcome_pretraining(task_config,
                        model_name,
                        cache_dir,
                        run_name="0",
                        lr=1e-05,
                        warmup_steps=5000,
                        embeds_dropout=0.1,
                        epochs=200,  # large because we use early stopping by default
                        batch_size=20,
                        grad_acc_steps=1,
                        early_stopping_metric="loss",
                        early_stopping_mode="min",
                        early_stopping_patience=10,
                        model_class="Bert",
                        tokenizer_class="BertTokenizer",
                        do_lower_case=True,
                        do_train=True,
                        do_eval=True,
                        do_hpo=False,
                        max_seq_len=512,
                        seed=11,
                        eval_every=500,
                        use_amp=False,
                        use_cuda=True,
                        ):
    # Load task config
    task_config = yaml.safe_load(open(task_config))

    data_dir = Path(task_config["data"]["data_dir"])

    # General Settings
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name, tokenizer_class=tokenizer_class,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = OutcomePretrainingProcessor(tokenizer=tokenizer,
                                            max_seq_len=max_seq_len,
                                            data_dir=data_dir,
                                            train_filename=task_config["data"]["train_filename"],
                                            dev_filename=task_config["data"]["dev_filename"],
                                            seed=seed,
                                            max_size_admission=50,
                                            max_size_discharge=50,
                                            cache_dir=cache_dir)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = OutcomePretrainingDataSilo(
        processor=processor,
        caching=True,
        cache_dir=cache_dir,
        batch_size=batch_size,
        max_multiprocessing_chunksize=200)

    if do_train:

        # Set save dir for experiment output
        save_dir = Path(task_config["output_dir"]) / f'{task_config["experiment_name"]}_{run_name}'

        # Use HPO config args if config is passed
        if do_hpo:
            save_dir = save_dir / tune.session.get_trial_name()
        else:
            exp_name = f"exp_{random.randint(100000, 999999)}"
            save_dir = save_dir / exp_name

        # Create save dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Setup MLFlow logger
        ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"])
        ml_logger.init_experiment(experiment_name=task_config["experiment_name"],
                                  run_name=f'{task_config["experiment_name"]}_{run_name}')

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis

        language_model = LanguageModel.load(model_name, language_model_class=model_class)

        # b) and NextSentenceHead prediction head or TextClassificationHead if it's not a Bert Model
        if model_class == "Bert":
            next_sentence_head = NextSentenceHead.load(model_class)
        else:
            next_sentence_head = TextClassificationHead(num_labels=2)

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[next_sentence_head],
            embeds_dropout_prob=embeds_dropout,
            lm_output_types=["per_sequence"],
            device=device,
        )

        # 5. Create an optimizer
        schedule_opts = {"name": "LinearWarmup",
                         "num_warmup_steps": warmup_steps}

        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=lr,
            device=device,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=epochs,
            use_amp=use_amp,
            grad_acc_steps=grad_acc_steps,
            schedule_opts=schedule_opts)

        # 6. Create an early stopping instance
        early_stopping = None
        if early_stopping_mode != "none":
            early_stopping = EarlyStopping(
                mode=early_stopping_mode,
                min_delta=0.0001,
                save_dir=save_dir,
                metric=early_stopping_metric,
                patience=early_stopping_patience
            )

        # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it
        # from time to time

        trainer = ExtendedTrainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=eval_every,
            early_stopping=early_stopping,
            device=device,
            grad_acc_steps=grad_acc_steps,
            evaluator_test=do_eval
        )

        def score_callback(eval_score, train_loss):
            tune.report(roc_auc_dev=eval_score, train_loss=train_loss)

        # 8. Train the model
        trainer.train(score_callback=score_callback if do_hpo else None)

        # 9. Save model if not saved in early stopping
        model.save(save_dir / "final_model")
        processor.save(save_dir / "final_model")

    if do_eval:
        # Load newly trained model or existing model
        if do_train:
            model_dir = save_dir
        else:
            model_dir = Path(model_name)

        logger.info("###### Eval on TEST SET #####")

        evaluator_test = Evaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device
        )

        # Load trained model for evaluation
        model = AdaptiveModel.load(model_dir, device)
        model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

        # Evaluate
        results = evaluator_test.eval(model, return_preds_and_labels=True)

        # Log results
        utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader),
                          save_path=model_dir / "eval_results.txt")
コード例 #14
0
def ner():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42, deterministic_cudnn=True)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)
    n_epochs = 4
    batch_size = 32
    evaluate_every = 400
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
    )

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer, max_seq_len=128, data_dir=Path(DATA_DIR), delimiter=" ", metric="seq_f1", label_list=ner_labels
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_loader_worker = 15
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=data_loader_worker)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = MODEL_DIR
    model.save(save_dir)
    processor.save(save_dir)
コード例 #15
0
def test_ner_amp(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"
    if AMP_AVAILABLE:
        use_amp = 'O1'
    else:
        use_amp = None

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename=Path("train-sample.txt"),
                             dev_filename=Path("dev-sample.txt"),
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_token"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-05,
        schedule_opts=None,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        use_amp=use_amp)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "1980 kam der Crown von Toyota"
        },
    ]
    model = Inferencer.load(save_dir, num_processes=0)
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0][0]["context"] == "Crown"
    assert isinstance(result[0]["predictions"][0][0]["probability"],
                      np.float32)
コード例 #16
0
def doc_regression():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_regression")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 5
    batch_size = 32
    evaluate_every = 30
    lang_model = "bert-base-cased"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    We do not have a sample dataset for regression yet, add your own dataset to run the example
    processor = RegressionProcessor(tokenizer=tokenizer,
                                    max_seq_len=128,
                                    data_dir=Path("../data/<YOUR-DATASET>"),
                                    label_column_name="label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text regression
    prediction_head = RegressionHead()

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    model = trainer.train(model)

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-doc-regression-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    basic_texts = [
        {
            "text": ""
        },
        {
            "text": ""
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)
コード例 #17
0
def doc_classification_with_earlystopping():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    # ml_logger = MLFlowLogger(tracking_uri="logs")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        layer_dims=[
            768,
            len(processor.tasks["text_classification"]["label_list"])
        ],
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.2,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=0.5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        metric="f1_offense",
        mode=
        "max",  # use the metric from our own metrics function instead of loss
        # metric="f1_macro", mode="max",  # use f1_macro from the dev evaluator of the trainer
        # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
        save_dir=Path("saved_models/bert-german-doc-tutorial-es"
                      ),  # where to save the best model
        patience=
        5  # number of evaluations to wait for improvement before terminating the training
    )

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    model = trainer.train(model)

    # 8. Hooray! You have a model.
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]

    # Load from the final epoch directory and apply
    print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING")
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)

    # Load from saved best model
    print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING")
    model = Inferencer.load(earlystopping.save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print("APPLICATION ON BEST MODEL")
    print(result)
コード例 #18
0
ファイル: test_doc_regression.py プロジェクト: vumichien/FARM
def test_doc_regression(caplog):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    processor = RegressionProcessor(tokenizer=tokenizer,
                            max_seq_len=8,
                            data_dir=Path("samples/doc_regr"),
                            train_filename="train-sample.tsv",
                            dev_filename="test-sample.tsv",
                            test_filename=None,
                            label_column_name="label")

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = RegressionHead()
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence_continuous"],
        device=device
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1}
    )

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device
    )

    trainer.train()

    save_dir = Path("testsave/doc_regr")
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand."},
        {"text": "it just did not fit right. The top is very thin showing everything."},
    ]

    model = Inferencer.load(save_dir, num_processes=0)
    result = model.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["pred"], np.float32)
コード例 #19
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False
    # or a local path:
    # lang_model = Path("../saved_models/farm-bert-base-cased")
    use_amp = None

    #############################################
    # CUSTOM OPTIMIZER & LR SCHEDULE
    #############################################
    # learning rate schedules from transformers
    schedule_opts = {"name": "LinearWarmup", "warmup_proportion": 0.4}
    # schedule_opts = {"name": "Constant"}
    # schedule_opts = {"name": "CosineWarmup", "warmup_proportion": 0.4}
    # schedule_opts = {"name": "CosineWarmupWithRestarts", "warmup_proportion": 0.4}

    # or from native pytorch (see https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html for all options)
    # schedule_opts = {"name": "StepLR", "step_size": 30, "gamma": 0.1}
    # schedule_opts = {"name": "ReduceLROnPlateau", "mode": 'min', "factor": 0.1, "patience":10}

    # optimizers from pytorch (see https://pytorch.org/docs/stable/optim.html for all options)
    optimizer_opts = {"name": "SGD", "momentum": 0.0}

    # or from apex (see https://github.com/NVIDIA/apex/tree/master/apex/optimizers for all options)
    # optimizer_opts = {"name": "FusedLAMB", "bias_correction": True}

    # or from transformers (default in FARM)
    #optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01}
    #############################################


    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir=Path("../data/germeval18"),
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="coarse_label"
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-3,
        optimizer_opts=optimizer_opts,
        schedule_opts=schedule_opts,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
        {"text": "Martin Müller spielt Handball in Berlin"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
    model.close_multiprocessing_pool()
コード例 #20
0
ファイル: test_dpr.py プロジェクト: sherlocked27/FARM
def test_dpr_modules(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1.Create question and passage tokenizers
    query_tokenizer = Tokenizer.load(pretrained_model_name_or_path="facebook/dpr-question_encoder-single-nq-base",
                                     do_lower_case=True, use_fast=True)
    context_tokenizer = Tokenizer.load(pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base",
                                       do_lower_case=True, use_fast=True)

    processor = TextSimilarityProcessor(
        tokenizer=query_tokenizer,
        passage_tokenizer=context_tokenizer,
        max_seq_len=256,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        data_dir="data/retriever",
        train_filename="nq-train.json",
        dev_filename="nq-dev.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1
    )

    question_language_model = LanguageModel.load(pretrained_model_name_or_path="bert-base-uncased",
                                                 language_model_class="DPRQuestionEncoder",
                                                 hidden_dropout_prob=0, attention_probs_dropout_prob=0)
    passage_language_model = LanguageModel.load(pretrained_model_name_or_path="bert-base-uncased",
                                                language_model_class="DPRContextEncoder",
                                                hidden_dropout_prob=0, attention_probs_dropout_prob=0)

    prediction_head = TextSimilarityHead(similarity_function="dot_product")

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.0,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )

    model.connect_heads_with_processor(processor.tasks)

    assert type(model) == BiAdaptiveModel
    assert type(processor) == TextSimilarityProcessor
    assert type(question_language_model) == DPRQuestionEncoder
    assert type(passage_language_model) == DPRContextEncoder

    # check embedding layer weights
    assert list(model.named_parameters())[0][1][0, 0].item() - -0.010200000368058681 < 0.0001

    d = {'query': 'big little lies season 2 how many episodes',
         'passages': [
                         {'title': 'Big Little Lies (TV series)',
                          'text': 'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley',
                          'label': 'positive',
                          'external_id': '18768923'},
                         {'title': 'Little People, Big World',
                          'text': 'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
                          'label': 'hard_negative',
                          'external_id': '7459116'},
                         {'title': 'Cormac McCarthy',
                          'text': 'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
                          'label': 'negative',
                          'passage_id': '2145653'}
                     ]
         }

    sample = processor._dict_to_samples(d)
    feats = processor._sample_to_features(sample[0])
    dataset, tensor_names = convert_features_to_dataset(feats)
    features = {key: val.unsqueeze(0).to(device) for key, val in zip(tensor_names, dataset[0])}

    # test features
    assert torch.all(torch.eq(features["query_input_ids"][0][:10].cpu(),
                              torch.tensor([101, 2502, 2210, 3658, 2161, 1016, 2129, 2116, 4178, 102])))
    assert torch.all(torch.eq(features["passage_input_ids"][0][0][:10].cpu(),
                              torch.tensor([101,  2502,  2210,  3658,  1006,  2694,  2186,  1007,   102,  2186])))
    assert len(features["query_segment_ids"][0].nonzero()) == 0
    assert len(features["passage_segment_ids"][0].nonzero()) == 0
    assert torch.all(torch.eq(features["query_attention_mask"].nonzero()[:, 1].cpu(), torch.tensor(list(range(10)))))
    assert torch.all(torch.eq(features["passage_attention_mask"][0][0].nonzero().cpu().squeeze(), torch.tensor(list(range(127)))))
    assert torch.all(torch.eq(features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(), torch.tensor(list(range(143)))))

    # test model encodings
    query_vector = model.language_model1(**features)[0]
    passage_vector = model.language_model2(**features)[0]
    assert torch.all(torch.le(query_vector[0, :10].cpu() - torch.tensor([-0.2135, -0.4748, 0.0501, -0.0430, -0.1747, -0.0441, 0.5638, 0.1405,
                                                                         0.2285, 0.0893]), torch.ones((1, 10))*0.0001))
    assert torch.all(torch.le(passage_vector[0, :10].cpu() - torch.tensor([0.0557, -0.6836, -0.3645, -0.5566,  0.2034, -0.3656,  0.2969, -0.0555,
                                                                          0.3405, -0.8691]), torch.ones((1, 10))*0.0001))
    assert torch.all(torch.le(passage_vector[1, :10].cpu() - torch.tensor([-0.2006, -1.5002, -0.1897, -0.3421, -0.0405, -0.0471, -0.0306,  0.1156,
                                                                           0.3350, -0.3412]), torch.ones((1, 10)) * 0.0001))

    # test logits and loss
    logits = model(**features)
    loss = model.logits_to_loss_per_head(logits, **features)
    similarity_scores = logits[0].cpu()
    assert torch.all(torch.le(similarity_scores - torch.tensor([[-1.8311e-03, -6.3016e+00]]), torch.ones((1, 2)) * 0.0001))
    assert (loss[0].item() - 0.0018) <= 0.0001
コード例 #21
0
def lm_finetuning():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    set_all_seeds(seed=42)
    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(
        experiment_name="Public_FARM", run_name="Run_minimal_example_lm"
    )
    ##########################
    ########## Settings
    ##########################
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 30
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
    )

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = BertStyleLMProcessor(
        data_dir=Path("../data/lm_finetune_nips"), tokenizer=tokenizer, max_seq_len=128, max_docs=20
    )
    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_multiprocessing_chunksize=20)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
    lm_prediction_head = BertLMHead.load(lang_model)
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-english-lm-tutorial")
    model.save(save_dir)
    processor.save(save_dir)
コード例 #22
0
def test_lm_finetuning_custom_vocab(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )
    tokenizer.add_tokens(["aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc"])

    processor = BertStyleLMProcessor(
        data_dir=Path("samples/lm_finetuning"),
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=12,
        next_sent_pred=True
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)

    language_model = LanguageModel.load(lang_model, n_added_tokens=len(tokenizer.get_added_vocab()))
    lm_prediction_head = BertLMHead.load(lang_model, n_added_tokens=len(tokenizer.get_added_vocab()))
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1}
    )
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    trainer.train()

    # LM embeddings and weight of decoder in head are shared and should therefore be equal
    assert torch.all(
        torch.eq(model.language_model.model.embeddings.word_embeddings.weight, model.prediction_heads[0].decoder.weight))

    save_dir = Path("testsave/lm_finetuning")
    model.save(save_dir)
    processor.save(save_dir)

    del model
    del processor
    del optimizer
    del data_silo
    del trainer

    basic_texts = [
        {"text": "Farmer's life is great."},
        {"text": "It's nothing for big city kids though."},
    ]
    model = Inferencer.load(save_dir, task_type="embeddings", num_processes=0)
    result = model.extract_vectors(dicts=basic_texts)
    assert result[0]["vec"].shape == (768,)
    # TODO check why results vary accross runs with same seed
    assert isinstance(result[0]["vec"][0], np.float32)
コード例 #23
0
ファイル: adaptive_model.py プロジェクト: gokberkyar/FARM
    def convert_from_transformers(cls,
                                  model_name_or_path,
                                  device,
                                  task_type,
                                  processor=None):
        """
        Load a (downstream) model from huggingface's transformers format. Use cases:
         - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data)
         - compare models without switching frameworks
         - use model directly for inference

        :param model_name_or_path: local path of a saved model or name of a public one.
                                              Exemplary public names:
                                              - distilbert-base-uncased-distilled-squad
                                              - deepset/bert-large-uncased-whole-word-masking-squad2

                                              See https://huggingface.co/models for full list
        :param device: "cpu" or "cuda"
        :param task_type: One of :
                          - 'question_answering'
                          - 'text_classification'
                          - 'embeddings'
                          More tasks coming soon ...
        :param processor: populates prediction head with information coming from tasks
        :type processor: Processor
        :return: AdaptiveModel
        """
        lm = LanguageModel.load(model_name_or_path)
        #TODO Infer type of head automatically from config

        if task_type == "question_answering":
            ph = QuestionAnsweringHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[ph],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types="per_token",
                                 device=device)
        elif task_type == "text_classification":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationhead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment."
                )
                raise NotImplementedError
            ph = TextClassificationHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[ph],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types="per_sequence",
                                 device=device)
        elif task_type == "ner":
            ph = TokenClassificationHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[ph],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types="per_token",
                                 device=device)
        elif task_type == "embeddings":
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types=["per_token", "per_sequence"],
                                 device=device)
        else:
            raise NotImplementedError(
                f"Huggingface's transformer models of type {task_type} are not supported yet"
            )

        if processor:
            adaptive_model.connect_heads_with_processor(processor.tasks)

        return adaptive_model
コード例 #24
0
    def convert_from_transformers(model_name_or_path,
                                  device,
                                  task_type=None,
                                  processor=None):
        """
        Load a (downstream) model from huggingface's transformers format. Use cases:
         - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data)
         - compare models without switching frameworks
         - use model directly for inference

        :param model_name_or_path: local path of a saved model or name of a public one.
                                              Exemplary public names:
                                              - distilbert-base-uncased-distilled-squad
                                              - deepset/bert-large-uncased-whole-word-masking-squad2

                                              See https://huggingface.co/models for full list
        :param device: "cpu" or "cuda"
        :param task_type: One of :
                          - 'question_answering'
                          - 'text_classification'
                          - 'embeddings'
                          More tasks coming soon ...
        :param processor: populates prediction head with information coming from tasks
        :type processor: Processor
        :return: AdaptiveModel
        """

        lm = LanguageModel.load(model_name_or_path)
        if task_type is None:
            # Infer task type from config
            architecture = lm.model.config.architectures[0]
            if "MaskedLM" in architecture:
                task_type = "lm"
            elif "QuestionAnswering" in architecture:
                task_type = "question_answering"
            elif "SequenceClassification" in architecture:
                if lm.model.config.num_labels == 1:
                    task_type = "regression"
                else:
                    task_type = "text_classification"
            elif "TokenClassification" in architecture:
                task_type = "ner"
            else:
                logger.error(
                    "Could not infer task type from model config. Please provide task type manually. "
                    "('lm', 'question_answering', 'regression', 'text_classification', 'ner' or 'embeddings')"
                )

        if task_type == "lm":
            ph = BertLMHead.load(model_name_or_path)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "question_answering":
            ph = QuestionAnsweringHead.load(model_name_or_path)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "regression":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Regression with Roberta or XLMRoberta not possible at the moment."
                )
                raise NotImplementedError
            ph = RegressionHead.load(model_name_or_path)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_sequence",
                                              device=device)

        elif task_type == "text_classification":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment."
                )
                raise NotImplementedError
            ph = TextClassificationHead.load(model_name_or_path)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_sequence",
                                              device=device)

        elif task_type == "ner":
            ph = TokenClassificationHead.load(model_name_or_path)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "embeddings":
            adaptive_model = am.AdaptiveModel(
                language_model=lm,
                prediction_heads=[],
                embeds_dropout_prob=0.1,
                lm_output_types=["per_token", "per_sequence"],
                device=device)

        if processor:
            adaptive_model.connect_heads_with_processor(processor.tasks)

        return adaptive_model
コード例 #25
0
ファイル: adaptive_model.py プロジェクト: sherlocked27/FARM
    def convert_to_onnx(cls,
                        model_name,
                        output_path,
                        task_type,
                        convert_to_float16=False,
                        quantize=False,
                        opset_version=11):
        """
        Convert a PyTorch model from transformers hub to an ONNX Model.

        :param model_name: transformers model name
        :type model_name: str
        :param output_path: output Path to write the converted to
        :type output_path: Path
        :param task_type: Type of task for the model. Available options: "embeddings", "question_answering",
                          "text_classification", "ner".
        :param convert_to_float16: By default, the model use float32 precision. With half precision of flaot16, inference
                                should be faster on Nvidia GPUs with Tensor core like T4 or V100. On older GPUs, float32
                                might be more performant.
        :type convert_to_float16: bool
        :param quantize: convert floating point number to integers
        :type quantize: bool
        :param opset_version: ONNX opset version
        :type opset_version: int
        :return:
        """
        language_model_class = LanguageModel.get_language_model_class(
            model_name)
        if language_model_class not in ["Bert", "Roberta", "XLMRoberta"]:
            raise Exception(
                "The current ONNX conversion only support 'BERT', 'RoBERTa', and 'XLMRoberta' models."
            )

        task_type_to_pipeline_map = {
            "question_answering": "question-answering",
            "embeddings": "feature-extraction",
            "ner": "ner"
        }

        convert(pipeline_name=task_type_to_pipeline_map[task_type],
                framework="pt",
                model=model_name,
                output=output_path / "model.onnx",
                opset=opset_version,
                use_external_format=True
                if language_model_class is "XLMRoberta" else False)

        # save processor & model config files that are needed when loading the model with the FARM Inferencer
        processor = Processor.convert_from_transformers(
            tokenizer_name_or_path=model_name,
            task_type=task_type,
            max_seq_len=256,
            doc_stride=128,
            use_fast=True)
        processor.save(output_path)
        model = AdaptiveModel.convert_from_transformers(model_name,
                                                        device="cpu",
                                                        task_type=task_type)
        model.save(output_path)
        os.remove(
            output_path / "language_model.bin"
        )  # remove the actual PyTorch model(only configs are required)

        onnx_model_config = {
            "task_type": task_type,
            "onnx_opset_version": opset_version,
            "language_model_class": language_model_class,
            "language": model.language_model.language
        }
        with open(output_path / "onnx_model_config.json", "w") as f:
            json.dump(onnx_model_config, f)

        if convert_to_float16:
            from onnxruntime_tools import optimizer
            config = AutoConfig.from_pretrained(model_name)
            optimized_model = optimizer.optimize_model(
                input=str(output_path / "model.onnx"),
                model_type='bert',
                num_heads=config.num_hidden_layers,
                hidden_size=config.hidden_size)
            optimized_model.convert_model_float32_to_float16()
            optimized_model.save_model_to_file("model.onnx")

        if quantize:
            quantize_model(output_path / "model.onnx")
コード例 #26
0
def doc_classification_multilabel():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 32

    evaluate_every = 500
    lang_model = "bert-base-uncased"
    do_lower_case = True

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load Toxic Comments Data automaticaly if it is not available.

    label_list = [
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ]
    metric = "acc"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=Path("../data/toxic-comments"),
        label_list=label_list,
        label_column_name="label",
        metric=metric,
        quote_char='"',
        multilabel=True,
        train_filename="train.tsv",
        dev_filename="val.tsv",
        test_filename=None,
        dev_split=0,
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = MultiLabelTextClassificationHead(
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/bert-german-multi-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text": "You f*****g bastards"
        },
        {
            "text": "What a lovely world"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
    model.close_multiprocessing_pool()
コード例 #27
0
def test_ner(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 5
    batch_size = 2
    evaluate_every = 1
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
                  "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir=Path("samples/ner"),
        train_filename="train-sample.txt",
        dev_filename="dev-sample.txt",
        test_filename=None,
        delimiter=" ",
        label_list=ner_labels,
        metric="seq_f1"
    )

    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={'name': 'LinearWarmup', 'warmup_proportion': 0.1}
    )
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    model = trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Albrecht Lehman ist eine Person"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    #print(result)
    #assert result[0]["predictions"][0]["context"] == "sagte"
    #assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
    result2 = model.inference_from_dicts(dicts=basic_texts, rest_api_schema=True)
    assert result == result2
コード例 #28
0
ファイル: test_s3e_pooling.py プロジェクト: vumichien/FARM
def test_s3e_fit():
    # small test data
    language_model = Path("samples/s3e/tiny_fasttext_model")
    corpus_path = Path("samples/s3e/tiny_corpus.txt")
    save_dir = Path("testsave/fitted_s3e/")
    do_lower_case = False
    batch_size = 2
    use_gpu = False

    # Fit S3E on a corpus
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False)

    # Create a InferenceProcessor
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model,
                               do_lower_case=do_lower_case)
    processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

    # Create an AdaptiveModel
    language_model = LanguageModel.load(language_model)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[],
                          embeds_dropout_prob=0.1,
                          lm_output_types=[],
                          device=device)

    model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor,
                                                    model=model,
                                                    corpus=corpus_path,
                                                    n_clusters=3,
                                                    pca_n_components=30,
                                                    svd_postprocessing=True,
                                                    min_token_occurrences=1)

    # save everything to allow inference without fitting everything again
    model.save(save_dir)
    processor.save(save_dir)
    with open(save_dir / "s3e_stats.pkl", "wb") as f:
        pickle.dump(s3e_stats, f)

    # Load model, tokenizer and processor directly into Inferencer
    inferencer = Inferencer(model=model,
                            processor=processor,
                            task_type="embeddings",
                            gpu=use_gpu,
                            batch_size=batch_size,
                            extraction_strategy="s3e",
                            extraction_layer=-1,
                            s3e_stats=s3e_stats,
                            num_processes=0)

    # Input
    basic_texts = [
        {
            "text": "a man is walking on the street."
        },
        {
            "text": "a woman is walking on the street."
        },
    ]

    # Get embeddings for input text (you can vary the strategy and layer)
    result = inferencer.inference_from_dicts(dicts=basic_texts)
    assert result[0]["context"] == [
        'a', 'man', 'is', 'walking', 'on', 'the', 'street', '.'
    ]
    assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6
    assert result[0]["vec"][-2] + 0.21376857861379997 < 1e-6
コード例 #29
0
    def __init__(self,
                 document_store: BaseDocumentStore,
                 query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base",
                 passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base",
                 single_model_path: Optional[Union[Path, str]] = None,
                 model_version: Optional[str] = None,
                 max_seq_len_query: int = 64,
                 max_seq_len_passage: int = 256,
                 top_k: int = 10,
                 use_gpu: bool = True,
                 batch_size: int = 16,
                 embed_title: bool = True,
                 use_fast_tokenizers: bool = True,
                 infer_tokenizer_classes: bool = False,
                 similarity_function: str = "dot_product",
                 progress_bar: bool = True
                 ):
        """
        Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
        The checkpoint format matches huggingface transformers' model format

        **Example:**

                ```python
                |    # remote model from FAIR
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                |                          passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base")
                |    # or from local path
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="model_directory/question-encoder",
                |                          passage_embedding_model="model_directory/context-encoder")
                ```

        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the
                                      one used by hugging-face transformers' modelhub models
                                      Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"``
        :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the
                                        one used by hugging-face transformers' modelhub models
                                        Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"``
        :param single_model_path: Local path or remote name of a query and passage embedder in one single model. Those
                                  models are typically trained within FARM.
                                  Currently available remote names: TODO add FARM DPR model to HF modelhub
        :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :param max_seq_len_query: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down."
        :param max_seq_len_passage: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down."
        :param top_k: How many documents to return per query.
        :param use_gpu: Whether to use gpu or not
        :param batch_size: Number of questions or passages to encode at once
        :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding.
                            This is the approach used in the original paper and is likely to improve performance if your
                            titles contain meaningful information for retrieval (topic, entities etc.) .
                            The title is expected to be present in doc.meta["name"] and can be supplied in the documents
                            before writing them to the DocumentStore like this:
                            {"text": "my text", "meta": {"name": "my title"}}.
        :param use_fast_tokenizers: Whether to use fast Rust tokenizers
        :param infer_tokenizer_classes: Whether to infer tokenizer class from the model config / name. 
                                        If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. 
        :param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training. 
                                    Options: `dot_product` (Default) or `cosine`
        :param progress_bar: Whether to show a tqdm progress bar or not.
                             Can be helpful to disable in production deployments to keep the logs clean.
        """

        self.document_store = document_store
        self.batch_size = batch_size
        self.progress_bar = progress_bar
        self.top_k = top_k

        if document_store is None:
           logger.warning("DensePassageRetriever initialized without a document store. "
                          "This is fine if you are performing DPR training. "
                          "Otherwise, please provide a document store in the constructor.")
        elif document_store.similarity != "dot_product":
            logger.warning(f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. "
                           "We recommend you use dot_product instead. "
                           "This can be set when initializing the DocumentStore")

        if use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.infer_tokenizer_classes = infer_tokenizer_classes
        tokenizers_default_classes = {
            "query": "DPRQuestionEncoderTokenizer",
            "passage": "DPRContextEncoderTokenizer"
        }
        if self.infer_tokenizer_classes:
            tokenizers_default_classes["query"] = None   # type: ignore
            tokenizers_default_classes["passage"] = None # type: ignore

        # Init & Load Encoders
        if single_model_path is None:
            self.query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=query_embedding_model,
                                                  revision=model_version,
                                                  do_lower_case=True,
                                                  use_fast=use_fast_tokenizers,
                                                  tokenizer_class=tokenizers_default_classes["query"])
            self.query_encoder = LanguageModel.load(pretrained_model_name_or_path=query_embedding_model,
                                                    revision=model_version,
                                                    language_model_class="DPRQuestionEncoder")
            self.passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_embedding_model,
                                                    revision=model_version,
                                                    do_lower_case=True,
                                                    use_fast=use_fast_tokenizers,
                                                    tokenizer_class=tokenizers_default_classes["passage"])
            self.passage_encoder = LanguageModel.load(pretrained_model_name_or_path=passage_embedding_model,
                                                      revision=model_version,
                                                      language_model_class="DPRContextEncoder")

            self.processor = TextSimilarityProcessor(query_tokenizer=self.query_tokenizer,
                                                     passage_tokenizer=self.passage_tokenizer,
                                                     max_seq_len_passage=max_seq_len_passage,
                                                     max_seq_len_query=max_seq_len_query,
                                                     label_list=["hard_negative", "positive"],
                                                     metric="text_similarity_metric",
                                                     embed_title=embed_title,
                                                     num_hard_negatives=0,
                                                     num_positives=1)
            prediction_head = TextSimilarityHead(similarity_function=similarity_function)
            self.model = BiAdaptiveModel(
                language_model1=self.query_encoder,
                language_model2=self.passage_encoder,
                prediction_heads=[prediction_head],
                embeds_dropout_prob=0.1,
                lm1_output_types=["per_sequence"],
                lm2_output_types=["per_sequence"],
                device=self.device,
            )
        else:
            self.processor = TextSimilarityProcessor.load_from_dir(single_model_path)
            self.processor.max_seq_len_passage = max_seq_len_passage
            self.processor.max_seq_len_query = max_seq_len_query
            self.processor.embed_title = embed_title
            self.processor.num_hard_negatives = 0
            self.processor.num_positives = 1  # during indexing of documents only one embedding is created
            self.model = BiAdaptiveModel.load(single_model_path, device=self.device)

        self.model.connect_heads_with_processor(self.processor.tasks, require_labels=False)
コード例 #30
0
def fit(language_model,
        corpus_path,
        save_dir,
        do_lower_case,
        batch_size=4,
        use_gpu=False):
    # Fit S3E on a corpus
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False)

    # Create a InferenceProcessor
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model,
                               do_lower_case=do_lower_case)
    processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

    # Create an AdaptiveModel
    language_model = LanguageModel.load(language_model)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor,
                                                    model=model,
                                                    corpus=corpus_path,
                                                    n_clusters=10,
                                                    pca_n_components=300,
                                                    svd_postprocessing=True,
                                                    min_token_occurrences=1)

    # save everything to allow inference without fitting everything again
    model.save(save_dir)
    processor.save(save_dir)
    with open(save_dir / "s3e_stats.pkl", "wb") as f:
        pickle.dump(s3e_stats, f)

    # Load model, tokenizer and processor directly into Inferencer
    inferencer = Inferencer(model=model,
                            processor=processor,
                            task_type="embeddings",
                            gpu=use_gpu,
                            batch_size=batch_size,
                            extraction_strategy="s3e",
                            extraction_layer=-1,
                            s3e_stats=s3e_stats)

    # Input
    basic_texts = [
        {
            "text": "a man is walking on the street."
        },
        {
            "text": "a woman is walking on the street."
        },
    ]

    # Get embeddings for input text (you can vary the strategy and layer)
    result = inferencer.inference_from_dicts(dicts=basic_texts)
    print(result)