コード例 #1
0
ファイル: test_lm_finetuning.py プロジェクト: tholor/FARM
def test_lm_finetuning(caplog):
    caplog.set_level(logging.CRITICAL)
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 5
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model, do_lower_case=False)

    processor = BertStyleLMProcessor(
        data_dir="samples/lm_finetuning",
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=64,
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = Bert.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    model = trainer.train(model)

    save_dir = "testsave/lm_finetuning"
    model.save(save_dir)
    processor.save(save_dir)
コード例 #2
0
ファイル: lm_finetuning.py プロジェクト: svmihar/FARM
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
processor = BertStyleLMProcessor(data_dir="../data/lm_finetune_nips",
                                 tokenizer=tokenizer,
                                 max_seq_len=128,
                                 max_docs=30)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor,
                     batch_size=batch_size,
                     max_multiprocessing_chunksize=20)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
lm_prediction_head = BertLMHead.load(lang_model)
next_sentence_head = NextSentenceHead.load(lang_model)

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[lm_prediction_head, next_sentence_head],
    embeds_dropout_prob=0.1,
    lm_output_types=["per_token", "per_sequence"],
    device=device,
)

# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    learning_rate=2e-5,
    device=device,
コード例 #3
0
def test_lm_finetuning(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False,
        never_split_chars=["-", "_"])

    processor = BertStyleLMProcessor(
        data_dir="samples/lm_finetuning",
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=12,
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = Bert.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    model = trainer.train(model)

    save_dir = "testsave/lm_finetuning"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Farmer's life is great."
        },
        {
            "text": "It's nothing for big city kids though."
        },
    ]
    model = Inferencer.load(save_dir, embedder_only=True)
    result = model.extract_vectors(dicts=basic_texts)
    assert result[0]["context"] == [
        'Farmer', "'", 's', 'life', 'is', 'great', '.'
    ]
    assert result[0]["vec"].shape == (768, )
    # TODO check why results vary accross runs with same seed
    assert isinstance(result[0]["vec"][0], np.float32)
コード例 #4
0
ファイル: test_lm_finetuning.py プロジェクト: svmihar/FARM
def test_lm_finetuning_no_next_sentence(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    processor = BertStyleLMProcessor(
        data_dir="samples/lm_finetuning",
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=12,
        next_sent_pred=False
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)

    language_model = LanguageModel.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1}
    )
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    model = trainer.train(model)

    # LM embeddings and weight of decoder in head are shared and should therefore be equal
    assert torch.all(
        torch.eq(model.language_model.model.embeddings.word_embeddings.weight, model.prediction_heads[0].decoder.weight))

    save_dir = "testsave/lm_finetuning_no_nsp"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Farmer's life is great."},
        {"text": "It's nothing for big city kids though."},
    ]
    model = Inferencer.load(save_dir, embedder_only=True)
    result = model.extract_vectors(dicts=basic_texts)
    assert result[0]["context"] == ['Farmer', "'", 's', 'life', 'is', 'great', '.']
    assert result[0]["vec"].shape == (768,)
    # TODO check why results vary accross runs with same seed
    assert isinstance(result[0]["vec"][0], np.float32)
コード例 #5
0
ファイル: transformers.py プロジェクト: skirdey/FARM
    def convert_from_transformers(model_name_or_path,
                                  device,
                                  revision=None,
                                  task_type=None,
                                  processor=None,
                                  **kwargs):
        """
        Load a (downstream) model from huggingface's transformers format. Use cases:
         - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data)
         - compare models without switching frameworks
         - use model directly for inference

        :param model_name_or_path: local path of a saved model or name of a public one.
                                              Exemplary public names:
                                              - distilbert-base-uncased-distilled-squad
                                              - deepset/bert-large-uncased-whole-word-masking-squad2

                                              See https://huggingface.co/models for full list
        :param device: "cpu" or "cuda"
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param task_type: One of :
                          - 'question_answering'
                          - 'text_classification'
                          - 'embeddings'
                          More tasks coming soon ...
        :param processor: populates prediction head with information coming from tasks
        :type processor: Processor
        :return: AdaptiveModel
        """

        lm = LanguageModel.load(model_name_or_path,
                                revision=revision,
                                **kwargs)
        if task_type is None:
            # Infer task type from config
            architecture = lm.model.config.architectures[0]
            if "MaskedLM" in architecture:
                task_type = "lm"
            elif "QuestionAnswering" in architecture:
                task_type = "question_answering"
            elif "SequenceClassification" in architecture:
                if lm.model.config.num_labels == 1:
                    task_type = "regression"
                else:
                    task_type = "text_classification"
            elif "TokenClassification" in architecture:
                task_type = "ner"
            else:
                logger.error(
                    "Could not infer task type from model config. Please provide task type manually. "
                    "('lm', 'question_answering', 'regression', 'text_classification', 'ner' or 'embeddings')"
                )

        if task_type == "lm":
            ph = BertLMHead.load(model_name_or_path,
                                 revision=revision,
                                 **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "question_answering":
            ph = QuestionAnsweringHead.load(model_name_or_path,
                                            revision=revision,
                                            **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "regression":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Regression with Roberta or XLMRoberta not possible at the moment."
                )
                raise NotImplementedError
            ph = RegressionHead.load(model_name_or_path, **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_sequence",
                                              device=device)

        elif task_type == "text_classification":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment."
                )
                raise NotImplementedError
            ph = TextClassificationHead.load(model_name_or_path,
                                             revision=revision,
                                             **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_sequence",
                                              device=device)

        elif task_type == "ner":
            ph = TokenClassificationHead.load(model_name_or_path,
                                              revision=revision,
                                              **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "embeddings":
            adaptive_model = am.AdaptiveModel(
                language_model=lm,
                prediction_heads=[],
                embeds_dropout_prob=0.1,
                lm_output_types=["per_token", "per_sequence"],
                device=device)

        if processor:
            adaptive_model.connect_heads_with_processor(processor.tasks)

        return adaptive_model
コード例 #6
0
def lm_finetuning():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    set_all_seeds(seed=42)
    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_minimal_example_lm")
    ##########################
    ########## Settings
    ##########################
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 30
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = BertStyleLMProcessor(data_dir=Path("../data/lm_finetune_nips"),
                                     tokenizer=tokenizer,
                                     max_seq_len=128,
                                     max_docs=20)
    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_multiprocessing_chunksize=20)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
    lm_prediction_head = BertLMHead.load(lang_model)
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-english-lm-tutorial")
    model.save(save_dir)
    processor.save(save_dir)