Example #1
0
def train_on_split(silo_to_use, n_fold, save_dir):
    logger.info(f"############ Crossvalidation: Fold {n_fold} ############")
    # Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        layer_dims=[
            768,
            len(processor.tasks["text_classification"]["label_list"])
        ],
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.2,
                          lm_output_types=["per_sequence"],
                          device=device)

    # Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=0.5e-5,
        device=device,
        n_batches=len(silo_to_use.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    # NOTE: Using a different save directory for each fold, allows us afterwards to use the
    # nfolds best models in an ensemble!
    save_dir += f"-{n_fold}"
    earlystopping = EarlyStopping(
        metric="f1_offense",
        mode=
        "max",  # use the metric from our own metrics function instead of loss
        save_dir=save_dir,  # where to save the best model
        patience=
        5  # number of evaluations to wait for improvement before terminating the training
    )

    trainer = Trainer(optimizer=optimizer,
                      data_silo=silo_to_use,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping,
                      evaluator_test=False)

    # train it
    model = trainer.train(model)

    return model
Example #2
0
def distilbert_nq(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)


    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 2
    n_epochs = 1
    evaluate_every = 4
    base_LM_model = "distilbert-base-uncased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=base_LM_model, do_lower_case=True
    )
    processor = NaturalQuestionsProcessor(
        tokenizer=tokenizer,
        max_seq_len=20,
        doc_stride=10,
        max_query_length=6,
        train_filename="train_sample.jsonl",
        dev_filename="dev_sample.jsonl",
        data_dir=Path("samples/nq")
    )

    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
    language_model = LanguageModel.load(base_LM_model)
    qa_head = QuestionAnsweringHead()
    classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[qa_head, classification_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device
    )
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device
    )
    trainer.train()
    return model, processor
Example #3
0
    def convert_from_transformers(cls, model_name_or_path, device, task_type):
        """
        Load a (downstream) model from huggingface's transformers format. Use cases:
         - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data)
         - compare models without switching frameworks
         - use model directly for inference

        :param model_name_or_path: local path of a saved model or name of a public one.
                                              Exemplary public names:
                                              - distilbert-base-uncased-distilled-squad
                                              - deepset/bert-large-uncased-whole-word-masking-squad2

                                              See https://huggingface.co/models for full list
        :param device: "cpu" or "cuda"
        :param task_type: One of :
                          - 'question_answering'
                          - 'text_classification'
                          - 'embeddings'
                          More tasks coming soon ...
        :return: AdaptiveModel
        """
        lm = LanguageModel.load(model_name_or_path)
        #TODO Infer type of head automatically from config

        if task_type == "question_answering":
            ph = QuestionAnsweringHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[ph],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types="per_token",
                                 device=device)
        elif task_type == "text_classification":
            ph = TextClassificationHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[ph],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types="per_sequence",
                                 device=device)
        elif task_type == "ner":
            ph = TokenClassificationHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[ph],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types="per_token",
                                 device=device)
        elif task_type == "embeddings":
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types=["per_token", "per_sequence"],
                                 device=device)
        else:
            raise NotImplementedError(
                f"Huggingface's transformer models of type {task_type} are not supported yet"
            )

        return adaptive_model
Example #4
0
    def convert_from_transformers(cls, model_name_or_path, device, task_type, processor=None):
        """
        Load a (downstream) model from huggingface's transformers format. Use cases:
         - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data)
         - compare models without switching frameworks
         - use model directly for inference

        :param model_name_or_path: local path of a saved model or name of a public one.
                                              Exemplary public names:
                                              - distilbert-base-uncased-distilled-squad
                                              - deepset/bert-large-uncased-whole-word-masking-squad2

                                              See https://huggingface.co/models for full list
        :param device: "cpu" or "cuda"
        :param task_type: One of :
                          - 'question_answering'
                          - 'text_classification'
                          - 'embeddings'
                          More tasks coming soon ...
        :param processor: populates prediction head with information coming from tasks
        :type processor: Processor
        :return: AdaptiveModel
        """
        lm = LanguageModel.load(model_name_or_path)
        # TODO Infer type of head automatically from config

        if task_type == "question_answering":
            ph = QuestionAnsweringHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1,
                                 lm_output_types="per_token", device=device)
        elif task_type == "text_classification":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationhead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment.")
                raise NotImplementedError
            ph = TextClassificationHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1,
                                 lm_output_types="per_sequence", device=device)
        elif task_type == "ner":
            ph = TokenClassificationHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1,
                                 lm_output_types="per_token", device=device)
        elif task_type == "embeddings":
            adaptive_model = cls(language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1,
                                 lm_output_types=["per_token", "per_sequence"], device=device)
        else:
            raise NotImplementedError(f"Huggingface's transformer models of type {task_type} are not supported yet")

        if processor:
            adaptive_model.connect_heads_with_processor(processor.tasks)

        return adaptive_model
Example #5
0
def test_multiple_prediction_heads():
    model = "bert-base-german-cased"
    lm = LanguageModel.load(model)
    ph1 = TextClassificationHead(num_labels=3, label_list=["negative", "neutral", "positive"])
    ph2 = TokenClassificationHead(num_labels=3, label_list=["PER", "LOC", "ORG"])
    adaptive_model = AdaptiveModel(language_model=lm, prediction_heads=[ph1, ph2], embeds_dropout_prob=0.1,
                                   lm_output_types="per_token", device="cpu")
    transformer_models = Converter.convert_to_transformers(adaptive_model)
    assert isinstance(transformer_models[0], BertForSequenceClassification)
    assert isinstance(transformer_models[1], BertForTokenClassification)
    del lm
    del transformer_models
    del adaptive_model
Example #6
0
def test_prediction_head_load_save_class_weights(tmp_path, caplog=None):
    """This is a regression test for #428 and #422."""
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 1
    lang_model = "bert-base-german-cased"
    data_dir_path = "samples/doc_class"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    tcp_params = dict(tokenizer=tokenizer,
                      max_seq_len=8,
                      data_dir=Path(data_dir_path),
                      train_filename="train-sample.tsv",
                      label_list=["OTHER", "OFFENSE"],
                      metric="f1_macro",
                      dev_filename="test-sample.tsv",
                      test_filename=None,
                      dev_split=0.0,
                      label_column_name="coarse_label")

    processor = TextClassificationProcessor(**tcp_params)

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(
        num_labels=2,
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    model.save(tmp_path)
    model_loaded = AdaptiveModel.load(tmp_path, device='cpu')
    assert model_loaded is not None
def load_prediction_heads(args, silo):
    if args.recycle_heads:
        args.logger.info("Recycling heads of the loaded model")
        # Model name should be a directory in this case
        _, ph_configs = CustomAdaptiveModel._get_prediction_head_files(
            args.model_name)
        prediction_heads = [
            PredictionHead.load(config_file) for config_file in ph_configs
        ]
        # Ensure that label_columns order is the same as respective prediction heads (ascending)
        # else this will misalign heads with tasks.
        for idx in range(len(prediction_heads)):
            args.logger.info(
                f"Renaming head task {prediction_heads[idx].task_name} to {args.label_columns[idx]}"
            )
            prediction_heads[idx].task_name = args.label_columns[idx]
        out_types = [head.ph_output_type for head in prediction_heads]
    elif args.train_mode == "classification":
        prediction_heads = [
            TextClassificationHead(
                layer_dims=[
                    args.heads_dim,
                    len(get_labels(args.data_dir, task))
                ],
                task_name=task,
            ) for task in args.label_columns
        ]
        out_types = ["per_sequence" for _ in args.label_columns]
    else:  # Regression from raw heads
        if args.do_feat_embeds:
            args.logger.info(f"feat_size: {args.feat_size}")
            prediction_heads = [
                FeaturesRegressionHead(
                    layer_dims=[args.heads_dim + args.feat_size, 1],
                    task_name=task) for task in args.label_columns
            ]
        else:
            prediction_heads = [
                RegressionHead(layer_dims=[args.heads_dim, 1], task_name=task)
                for task in args.label_columns
            ]
        out_types = ["per_sequence_continuous" for _ in args.label_columns]
    return prediction_heads, out_types
Example #8
0
processor = BertStyleLMProcessor(data_dir="../data/finetune_sample",
                                 tokenizer=tokenizer,
                                 max_seq_len=128)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=32)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = Bert.load("bert-base-german-cased")
# b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
lm_prediction_head = BertLMHead(
    embeddings=language_model.model.embeddings,
    hidden_size=language_model.model.config.hidden_size,
)
next_sentence_head = TextClassificationHead(
    layer_dims=[language_model.model.config.hidden_size, 2],
    loss_ignore_index=-1)

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[lm_prediction_head, next_sentence_head],
    embeds_dropout_prob=0.1,
    lm_output_types=["per_token", "per_sequence"],
    device=device,
)

# 5. Create an optimizer
optimizer, warmup_linear = initialize_optimizer(
    model=model,
    learning_rate=2e-5,
    warmup_proportion=0.1,
Example #9
0
def test_text_pair_classification(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 5
    evaluate_every = 2
    lang_model = "microsoft/MiniLM-L12-H384-uncased"
    label_list = ["0", "1", "2"]

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model)

    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="f1_macro",
        max_seq_len=128,
        train_filename="sample.tsv",
        dev_filename="sample.tsv",
        test_filename=None,
        data_dir=Path("samples/text_pair"),
        delimiter="\t")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    trainer.train()

    save_dir = Path("testsave/text_pair_classification_model")
    model.save(save_dir)
    processor.save(save_dir)

    # For correct Text Pair Classification on raw dictionaries, we need to put both texts (text, text_b) into a tuple
    # See corresponding operation in the file_to_dicts method of TextPairClassificationProcessor here: https://github.com/deepset-ai/FARM/blob/5ab5b1620cb51ceb874d4b30c887e377ad1a6e9a/farm/data_handler/processor.py#L744
    basic_texts = [
        {
            "text":
            ("how many times have real madrid won the champions league in a row",
             "They have also won the competition the most times in a row, winning it five times from 1956 to 1960"
             )
        },
        {
            "text": ("how many seasons of the blacklist are there on netflix",
                     "Retrieved March 27 , 2018 .")
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0]["label"] == "1"
    assert np.isclose(result[0]["predictions"][0]["probability"],
                      0.3781,
                      rtol=0.05)
    model.close_multiprocessing_pool()
Example #10
0
    def convert_from_transformers(model_name_or_path,
                                  device,
                                  revision=None,
                                  task_type=None,
                                  processor=None,
                                  **kwargs):
        """
        Load a (downstream) model from huggingface's transformers format. Use cases:
         - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data)
         - compare models without switching frameworks
         - use model directly for inference

        :param model_name_or_path: local path of a saved model or name of a public one.
                                              Exemplary public names:
                                              - distilbert-base-uncased-distilled-squad
                                              - deepset/bert-large-uncased-whole-word-masking-squad2

                                              See https://huggingface.co/models for full list
        :param device: "cpu" or "cuda"
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param task_type: One of :
                          - 'question_answering'
                          - 'text_classification'
                          - 'embeddings'
                          More tasks coming soon ...
        :param processor: populates prediction head with information coming from tasks
        :type processor: Processor
        :return: AdaptiveModel
        """

        lm = LanguageModel.load(model_name_or_path,
                                revision=revision,
                                **kwargs)
        if task_type is None:
            # Infer task type from config
            architecture = lm.model.config.architectures[0]
            if "MaskedLM" in architecture:
                task_type = "lm"
            elif "QuestionAnswering" in architecture:
                task_type = "question_answering"
            elif "SequenceClassification" in architecture:
                if lm.model.config.num_labels == 1:
                    task_type = "regression"
                else:
                    task_type = "text_classification"
            elif "TokenClassification" in architecture:
                task_type = "ner"
            else:
                logger.error(
                    "Could not infer task type from model config. Please provide task type manually. "
                    "('lm', 'question_answering', 'regression', 'text_classification', 'ner' or 'embeddings')"
                )

        if task_type == "lm":
            ph = BertLMHead.load(model_name_or_path,
                                 revision=revision,
                                 **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "question_answering":
            ph = QuestionAnsweringHead.load(model_name_or_path,
                                            revision=revision,
                                            **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "regression":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Regression with Roberta or XLMRoberta not possible at the moment."
                )
                raise NotImplementedError
            ph = RegressionHead.load(model_name_or_path, **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_sequence",
                                              device=device)

        elif task_type == "text_classification":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment."
                )
                raise NotImplementedError
            ph = TextClassificationHead.load(model_name_or_path,
                                             revision=revision,
                                             **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_sequence",
                                              device=device)

        elif task_type == "ner":
            ph = TokenClassificationHead.load(model_name_or_path,
                                              revision=revision,
                                              **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "embeddings":
            adaptive_model = am.AdaptiveModel(
                language_model=lm,
                prediction_heads=[],
                embeds_dropout_prob=0.1,
                lm_output_types=["per_token", "per_sequence"],
                device=device)

        if processor:
            adaptive_model.connect_heads_with_processor(processor.tasks)

        return adaptive_model
Example #11
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]
    train_filename = "train.tsv"
    dev_filename = "dev_200k.tsv"

    # The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking
    generate_data = False
    data_dir = Path("../data/msmarco_passage")
    predictions_raw_filename = "predictions_raw.txt"
    predictions_filename = "predictions.txt"
    train_source_filename = "triples.train.1m.tsv"
    qrels_filename = "qrels.dev.tsv"
    queries_filename = "queries.dev.tsv"
    passages_filename = "collection.tsv"
    top1000_filename = "top1000.dev"

    # 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once!
    # The final format is a tsv file with 3 columns (text, text_b and label)
    if generate_data:
        reformat_msmarco_train(data_dir / train_source_filename,
                               data_dir / train_filename)
        reformat_msmarco_dev(data_dir / queries_filename,
                             data_dir / passages_filename,
                             data_dir / qrels_filename,
                             data_dir / top1000_filename,
                             data_dir / dev_filename)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    Evaluation during training will be performed on a slice of the train set
    #    We will be using the msmarco dev set as our final evaluation set
    processor = TextPairClassificationProcessor(tokenizer=tokenizer,
                                                label_list=label_list,
                                                metric="f1_macro",
                                                train_filename=train_filename,
                                                test_filename=None,
                                                dev_split=0.001,
                                                max_seq_len=128,
                                                data_dir=data_dir,
                                                delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(num_labels=len(label_list),
                                             class_weights=data_silo.calculate_class_weights(
                                                 task_name="text_classification"),
                                             )

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence_continuous"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/passage_ranking_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    model = Inferencer.load(save_dir, gpu=True, max_seq_len=128, batch_size=128)
    result = model.inference_from_file(data_dir / dev_filename)

    write_msmarco_results(result, save_dir / predictions_raw_filename)

    msmarco_evaluation(preds_file=save_dir / predictions_raw_filename,
                       dev_file=data_dir / dev_filename,
                       qrels_file=data_dir / qrels_filename,
                       output_file=save_dir / predictions_filename)

    model.close_multiprocessing_pool()
Example #12
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification_fasttext")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 3
    batch_size = 32
    evaluate_every = 100
    # load fasttext from a local path:
    #fasttext_model = "../saved_models/fasttext-german-uncased"
    # or through s3
    fasttext_model = "fasttext-german-uncased"
    do_lower_case = True
    max_features = 10_000  # maximum number of unique words we will transform
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1. To make Fasttext work within FARM and with advanced aggregation strategies, we need a fixed vocabulary and associated Wordembeddings
    ft_converter = Fasttext_converter(
        pretrained_model_name_or_path=fasttext_model,
        do_lower_case=do_lower_case,
        data_path=Path("../data/germeval18"),
        train_filename="train.tsv",
        output_path=Path("../saved_models/fasttext-german-uncased-converted"),
        language="German",
        max_features=max_features)
    # We convert the data to have fixed size vocab and embeddings
    vocab_counts = ft_converter.convert_on_data()

    # 2. Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=ft_converter.output_path,
        do_lower_case=do_lower_case)

    # 3. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=ft_converter.data_path,
        label_list=label_list,
        train_filename=ft_converter.train_filename,
        dev_split=0,
        test_filename="test.tsv",
        metric=metric,
        label_column_name="coarse_label")

    # 4. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor, batch_size=batch_size, max_processes=1
    )  # multiprocessing with WordembeddingTokenizer is not optimal - so disable it

    # 5. Create an AdaptiveModel
    # a) which consists of the newly created embedding model as a basis.
    language_model = LanguageModel.load(ft_converter.output_path)
    # b) and a prediction head on top that is suited for our task => Text classification
    # Since we do not have a powerful Transformer based Language Model, we need a slightly deeper NN
    # for going the Classification
    prediction_head = TextClassificationHead(
        layer_dims=[300, 600, len(label_list)],
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 6. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-3,
        device=device,
        n_batches=len(
            data_silo.get_data_loader("train")
        ),  #len(data_silo.loaders["train"]),streaming: len(data_silo.get_data_loader("train"))
        n_epochs=n_epochs)

    # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 8. Let it grow
    trainer.train()
def doc_classification_with_earlystopping():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    # ml_logger = MLFlowLogger(tracking_uri="logs")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.2,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=0.5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        metric="f1_offense",
        mode=
        "max",  # use the metric from our own metrics function instead of loss
        # metric="f1_macro", mode="max",  # use f1_macro from the dev evaluator of the trainer
        # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
        save_dir=Path("saved_models/bert-german-doc-tutorial-es"
                      ),  # where to save the best model
        patience=
        5  # number of evaluations to wait for improvement before terminating the training
    )

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model.
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]

    # Load from the final epoch directory and apply
    print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING")
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
    model.close_multiprocessing_pool()

    # Load from saved best model
    print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING")
    model = Inferencer.load(earlystopping.save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print("APPLICATION ON BEST MODEL")
    print(result)
    model.close_multiprocessing_pool()
Example #14
0
processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir="../data/germeval18",
                                        labels=label_list,
                                        metric=metric,
                                        source_field="coarse_label")

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = Bert.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(layer_dims=[
    768, len(processor.tasks["text_classification"]["label_list"])
])

model = AdaptiveModel(language_model=language_model,
                      prediction_heads=[prediction_head],
                      embeds_dropout_prob=0.1,
                      lm_output_types=["per_sequence"],
                      device=device)

# 5. Create an optimizer
optimizer, warmup_linear = initialize_optimizer(
    model=model,
    learning_rate=2e-5,
    warmup_proportion=0.1,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=n_epochs)
Example #15
0
def doc_classifcation():
    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    tokenizer = AutoTokenizer.from_pretrained(lang_model, strip_accents=False)
    #tokenizer = Tokenizer.load(
    #    pretrained_model_name_or_path=lang_model,
    #    do_lower_case=do_lower_case)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir=Path("./data/germeval18"),
                                            label_list=label_list,
                                            metric=metric,
                                            dev_filename="test.tsv",  # we want to evaluate against test
                                            label_column_name="coarse_label",
                                            )

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    earlystopping = EarlyStopping(
        metric=metric, mode="max",
        #save_dir=Path("./saved_models"),
        patience=3
    )

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        early_stopping=earlystopping,
        device=device)

    trainer.train()

    return earlystopping.best_so_far
Example #16
0
def doc_classification_cola():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_cola")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 5
    batch_size = 100
    evaluate_every = 20
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load Cola 2018 Data.

    label_list = ["0", "1"]
    metric = "mcc"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=64,
                                            data_dir=Path("../data/cola"),
                                            dev_filename=Path("dev.tsv"),
                                            dev_split=None,
                                            test_filename=None,
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)

    # language_model = Roberta.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text": "The box contained the ball from the tree."
        },
        {
            "text": "I'll fix you a drink."
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
Example #17
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification_glove")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 3
    batch_size = 32
    evaluate_every = 100
    # load from a local path:
    lang_model = Path("../saved_models/glove-german-uncased")
    # or through s3
    #lang_model = "glove-german-uncased"
    do_lower_case = True

    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        dev_split=0,
        test_filename="test.tsv",
        train_filename="train.tsv",
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)

    # 4. Create an AdaptiveModel
    # a) which consists of an embedding model as a basis.
    # Word embedding models only converts words it has seen during training to embedding vectors.
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        layer_dims=[300, 600, len(label_list)],
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()
Example #18
0
def doc_classification(task,
                       model_type,
                       n_epochs,
                       batch_size,
                       embeds_dropout,
                       evaluate_every,
                       use_cuda,
                       max_seq_len,
                       learning_rate,
                       do_lower_case,
                       register_model,
                       save_model=True,
                       early_stopping=False):

    language = cu.params.get('language')

    # Check task
    if cu.tasks.get(str(task)).get('type') != 'classification':
        raise Exception('NOT A CLASSIFICATION TASK')

    # Data
    dt_task = dt.Data(task=task)
    ## Download training files
    if not os.path.isfile(dt_task.get_path('fn_train', dir='data_dir')):
        dt_task.download('data_dir', dir='data_dir', source='datastore')

    # Settings
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)
    lang_model = he.get_farm_model(model_type, language)
    save_dir = dt_task.get_path('model_dir')
    label_list = dt_task.load('fn_label', dir='data_dir',
                              header=None)[0].to_list()

    # AML log
    try:
        aml_run.log('task', task)
        aml_run.log('language', language)
        aml_run.log('n_epochs', n_epochs)
        aml_run.log('batch_size', batch_size)
        aml_run.log('learning_rate', learning_rate)
        aml_run.log('embeds_dropout', embeds_dropout)
        aml_run.log('max_seq_len', max_seq_len)
        aml_run.log('lang_model', lang_model)
        aml_run.log_list('label_list', label_list)
    except:
        pass

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
        # AML log
        try:
            aml_run.log('acc', acc.get('acc'))
            aml_run.log('f1macro', f1macro)
            aml_run.log('f1micro', f1micro)
        except:
            pass
        return {"acc": acc, "f1_macro": f1macro, "f1_micro": f1micro}

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        data_dir=dt_task.data_dir,
        label_list=label_list,
        metric=metric,
        label_column_name="label",
        train_filename=dt_task.get_path('fn_train', dir='data_dir'),
        test_filename=dt_task.get_path('fn_test', dir='data_dir'))

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    ## Pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)

    ## Prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(processor.tasks["text_classification"]["label_list"]),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=embeds_dropout,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        learning_rate=learning_rate,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    if early_stopping:
        earlystopping = EarlyStopping(
            metric="f1_macro",
            mode="max",  # use f1_macro from the dev evaluator of the trainer
            # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
            save_dir=save_dir,  # where to save the best model
            patience=
            2  # number of evaluations to wait for improvement before terminating the training
        )
    else:
        earlystopping = None

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Store it:
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    if save_model:
        model.save(save_dir)
        processor.save(save_dir)

        if register_model:
            dt_task.upload('model_dir', destination='model')
Example #19
0
def main(args):
    print(f"[INFO] PyTorch Version: {torch.__version__}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("[INFO] Devices available: {}".format(device))
    checkpoint_path = Path(args.ckpt_path) / args.run_name
    ml_logger = MLFlowLogger(tracking_uri=args.tracking_uri)
    ml_logger.init_experiment(experiment_name=args.experiment_name,
                              run_name=args.run_name)
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=args.pretrained_model_name_or_path,
        do_lower_case=False)
    # Processor
    if args.task_name == "text_classification":
        processor = TextClassificationProcessor(
            tokenizer=tokenizer,
            train_filename=args.train_filename,
            dev_filename=None,
            test_filename=args.test_filename,
            header=0,
            max_seq_len=args.max_seq_len,
            data_dir=args.data_dir,
            label_list=args.label_list,
            metric=args.metric,
            label_column_name=args.label_column_name,
            text_column_name=args.text_column_name)
    elif args.task_name == "question_answering":
        processor = SquadProcessor(tokenizer=tokenizer,
                                   train_filename=args.train_filename,
                                   dev_filename=args.test_filename,
                                   test_filename=args.test_filename,
                                   max_seq_len=args.max_seq_len,
                                   data_dir=args.data_dir,
                                   label_list=args.label_list,
                                   metric=args.metric,
                                   max_query_length=64,
                                   doc_stride=128,
                                   max_answers=1)
    else:
        raise ValueError("task name error")
    processor.save(checkpoint_path)

    # DataSilo
    data_silo = DataSilo(processor=processor,
                         batch_size=args.batch_size,
                         eval_batch_size=args.eval_batch_size,
                         caching=True,
                         cache_path=checkpoint_path)
    # LanguageModel: Build pretrained language model
    language_model = LanguageModel.load(args.pretrained_model_name_or_path,
                                        language="korean")

    # PredictionHead: Build predictor layer
    if args.task_name == "text_classification":
        # If you do classification on imbalanced classes, consider using class weights.
        # They change the loss function to down-weight frequent classes.
        prediction_head = TextClassificationHead(
            num_labels=len(args.label_list),
            class_weights=data_silo.calculate_class_weights(
                task_name=args.task_name))
    elif args.task_name == "question_answering":
        prediction_head = QuestionAnsweringHead(
            layer_dims=[768, 2],
            task_name=args.task_name,
        )
    else:
        raise ValueError("task name error")

    # AdaptiveModel: Combine all
    if args.task_name == "text_classification":
        lm_output_types = ["per_sequence"]
    elif args.task_name == "question_answering":
        lm_output_types = ["per_token"]
    else:
        raise ValueError("task name error")

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=args.embeds_dropout_prob,
                          lm_output_types=lm_output_types,
                          device=device)

    # Initialize Optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        device=device,
        learning_rate=args.learning_rate,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=args.n_epochs)
    # EarlyStopping
    earlymetric = "f1" if args.task_name == "question_answering" else "acc"
    mode = "max" if args.task_name in [
        "text_classification", "question_answering"
    ] else "min"
    earlystop = EarlyStopping(save_dir=checkpoint_path,
                              metric=earlymetric,
                              mode=mode,
                              patience=5)

    # Trainer
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        lr_schedule=lr_schedule,
        data_silo=data_silo,
        early_stopping=earlystop,
        evaluate_every=args.evaluate_every,
        checkpoints_to_keep=args.checkpoints_to_keep,
        checkpoint_root_dir=checkpoint_path,
        checkpoint_every=args.checkpoint_every,
        epochs=args.n_epochs,
        n_gpu=args.n_gpu,
        device=device,
    )
    # now train!
    model = trainer.train()
def test_doc_classification():
    #caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "roberta-base"

    tokenizer = RobertaTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=8,
                                            data_dir="samples/doc_class",
                                            train_filename="train-sample.tsv",
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            dev_filename="test-sample.tsv",
                                            test_filename=None,
                                            dev_split=0.0,
                                            label_column_name="coarse_label")

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = Roberta.load(lang_model)
    prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts=None)

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    model = trainer.train(model)

    save_dir = "testsave/doc_class_roberta"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Martin Müller spielt Handball in Berlin."},
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."}
    ]


    inf = Inferencer.load(save_dir,batch_size=2)
    result = inf.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["probability"],np.float32)
Example #21
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    # or a local path:
    # lang_model = Path("../saved_models/farm-bert-base-cased")
    use_amp = None

    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir=Path("../data/germeval18"),
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="coarse_label"
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
        {"text": "Martin Müller spielt Handball in Berlin"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    # or a local path:
    # lang_model = Path("../saved_models/farm-bert-base-cased")
    use_amp = None

    #############################################
    # CUSTOM OPTIMIZER & LR SCHEDULE
    #############################################
    # learning rate schedules from transformers
    schedule_opts = {"name": "LinearWarmup", "warmup_proportion": 0.4}
    # schedule_opts = {"name": "Constant"}
    # schedule_opts = {"name": "CosineWarmup", "warmup_proportion": 0.4}
    # schedule_opts = {"name": "CosineWarmupWithRestarts", "warmup_proportion": 0.4}

    # or from native pytorch (see https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html for all options)
    # schedule_opts = {"name": "StepLR", "step_size": 30, "gamma": 0.1}
    # schedule_opts = {"name": "ReduceLROnPlateau", "mode": 'min', "factor": 0.1, "patience":10}

    # optimizers from pytorch (see https://pytorch.org/docs/stable/optim.html for all options)
    optimizer_opts = {"name": "SGD", "momentum": 0.0}

    # or from apex (see https://github.com/NVIDIA/apex/tree/master/apex/optimizers for all options)
    # optimizer_opts = {"name": "FusedLAMB", "bias_correction": True}

    # or from transformers (default in FARM)
    #optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01}
    #############################################

    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-3,
        optimizer_opts=optimizer_opts,
        schedule_opts=schedule_opts,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
Example #23
0
def test_doc_classification(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 8
    evaluate_every = 30
    lang_model = "bert-base-german-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model, do_lower_case=False)

    processor = GermEval18CoarseProcessor(tokenizer=tokenizer,
                                          max_seq_len=64,
                                          data_dir="samples/doc_class",
                                          train_filename="train-sample.tsv",
                                          test_filename=None)

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = Bert.load(lang_model)
    prediction_head = TextClassificationHead(
        layer_dims=[768, len(processor.label_list)])
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_examples=data_silo.n_samples("train"),
        batch_size=batch_size,
        n_epochs=1)

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      warmup_linear=warmup_linear,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = "testsave/doc_class"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.run_inference(dicts=basic_texts)
    assert result[0]["predictions"][0]["label"] == "OTHER"
    assert abs(result[0]["predictions"][0]["probability"] -
               0.5358161) <= 0.0001
def test_doc_classification(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir=Path("samples/doc_class"),
        train_filename=Path("train-sample.tsv"),
        label_list=["OTHER", "OFFENSE"],
        metric="f1_macro",
        dev_filename="test-sample.tsv",
        test_filename=None,
        dev_split=0.0,
        label_column_name="coarse_label")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = DistilBert.load(lang_model)
    prediction_head = TextClassificationHead()
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts=None)

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = Path("testsave/doc_class")
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [{
        "text": "Malte liebt Berlin."
    }, {
        "text":
        "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."
    }]

    inf = Inferencer.load(save_dir, batch_size=2)
    result = inf.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
Example #25
0
def question_answering():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_natural_questions")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 24
    n_epochs = 1
    evaluate_every = 500
    lang_model = "deepset/roberta-base-squad2" # start with a model that can already extract answers
    do_lower_case = False # roberta is a cased model
    train_filename = "train_medium.jsonl"
    dev_filename = "dev_medium.jsonl"
    keep_is_impossible = 0.15 # downsample negative examples after data conversion
    downsample_context_size = 300 # reduce length of wikipedia articles to relevant part around the answer

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
    )

    # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart
    html_tags = [
                "<Th>","</Th>",
                "<Td>","</Td>",
                "<Tr>","</Tr>",
                "<Li>","</Li>",
                "<P>" ,"</P>",
                "<Ul>","</Ul>",
                "<H1>","</H1>",
                "<H2>","</H2>",
                "<H3>","</H3>",
                "<H4>","</H4>",
                "<H5>", "</H5>",
                "<Td_colspan=",
    ]
    tokenizer.add_tokens(html_tags)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = NaturalQuestionsProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        train_filename=train_filename,
        dev_filename=dev_filename,
        keep_no_answer=keep_is_impossible,
        downsample_context_size=downsample_context_size,
        data_dir=Path("../data/natural_questions"),
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size, caching=True)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model,n_added_tokens=len(html_tags))
    # b) and in case of Natural Questions we need two Prediction Heads
    #    one for extractive Question Answering
    qa_head = QuestionAnsweringHead()
    #    another one for answering yes/no questions or deciding if the given text passage might contain an answer
    classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) # answer_type_list = ["is_impossible", "span", "yes", "no"]
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[qa_head, classification_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/roberta-base-squad2-nq")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Since training on the whole NQ corpus requires substantial compute resources we trained and uploaded a model on s3
    fetch_archive_from_http("https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/models/roberta-base-squad2-nq.zip", output_dir="../saved_models/farm")
    QA_input = [
        {
            "qas": ["Did GameTrailers rated Twilight Princess as one of the best games ever created?"],
            "context":  "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
        }
    ]

    model = QAInferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True)
    result = model.inference_from_dicts(dicts=QA_input, return_json=False) # result is a list of QAPred objects

    print(f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?"
          f"\nAnswer from model: {result[0].prediction[0].answer}")
    model.close_multiprocessing_pool()
Example #26
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_text_pair_classification")

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset.
    # The TextPairClassificationProcessor expects a csv with columns called "text', "text_b" and "label"
    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="f1_macro",
        max_seq_len=128,
        dev_filename="dev.tsv",
        test_filename=None,
        data_dir=Path("../data/asnq_binary"),
        delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-6,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/text_pair_classification_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    basic_texts = [
        {
            "text":
            "how many times have real madrid won the champions league in a row",
            "text_b":
            "They have also won the competition the most times in a row, winning it five times from 1956 to 1960"
        },
        {
            "text": "how many seasons of the blacklist are there on netflix",
            "text_b": "Retrieved March 27 , 2018 ."
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)
Example #27
0
                                        max_seq_len=128,
                                        data_dir="../data/germeval18",
                                        label_list=label_list,
                                        metric=metric,
                                        label_column_name="coarse_label")

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(
    layer_dims=[
        768, len(processor.tasks["text_classification"]["label_list"])
    ],
    class_weights=data_silo.calculate_class_weights(
        task_name="text_classification"))

model = AdaptiveModel(language_model=language_model,
                      prediction_heads=[prediction_head],
                      embeds_dropout_prob=0.1,
                      lm_output_types=["per_sequence"],
                      device=device)

# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    learning_rate=2e-5,
    device=device,
    n_batches=len(data_silo.loaders["train"]),
Example #28
0
def test_nq(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 2
    n_epochs = 1
    evaluate_every = 4
    base_LM_model = "distilbert-base-uncased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model,
                               do_lower_case=True)
    processor = NaturalQuestionsProcessor(tokenizer=tokenizer,
                                          max_seq_len=20,
                                          doc_stride=10,
                                          max_query_length=6,
                                          train_filename="train_sample.jsonl",
                                          dev_filename="dev_sample.jsonl",
                                          data_dir=Path("samples/nq"))

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(base_LM_model)
    qa_head = QuestionAnsweringHead()
    classification_head = TextClassificationHead(
        num_labels=len(processor.answer_type_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[qa_head, classification_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)
    trainer.train()
    save_dir = Path("testsave/nq")
    model.save(save_dir)
    processor.save(save_dir)

    inferencer = Inferencer.load(save_dir,
                                 batch_size=2,
                                 gpu=False,
                                 num_processes=0)

    qa_format_1 = [{
        "questions": ["Who counted the game among the best ever made?"],
        "text":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]
    qa_format_2 = [{
        "qas": ["Who counted the game among the best ever made?"],
        "context":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
    }]

    result1 = inferencer.inference_from_dicts(dicts=qa_format_1)
    result2 = inferencer.inference_from_dicts(dicts=qa_format_2)
    assert result1 == result2
Example #29
0
def test_doc_classification(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 8
    evaluate_every = 5
    lang_model = "bert-base-german-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model, do_lower_case=False)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir="samples/doc_class",
                                            train_filename="train-sample.tsv",
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            dev_filename="test-sample.tsv",
                                            test_filename=None,
                                            dev_split=0.0,
                                            label_column_name="coarse_label")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = Bert.load(lang_model)
    prediction_head = TextClassificationHead(layer_dims=[
        768, len(processor.tasks["text_classification"]["label_list"])
    ])
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1)

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      warmup_linear=warmup_linear,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = "testsave/doc_class"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [{
        "text": "Martin Müller spielt Handball in Berlin."
    }, {
        "text":
        "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."
    }, {
        "text":
        "Franzosen verteidigen 2:1-Führung – Kritische Stimmen zu Schwedens Superstar"
    }, {
        "text": "Neues Video von Designern macht im Netz die Runde"
    }, {
        "text":
        "23-jähriger Brasilianer muss vier Spiele pausieren – Entscheidung kann noch angefochten werden"
    }, {
        "text":
        "Aufständische verwendeten Chemikalie bei Gefechten im August."
    }, {
        "text":
        "Bewährungs- und Geldstrafe für 26-Jährigen wegen ausländerfeindlicher Äußerung"
    }, {
        "text":
        "ÖFB-Teamspieler nur sechs Minuten nach seinem Tor beim 1:1 gegen Sunderland verletzt ausgewechselt"
    }, {
        "text":
        "Ein 31-jähriger Polizist soll einer 42-Jährigen den Knöchel gebrochen haben"
    }, {
        "text":
        "18 Menschen verschleppt. Kabul – Nach einem Hubschrauber-Absturz im Norden Afghanistans haben Sicherheitskräfte am Mittwoch versucht"
    }]
    #TODO enable loading here again after we have finished migration towards "processor.tasks"
    #inf = Inferencer.load(save_dir)
    inf = Inferencer(model=model, processor=processor)
    result = inf.run_inference(dicts=basic_texts)
    assert result[0]["predictions"][0]["label"] == "OTHER"
    assert abs(result[0]["predictions"][0]["probability"] - 0.7) <= 0.1

    loaded_processor = TextClassificationProcessor.load_from_dir(save_dir)
    inf2 = Inferencer(model=model, processor=loaded_processor)
    result_2 = inf2.run_inference(dicts=basic_texts)
    pprint(list(zip(result, result_2)))
    for r1, r2 in list(zip(result, result_2)):
        assert r1 == r2


# if(__name__=="__main__"):
#     test_doc_classification()
Example #30
0
def doc_classification_cola():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 3
    batch_size = 8
    evaluate_every = 450
    lang_model = "/bert-base-chinese" #BERT中文模型的路径
    #模型下载地址https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load Cola 2018 Data.

    label_list =["城乡建设","卫生计生","商贸旅游","劳动和社会保障","教育文体","交通运输","环境保护"]
    metric = "acc"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=507,
                                            data_dir=Path("/BERT留言分类数据集"), #存放文本分类数据的文件夹路径,数据格式:第一列按字符分隔的text,第二列label,之间用制表符分隔。第一行需要有"text"与"label"
                                            dev_filename=None, #Path("dev.tsv"),
                                            dev_split=0.1,
                                            test_filename="/BERT留言分类数据集/test.tsv",
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="label"
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)

    # language_model = Roberta.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("/BERT文本分类输出的模型")
    model.save(save_dir)
    processor.save(save_dir)