Beispiel #1
0
def test_text_classifier_transformer_finetune(results_base_path,
                                              tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_single",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=TransformerDocumentEmbeddings(
            "distilbert-base-uncased"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=False,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.fine_tune(
        results_base_path,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=True,
        learning_rate=0.5e-5,
        num_workers=2,
    )

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict([sentence, sentence_empty])

    values = []
    for label in sentence.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Beispiel #2
0
 def prepare_resources(self):
     # turn off INFO and DEBUG logging
     import flair  # KEEP THIS IMPORT HERE! (it initialises 'flair' logger)
     import logging
     logger = logging.getLogger('flair')
     logger.setLevel(logging.WARNING)
     if self.seed:
         flair.set_seed(self.seed)
Beispiel #3
0
def test_text_classifier_multi(results_base_path, tasks_base_path):
    flair.set_seed(123)

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_multi",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=DocumentPoolEmbeddings([turian_embeddings],
                                                   fine_tune_mode="linear"),
        label_dictionary=label_dict,
        label_type="city",
        multi_label=True,
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  mini_batch_size=2,
                  max_epochs=50,
                  shuffle=True)

    # check if model can predict
    sentence = Sentence("this is Berlin")
    sentence_empty = Sentence("       ")

    model.predict(sentence)
    model.predict([sentence, sentence_empty])
    model.predict([sentence_empty])

    # load model
    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    # chcek if model predicts correct label
    sentence = Sentence("this is Berlin")
    sentence_double = Sentence("this is Berlin and pizza")

    loaded_model.predict([sentence, sentence_double])

    values = []
    for label in sentence_double.labels:
        assert label.value is not None
        assert 0.0 <= label.score <= 1.0
        assert type(label.score) is float
        values.append(label.value)

    assert "Berlin" in values
    assert "pizza" in values

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="city")
    print(result.classification_report)
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets,
                   json_config):
    # Config values
    # Replace it with more Pythonic solutions later!
    word_embedding = json_config["word_embedding"]
    use_crf = json_config["use_crf"] if "use_crf" in json_config else False

    # Set seed for reproducibility
    set_seed(seed)

    corpus_list = []

    # Dataset-related
    for dataset in hipe_datasets:
        dataset_name, language = dataset.split("/")
        corpus_list.append(
            NER_HIPE_2022(dataset_name=dataset_name,
                          language=language,
                          add_document_separator=True))

    print("Use CRF:", use_crf)

    corpora: MultiCorpus = MultiCorpus(corpora=corpus_list,
                                       sample_missing_splits=False)
    label_dictionary = corpora.make_label_dictionary(label_type="ner")
    print("Label Dictionary:", label_dictionary.get_items())

    # FastText Embeddings
    embeddings = FastTextEmbeddings(embeddings=word_embedding)

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=label_dictionary,
        tag_type="ner",
        use_crf=use_crf,
    )

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpora)

    datasets = "-".join([dataset for dataset in hipe_datasets])

    trainer.train(
        f"hipe2022-flert-we-trainer-{datasets}-{word_embedding}-bs{batch_size}-wsFalse-e{epoch}-lr{learning_rate}-crf{use_crf}-{seed}",
        mini_batch_size=batch_size,
        mini_batch_chunk_size=2,
        patience=3,
        max_epochs=epoch,
        shuffle=True,
        learning_rate=learning_rate,
    )

    # Finally, print model card for information
    tagger.print_model_card()
Beispiel #5
0
def test_sequence_tagger_transformer_finetune(results_base_path,
                                              tasks_base_path):
    flair.set_seed(123)

    # load dataset
    corpus: Corpus = ColumnCorpus(
        data_folder=tasks_base_path / "trivial" / "trivial_bioes",
        column_format={
            0: "text",
            1: "ner"
        },
    )
    tag_dictionary = corpus.make_label_dictionary("ner")

    # tagger without CRF
    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=TransformerWordEmbeddings("distilbert-base-uncased",
                                             fine_tune=True),
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
        use_rnn=False,
        reproject_embeddings=False,
    )

    # train
    trainer = ModelTrainer(tagger, corpus)
    trainer.fine_tune(
        results_base_path,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=True,
        learning_rate=0.5e-4,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("this is New York")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # check if loaded model can predict
    entities = [span.text for span in sentence.get_spans("ner")]
    assert "New York" in entities

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="ner")
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Beispiel #6
0
def run_experiment(data_folder, task_name, model_name, split):
    # Adjust logging level
    logging.getLogger("flair").setLevel(level="ERROR")

    set_seed(1)

    if task_name in ["lft", "onb"]:
        # Configuration
        column_format = {0: "token", 1: "ner"}

        # We use official data from Riedl and Padó
        train_file = f"enp_DE.{task_name}.mr.tok.train.bio"
        dev_file = f"enp_DE.{task_name}.mr.tok.dev.bio"
        test_file = f"enp_DE.{task_name}.mr.tok.test.bio"

        # Corpus
        corpus = ColumnCorpus(
            data_folder=data_folder,
            column_format=column_format,
            train_file=train_file,
            dev_file=dev_file,
            test_file=test_file,
            tag_to_bioes="ner",
        )

    tagger: SequenceTagger = SequenceTagger.load(model_name)

    ds = corpus.test if split == "test" else corpus.dev

    for sentence in ds:
        tagger.predict(sentence, label_name="predicted")

        gold_spans = sentence.get_spans("ner")

        pred_spans = sentence.get_spans("predicted")

        for token in sentence:
            gold_tag = "O"

            for span in gold_spans:
                if token in span:
                    gold_tag = "B-" + span.tag if token == span[
                        0] else "I-" + span.tag

            pred_tag = "O"

            for span in pred_spans:
                if token in span:
                    pred_tag = "B-" + span.tag if token == span[
                        0] else "I-" + span.tag

            print(f"{token.text} {gold_tag} {pred_tag}")

        print("")
Beispiel #7
0
def test_sequence_tagger_with_crf(results_base_path, tasks_base_path):
    flair.set_seed(123)

    # load dataset
    corpus: Corpus = ColumnCorpus(data_folder=tasks_base_path / "trivial" /
                                  "trivial_bioes",
                                  column_format={
                                      0: "text",
                                      1: "ner"
                                  })
    tag_dictionary = corpus.make_label_dictionary("ner")

    # tagger without CRF
    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=turian_embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=True,
    )

    # train
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=False,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("this is New York")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # check if loaded model can predict
    entities = [span.text for span in sentence.get_spans('ner')]
    assert "New York" in entities

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type='ner')
    assert result.classification_report["micro avg"]["f1-score"] == 1.

    # clean up results directory
    shutil.rmtree(results_base_path)
    del loaded_model
Beispiel #8
0
def test_text_classifier_multi(results_base_path, tasks_base_path):
    flair.set_seed(123)

    flair_embeddings = FlairEmbeddings("news-forward-fast")

    corpus = ClassificationCorpus(
        tasks_base_path / "trivial" / "trivial_text_classification_single",
        label_type="city",
    )
    label_dict = corpus.make_label_dictionary(label_type="city")

    model: TextClassifier = TextClassifier(
        document_embeddings=DocumentPoolEmbeddings([flair_embeddings], fine_tune_mode="linear"),
        label_dictionary=label_dict,
        label_type="city",
    )

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path, mini_batch_size=2, max_epochs=1, shuffle=True)

    del model
    train_log_file = results_base_path / "training.log"
    assert train_log_file.exists()
    lines = train_log_file.read_text(encoding="utf-8").split("\n")
    expected_substrings = [
        "Device: ",
        "Corpus: ",
        "Parameters:",
        "- learning_rate: ",
        "- patience: ",
        "Embeddings storage mode:",
        "epoch 1 - iter",
        "EPOCH 1 done: loss",
        "Results:",
    ]
    for expected_substring in expected_substrings:
        assert any(expected_substring in line for line in lines), expected_substring
import sys
import os
sys.path.insert(0, "/vol/fob-vol7/mi19/harnisph/flair")

import flair
import torch
from flair.models import TARSSequenceTagger2
from flair.data import Sentence
from flair.datasets import CONLL_03

tagger = TARSSequenceTagger2.load(
    "resources/v3/moviecomplex-long/final-model.pt")

flair.set_seed(3)

label_name_map = {
    "LOC": "Location",
    "PER": "Person",
    "ORG": "Organization",
    "MISC": "Miscellaneous"
}
print(label_name_map)
corpus = CONLL_03(
    tag_to_bioes=None,
    tag_to_bio2="ner",
    label_name_map=label_name_map,
    base_path="/vol/fob-vol7/mi19/harnisph/studienprojekt-dokumentation")
corpus = corpus.downsample(0.1)
tag_type = "ner"
tag_dictionary = corpus.make_label_dictionary(tag_type)
tagger.add_and_switch_to_new_task("zeroshot-moviecomplex-long-to-conll3",
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets,
                   json_config):
    # Config values
    # Replace it with more Pythonic solutions later!
    hf_model = json_config["hf_model"]
    context_size = json_config["context_size"]
    layers = json_config["layers"] if "layers" in json_config else "-1"
    use_crf = json_config["use_crf"] if "use_crf" in json_config else False

    # Set seed for reproducibility
    set_seed(seed)

    corpus_list = []

    # Dataset-related
    for dataset in hipe_datasets:
        dataset_name, language = dataset.split("/")

        current_corpus = NER_HIPE_2022(dataset_name=dataset_name,
                                       language=language,
                                       add_document_separator=True)

        for split in ["train", "dev"]:
            kb_data = []

            print(f"Loading KB contexts for {dataset}...")

            with open(f"kb_data/ajmc/{language}/{language}_{split}.jsonl",
                      "rt") as f_p:
                for line in f_p:
                    kb_data.append(line)

            corpus_split = current_corpus.train if split == "train" else current_corpus.dev

            for index, sent in enumerate(corpus_split):
                jsonl = json.loads(kb_data[index])

                kb_context = " ".join(jsonl["contexts"]).split(" ")

                sent.kb_context = kb_context

        corpus_list.append(current_corpus)

    if context_size == 0:
        context_size = False

    print("FLERT Context:", context_size)
    print("Layers:", layers)
    print("Use CRF:", use_crf)

    corpora: MultiCorpus = MultiCorpus(corpora=corpus_list,
                                       sample_missing_splits=False)
    label_dictionary = corpora.make_label_dictionary(label_type="ner")
    print("Label Dictionary:", label_dictionary.get_items())

    # Embeddings
    embeddings = KBTransformerEmbeddings(
        model=hf_model,
        layers=layers,
        subtoken_pooling="first",
        fine_tune=True,
        use_context=context_size,
    )

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=label_dictionary,
        tag_type="ner",
        use_crf=use_crf,
        use_rnn=False,
        reproject_embeddings=False,
    )

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpora)

    datasets = "-".join([dataset for dataset in hipe_datasets])

    trainer.fine_tune(
        f"hipe2022-flert-fine-tune-kb-{datasets}-{hf_model}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}",
        learning_rate=learning_rate,
        mini_batch_size=batch_size,
        max_epochs=epoch,
        shuffle=True,
        embeddings_storage_mode='none',
        weight_decay=0.,
        use_final_model_for_eval=False,
    )

    # Finally, print model card for information
    tagger.print_model_card()
from flair.training_utils import EvaluationMetric
from flair.visual.training_curves import Plotter

#get the corpus
#flair.set_seed(1)
import flair
# flair.set_seed(2)
# flair.set_seed(3)
from flair.datasets import CONLL_03
from mapping import (twitter_ner_mapped, onto_ner_mapped, wikigold_ner_mapped,
                     webpages_ner_mapped)

dataset_name = "conll3"

for seed in [1, 2, 3]:
    flair.set_seed(123)

    if dataset_name == "onto_ner":

        corpus = onto_ner_mapped()
    elif dataset_name == "conll3":
        corpus = CONLL_03()
    elif dataset_name == "wikipedia":
        corpus = wikigold_ner_mapped()
    elif dataset_name == "webpages":
        corpus = webpages_ner_mapped()
    elif dataset_name == "twitter":
        corpus = twitter_ner_mapped()

    flair.set_seed(seed)
import sys
import os
#sys.path.insert(0, os.path.join('C:/', 'Users', 'pharn', 'flair'))
#sys.path.insert(0, os.path.join('C:/', 'Users', 'pharn', 'AppData', 'Local', 'Packages', 'PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0', 'LocalCache', 'local-packages', 'Python38', 'site-packages'))
sys.path.insert(0, "/vol/fob-vol7/mi19/harnisph/flair")
sys.path.insert(0, os.path.join("vol", "fob-vol7", "mi19", "harnisph",
                                "flair"))

import flair
from flair.models import TARSSequenceTagger2
from flair.data import Sentence
#from flair.datasets import CONLL_3, MIT_MOVIE_NER_COMPLEX
from flair.datasets import WNUT_2020_NER

flair.set_seed(1)

tagger = TARSSequenceTagger2.load(
    "resources/testfaelle-studproj/conll_3/final-model.pt")

sentences = [
    Sentence(
        "The Parlament of the United Kingdom is discussing a variety of topics."
    ),
    Sentence(
        "A man fell in love with a woman. This takes place in the last century. The film received the Golden Love Film Award."
    ),
    Sentence("The Company of Coca Cola was invented in 1901."),
    Sentence("This is very frustrating! I was smiling since I saw you."),
    Sentence("The Green Party received only a small percentage of the vote."),
    Sentence(
        "Bayern Munich won the german soccer series the sixth time in a row.")
Beispiel #13
0
def main():
    parser = HfArgumentParser((ModelArguments, TrainingArguments, FlertArguments, DataArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        (
            model_args,
            training_args,
            flert_args,
            data_args,
        ) = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        (
            model_args,
            training_args,
            flert_args,
            data_args,
        ) = parser.parse_args_into_dataclasses()

    set_seed(training_args.seed)

    flair.device = training_args.device

    corpus = get_flair_corpus(data_args)

    logger.info(corpus)

    tag_type: str = "ner"
    tag_dictionary = corpus.make_label_dictionary(tag_type)
    logger.info(tag_dictionary)

    embeddings = TransformerWordEmbeddings(
        model=model_args.model_name_or_path,
        layers=model_args.layers,
        subtoken_pooling=model_args.subtoken_pooling,
        fine_tune=True,
        use_context=flert_args.context_size,
        respect_document_boundaries=flert_args.respect_document_boundaries,
    )

    tagger = SequenceTagger(
        hidden_size=model_args.hidden_size,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type=tag_type,
        use_crf=model_args.use_crf,
        use_rnn=False,
        reproject_embeddings=False,
    )

    trainer = ModelTrainer(tagger, corpus)

    trainer.fine_tune(
        data_args.output_dir,
        learning_rate=training_args.learning_rate,
        mini_batch_size=training_args.batch_size,
        mini_batch_chunk_size=training_args.mini_batch_chunk_size,
        max_epochs=training_args.num_epochs,
        embeddings_storage_mode=training_args.embeddings_storage_mode,
        weight_decay=training_args.weight_decay,
    )

    torch.save(model_args, os.path.join(data_args.output_dir, "model_args.bin"))
    torch.save(training_args, os.path.join(data_args.output_dir, "training_args.bin"))

    # finally, print model card for information
    tagger.print_model_card()
Beispiel #14
0
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets,
                   json_config):
    # Config values
    # Replace it with more Pythonic solutions later!
    hf_model = json_config["hf_model"]
    context_size = json_config["context_size"]
    layers = json_config["layers"] if "layers" in json_config else "-1"
    use_crf = json_config["use_crf"] if "use_crf" in json_config else False
    additional_hipe_datasets = json_config[
        "additional_hipe_datasets"] if "additional_hipe_datasets" in json_config else None
    label_name_map = json_config[
        "label_name_map"] if "label_name_map" in json_config else None

    # Set seed for reproducibility
    set_seed(seed)

    corpus_list = []

    # Dataset-related
    for dataset in hipe_datasets:
        dataset_name, language = dataset.split("/")

        preproc_fn = None

        if dataset_name == "ajmc":
            preproc_fn = prepare_ajmc_corpus

        corpus_list.append(
            NER_HIPE_2022(dataset_name=dataset_name,
                          language=language,
                          preproc_fn=preproc_fn,
                          add_document_separator=True))

    if additional_hipe_datasets and label_name_map:
        # Special case: do not use Dev data from additional datasets
        # This makes evaluation and comparison much more easier!

        for dataset in additional_hipe_datasets:
            dataset_name, language = dataset.split("/")

            preproc_fn = None

            if dataset_name == "hipe2020":
                print("Using own HIPE-2020 Preprocessing function.")
                print(
                    "Please make sure that Flair Datasets folder was cleaned before!"
                )
                preproc_fn = prepare_clef_2020_corpus

            additional_corpus = NER_HIPE_2022(dataset_name=dataset_name,
                                              label_name_map=label_name_map,
                                              language=language,
                                              add_document_separator=True,
                                              preproc_fn=preproc_fn)
            additional_corpus._dev = []
            corpus_list.append(additional_corpus)

    if context_size == 0:
        context_size = False

    print("FLERT Context:", context_size)
    print("Layers:", layers)
    print("Use CRF:", use_crf)

    corpora: MultiCorpus = MultiCorpus(corpora=corpus_list,
                                       sample_missing_splits=False)
    label_dictionary = corpora.make_label_dictionary(label_type="ner")
    print("Label Dictionary:", label_dictionary.get_items())

    # Embeddings
    embeddings = TransformerWordEmbeddings(
        model=hf_model,
        layers=layers,
        subtoken_pooling="first",
        fine_tune=True,
        use_context=context_size,
    )

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=label_dictionary,
        tag_type="ner",
        use_crf=use_crf,
        use_rnn=False,
        reproject_embeddings=False,
    )

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpora)

    datasets = "-".join([dataset for dataset in hipe_datasets])

    trainer.fine_tune(
        f"hipe2022-flert-fine-tune-{datasets}-{hf_model}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}",
        learning_rate=learning_rate,
        mini_batch_size=batch_size,
        max_epochs=epoch,
        shuffle=True,
        embeddings_storage_mode='none',
        weight_decay=0.,
        use_final_model_for_eval=False,
    )

    # Finally, print model card for information
    tagger.print_model_card()
def run_experiment(data_folder, task_name, model_name, run_id, use_context):
    # Set seed for reproducibility
    set_seed(int(run_id))

    if use_context == 0:
        use_context = False

    print("FLERT Context:", use_context)

    if task_name in ["lft", "onb"]:

        # Configuration
        column_format = {0: "token", 1: "ner"}

        # We use official data from Riedl and Padó
        train_file = f"enp_DE.{task_name}.mr.tok.train.bio"
        dev_file = f"enp_DE.{task_name}.mr.tok.dev.bio"
        test_file = f"enp_DE.{task_name}.mr.tok.test.bio"

        # Corpus
        corpus = ColumnCorpus(
            data_folder=data_folder,
            column_format=column_format,
            train_file=train_file,
            dev_file=dev_file,
            test_file=test_file,
            tag_to_bioes="ner",
        )

    # Corpus configuration
    tag_type = "ner"
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # Embeddings
    embedding_types: List[TokenEmbeddings] = [
        TransformerWordEmbeddings(model=model_name,
                                  layers="all",
                                  layer_mean=True,
                                  use_context=use_context)
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type=tag_type,
        use_crf=True,
    )

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(
        f"resources/taggers/ner-{task_name}-{model_name}-context{use_context}-{run_id}",
        learning_rate=0.1,
        mini_batch_size=16,
        max_epochs=200,
        shuffle=True,
    )
Beispiel #16
0
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets,
                   json_config):
    # Config values
    # Replace it with more Pythonic solutions later!
    best_model = json_config["best_model"]
    context_size = json_config["context_size"]
    layers = json_config["layers"] if "layers" in json_config else "-1"
    use_crf = json_config["use_crf"] if "use_crf" in json_config else False

    # Set seed for reproducibility
    set_seed(seed)

    corpus_list = []

    # Dataset-related
    for dataset in hipe_datasets:
        dataset_name, language = dataset.split("/")
        preproc_fn = None

        if dataset_name == "ajmc":
            preproc_fn = prepare_ajmc_corpus

        corpus_list.append(
            NER_HIPE_2022(dataset_name=dataset_name,
                          language=language,
                          preproc_fn=preproc_fn,
                          add_document_separator=True))

    if context_size == 0:
        context_size = False

    print("FLERT Context:", context_size)
    print("Layers:", layers)
    print("Use CRF:", use_crf)

    corpora: MultiCorpus = MultiCorpus(corpora=corpus_list,
                                       sample_missing_splits=False)
    label_dictionary = corpora.make_label_dictionary(label_type="ner")
    print("Label Dictionary:", label_dictionary.get_items())

    print("Loading model from stage 1:", best_model)
    tagger: SequenceTagger = SequenceTagger.load(best_model)

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpora)

    datasets = "-".join([dataset for dataset in hipe_datasets])

    best_model_name = best_model.replace("/", "_")

    trainer.fine_tune(
        f"hipe2022-flert-fine-tune-multistage-{datasets}-{best_model_name}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}",
        learning_rate=learning_rate,
        mini_batch_size=batch_size,
        max_epochs=epoch,
        shuffle=True,
        embeddings_storage_mode='none',
        weight_decay=0.,
        use_final_model_for_eval=False,
    )

    # Finally, print model card for information
    tagger.print_model_card()
Beispiel #17
0
def run_experiment(seed, batch_size, epoch, learning_rate, json_config):
    # Config values
    # Replace it with more Pythonic solutions later!
    
    hf_model = json_config["hf_model"]
    context_size = json_config["context_size"]
    layers = json_config["layers"] if "layers" in json_config else "-1"
    use_crf = json_config["use_crf"] if "use_crf" in json_config else False
    task_name = json_config["task_name"]

    # Dataset-related
    data_folder = json_config["data_folder"]
    train_file     = json_config["train_file"]
    dev_file       = json_config["dev_file"]
    test_file      = json_config["test_file"]

    # Set seed for reproducibility
    set_seed(seed)

    if context_size == 0:
        context_size = False

    print("FLERT Context:", context_size)
    print("Layers:", layers)
    print("Use CRF:", use_crf)

    # Configuration
    column_format = {0: "text", 1: "ner"}

    # Corpus
    corpus = ColumnCorpus(data_folder=data_folder,
                          column_format=column_format,
                          train_file=train_file,
                          dev_file=dev_file,
                          test_file=test_file,
                          tag_to_bioes="ner",
                         )

    # Corpus configuration
    tag_type = "ner"
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # Embeddings
    embeddings = TransformerWordEmbeddings(
        model=hf_model,
        layers=layers,
        subtoken_pooling="first",
        fine_tune=True,
        use_context=context_size,
    )

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type=tag_type,
        use_crf=use_crf,
        use_rnn=False,
        reproject_embeddings=False,
    )

    # Trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.fine_tune(
        f"histo-flert-fine-tuning-{task_name}-{hf_model}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}",
        learning_rate=learning_rate,
        mini_batch_size=batch_size,
        max_epochs=epoch,
        shuffle=True,
        embeddings_storage_mode='none',
        weight_decay=0.,
        use_final_model_for_eval=False,
    )
Beispiel #18
0
                        help="Model name (such as Hugging Face model hub name")
    parser.add_argument(
        "-d",
        "--dataset",
        type=str,
        help="Defines dataset, choose between imst, boun or xtreme")

    # Parse experimental arguments
    args = parser.parse_args()

    # use cuda device as passed
    flair.device = f'cuda:{str(args.cuda)}'

    # for each passed seed, do one experimental run
    for seed in args.seeds:
        flair.set_seed(seed)

        # model
        hf_model = args.model

        # initialize embeddings
        embeddings = TransformerWordEmbeddings(
            model=hf_model,
            layers="-1",
            subtoken_pooling="first",
            fine_tune=True,
            use_context=False,
            respect_document_boundaries=False,
        )

        # select dataset depending on which language variable is passed