def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path / 'fashion'), column_format={ 0: 'text', 2: 'ner', }) corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('turian') model = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = SequenceTagger.load_checkpoint( (results_base_path / 'checkpoint.pt')) trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) shutil.rmtree(results_base_path)
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path / 'fashion'), column_format={ 0: 'text', 2: 'ner', }) corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('turian') tagger = SequenceTagger(hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) trainer = ModelTrainer(tagger, corpus) trainer.train(results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, shuffle=False) loaded_model = SequenceTagger.load((results_base_path / 'final-model.pt')) sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) shutil.rmtree(results_base_path)
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus( data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"} ) corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") model: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) checkpoint = SequenceTagger.load_checkpoint(results_base_path / "checkpoint.pt") trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) # clean up results directory shutil.rmtree(results_base_path)
def run_evaluator(model_name: str, dataset_names: str): corpus_list = [] # Dataset-related for dataset in dataset_names.split((",")): dataset_name, language = dataset.split("/") preproc_fn = None if dataset_name == "ajmc": preproc_fn = prepare_ajmc_corpus corpus_list.append( NER_HIPE_2022(dataset_name=dataset_name, language=language, preproc_fn=preproc_fn, add_document_separator=True)) corpora: MultiCorpus = MultiCorpus(corpora=corpus_list, sample_missing_splits=False) label_dictionary = corpora.make_label_dictionary(label_type="ner") print("Label Dictionary:", label_dictionary.get_items()) model = SequenceTagger.load(model_name) dev_result = model.evaluate(corpora.dev, gold_label_type="ner", mini_batch_size=8) print(dev_result)
def multi_corpus(self, output: Union[str, Path], first_corpus: str = "germeval"): data_dir = Path(Path(self.directory).parent, first_corpus) first = self._load_corpus(data_dir) second = self._load_corpus() corpus = MultiCorpus([first, second]) tagger = self._train(output, corpus)
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={ 0: "text", 3: "ner" }) corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL( base_path=tasks_base_path).downsample(0.1) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_label_dictionary("ner") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=turian_embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, allow_unk_predictions=True, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, shuffle=False, ) del trainer, tagger, corpus loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) del loaded_model
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={ 0: "text", 2: "ner" }) corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_tag_dictionary("ner") embeddings = WordEmbeddings("turian") tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=2, shuffle=False, ) loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_multi_corpus(tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(tasks_base_path / "germeval_14", column_format={0: "text", 2: "ner"}) corpus_2 = flair.datasets.ColumnCorpus(tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}) # get two corpora as one corpus = MultiCorpus([corpus_1, corpus_2]) assert len(corpus.train) == 8 assert len(corpus.dev) == 2 assert len(corpus.test) == 2
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets, json_config): # Config values # Replace it with more Pythonic solutions later! word_embedding = json_config["word_embedding"] use_crf = json_config["use_crf"] if "use_crf" in json_config else False # Set seed for reproducibility set_seed(seed) corpus_list = [] # Dataset-related for dataset in hipe_datasets: dataset_name, language = dataset.split("/") corpus_list.append( NER_HIPE_2022(dataset_name=dataset_name, language=language, add_document_separator=True)) print("Use CRF:", use_crf) corpora: MultiCorpus = MultiCorpus(corpora=corpus_list, sample_missing_splits=False) label_dictionary = corpora.make_label_dictionary(label_type="ner") print("Label Dictionary:", label_dictionary.get_items()) # FastText Embeddings embeddings = FastTextEmbeddings(embeddings=word_embedding) tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=label_dictionary, tag_type="ner", use_crf=use_crf, ) # Trainer trainer: ModelTrainer = ModelTrainer(tagger, corpora) datasets = "-".join([dataset for dataset in hipe_datasets]) trainer.train( f"hipe2022-flert-we-trainer-{datasets}-{word_embedding}-bs{batch_size}-wsFalse-e{epoch}-lr{learning_rate}-crf{use_crf}-{seed}", mini_batch_size=batch_size, mini_batch_chunk_size=2, patience=3, max_epochs=epoch, shuffle=True, learning_rate=learning_rate, ) # Finally, print model card for information tagger.print_model_card()
def test_train_resume_tagger(results_base_path, tasks_base_path): corpus_1 = flair.datasets.ColumnCorpus(data_folder=tasks_base_path / "fashion", column_format={ 0: "text", 3: "ner" }) corpus_2 = flair.datasets.NER_GERMAN_GERMEVAL( base_path=tasks_base_path).downsample(0.1) corpus = MultiCorpus([corpus_1, corpus_2]) tag_dictionary = corpus.make_label_dictionary("ner") model: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=turian_embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, ) # train model for 2 epochs trainer = ModelTrainer(model, corpus) trainer.train(results_base_path, max_epochs=2, shuffle=False, checkpoint=True) del model # load the checkpoint model and train until epoch 4 checkpoint_model = SequenceTagger.load(results_base_path / "checkpoint.pt") trainer.resume(model=checkpoint_model, max_epochs=4) # clean up results directory del trainer
# 2. what tag do we want to predict? tag_type = 'pos' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('extvec'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) trainer: ModelTrainer = ModelTrainer(tagger, corpus) multi_corpus = MultiCorpus([english_corpus, german_corpus, dutch_corpus]) result = trainer.train('resources/taggers/example-pos', train_with_dev=True, embeddings_storage_mode='cpu', mini_batch_size=256, max_epochs=max_epochs)
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets, json_config): # Config values # Replace it with more Pythonic solutions later! hf_model = json_config["hf_model"] context_size = json_config["context_size"] layers = json_config["layers"] if "layers" in json_config else "-1" use_crf = json_config["use_crf"] if "use_crf" in json_config else False # Set seed for reproducibility set_seed(seed) corpus_list = [] # Dataset-related for dataset in hipe_datasets: dataset_name, language = dataset.split("/") current_corpus = NER_HIPE_2022(dataset_name=dataset_name, language=language, add_document_separator=True) for split in ["train", "dev"]: kb_data = [] print(f"Loading KB contexts for {dataset}...") with open(f"kb_data/ajmc/{language}/{language}_{split}.jsonl", "rt") as f_p: for line in f_p: kb_data.append(line) corpus_split = current_corpus.train if split == "train" else current_corpus.dev for index, sent in enumerate(corpus_split): jsonl = json.loads(kb_data[index]) kb_context = " ".join(jsonl["contexts"]).split(" ") sent.kb_context = kb_context corpus_list.append(current_corpus) if context_size == 0: context_size = False print("FLERT Context:", context_size) print("Layers:", layers) print("Use CRF:", use_crf) corpora: MultiCorpus = MultiCorpus(corpora=corpus_list, sample_missing_splits=False) label_dictionary = corpora.make_label_dictionary(label_type="ner") print("Label Dictionary:", label_dictionary.get_items()) # Embeddings embeddings = KBTransformerEmbeddings( model=hf_model, layers=layers, subtoken_pooling="first", fine_tune=True, use_context=context_size, ) tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=label_dictionary, tag_type="ner", use_crf=use_crf, use_rnn=False, reproject_embeddings=False, ) # Trainer trainer: ModelTrainer = ModelTrainer(tagger, corpora) datasets = "-".join([dataset for dataset in hipe_datasets]) trainer.fine_tune( f"hipe2022-flert-fine-tune-kb-{datasets}-{hf_model}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}", learning_rate=learning_rate, mini_batch_size=batch_size, max_epochs=epoch, shuffle=True, embeddings_storage_mode='none', weight_decay=0., use_final_model_for_eval=False, ) # Finally, print model card for information tagger.print_model_card()
# 2. initialize embeddings from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings embedding_types = [ WordEmbeddings("pubmed"), FlairEmbeddings("pubmed-forward"), FlairEmbeddings("pubmed-backward"), ] embeddings = StackedEmbeddings(embeddings=embedding_types) # 3. Initialize corpus # We also train on the test portions of the corpora, because we evaluate on held-out corpora from flair.data import MultiCorpus from torch.utils.data import ConcatDataset corpus = MultiCorpus(GENE_CORPORA) corpus._train = ConcatDataset([corpus._train, corpus._test]) # 4. Initialize sequence tagger from flair.models import SequenceTagger tag_dictionary = corpus.make_tag_dictionary(tag_type="ner") tagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=True, locked_dropout=0.5 )
def load_corpora(tasks: List[Union[NLPTask, str]], base_path: Path = None) -> MultiCorpus: return MultiCorpus([ NLPTaskDataFetcher.load_corpus(task, base_path) for task in tasks ])
print('-' * 150, flush=True) print(" - Domain: ", domain, flush=True) print(" - Corpus Train Size: ", len(corpus.train), flush=True) print(" - Corpus Dev Size: ", len(corpus.dev), flush=True) print(" - Corpus Test Size: ", len(corpus.test), flush=True) print('-' * 150, flush=True) # initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('extvec'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) multi_all_corpus = MultiCorpus(ALL_POS_ALL_CORPUS.values()) # 3. make the tag dictionary from the corpus tag_dictionary = multi_all_corpus.make_tag_dictionary(tag_type=tag_type) print('-' * 50, '\nTag_dictionary size: ', len(tag_dictionary), '\n', '-' * 50, flush=True) # initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type) for target_domain in ALL_POS_DOMAINS:
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets, json_config): # Config values # Replace it with more Pythonic solutions later! hf_model = json_config["hf_model"] context_size = json_config["context_size"] layers = json_config["layers"] if "layers" in json_config else "-1" use_crf = json_config["use_crf"] if "use_crf" in json_config else False additional_hipe_datasets = json_config[ "additional_hipe_datasets"] if "additional_hipe_datasets" in json_config else None label_name_map = json_config[ "label_name_map"] if "label_name_map" in json_config else None # Set seed for reproducibility set_seed(seed) corpus_list = [] # Dataset-related for dataset in hipe_datasets: dataset_name, language = dataset.split("/") preproc_fn = None if dataset_name == "ajmc": preproc_fn = prepare_ajmc_corpus corpus_list.append( NER_HIPE_2022(dataset_name=dataset_name, language=language, preproc_fn=preproc_fn, add_document_separator=True)) if additional_hipe_datasets and label_name_map: # Special case: do not use Dev data from additional datasets # This makes evaluation and comparison much more easier! for dataset in additional_hipe_datasets: dataset_name, language = dataset.split("/") preproc_fn = None if dataset_name == "hipe2020": print("Using own HIPE-2020 Preprocessing function.") print( "Please make sure that Flair Datasets folder was cleaned before!" ) preproc_fn = prepare_clef_2020_corpus additional_corpus = NER_HIPE_2022(dataset_name=dataset_name, label_name_map=label_name_map, language=language, add_document_separator=True, preproc_fn=preproc_fn) additional_corpus._dev = [] corpus_list.append(additional_corpus) if context_size == 0: context_size = False print("FLERT Context:", context_size) print("Layers:", layers) print("Use CRF:", use_crf) corpora: MultiCorpus = MultiCorpus(corpora=corpus_list, sample_missing_splits=False) label_dictionary = corpora.make_label_dictionary(label_type="ner") print("Label Dictionary:", label_dictionary.get_items()) # Embeddings embeddings = TransformerWordEmbeddings( model=hf_model, layers=layers, subtoken_pooling="first", fine_tune=True, use_context=context_size, ) tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=label_dictionary, tag_type="ner", use_crf=use_crf, use_rnn=False, reproject_embeddings=False, ) # Trainer trainer: ModelTrainer = ModelTrainer(tagger, corpora) datasets = "-".join([dataset for dataset in hipe_datasets]) trainer.fine_tune( f"hipe2022-flert-fine-tune-{datasets}-{hf_model}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}", learning_rate=learning_rate, mini_batch_size=batch_size, max_epochs=epoch, shuffle=True, embeddings_storage_mode='none', weight_decay=0., use_final_model_for_eval=False, ) # Finally, print model card for information tagger.print_model_card()
def run_experiment(seed, batch_size, epoch, learning_rate, hipe_datasets, json_config): # Config values # Replace it with more Pythonic solutions later! best_model = json_config["best_model"] context_size = json_config["context_size"] layers = json_config["layers"] if "layers" in json_config else "-1" use_crf = json_config["use_crf"] if "use_crf" in json_config else False # Set seed for reproducibility set_seed(seed) corpus_list = [] # Dataset-related for dataset in hipe_datasets: dataset_name, language = dataset.split("/") preproc_fn = None if dataset_name == "ajmc": preproc_fn = prepare_ajmc_corpus corpus_list.append( NER_HIPE_2022(dataset_name=dataset_name, language=language, preproc_fn=preproc_fn, add_document_separator=True)) if context_size == 0: context_size = False print("FLERT Context:", context_size) print("Layers:", layers) print("Use CRF:", use_crf) corpora: MultiCorpus = MultiCorpus(corpora=corpus_list, sample_missing_splits=False) label_dictionary = corpora.make_label_dictionary(label_type="ner") print("Label Dictionary:", label_dictionary.get_items()) print("Loading model from stage 1:", best_model) tagger: SequenceTagger = SequenceTagger.load(best_model) # Trainer trainer: ModelTrainer = ModelTrainer(tagger, corpora) datasets = "-".join([dataset for dataset in hipe_datasets]) best_model_name = best_model.replace("/", "_") trainer.fine_tune( f"hipe2022-flert-fine-tune-multistage-{datasets}-{best_model_name}-bs{batch_size}-ws{context_size}-e{epoch}-lr{learning_rate}-layers{layers}-crf{use_crf}-{seed}", learning_rate=learning_rate, mini_batch_size=batch_size, max_epochs=epoch, shuffle=True, embeddings_storage_mode='none', weight_decay=0., use_final_model_for_eval=False, ) # Finally, print model card for information tagger.print_model_card()
import flair from flair.data import Corpus #from flair.datasets import TREC_6 from flair.models import SimpleSequenceTagger from flair.trainers import ModelTrainer from flair.embeddings import WordEmbeddings from flair.data import Sentence from flair.data import MultiCorpus from flair.datasets import UD_ENGLISH, UD_GERMAN # 1. get the corpora - English and German UD corpus: MultiCorpus = MultiCorpus([UD_ENGLISH(), UD_GERMAN()]).downsample(0.1) # 2. what tag do we want to predict? tag_type = 'upos' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) #4 embedding glove_embedding = WordEmbeddings('glove') #5 sequence tagger tagger = SimpleSequenceTagger(
columns = {0: 'text', 1: 'ner'} data_folder = '../' corpus1: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.train.txt", test_file="de-da-te-ta.10E-4percent.conll.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.dev.txt") corpus2: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt", test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt") corpus = MultiCorpus([corpus1, corpus2]) custom_embedding = WordEmbeddings( '../../glove/GloVe/vectors_converted_to_gensim.gensim') #bert_embedding = BertEmbeddings('bert-embedding-files/') word_embeddings = StackedEmbeddings([custom_embedding, WordEmbeddings('tr')]) search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[word_embeddings]) #search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256]) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[2]) #search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2, 0.25])
from flair.data import TaggedCorpus, MultiCorpus from flair.data_fetcher import NLPTaskDataFetcher from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, BertEmbeddings from typing import List from flair.data import Dictionary import flair, torch flair.device = torch.device('cpu') columns = {0: 'text', 1: 'ner'} data_folder = '../' corpus1: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.train.txt", test_file="de-da-te-ta.10E-4percent.conll.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.dev.txt") corpus2: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt", test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt") corpus = MultiCorpus([corpus1, corpus2]) tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) #tag_dictionary: Dictionary = Dictionary.load('../vocab/m.model') glove_embedding = WordEmbeddings('../../glove/GLOVE/GloVe/vectors.gensim') word2vec_embedding = WordEmbeddings('../../huawei_w2v/vector.gensim') #bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32') embedding_types: List[TokenEmbeddings] = [WordEmbeddings('tr'), glove_embedding, word2vec_embedding] #embedding_types: List[TokenEmbeddings] = [custom_embedding] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, use_rnn=True, rnn_layers=2) from flair.trainers import ModelTrainer