def test_loading_sequence_labeling_data(): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION) assert len(corpus.train) == 6 assert len(corpus.dev) == 1 assert len(corpus.test) == 1
def test_loading_imdb_data(): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) assert len(corpus.train) == 5 assert len(corpus.dev) == 5 assert len(corpus.test) == 5
def test_load_ag_news_data(tasks_base_path): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS, tasks_base_path) assert len(corpus.train) == 10 assert len(corpus.dev) == 10 assert len(corpus.test) == 10
def test_loading_ag_news_data(): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS) assert len(corpus.train) == 10 assert len(corpus.dev) == 10 assert len(corpus.test) == 10
def test_train_charlm__nocache_load_use_classifier(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast', use_cache=False) document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file('./results/final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree('./results')
def test_train_charlm_changed_chache_load_use_tagger(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION) tag_dictionary = corpus.make_tag_dictionary('ner') # make a temporary cache directory that we remove afterwards os.makedirs('./results/cache/', exist_ok=True) embeddings = CharLMEmbeddings('news-forward-fast', cache_directory='./results/cache/') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True) trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=3) # remove the cache directory shutil.rmtree('./results/cache') loaded_model: SequenceTagger = SequenceTagger.load_from_file('./results/final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree('./results')
def test_train_load_use_classifier(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, base_path=tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, test_mode=True) trainer.train(str(results_base_path), max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) loaded_model = TextClassifier.load_from_file(results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
def test_train_load_use_tagger(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('glove') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True) trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=3) loaded_model: SequenceTagger = SequenceTagger.load_from_file('./results/final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree('./results')
def test_load_ud_english_data(tasks_base_path): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.UD_ENGLISH, tasks_base_path) assert len(corpus.train) == 6 assert len(corpus.test) == 4 assert len(corpus.dev) == 2
def test_load_germeval_data(tasks_base_path): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.GERMEVAL, tasks_base_path) assert len(corpus.train) == 2 assert len(corpus.dev) == 1 assert len(corpus.test) == 1
def test_load_imdb_data(tasks_base_path): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, tasks_base_path) assert len(corpus.train) == 5 assert len(corpus.dev) == 5 assert len(corpus.test) == 5
def test_training(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('glove') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True) trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=10) # clean up results directory shutil.rmtree('./results')
def init() -> Tuple[TaggedCorpus, Dictionary, TextClassifier]: corpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) return corpus, label_dict, model
def test_text_classifier_single_label(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) # clean up results directory shutil.rmtree('./results')
def test_sentence_to_real_string(): sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True) assert ('I love Berlin.' == sentence.to_plain_string()) corpus = NLPTaskDataFetcher.fetch_data(NLPTask.GERMEVAL) sentence = corpus.train[0] assert ( 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .' == sentence.to_tokenized_string()) assert ( 'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer Weise aufgetreten, die alles andere als überzeugend war".' == sentence.to_plain_string()) sentence = corpus.train[1] assert ( 'Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf .' == sentence.to_tokenized_string()) assert ( 'Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf.' == sentence.to_plain_string())
def text_classification(): corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS) corpus.train = [sentence for sentence in corpus.train if len(sentence) > 0] corpus.test = [sentence for sentence in corpus.test if len(sentence) > 0] corpus.dev = [sentence for sentence in corpus.dev if len(sentence) > 0] print("corpus created") #print(corpus.get_all_sentences()) label_dict = corpus.make_label_dictionary() print("created label dict") #for sent in corpus.get_all_sentences(): # print(sent.labels) word_embeddings = [ WordEmbeddings('glove'), CharLMEmbeddings('news-forward'), CharLMEmbeddings('news-backward') ] print("loaded word embeddings") document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_states=512, reproject_words=True, reproject_words_dimension=256, ) print("loaded document embeddings") classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=True) print("created classifier") # 6. initialize the text classifier trainer trainer = TextClassifierTrainer(classifier, corpus, label_dict) print("starting training") # 7. start the trainig trainer.train('results', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=50) print("training finished")
def test_text_classifier_mulit_label(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentMeanEmbeddings = DocumentMeanEmbeddings([glove_embedding], True) model = TextClassifier(document_embeddings, label_dict, True) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert(l.name is not None) assert(0.0 <= l.confidence <= 1.0) assert(type(l.confidence) is float) # clean up results directory shutil.rmtree('./results')
def test_text_classifier_single_label(tasks_base_path): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, tasks_base_path) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( [glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.value is not None) assert (0.0 <= l.score <= 1.0) assert (type(l.score) is float) # clean up results directory shutil.rmtree('./results')
def test_train_charlm_nochache_load_use_tagger(results_base_path, tasks_base_path): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, base_path=tasks_base_path) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = CharLMEmbeddings('news-forward-fast', use_cache=False) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True) trainer.train(str(results_base_path), learning_rate=0.1, mini_batch_size=2, max_epochs=2) loaded_model: SequenceTagger = SequenceTagger.load_from_file( results_base_path / 'final-model.pt') sentence = Sentence('I love Berlin') sentence_empty = Sentence(' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # clean up results directory shutil.rmtree(results_base_path)
from typing import List from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.data import TaggedCorpus from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings, CharacterEmbeddings from flair.visual.training_curves import Plotter # 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_data(NLPTask.UD_ENGLISH) print(corpus) # 2. what tag do we want to predict? tag_type = 'pos' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use contextual string embeddings # # CharLMEmbeddings('news-forward'), # # CharLMEmbeddings('news-backward'), ]
from typing import List import torch from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.data import TaggedCorpus from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings, CharacterEmbeddings # 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_data( NLPTask.CONLL_03).downsample(0.1) print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use contextual string embeddings # # CharLMEmbeddings('news-forward'), #
from flair.data import TaggedCorpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings from typing import List import os # 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_data(NLPTask.ZH) print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('zh'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use contextual string embeddings #CharLMEmbeddings('news-forward'), # CharLMEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) # 5. initialize sequence tagger