def test_load_ud_english_data(): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.UD_ENGLISH) assert len(corpus.train) == 6 assert len(corpus.test) == 4 assert len(corpus.dev) == 2
def test_load_germeval_data(): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.GERMEVAL) assert len(corpus.train) == 2 assert len(corpus.dev) == 1 assert len(corpus.test) == 1
def test_load_imdb_data(): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) assert len(corpus.train) == 5 assert len(corpus.dev) == 5 assert len(corpus.test) == 5
def test_load_sequence_labeling_data(): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION) assert len(corpus.train) == 6 assert len(corpus.dev) == 1 assert len(corpus.test) == 1
def test_load_ag_news_data(): # get training, test and dev data corpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS) assert len(corpus.train) == 10 assert len(corpus.dev) == 10 assert len(corpus.test) == 10
def init() -> Tuple[TaggedCorpus, Dictionary, TextClassifier]: corpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False) model = TextClassifier(document_embeddings, label_dict, False) return corpus, label_dict, model
def test_sentence_to_real_string(): sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True) assert ('I love Berlin.' == sentence.to_plain_string()) corpus = NLPTaskDataFetcher.fetch_data(NLPTask.GERMEVAL) sentence = corpus.train[0] assert ( 'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .' == sentence.to_tokenized_string()) assert ( 'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer Weise aufgetreten, die alles andere als überzeugend war".' == sentence.to_plain_string()) sentence = corpus.train[1] assert ( 'Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf .' == sentence.to_tokenized_string()) assert ( 'Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf.' == sentence.to_plain_string())
def test_training(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION) tag_dictionary = corpus.make_tag_dictionary('ner') embeddings = WordEmbeddings('glove') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='ner', use_crf=False) # initialize trainer trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True) trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=10) # clean up results directory shutil.rmtree('./results')
def test_text_classifier_mulit_label(): corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB) label_dict = corpus.make_label_dictionary() glove_embedding: WordEmbeddings = WordEmbeddings('en-glove') document_embeddings: DocumentMeanEmbeddings = DocumentMeanEmbeddings( [glove_embedding]) model = TextClassifier(document_embeddings, label_dict, True) trainer = TextClassifierTrainer(model, corpus, label_dict, False) trainer.train('./results', max_epochs=2) sentence = Sentence("Berlin is a really nice city.") for s in model.predict(sentence): for l in s.labels: assert (l.name is not None) assert (0.0 <= l.confidence <= 1.0) assert (type(l.confidence) is float) # clean up results directory shutil.rmtree('./results')
from typing import List import torch from flairrelex.data_fetcher import NLPTaskDataFetcher, NLPTask from flairrelex.data import TaggedCorpus from flairrelex.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings, CharacterEmbeddings # 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_data( NLPTask.CONLL_03).downsample(0.1) print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use contextual string embeddings # # CharLMEmbeddings('news-forward'), #