Exemple #1
0
def test_tagged_corpus_get_all_sentences():
    train_sentence = Sentence("I'm used in training.", use_tokenizer=True)
    dev_sentence = Sentence("I'm a dev sentence.", use_tokenizer=True)
    test_sentence = Sentence("I will be only used for testing.", use_tokenizer=True)

    corpus: TaggedCorpus = TaggedCorpus([train_sentence], [dev_sentence], [test_sentence])

    all_sentences = corpus.get_all_sentences()

    assert (3 == len(all_sentences))
Exemple #2
0
def test_tagged_corpus_make_label_dictionary_string():
    sentence_1 = Sentence('sentence 1', labels=['class_1'])
    sentence_2 = Sentence('sentence 2', labels=['class_2'])
    sentence_3 = Sentence('sentence 3', labels=['class_1'])

    corpus: TaggedCorpus = TaggedCorpus([sentence_1, sentence_2, sentence_3], [], [])

    label_dict = corpus.make_label_dictionary()

    assert (2 == len(label_dict))
    assert ('<unk>' not in label_dict.get_items())
    assert ('class_1' in label_dict.get_items())
    assert ('class_2' in label_dict.get_items())
Exemple #3
0
    def read_column_data(path_to_column_file: str,
                         column_name_map: Dict[int, str],
                         infer_whitespace_after: bool = True):
        """
        Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the
        column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you
        specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third
        the chunk and the forth the NER tag.
        :param path_to_column_file: the path to the column file
        :param column_name_map: a map of column number to token annotation name
        :param infer_whitespace_after: if True, tries to infer whitespace_after field for Token
        :return: list of sentences
        """
        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_column_file).read().strip().split('\n')

        # most data sets have the token text in the first column, if not, pass 'text' as column
        text_column: int = 0
        for column in column_name_map:
            if column_name_map[column] == 'text':
                text_column = column

        sentence: Sentence = Sentence()
        for line in lines:

            if line.startswith('#'):
                continue

            if line == '':
                if len(sentence) > 0:
                    sentence._infer_space_after()
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            else:
                fields: List[str] = re.split("\s+", line)
                token = Token(fields[text_column])
                for column in column_name_map:
                    if len(fields) > column:
                        if column != text_column:
                            token.add_tag(column_name_map[column], fields[column])
                sentence.add_token(token)

        if len(sentence.tokens) > 0:
            sentence._infer_space_after()
            sentences.append(sentence)

        return sentences
Exemple #4
0
def test_sentence_infer_tokenization():
    sentence: Sentence = Sentence()
    sentence.add_token(Token('xyz'))
    sentence.add_token(Token('"'))
    sentence.add_token(Token('abc'))
    sentence.add_token(Token('"'))
    sentence._infer_space_after()

    assert ('xyz " abc "' == sentence.to_tokenized_string())
    assert ('xyz "abc"' == sentence.to_plain_string())

    sentence: Sentence = Sentence('xyz " abc "')
    sentence._infer_space_after()
    assert ('xyz " abc "' == sentence.to_tokenized_string())
    assert ('xyz "abc"' == sentence.to_plain_string())
def test_training():
    # get default dictionary
    dictionary: Dictionary = Dictionary.load('chars')

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary,
                                                  is_forward_lm=True,
                                                  hidden_size=128,
                                                  nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus('resources/corpora/lorem_ipsum',
                                    dictionary,
                                    language_model.is_forward_lm,
                                    character_level=True)

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(
        language_model, corpus)
    trainer.train('./results',
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=5)

    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
    char_lm_embeddings = CharLMEmbeddings('./results/best-lm.pt')
    sentence = Sentence('I love Berlin')
    char_lm_embeddings.embed(sentence)
    print(sentence[1].embedding.size())

    # clean up results directory
    shutil.rmtree('./results')
Exemple #6
0
def test_create_sentence_without_tokenizer():
    sentence: Sentence = Sentence('I love Berlin.')

    assert (3 == len(sentence.tokens))
    assert ('I' == sentence.tokens[0].text)
    assert ('love' == sentence.tokens[1].text)
    assert ('Berlin.' == sentence.tokens[2].text)
Exemple #7
0
def init_document_embeddings():
    text = 'I love Berlin. Berlin is a great place to live.'
    sentence: Sentence = Sentence(text)

    glove: TokenEmbeddings = WordEmbeddings('en-glove')
    charlm: TokenEmbeddings = CharLMEmbeddings('mix-backward')

    return sentence, glove, charlm
Exemple #8
0
def test_sentence_whitespace_tokenization():
    sentence: Sentence = Sentence('I  love Berlin .')

    assert (4 == len(sentence.tokens))
    assert ('I' == sentence.get_token(1).text)
    assert ('love' == sentence.get_token(2).text)
    assert ('Berlin' == sentence.get_token(3).text)
    assert ('.' == sentence.get_token(4).text)
Exemple #9
0
def test_sentence_get_item():
    sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True)

    assert (sentence.get_token(1) == sentence[0])
    assert (sentence.get_token(3) == sentence[2])

    with pytest.raises(IndexError):
        token = sentence[4]
Exemple #10
0
def test_create_sentence_with_tokenizer():
    sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True)

    assert (4 == len(sentence.tokens))
    assert ('I' == sentence.tokens[0].text)
    assert ('love' == sentence.tokens[1].text)
    assert ('Berlin' == sentence.tokens[2].text)
    assert ('.' == sentence.tokens[3].text)
Exemple #11
0
    def read_conll_ud(path_to_conll_file: str) -> List[Sentence]:
        """
       Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation
       :param path_to_conll_file: the path to the conll-u file
       :return: list of sentences
       """
        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_conll_file, encoding='utf-8'). \
            read().strip().split('\n')

        sentence: Sentence = Sentence()
        for line in lines:

            fields: List[str] = re.split("\s+", line)
            if line == '':
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            elif line.startswith('#'):
                continue
            elif '.' in fields[0]:
                continue
            elif '-' in fields[0]:
                continue
            else:
                token = Token(fields[1], head_id=int(fields[6]))
                token.add_tag('lemma', str(fields[2]))
                token.add_tag('upos', str(fields[3]))
                token.add_tag('pos', str(fields[4]))
                token.add_tag('dependency', str(fields[7]))

                for morph in str(fields[5]).split('|'):
                    if not "=" in morph: continue;
                    token.add_tag(morph.split('=')[0].lower(), morph.split('=')[1])

                if len(fields) > 10 and str(fields[10]) == 'Y':
                    token.add_tag('frame', str(fields[11]))

                sentence.add_token(token)

        if len(sentence.tokens) > 0: sentences.append(sentence)

        return sentences
Exemple #12
0
def test_tagged_corpus_statistics_multi_label():
    train_sentence = Sentence('I love Berlin.', labels=['class_1'], use_tokenizer=True)
    dev_sentence = Sentence('The sun is shining.', labels=['class_2'], use_tokenizer=True)
    test_sentence = Sentence('Berlin is sunny.', labels=['class_1', 'class_2'], use_tokenizer=True)

    class_to_count_dict = TaggedCorpus._get_classes_to_count([train_sentence, dev_sentence, test_sentence])

    assert ('class_1' in class_to_count_dict)
    assert ('class_2' in class_to_count_dict)
    assert (2 == class_to_count_dict['class_1'])
    assert (2 == class_to_count_dict['class_2'])

    tokens_in_sentences = TaggedCorpus._get_tokens_per_sentence([train_sentence, dev_sentence, test_sentence])

    assert (3 == len(tokens_in_sentences))
    assert (4 == tokens_in_sentences[0])
    assert (5 == tokens_in_sentences[1])
    assert (4 == tokens_in_sentences[2])
Exemple #13
0
def test_tagged_corpus_downsample():
    sentence = Sentence('I love Berlin.', labels=[Label('class_1')], use_tokenizer=True)

    corpus: TaggedCorpus = TaggedCorpus(
        [sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence], [], [])

    assert (10 == len(corpus.train))

    corpus.downsample(percentage=0.3, only_downsample_train=True)

    assert (3 == len(corpus.train))
Exemple #14
0
def load_and_apply_char_lm_embeddings(emb_type: str):
    text = 'I love Berlin.'
    sentence: Sentence = Sentence(text)
    embeddings: TokenEmbeddings = CharLMEmbeddings(emb_type)
    embeddings.embed(sentence)

    for token in sentence.tokens:
        assert (len(token.get_embedding()) != 0)

        token.clear_embeddings()

        assert (len(token.get_embedding()) == 0)
Exemple #15
0
def test_sentence_to_tagged_string():
    token1 = Token('I', 0)
    token2 = Token('love', 1, 0)
    token3 = Token('Berlin', 2, 1)
    token3.add_tag('ner', 'LOC')

    sentence: Sentence = Sentence()
    sentence.add_token(token1)
    sentence.add_token(token2)
    sentence.add_token(token3)

    assert ('I love Berlin <LOC>' == sentence.to_tagged_string())
Exemple #16
0
def test_get_head():
    token1 = Token('I', 0)
    token2 = Token('love', 1, 0)
    token3 = Token('Berlin', 2, 1)

    sentence: Sentence = Sentence()
    sentence.add_token(token1)
    sentence.add_token(token2)
    sentence.add_token(token3)

    assert (token2 == token3.get_head())
    assert (token1 == token2.get_head())
    assert (None == token1.get_head())
Exemple #17
0
def test_tag_sentence():

    # test tagging
    sentence = Sentence('I love Berlin')

    tagger = SequenceTagger.load('ner')

    tagger.predict(sentence)

    # test re-tagging
    tagger = SequenceTagger.load('pos')

    tagger.predict(sentence)
Exemple #18
0
def test_document_mean_embeddings():
    text = 'I love Berlin. Berlin is a great place to live.'
    sentence: Sentence = Sentence(text)

    glove: TokenEmbeddings = WordEmbeddings('en-glove')
    charlm: TokenEmbeddings = CharLMEmbeddings('mix-backward')

    embeddings: DocumentMeanEmbeddings = DocumentMeanEmbeddings(
        [glove, charlm])

    embeddings.embed(sentence)

    assert (len(sentence.get_embedding()) != 0)

    sentence.clear_embeddings()

    assert (len(sentence.get_embedding()) == 0)
Exemple #19
0
def test_sentence_to_real_string():
    sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True)
    assert ('I love Berlin.' == sentence.to_plain_string())

    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.GERMEVAL)

    sentence = corpus.train[0]
    assert (
                'Schartau sagte dem " Tagesspiegel " vom Freitag , Fischer sei " in einer Weise aufgetreten , die alles andere als überzeugend war " .' == sentence.to_tokenized_string())
    assert (
                'Schartau sagte dem "Tagesspiegel" vom Freitag, Fischer sei "in einer Weise aufgetreten, die alles andere als überzeugend war".' == sentence.to_plain_string())

    sentence = corpus.train[1]
    assert (
                'Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter , als er einen fliegenden Händler aus dem Libanon traf .' == sentence.to_tokenized_string())
    assert (
                'Firmengründer Wolf Peter Bree arbeitete Anfang der siebziger Jahre als Möbelvertreter, als er einen fliegenden Händler aus dem Libanon traf.' == sentence.to_plain_string())
Exemple #20
0
def test_stacked_embeddings():
    text = 'I love Berlin.'
    sentence: Sentence = Sentence(text)

    glove: TokenEmbeddings = WordEmbeddings('en-glove')
    news: TokenEmbeddings = WordEmbeddings('en-news')
    charlm: TokenEmbeddings = CharLMEmbeddings('mix-backward')

    embeddings: StackedEmbeddings = StackedEmbeddings([glove, news, charlm])

    embeddings.embed(sentence)

    for token in sentence.tokens:
        assert (len(token.get_embedding()) != 0)

        token.clear_embeddings()

        assert (len(token.get_embedding()) == 0)
Exemple #21
0
def test_tagged_corpus_make_vocab_dictionary():
    train_sentence = Sentence('used in training. training is cool.', use_tokenizer=True)

    corpus: TaggedCorpus = TaggedCorpus([train_sentence], [], [])

    vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=-1)

    assert (3 == len(vocab))
    assert ('<unk>' in vocab.get_items())
    assert ('training' in vocab.get_items())
    assert ('.' in vocab.get_items())

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=-1)

    assert (7 == len(vocab))

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=2)

    assert (3 == len(vocab))
    assert ('<unk>' in vocab.get_items())
    assert ('training' in vocab.get_items())
    assert ('.' in vocab.get_items())
Exemple #22
0
def test_text_classifier_mulit_label():
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings: DocumentMeanEmbeddings = DocumentMeanEmbeddings(
        [glove_embedding])

    model = TextClassifier(document_embeddings, label_dict, True)

    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
    trainer.train('./results', max_epochs=2)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.name is not None)
            assert (0.0 <= l.confidence <= 1.0)
            assert (type(l.confidence) is float)

    # clean up results directory
    shutil.rmtree('./results')
Exemple #23
0
    def read_text_classification_file(path_to_file):
        """
        Reads a data file for text classification. The file should contain one document/text per line.
        The line should have the following format:
        __label__<class_name> <text>
        If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g.,
        __label__<class_name_1> __label__<class_name_2> <text>
        :param path_to_file: the path to the data file
        :return: list of sentences
        """
        label_prefix = '__label__'
        sentences = []

        with open(path_to_file) as f:
            lines = f.readlines()

            for line in lines:
                words = line.split()

                labels = []
                l_len = 0

                for i in range(len(words)):
                    if words[i].startswith(label_prefix):
                        l_len += len(words[i]) + 1
                        label = words[i].replace(label_prefix, "")
                        labels.append(label)
                    else:
                        break

                text = line[l_len:].strip()

                if text and labels:
                    sentences.append(Sentence(text, labels=labels, use_tokenizer=True))

        return sentences
Exemple #24
0
from flairrelex.data import Sentence
from flairrelex.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger.load('ner')

sentence: Sentence = Sentence('George Washington went to Washington .')
tagger.predict(sentence)

print('Analysing %s' % sentence)
print('\nThe following NER tags are found: \n')
print(sentence.to_tagged_string())
Exemple #25
0
def test_sentence_to_plain_string():
    sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True)

    assert ('I love Berlin .' == sentence.to_tokenized_string())