def test_example1(self):
     tokens = []
     tokens.append(
         Token(1, b'Kiten', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(2, b'Master', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(3, b':', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(4,
               b'http://youtu.be/jVVD0OZk-6g',
               spacy_is_punct=False,
               spacy_like_url=True))
     sentence_uniqueID = 'p339_s003'
     text = 'Kiten Master: http://youtu.be/jVVD0OZk-6g'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_sentence(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 0.0 / len(tokens), 0.0 / len(tokens), 4, 1, 1, 0.0, 0.0, 0.0,
         1.0
     ]
     self.assertEqual(feature_value, expected_value)
예제 #2
0
 def test_extract_average_polarity_example1(self):
     tokens = []
     tokens.append(Token(1, None, None, None, None, None, None, 0.5))
     tokens.append(Token(2, None, None, None, None, None, None, None))
     tokens.append(Token(3, None, None, None, None, None, None, 1.5))
     thf_sentence = THFSentenceExport(None, None, None, tokens, None, 1)
     feature_value = polarity_sentiws_feature.extract_average_polarity(
         thf_sentence)
     expected_value = [1.0]
     self.assertEqual(feature_value, expected_value)
예제 #3
0
 def test_count_polarity_bearing_tokens_example1(self):
     tokens = []
     tokens.append(Token(1, None, None, None, None, None, None, 0.5))
     tokens.append(Token(2, None, None, None, None, None, None, None))
     tokens.append(Token(3, None, None, None, None, None, None, 1.5))
     thf_sentence = THFSentenceExport(None, None, None, tokens, None, 1)
     feature_value = sentiws_polarity_bearing_tokens_feature.count_polarity_bearing_tokens(
         thf_sentence)
     expected_value = [2]
     self.assertEqual(feature_value, expected_value)
 def test_example6(self):
     tokens = []
     tokens.append(
         Token(1, b'kleine', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(2,
               b'Elektrodrohnen',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(3, b'just', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(4, b'for', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(5, b'fun', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(6, b',', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(7, b'warum', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(8, b'nicht', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(9, b'.', spacy_is_punct=True,
                         spacy_like_url=False))
     sentence_uniqueID = 'p339_s003'
     text = 'kleine Elektrodrohnen just for fun, warum nicht.'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_sentence(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 1.0 / len(tokens), 1.0 / len(tokens), 9, 0, 2, 1.0, 0.0, 0.0,
         0.0
     ]
     self.assertEqual(feature_value, expected_value)
예제 #5
0
 def test_exclamation_mark_end(self):
     tokens = []
     tokens.append(Token(1, 'Test', None, None, None, None, None, None))
     tokens.append(Token(2, '!', None, None, None, None, None, None))
     sentence_uniqueID = 'p339_s007'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, 'Test !',
                                      tokens, None, 1)
     use_sentence_length = False
     feature_value = structural_features.transform_sentence(
         thf_sentence, use_sentence_length)
     expected_value = [
         7, 0.0 / len(tokens), 0.0 / len(tokens), 0.0, 0.0, 1.0, 0.0, 0.0
     ]
     self.assertEqual(feature_value, expected_value)
 def test_example4(self):
     tokens = []
     tokens.append(
         Token(1, b'Hier', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(2, b'eine', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(3, b'Konzept', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(4, b'-', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(5, b'Grafik', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(6, b':', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(7, b' ', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(8,
               b'http://i.imgur.com/JGlqExO.jpg',
               spacy_is_punct=False,
               spacy_like_url=True))
     sentence_uniqueID = 'p339_s003'
     text = 'Hier eine Konzept-Grafik:  http://i.imgur.com/JGlqExO.jpg'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_sentence(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 0.0 / len(tokens), 0.0 / len(tokens), 8, 1, 2, 0.0, 0.0, 0.0,
         1.0
     ]
     self.assertEqual(feature_value, expected_value)
예제 #7
0
def load_v3(file_path='data/THF/sentence/subtaskA_train.json',
            group_claims=True):
    logger.debug(u'Parsing JSON File: {}'.format(file_path))
    sentences = []
    with open(file_path, encoding='utf-8') as data_file:
        data = json.load(data_file)
        for sentence in data:
            sentence_tokens = sentence["NLP"]["tokens"]
            tokens = []
            for token in sentence_tokens:
                token.pop("embedding")
                token_model = Token(**token)
                tokens.append(token_model)
            dependencies = []
            dependency_tokens = sentence["NLP"]["dependencies"]
            for dependency in dependency_tokens:
                dependency_model = Dependency(**dependency)
                dependencies.append(dependency_model)
            label = sentence["Label"]
            if group_claims:
                if label == 'ClaimContra' or label == 'ClaimPro':
                    label = 'Claim'
            sentence_model = THFSentenceExport(sentence["UniqueID"],
                                               label,
                                               sentence["Text"],
                                               tokens,
                                               dependencies,
                                               textdepth=sentence["TextDepth"])
            sentences.append(sentence_model)
    logger.info('Parsed {} sentences'.format(len(sentences)))
    return sentences
예제 #8
0
 def process_sentence(self, sentence):
     result = self.nlp(sentence)
     tokens = []
     dependencies = []
     for token in result:
         iwnlp_lemma = self.lemmatizer.lemmatize(
             token.text, pos_universal_google=token.pos_)
         sentiws = self.sentiws.determine(token.text,
                                          pos_universal_google=token.pos_)
         token_model = Token(token.i + 1,
                             text=token.text,
                             spacy_pos_stts=token.tag_,
                             spacy_pos_universal_google=token.pos_,
                             iwnlp_lemma=iwnlp_lemma,
                             spacy_ner_type=token.ent_type_,
                             spacy_ner_iob=token.ent_iob_,
                             spacy_is_punct=token.is_punct,
                             spacy_is_space=token.is_space,
                             spacy_like_num=token.like_num,
                             spacy_like_url=token.like_url,
                             spacy_shape=token.shape_,
                             polarity_sentiws=sentiws)
         tokens.append(token_model)
         dependency_model = Dependency(token.i + 1, token.dep_,
                                       token.head.i + 1)
         dependencies.append(dependency_model)
         # print(token_model.token_index_in_sentence, token_model.text.encode('utf-8'),
         # format_iwnlp_lemma(token_model.iwnlp_lemma), token_model.spacy_pos_stts,
         # token_model.spacy_pos_universal_google, token_model.spacy_ner_type, token_model.spacy_ner_iob)
     return {'tokens': tokens, 'dependencies': dependencies}
예제 #9
0
 def test_link(self):
     tokens = []
     tokens.append(Token(1, 'Test', None, None, None, None, None, None))
     tokens.append(
         Token(2, 'http://umap.openstreetmap.fr', None, None, None, None,
               None, None))
     sentence_uniqueID = 'p339_s021'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None,
                                      'Test http://umap.openstreetmap.fr',
                                      tokens, None, 1)
     use_sentence_length = False
     feature_value = structural_features.transform_sentence(
         thf_sentence, use_sentence_length)
     expected_value = [
         21, 0.0 / len(tokens), 0.0 / len(tokens), 1.0, 0.0, 0.0, .0, 1.0
     ]
     self.assertEqual(feature_value, expected_value)
예제 #10
0
 def test_count_different_ner_labels_example5(self):
     tokens = []
     tokens.append(
         Token(1,
               b'Vorbild',
               spacy_pos_universal_google='NOUN',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(2,
               b'New',
               spacy_pos_universal_google='PROPN',
               spacy_ner_type='LOC',
               spacy_ner_iob='B'))
     tokens.append(
         Token(3,
               b'York',
               spacy_pos_universal_google='PROPN',
               spacy_ner_type='LOC',
               spacy_ner_iob='I'))
     tokens.append(
         Token(4,
               b':',
               spacy_pos_universal_google='PUNCT',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(
             5,
             b'http://www.houndsandpeople.com/de/magazin/kultur/new-york-city-dogs-teil-und-seele-der-weltmetropole/',
             spacy_pos_universal_google='AUX',
             spacy_ner_type='',
             spacy_ner_iob='O'))
     thf_sentence = THFSentenceExport(
         None, None,
         "Vorbild New York: http://www.houndsandpeople.com/de/magazin/kultur/new-york-city-dogs-teil-und-seele-der-weltmetropole/",
         tokens, None, 1)
     feature_value = ner_feature.count_different_ner_labels(
         thf_sentence.tokens)
     expected_value = np.array([0, 1, 0], dtype=np.float64)
     self.assertEqual(np.array_equal(feature_value, expected_value), True)
예제 #11
0
def load(file_path='data/THF/sentence/subtaskA_train.json', group_claims=True):
    """
    Loads the THF corpus from an JSON file
    :param file_path: relative path to the JSON file
    :return:
    """
    logger.debug(u'Parsing JSON File: {}'.format(file_path))
    sentences = []
    with open(file_path, encoding='utf-8') as data_file:
        data = json.load(data_file)
        for sentence in data:
            sentence_tokens = sentence["NLP"]["Sentences"][0]["Tokens"]
            tokens = []
            dependencies = []
            for token in sentence_tokens:
                token_model = Token(token["TokenIndexInSentence"],
                                    token["Text"],
                                    pos_tag=token["POSTag"],
                                    mate_tools_pos_tag=token["MateToolsPPOS"],
                                    mate_tools_lemma=token["MateToolsPLemma"],
                                    tree_tagger_lemma=parse_tree_tagger_lemma(
                                        token.get("TreeTaggerLemma", None)),
                                    iwnlp_lemma=parse_IWNLP_lemma(
                                        token.get("IWNLPLemma", None)),
                                    polarity=parse_polarity(
                                        (token.get("Polarity", None))))
                tokens.append(token_model)
            dependency_tokens = sentence["NLP"]["Sentences"][0]["Dependencies"]
            for dependency in dependency_tokens:
                dependency_model = Dependency(
                    dependency["TokenID"], dependency["DependencyRelation"],
                    dependency["DependencyHeadTokenID"])
                dependencies.append(dependency_model)
            label = sentence["Label"]
            if group_claims:
                if label == 'ClaimContra' or label == 'ClaimPro':
                    label = 'Claim'
            sentence_model = THFSentenceExport(sentence["UniqueID"], label,
                                               sentence["Text"], tokens,
                                               dependencies)
            sentences.append(sentence_model)
    logger.info('Parsed {} sentences'.format(len(sentences)))
    return sentences
예제 #12
0
 def test_example2_without_sentence_length(self):
     tokens = []
     tokens.append(Token(1, 'Das', None, None, None, None, None, None))
     tokens.append(Token(2, 'ist', None, None, None, None, None, None))
     tokens.append(Token(3, '.', None, None, None, None, None, None))
     tokens.append(Token(4, ',', None, None, None, None, None, None))
     tokens.append(Token(5, 'Test', None, None, None, None, None, None))
     tokens.append(Token(6, '!', None, None, None, None, None, None))
     sentence_uniqueID = 'p339_s021'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None,
                                      'Das ist . , Test!', tokens, None, 1)
     use_sentence_length = False
     feature_value = structural_features.transform_sentence(
         thf_sentence, use_sentence_length)
     expected_value = [
         21, 1.0 / len(tokens), 1.0 / len(tokens), 0.0, 0.0, 1.0, 0.0, 0.0
     ]
     self.assertEqual(feature_value, expected_value)
예제 #13
0
 def test_example1(self):
     tokens = []
     tokens.append(Token(1, 'Das', None, None, None, None, None, None))
     tokens.append(Token(2, 'ist', None, None, None, None, None, None))
     tokens.append(Token(3, 'ein', None, None, None, None, None, None))
     tokens.append(Token(4, ',', None, None, None, None, None, None))
     tokens.append(Token(5, 'Test', None, None, None, None, None, None))
     tokens.append(Token(6, '!', None, None, None, None, None, None))
     sentence_uniqueID = 'c0331_s003'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None,
                                      'Das ist ein , Test!', tokens, None,
                                      1)
     use_sentence_length = True
     feature_value = structural_features.transform_sentence(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 1.0 / len(tokens), 0.0 / len(tokens), 1.0 * len(tokens), 0.0,
         0.0, 1.0, 0.0, 0.0
     ]
     self.assertEqual(feature_value, expected_value)
 def test_example5(self):
     tokens = []
     tokens.append(
         Token(1,
               b'Tempelhofparikram',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(Token(2, b'-', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(3,
               b'Interreligi\xc3\xb6ser',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(4, b'Pilgerpfad', spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(5, b'auf', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(6, b'dem', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(7,
               b'Tempelhofer',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(8, b'Feld', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(9, b'(', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(10,
               b'http://lebensplan.com/Interreligioeser-Pilgerpfad.pdf',
               spacy_is_punct=False,
               spacy_like_url=True))
     tokens.append(
         Token(11, b')', spacy_is_punct=True, spacy_like_url=False))
     sentence_uniqueID = 'p339_s003'
     text = 'Tempelhofparikram - Interreligi\u00f6ser Pilgerpfad auf dem Tempelhofer Feld (http://lebensplan.com/Interreligioeser-Pilgerpfad.pdf)'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_sentence(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 0.0 / len(tokens), 0.0 / len(tokens), 11, 1, 3, 0.0, 0.0, 0.0,
         1.0
     ]
     self.assertEqual(feature_value, expected_value)
예제 #15
0
    def test_count_different_ner_labels_example1(self):
        tokens = []
        tokens.append(
            Token(1,
                  b'Wenn',
                  spacy_pos_universal_google='SCONJ',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(2,
                  b'ich',
                  spacy_pos_universal_google='PRON',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(3,
                  b'durch',
                  spacy_pos_universal_google='ADP',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(4,
                  b'den',
                  spacy_pos_universal_google='DET',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(5,
                  b'Hans',
                  spacy_pos_universal_google='PROPN',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(6,
                  b'-',
                  spacy_pos_universal_google='PUNCT',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(7,
                  b'Baluschek',
                  spacy_pos_universal_google='PROPN',
                  spacy_ner_type='PERSON',
                  spacy_ner_iob='B'))
        tokens.append(
            Token(8,
                  b'-',
                  spacy_pos_universal_google='PUNCT',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(9,
                  b'Park',
                  spacy_pos_universal_google='NOUN',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(10,
                  b'radle',
                  spacy_pos_universal_google='VERB',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(11,
                  b',',
                  spacy_pos_universal_google='PUNCT',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(12,
                  b'riecht',
                  spacy_pos_universal_google='VERB',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(13,
                  b'es',
                  spacy_pos_universal_google='PRON',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(14,
                  b'immer',
                  spacy_pos_universal_google='ADV',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(15,
                  b'stark',
                  spacy_pos_universal_google='ADJ',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(16,
                  b'vom',
                  spacy_pos_universal_google='ADP',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(17,
                  b'angrenzenden',
                  spacy_pos_universal_google='ADJ',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(18,
                  b'S\xc3\xbcdgel\xc3\xa4nde',
                  spacy_pos_universal_google='NOUN',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(19,
                  b'nach',
                  spacy_pos_universal_google='ADP',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(20,
                  b'Farbd\xc3\xbcnsten',
                  spacy_pos_universal_google='NOUN',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(21,
                  b'und',
                  spacy_pos_universal_google='CONJ',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(22,
                  b'das',
                  spacy_pos_universal_google='PRON',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(23,
                  b'passt',
                  spacy_pos_universal_google='VERB',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(24,
                  b'nicht',
                  spacy_pos_universal_google='PART',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(25,
                  b'wirklich',
                  spacy_pos_universal_google='ADJ',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(26,
                  b'zum',
                  spacy_pos_universal_google='ADP',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(27,
                  b'Naturschutz',
                  spacy_pos_universal_google='NOUN',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))
        tokens.append(
            Token(28,
                  b'.',
                  spacy_pos_universal_google='PUNCT',
                  spacy_ner_type='',
                  spacy_ner_iob='O'))

        thf_sentence = THFSentenceExport(
            None, None,
            "Wenn ich durch den Hans-Baluschek-Park radle, riecht es immer stark vom angrenzenden Südgelände nach Farbdünsten und das passt nicht wirklich zum Naturschutz.",
            tokens, None, 1)
        feature_value = ner_feature.count_different_ner_labels(
            thf_sentence.tokens)
        expected_value = np.array([1, 0, 0], dtype=np.float64)
        self.assertEqual(np.array_equal(feature_value, expected_value), True)
예제 #16
0
 def test_count_different_ner_labels_example4(self):
     tokens = []
     tokens.append(
         Token(1,
               b'Oder',
               spacy_pos_universal_google='CONJ',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(2,
               b',',
               spacy_pos_universal_google='PUNCT',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(3,
               b'um',
               spacy_pos_universal_google='SCONJ',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(4,
               b'mit',
               spacy_pos_universal_google='ADP',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(5,
               b'Hermann',
               spacy_pos_universal_google='PROPN',
               spacy_ner_type='PERSON',
               spacy_ner_iob='B'))
     tokens.append(
         Token(6,
               b'Hesse',
               spacy_pos_universal_google='PROPN',
               spacy_ner_type='PERSON',
               spacy_ner_iob='I'))
     tokens.append(
         Token(7,
               b'zu',
               spacy_pos_universal_google='PART',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(8,
               b'sprechen',
               spacy_pos_universal_google='VERB',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(9,
               b':',
               spacy_pos_universal_google='PUNCT',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(10,
               b'Jedem',
               spacy_pos_universal_google='DET',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(11,
               b'Ende',
               spacy_pos_universal_google='NOUN',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(12,
               b'wohnt',
               spacy_pos_universal_google='VERB',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(13,
               b'ein',
               spacy_pos_universal_google='DET',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(14,
               b'neuer',
               spacy_pos_universal_google='ADJ',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(15,
               b'Anfang',
               spacy_pos_universal_google='NOUN',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(16,
               b'inne',
               spacy_pos_universal_google='PART',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(17,
               b'.',
               spacy_pos_universal_google='PUNCT',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     thf_sentence = THFSentenceExport(
         None, None,
         "Oder, um mit Hermann Hesse zu sprechen: Jedem Ende wohnt ein neuer Anfang inne.",
         tokens, None, 1)
     feature_value = ner_feature.count_different_ner_labels(
         thf_sentence.tokens)
     expected_value = np.array([1, 0, 0], dtype=np.float64)
     self.assertEqual(np.array_equal(feature_value, expected_value), True)
예제 #17
0
 def test_count_different_ner_labels_example3(self):
     tokens = []
     tokens.append(
         Token(1,
               b'Auf',
               spacy_pos_universal_google='ADP',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(2,
               b'dem',
               spacy_pos_universal_google='DET',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(3,
               b'Tempelhofer',
               spacy_pos_universal_google='ADJ',
               spacy_ner_type='LOC',
               spacy_ner_iob='B'))
     tokens.append(
         Token(4,
               b'Feld',
               spacy_pos_universal_google='NOUN',
               spacy_ner_type='LOC',
               spacy_ner_iob='I'))
     tokens.append(
         Token(5,
               b'stehen',
               spacy_pos_universal_google='VERB',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(6,
               b'22',
               spacy_pos_universal_google='NUM',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(7,
               b'kleinere',
               spacy_pos_universal_google='ADJ',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(8,
               b'Geb\xc3\xa4ude',
               spacy_pos_universal_google='NOUN',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(9,
               b',',
               spacy_pos_universal_google='PUNCT',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(10,
               b'gr\xc3\xb6\xc3\x9ftenteils',
               spacy_pos_universal_google='ADV',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(11,
               b'ungenutzt',
               spacy_pos_universal_google='ADJ',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(12,
               b'.',
               spacy_pos_universal_google='PUNCT',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     thf_sentence = THFSentenceExport(
         None, None,
         "Auf dem Tempelhofer Feld stehen 22 kleinere Gebäude, größtenteils ungenutzt.",
         tokens, None, 1)
     feature_value = ner_feature.count_different_ner_labels(
         thf_sentence.tokens)
     expected_value = np.array([0, 1, 0], dtype=np.float64)
     self.assertEqual(np.array_equal(feature_value, expected_value), True)
예제 #18
0
 def test_count_different_ner_labels_example2(self):
     tokens = []
     tokens.append(
         Token(1,
               b'Bei',
               spacy_pos_universal_google='ADP',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(2,
               b'der',
               spacy_pos_universal_google='DET',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(3,
               b'Stauraumplanung',
               spacy_pos_universal_google='NOUN',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(4,
               b'wird',
               spacy_pos_universal_google='AUX',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(5,
               b'es',
               spacy_pos_universal_google='PRON',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(6,
               b'aus',
               spacy_pos_universal_google='ADP',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(7,
               b'allen',
               spacy_pos_universal_google='DET',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(8,
               b'Gullis',
               spacy_pos_universal_google='NOUN',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(9,
               b'demn\xc3\xa4chst',
               spacy_pos_universal_google='ADV',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(10,
               b'stinken',
               spacy_pos_universal_google='VERB',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(11,
               b'und',
               spacy_pos_universal_google='CONJ',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(12,
               b'Abwaaserlagerung',
               spacy_pos_universal_google='NOUN',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(13,
               b'in',
               spacy_pos_universal_google='ADP',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(14,
               b'Schiffen',
               spacy_pos_universal_google='NOUN',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(15,
               b'auf',
               spacy_pos_universal_google='ADP',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(16,
               b'der',
               spacy_pos_universal_google='DET',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(17,
               b'Spree',
               spacy_pos_universal_google='PROPN',
               spacy_ner_type='LOC',
               spacy_ner_iob='B'))
     tokens.append(
         Token(18,
               b'd\xc3\xbcrfte',
               spacy_pos_universal_google='VERB',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(19,
               b'auch',
               spacy_pos_universal_google='ADV',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(20,
               b'nicht',
               spacy_pos_universal_google='PART',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(21,
               b'gesund',
               spacy_pos_universal_google='ADJ',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(22,
               b'sein',
               spacy_pos_universal_google='AUX',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     tokens.append(
         Token(23,
               b'.',
               spacy_pos_universal_google='PUNCT',
               spacy_ner_type='',
               spacy_ner_iob='O'))
     thf_sentence = THFSentenceExport(
         None, None,
         "Bei der Stauraumplanung wird es aus allen Gullis demnächst stinken und Abwaaserlagerung in Schiffen auf der Spree dürfte auch nicht gesund sein.",
         tokens, None, 1)
     feature_value = ner_feature.count_different_ner_labels(
         thf_sentence.tokens)
     expected_value = np.array([0, 1, 0], dtype=np.float64)
     self.assertEqual(np.array_equal(feature_value, expected_value), True)
 def test_example2(self):
     tokens = []
     tokens.append(
         Token(1, b'Diesen', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(2, b'Vorschlag', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(3, b'gibt', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(4, b'es', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(5, b'schon', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(6, b':', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(7,
               b'S\xc3\xbcdlicher',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(8, b'Zugang', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(9, b'zur', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(10,
               b'Oberlandstra\xc3\x9fe',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(11, b'(', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(12, b'Hatun', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(13, b'-', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(14,
               b'S\xc3\xbcr\xc3\xbcc\xc3\xbc',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(15, b'-', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(16,
               b'Br\xc3\xbccke',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(17, b')', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(18, b'(', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(
             19,
             b'https://tempelhofer-feld.berlin.de/i/tempelhofer-feld/proposal/104-S%C3%BCdlicher_Zugang_zur_Oberlandstra%C3%9Fe_Hatu',
             spacy_is_punct=False,
             spacy_like_url=True))
     tokens.append(
         Token(20, b')', spacy_is_punct=True, spacy_like_url=False))
     sentence_uniqueID = 'p339_s003'
     text = 'Diesen Vorschlag gibt es schon: S\u00fcdlicher Zugang zur Oberlandstra\u00dfe (Hatun-S\u00fcr\u00fcc\u00fc-Br\u00fccke) (https://tempelhofer-feld.berlin.de/i/tempelhofer-feld/proposal/104-S%C3%BCdlicher_Zugang_zur_Oberlandstra%C3%9Fe_Hatu)'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_sentence(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 0.0 / len(tokens), 0.0 / len(tokens), 20, 1, 7, 0.0, 0.0, 0.0,
         1.0
     ]
     self.assertEqual(feature_value, expected_value)
 def test_example3(self):
     tokens = []
     tokens.append(
         Token(1, b'BM', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(2, b'Tester', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(3, b'#', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(4, b'1', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(5, b':', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(6, b'Kite', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(Token(7, b'-', spacy_is_punct=True,
                         spacy_like_url=False))
     tokens.append(
         Token(8, b'Skaten', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(9, b'auf', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(10, b'dem', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(11,
               b'Tempelhofer',
               spacy_is_punct=False,
               spacy_like_url=False))
     tokens.append(
         Token(12, b'Feld', spacy_is_punct=False, spacy_like_url=False))
     tokens.append(
         Token(13, b':', spacy_is_punct=True, spacy_like_url=False))
     tokens.append(
         Token(14,
               b'http://youtu.be/Jf68D61QN4A',
               spacy_is_punct=False,
               spacy_like_url=True))
     sentence_uniqueID = 'p339_s003'
     text = 'BM Tester #1: Kite-Skaten auf dem Tempelhofer Feld: http://youtu.be/Jf68D61QN4A'
     thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens,
                                      None, 1)
     use_sentence_length = True
     feature_value = structural_features_spacy.transform_sentence(
         thf_sentence, use_sentence_length)
     expected_value = [
         3, 0.0 / len(tokens), 0.0 / len(tokens), 14, 1, 4, 0.0, 0.0, 0.0,
         1.0
     ]
     self.assertEqual(feature_value, expected_value)