def test_example1(self): tokens = [] tokens.append( Token(1, b'Kiten', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(2, b'Master', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(3, b':', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(4, b'http://youtu.be/jVVD0OZk-6g', spacy_is_punct=False, spacy_like_url=True)) sentence_uniqueID = 'p339_s003' text = 'Kiten Master: http://youtu.be/jVVD0OZk-6g' thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens, None, 1) use_sentence_length = True feature_value = structural_features_spacy.transform_sentence( thf_sentence, use_sentence_length) expected_value = [ 3, 0.0 / len(tokens), 0.0 / len(tokens), 4, 1, 1, 0.0, 0.0, 0.0, 1.0 ] self.assertEqual(feature_value, expected_value)
def test_extract_average_polarity_example1(self): tokens = [] tokens.append(Token(1, None, None, None, None, None, None, 0.5)) tokens.append(Token(2, None, None, None, None, None, None, None)) tokens.append(Token(3, None, None, None, None, None, None, 1.5)) thf_sentence = THFSentenceExport(None, None, None, tokens, None, 1) feature_value = polarity_sentiws_feature.extract_average_polarity( thf_sentence) expected_value = [1.0] self.assertEqual(feature_value, expected_value)
def test_count_polarity_bearing_tokens_example1(self): tokens = [] tokens.append(Token(1, None, None, None, None, None, None, 0.5)) tokens.append(Token(2, None, None, None, None, None, None, None)) tokens.append(Token(3, None, None, None, None, None, None, 1.5)) thf_sentence = THFSentenceExport(None, None, None, tokens, None, 1) feature_value = sentiws_polarity_bearing_tokens_feature.count_polarity_bearing_tokens( thf_sentence) expected_value = [2] self.assertEqual(feature_value, expected_value)
def test_example6(self): tokens = [] tokens.append( Token(1, b'kleine', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(2, b'Elektrodrohnen', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(3, b'just', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(4, b'for', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(5, b'fun', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(6, b',', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(7, b'warum', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(8, b'nicht', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(9, b'.', spacy_is_punct=True, spacy_like_url=False)) sentence_uniqueID = 'p339_s003' text = 'kleine Elektrodrohnen just for fun, warum nicht.' thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens, None, 1) use_sentence_length = True feature_value = structural_features_spacy.transform_sentence( thf_sentence, use_sentence_length) expected_value = [ 3, 1.0 / len(tokens), 1.0 / len(tokens), 9, 0, 2, 1.0, 0.0, 0.0, 0.0 ] self.assertEqual(feature_value, expected_value)
def test_exclamation_mark_end(self): tokens = [] tokens.append(Token(1, 'Test', None, None, None, None, None, None)) tokens.append(Token(2, '!', None, None, None, None, None, None)) sentence_uniqueID = 'p339_s007' thf_sentence = THFSentenceExport(sentence_uniqueID, None, 'Test !', tokens, None, 1) use_sentence_length = False feature_value = structural_features.transform_sentence( thf_sentence, use_sentence_length) expected_value = [ 7, 0.0 / len(tokens), 0.0 / len(tokens), 0.0, 0.0, 1.0, 0.0, 0.0 ] self.assertEqual(feature_value, expected_value)
def test_example4(self): tokens = [] tokens.append( Token(1, b'Hier', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(2, b'eine', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(3, b'Konzept', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(4, b'-', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(5, b'Grafik', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(6, b':', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(7, b' ', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(8, b'http://i.imgur.com/JGlqExO.jpg', spacy_is_punct=False, spacy_like_url=True)) sentence_uniqueID = 'p339_s003' text = 'Hier eine Konzept-Grafik: http://i.imgur.com/JGlqExO.jpg' thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens, None, 1) use_sentence_length = True feature_value = structural_features_spacy.transform_sentence( thf_sentence, use_sentence_length) expected_value = [ 3, 0.0 / len(tokens), 0.0 / len(tokens), 8, 1, 2, 0.0, 0.0, 0.0, 1.0 ] self.assertEqual(feature_value, expected_value)
def load_v3(file_path='data/THF/sentence/subtaskA_train.json', group_claims=True): logger.debug(u'Parsing JSON File: {}'.format(file_path)) sentences = [] with open(file_path, encoding='utf-8') as data_file: data = json.load(data_file) for sentence in data: sentence_tokens = sentence["NLP"]["tokens"] tokens = [] for token in sentence_tokens: token.pop("embedding") token_model = Token(**token) tokens.append(token_model) dependencies = [] dependency_tokens = sentence["NLP"]["dependencies"] for dependency in dependency_tokens: dependency_model = Dependency(**dependency) dependencies.append(dependency_model) label = sentence["Label"] if group_claims: if label == 'ClaimContra' or label == 'ClaimPro': label = 'Claim' sentence_model = THFSentenceExport(sentence["UniqueID"], label, sentence["Text"], tokens, dependencies, textdepth=sentence["TextDepth"]) sentences.append(sentence_model) logger.info('Parsed {} sentences'.format(len(sentences))) return sentences
def process_sentence(self, sentence): result = self.nlp(sentence) tokens = [] dependencies = [] for token in result: iwnlp_lemma = self.lemmatizer.lemmatize( token.text, pos_universal_google=token.pos_) sentiws = self.sentiws.determine(token.text, pos_universal_google=token.pos_) token_model = Token(token.i + 1, text=token.text, spacy_pos_stts=token.tag_, spacy_pos_universal_google=token.pos_, iwnlp_lemma=iwnlp_lemma, spacy_ner_type=token.ent_type_, spacy_ner_iob=token.ent_iob_, spacy_is_punct=token.is_punct, spacy_is_space=token.is_space, spacy_like_num=token.like_num, spacy_like_url=token.like_url, spacy_shape=token.shape_, polarity_sentiws=sentiws) tokens.append(token_model) dependency_model = Dependency(token.i + 1, token.dep_, token.head.i + 1) dependencies.append(dependency_model) # print(token_model.token_index_in_sentence, token_model.text.encode('utf-8'), # format_iwnlp_lemma(token_model.iwnlp_lemma), token_model.spacy_pos_stts, # token_model.spacy_pos_universal_google, token_model.spacy_ner_type, token_model.spacy_ner_iob) return {'tokens': tokens, 'dependencies': dependencies}
def test_link(self): tokens = [] tokens.append(Token(1, 'Test', None, None, None, None, None, None)) tokens.append( Token(2, 'http://umap.openstreetmap.fr', None, None, None, None, None, None)) sentence_uniqueID = 'p339_s021' thf_sentence = THFSentenceExport(sentence_uniqueID, None, 'Test http://umap.openstreetmap.fr', tokens, None, 1) use_sentence_length = False feature_value = structural_features.transform_sentence( thf_sentence, use_sentence_length) expected_value = [ 21, 0.0 / len(tokens), 0.0 / len(tokens), 1.0, 0.0, 0.0, .0, 1.0 ] self.assertEqual(feature_value, expected_value)
def test_count_different_ner_labels_example5(self): tokens = [] tokens.append( Token(1, b'Vorbild', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(2, b'New', spacy_pos_universal_google='PROPN', spacy_ner_type='LOC', spacy_ner_iob='B')) tokens.append( Token(3, b'York', spacy_pos_universal_google='PROPN', spacy_ner_type='LOC', spacy_ner_iob='I')) tokens.append( Token(4, b':', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token( 5, b'http://www.houndsandpeople.com/de/magazin/kultur/new-york-city-dogs-teil-und-seele-der-weltmetropole/', spacy_pos_universal_google='AUX', spacy_ner_type='', spacy_ner_iob='O')) thf_sentence = THFSentenceExport( None, None, "Vorbild New York: http://www.houndsandpeople.com/de/magazin/kultur/new-york-city-dogs-teil-und-seele-der-weltmetropole/", tokens, None, 1) feature_value = ner_feature.count_different_ner_labels( thf_sentence.tokens) expected_value = np.array([0, 1, 0], dtype=np.float64) self.assertEqual(np.array_equal(feature_value, expected_value), True)
def load(file_path='data/THF/sentence/subtaskA_train.json', group_claims=True): """ Loads the THF corpus from an JSON file :param file_path: relative path to the JSON file :return: """ logger.debug(u'Parsing JSON File: {}'.format(file_path)) sentences = [] with open(file_path, encoding='utf-8') as data_file: data = json.load(data_file) for sentence in data: sentence_tokens = sentence["NLP"]["Sentences"][0]["Tokens"] tokens = [] dependencies = [] for token in sentence_tokens: token_model = Token(token["TokenIndexInSentence"], token["Text"], pos_tag=token["POSTag"], mate_tools_pos_tag=token["MateToolsPPOS"], mate_tools_lemma=token["MateToolsPLemma"], tree_tagger_lemma=parse_tree_tagger_lemma( token.get("TreeTaggerLemma", None)), iwnlp_lemma=parse_IWNLP_lemma( token.get("IWNLPLemma", None)), polarity=parse_polarity( (token.get("Polarity", None)))) tokens.append(token_model) dependency_tokens = sentence["NLP"]["Sentences"][0]["Dependencies"] for dependency in dependency_tokens: dependency_model = Dependency( dependency["TokenID"], dependency["DependencyRelation"], dependency["DependencyHeadTokenID"]) dependencies.append(dependency_model) label = sentence["Label"] if group_claims: if label == 'ClaimContra' or label == 'ClaimPro': label = 'Claim' sentence_model = THFSentenceExport(sentence["UniqueID"], label, sentence["Text"], tokens, dependencies) sentences.append(sentence_model) logger.info('Parsed {} sentences'.format(len(sentences))) return sentences
def test_example2_without_sentence_length(self): tokens = [] tokens.append(Token(1, 'Das', None, None, None, None, None, None)) tokens.append(Token(2, 'ist', None, None, None, None, None, None)) tokens.append(Token(3, '.', None, None, None, None, None, None)) tokens.append(Token(4, ',', None, None, None, None, None, None)) tokens.append(Token(5, 'Test', None, None, None, None, None, None)) tokens.append(Token(6, '!', None, None, None, None, None, None)) sentence_uniqueID = 'p339_s021' thf_sentence = THFSentenceExport(sentence_uniqueID, None, 'Das ist . , Test!', tokens, None, 1) use_sentence_length = False feature_value = structural_features.transform_sentence( thf_sentence, use_sentence_length) expected_value = [ 21, 1.0 / len(tokens), 1.0 / len(tokens), 0.0, 0.0, 1.0, 0.0, 0.0 ] self.assertEqual(feature_value, expected_value)
def test_example1(self): tokens = [] tokens.append(Token(1, 'Das', None, None, None, None, None, None)) tokens.append(Token(2, 'ist', None, None, None, None, None, None)) tokens.append(Token(3, 'ein', None, None, None, None, None, None)) tokens.append(Token(4, ',', None, None, None, None, None, None)) tokens.append(Token(5, 'Test', None, None, None, None, None, None)) tokens.append(Token(6, '!', None, None, None, None, None, None)) sentence_uniqueID = 'c0331_s003' thf_sentence = THFSentenceExport(sentence_uniqueID, None, 'Das ist ein , Test!', tokens, None, 1) use_sentence_length = True feature_value = structural_features.transform_sentence( thf_sentence, use_sentence_length) expected_value = [ 3, 1.0 / len(tokens), 0.0 / len(tokens), 1.0 * len(tokens), 0.0, 0.0, 1.0, 0.0, 0.0 ] self.assertEqual(feature_value, expected_value)
def test_example5(self): tokens = [] tokens.append( Token(1, b'Tempelhofparikram', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(2, b'-', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(3, b'Interreligi\xc3\xb6ser', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(4, b'Pilgerpfad', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(5, b'auf', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(6, b'dem', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(7, b'Tempelhofer', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(8, b'Feld', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(9, b'(', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(10, b'http://lebensplan.com/Interreligioeser-Pilgerpfad.pdf', spacy_is_punct=False, spacy_like_url=True)) tokens.append( Token(11, b')', spacy_is_punct=True, spacy_like_url=False)) sentence_uniqueID = 'p339_s003' text = 'Tempelhofparikram - Interreligi\u00f6ser Pilgerpfad auf dem Tempelhofer Feld (http://lebensplan.com/Interreligioeser-Pilgerpfad.pdf)' thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens, None, 1) use_sentence_length = True feature_value = structural_features_spacy.transform_sentence( thf_sentence, use_sentence_length) expected_value = [ 3, 0.0 / len(tokens), 0.0 / len(tokens), 11, 1, 3, 0.0, 0.0, 0.0, 1.0 ] self.assertEqual(feature_value, expected_value)
def test_count_different_ner_labels_example1(self): tokens = [] tokens.append( Token(1, b'Wenn', spacy_pos_universal_google='SCONJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(2, b'ich', spacy_pos_universal_google='PRON', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(3, b'durch', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(4, b'den', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(5, b'Hans', spacy_pos_universal_google='PROPN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(6, b'-', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(7, b'Baluschek', spacy_pos_universal_google='PROPN', spacy_ner_type='PERSON', spacy_ner_iob='B')) tokens.append( Token(8, b'-', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(9, b'Park', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(10, b'radle', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(11, b',', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(12, b'riecht', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(13, b'es', spacy_pos_universal_google='PRON', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(14, b'immer', spacy_pos_universal_google='ADV', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(15, b'stark', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(16, b'vom', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(17, b'angrenzenden', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(18, b'S\xc3\xbcdgel\xc3\xa4nde', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(19, b'nach', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(20, b'Farbd\xc3\xbcnsten', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(21, b'und', spacy_pos_universal_google='CONJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(22, b'das', spacy_pos_universal_google='PRON', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(23, b'passt', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(24, b'nicht', spacy_pos_universal_google='PART', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(25, b'wirklich', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(26, b'zum', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(27, b'Naturschutz', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(28, b'.', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) thf_sentence = THFSentenceExport( None, None, "Wenn ich durch den Hans-Baluschek-Park radle, riecht es immer stark vom angrenzenden Südgelände nach Farbdünsten und das passt nicht wirklich zum Naturschutz.", tokens, None, 1) feature_value = ner_feature.count_different_ner_labels( thf_sentence.tokens) expected_value = np.array([1, 0, 0], dtype=np.float64) self.assertEqual(np.array_equal(feature_value, expected_value), True)
def test_count_different_ner_labels_example4(self): tokens = [] tokens.append( Token(1, b'Oder', spacy_pos_universal_google='CONJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(2, b',', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(3, b'um', spacy_pos_universal_google='SCONJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(4, b'mit', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(5, b'Hermann', spacy_pos_universal_google='PROPN', spacy_ner_type='PERSON', spacy_ner_iob='B')) tokens.append( Token(6, b'Hesse', spacy_pos_universal_google='PROPN', spacy_ner_type='PERSON', spacy_ner_iob='I')) tokens.append( Token(7, b'zu', spacy_pos_universal_google='PART', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(8, b'sprechen', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(9, b':', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(10, b'Jedem', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(11, b'Ende', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(12, b'wohnt', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(13, b'ein', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(14, b'neuer', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(15, b'Anfang', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(16, b'inne', spacy_pos_universal_google='PART', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(17, b'.', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) thf_sentence = THFSentenceExport( None, None, "Oder, um mit Hermann Hesse zu sprechen: Jedem Ende wohnt ein neuer Anfang inne.", tokens, None, 1) feature_value = ner_feature.count_different_ner_labels( thf_sentence.tokens) expected_value = np.array([1, 0, 0], dtype=np.float64) self.assertEqual(np.array_equal(feature_value, expected_value), True)
def test_count_different_ner_labels_example3(self): tokens = [] tokens.append( Token(1, b'Auf', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(2, b'dem', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(3, b'Tempelhofer', spacy_pos_universal_google='ADJ', spacy_ner_type='LOC', spacy_ner_iob='B')) tokens.append( Token(4, b'Feld', spacy_pos_universal_google='NOUN', spacy_ner_type='LOC', spacy_ner_iob='I')) tokens.append( Token(5, b'stehen', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(6, b'22', spacy_pos_universal_google='NUM', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(7, b'kleinere', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(8, b'Geb\xc3\xa4ude', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(9, b',', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(10, b'gr\xc3\xb6\xc3\x9ftenteils', spacy_pos_universal_google='ADV', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(11, b'ungenutzt', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(12, b'.', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) thf_sentence = THFSentenceExport( None, None, "Auf dem Tempelhofer Feld stehen 22 kleinere Gebäude, größtenteils ungenutzt.", tokens, None, 1) feature_value = ner_feature.count_different_ner_labels( thf_sentence.tokens) expected_value = np.array([0, 1, 0], dtype=np.float64) self.assertEqual(np.array_equal(feature_value, expected_value), True)
def test_count_different_ner_labels_example2(self): tokens = [] tokens.append( Token(1, b'Bei', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(2, b'der', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(3, b'Stauraumplanung', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(4, b'wird', spacy_pos_universal_google='AUX', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(5, b'es', spacy_pos_universal_google='PRON', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(6, b'aus', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(7, b'allen', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(8, b'Gullis', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(9, b'demn\xc3\xa4chst', spacy_pos_universal_google='ADV', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(10, b'stinken', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(11, b'und', spacy_pos_universal_google='CONJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(12, b'Abwaaserlagerung', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(13, b'in', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(14, b'Schiffen', spacy_pos_universal_google='NOUN', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(15, b'auf', spacy_pos_universal_google='ADP', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(16, b'der', spacy_pos_universal_google='DET', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(17, b'Spree', spacy_pos_universal_google='PROPN', spacy_ner_type='LOC', spacy_ner_iob='B')) tokens.append( Token(18, b'd\xc3\xbcrfte', spacy_pos_universal_google='VERB', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(19, b'auch', spacy_pos_universal_google='ADV', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(20, b'nicht', spacy_pos_universal_google='PART', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(21, b'gesund', spacy_pos_universal_google='ADJ', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(22, b'sein', spacy_pos_universal_google='AUX', spacy_ner_type='', spacy_ner_iob='O')) tokens.append( Token(23, b'.', spacy_pos_universal_google='PUNCT', spacy_ner_type='', spacy_ner_iob='O')) thf_sentence = THFSentenceExport( None, None, "Bei der Stauraumplanung wird es aus allen Gullis demnächst stinken und Abwaaserlagerung in Schiffen auf der Spree dürfte auch nicht gesund sein.", tokens, None, 1) feature_value = ner_feature.count_different_ner_labels( thf_sentence.tokens) expected_value = np.array([0, 1, 0], dtype=np.float64) self.assertEqual(np.array_equal(feature_value, expected_value), True)
def test_example2(self): tokens = [] tokens.append( Token(1, b'Diesen', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(2, b'Vorschlag', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(3, b'gibt', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(4, b'es', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(5, b'schon', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(6, b':', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(7, b'S\xc3\xbcdlicher', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(8, b'Zugang', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(9, b'zur', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(10, b'Oberlandstra\xc3\x9fe', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(11, b'(', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(12, b'Hatun', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(13, b'-', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(14, b'S\xc3\xbcr\xc3\xbcc\xc3\xbc', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(15, b'-', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(16, b'Br\xc3\xbccke', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(17, b')', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(18, b'(', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token( 19, b'https://tempelhofer-feld.berlin.de/i/tempelhofer-feld/proposal/104-S%C3%BCdlicher_Zugang_zur_Oberlandstra%C3%9Fe_Hatu', spacy_is_punct=False, spacy_like_url=True)) tokens.append( Token(20, b')', spacy_is_punct=True, spacy_like_url=False)) sentence_uniqueID = 'p339_s003' text = 'Diesen Vorschlag gibt es schon: S\u00fcdlicher Zugang zur Oberlandstra\u00dfe (Hatun-S\u00fcr\u00fcc\u00fc-Br\u00fccke) (https://tempelhofer-feld.berlin.de/i/tempelhofer-feld/proposal/104-S%C3%BCdlicher_Zugang_zur_Oberlandstra%C3%9Fe_Hatu)' thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens, None, 1) use_sentence_length = True feature_value = structural_features_spacy.transform_sentence( thf_sentence, use_sentence_length) expected_value = [ 3, 0.0 / len(tokens), 0.0 / len(tokens), 20, 1, 7, 0.0, 0.0, 0.0, 1.0 ] self.assertEqual(feature_value, expected_value)
def test_example3(self): tokens = [] tokens.append( Token(1, b'BM', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(2, b'Tester', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(3, b'#', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(4, b'1', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(5, b':', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(6, b'Kite', spacy_is_punct=False, spacy_like_url=False)) tokens.append(Token(7, b'-', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(8, b'Skaten', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(9, b'auf', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(10, b'dem', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(11, b'Tempelhofer', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(12, b'Feld', spacy_is_punct=False, spacy_like_url=False)) tokens.append( Token(13, b':', spacy_is_punct=True, spacy_like_url=False)) tokens.append( Token(14, b'http://youtu.be/Jf68D61QN4A', spacy_is_punct=False, spacy_like_url=True)) sentence_uniqueID = 'p339_s003' text = 'BM Tester #1: Kite-Skaten auf dem Tempelhofer Feld: http://youtu.be/Jf68D61QN4A' thf_sentence = THFSentenceExport(sentence_uniqueID, None, text, tokens, None, 1) use_sentence_length = True feature_value = structural_features_spacy.transform_sentence( thf_sentence, use_sentence_length) expected_value = [ 3, 0.0 / len(tokens), 0.0 / len(tokens), 14, 1, 4, 0.0, 0.0, 0.0, 1.0 ] self.assertEqual(feature_value, expected_value)