def test_lemmatize_unicode(): _check_corenlp() lines = parse(u"\u0540\u0578\u057e\u056b\u056f hit C\xe9sar", annotators=['tokenize', 'ssplit', 'pos', 'lemma']) saf = stanford_to_saf(lines) assert_equal({t['lemma'] for t in saf['tokens']}, {'Cesar', 'hit', 'Hovik'})
def test_lemmatize_unicode(): _check_corenlp() lines = parse(u"H\xf6v\xedk hit C\xe9sar", annotators=['tokenize', 'ssplit', 'pos', 'lemma']) saf = stanford_to_saf(lines) assert_equal({t['lemma'] for t in saf['tokens']}, {'Cesar', 'hit', 'Hovik'})
def test_ner(): _check_corenlp() annotators = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner'] saf = stanford_to_saf(parse("John lives in Amsterdam", annotators=annotators)) lemmata = {t['id']: t['lemma'] for t in saf['tokens']} entities = {lemmata[e['tokens'][0]]: e['type'] for e in saf['entities']} assert_equal(entities, {'John': 'PERSON', 'Amsterdam': 'LOCATION'})
def test_raw(): _check_corenlp() lines = parse("It. Works\n", annotators=['tokenize', 'ssplit']) expected = ['Sentence #1 (2 tokens):', 'It.', '[Text=It CharacterOffsetBegin=0 CharacterOffsetEnd=2]' ' [Text=. CharacterOffsetBegin=2 CharacterOffsetEnd=3]', 'Sentence #2 (1 tokens):', 'Works', '[Text=Works CharacterOffsetBegin=4 CharacterOffsetEnd=9]'] assert_equal(lines, expected)
def test_lemmatize(): _check_corenlp() lines = parse("He jumped. \n\n Cool!", annotators=['tokenize', 'ssplit', 'pos', 'lemma']) saf = stanford_to_saf(lines) assert_equal(set(saf.keys()), {'tokens', 'header'}) assert_equal({t['lemma'] for t in saf['tokens']}, {'he', 'jump', 'cool', '!', '.'}) assert_equal({t['sentence'] for t in saf['tokens']}, {1, 2})
def test_raw(): _check_corenlp() lines = parse("It. Works\n", annotators=['tokenize', 'ssplit']) expected = [ 'Sentence #1 (2 tokens):', 'It.', '[Text=It CharacterOffsetBegin=0 CharacterOffsetEnd=2]' ' [Text=. CharacterOffsetBegin=2 CharacterOffsetEnd=3]', 'Sentence #2 (1 tokens):', 'Works', '[Text=Works CharacterOffsetBegin=4 CharacterOffsetEnd=9]' ] assert_equal(lines, expected)
def test_parse(): _check_corenlp() saf = stanford_to_saf(parse("John loves himself")) lemmata = {t['id']: t['lemma'] for t in saf['tokens']} assert_equal(saf['trees'], [{ "tree": "(ROOT (S (NP (NNP John)) (VP (VBZ loves) " "(NP (PRP himself)))))", "sentence": 1 }]) deps = {(lemmata[d['child']], lemmata[d['parent']], d['relation']) for d in saf['dependencies']} assert_equal(deps, {('John', 'love', 'nsubj'), ('himself', 'love', 'dobj')}) corefs = {tuple(sorted([lemmata[c[0][0]], lemmata[c[1][0]]])) for c in saf['coreferences']} assert_equal(corefs, {tuple(sorted(['John', 'himself']))})
def test_multiple_sentences(): _check_corenlp() p = parse("John lives in Amsterdam. He works in London") saf = stanford_to_saf(p) tokens = {t['id'] : t for t in saf['tokens']} # are token ids unique? assert_equal(len(tokens), len(saf['tokens'])) # is location in second sentence correct? entities = {tokens[e['tokens'][0]]['lemma'] : e['type'] for e in saf['entities']} assert_in(('London', 'LOCATION'), entities.items()) # is dependency in second sentence correct? rels = [(tokens[rel['child']]['lemma'], rel['relation'], tokens[rel['parent']]['lemma']) for rel in saf['dependencies']] assert_in(("he", "nsubj", "work"), rels) assert_in(("John", "nsubj", "live"), rels) # is coref parsed correctly? coref = {(tokens[x[0][0]]['lemma'], tokens[x[1][0]]['lemma']) for x in saf['coreferences']} assert_equal(coref, {("John", "he")})
def test_multiple_sentences(): _check_corenlp() p = parse("John lives in Amsterdam. He works in London") saf = stanford_to_saf(p) tokens = {t['id']: t for t in saf['tokens']} # are token ids unique? assert_equal(len(tokens), len(saf['tokens'])) # is location in second sentence correct? entities = {tokens[e['tokens'][0]]['lemma']: e['type'] for e in saf['entities']} assert_in(('London', 'LOCATION'), entities.items()) # is dependency in second sentence correct? rels = [(tokens[rel['child']]['lemma'], rel['relation'], tokens[rel['parent']]['lemma']) for rel in saf['dependencies']] assert_in(("he", "nsubj", "work"), rels) assert_in(("John", "nsubj", "live"), rels) # is coref parsed correctly? coref = {(tokens[x[0][0]]['lemma'], tokens[x[1][0]]['lemma']) for x in saf['coreferences']} assert_equal(coref, {("John", "he")})