Exemple #1
0
def test_lemmatize_unicode():
    _check_corenlp()
    lines = parse(u"H\xf6v\xedk hit C\xe9sar",
                  annotators=['tokenize', 'ssplit', 'pos', 'lemma'])
    saf = stanford_to_saf(lines)
    assert_equal({t['lemma'] for t in saf['tokens']},
                 {'Cesar', 'hit', 'Hovik'})
Exemple #2
0
def test_lemmatize_unicode():
    _check_corenlp()
    lines = parse(u"\u0540\u0578\u057e\u056b\u056f hit C\xe9sar",
                  annotators=['tokenize', 'ssplit', 'pos', 'lemma'])
    saf = stanford_to_saf(lines)
    assert_equal({t['lemma'] for t in saf['tokens']},
                 {'Cesar', 'hit', 'Hovik'})
Exemple #3
0
def test_ner():
    _check_corenlp()
    annotators = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner']
    saf = stanford_to_saf(parse("John lives in Amsterdam",
                                annotators=annotators))
    lemmata = {t['id']: t['lemma'] for t in saf['tokens']}
    entities = {lemmata[e['tokens'][0]]: e['type'] for e in saf['entities']}
    assert_equal(entities, {'John': 'PERSON', 'Amsterdam': 'LOCATION'})
Exemple #4
0
def test_ner():
    _check_corenlp()
    annotators = ['tokenize', 'ssplit', 'pos', 'lemma', 'ner']
    saf = stanford_to_saf(parse("John lives in Amsterdam",
                                annotators=annotators))
    lemmata = {t['id']: t['lemma'] for t in saf['tokens']}
    entities = {lemmata[e['tokens'][0]]: e['type'] for e in saf['entities']}
    assert_equal(entities, {'John': 'PERSON', 'Amsterdam': 'LOCATION'})
Exemple #5
0
def test_parse_xml():
    xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read()
    saf = stanford_to_saf(xml)
    assert_equal({t['lemma'] for t in saf['tokens']},
                 {"John", "attack", "I", "in", "London", "hit", "he", "back", "."})
    london = [t for t in saf['tokens'] if t['lemma'] == 'London'][0]
    assert_equal(london['pos'], 'NNP')
    assert_in({"type": "LOCATION", "tokens": [london['id']]}, saf['entities'])
Exemple #6
0
def test_parse_xml():
    with open(join(dirname(__file__), "test_corenlp.xml")) as f:
        xml = f.read()
    saf = stanford_to_saf(xml)
    assert_equal({t['lemma'] for t in saf['tokens']},
                 set("John attack I in London hit he back .".split()))
    london = [t for t in saf['tokens'] if t['lemma'] == 'London'][0]
    assert_equal(london['pos'], 'NNP')
    assert_in({"type": "LOCATION", "tokens": [london['id']]}, saf['entities'])
Exemple #7
0
def test_lemmatize():
    _check_corenlp()
    lines = parse("He jumped. \n\n Cool!",
                  annotators=['tokenize', 'ssplit', 'pos', 'lemma'])
    saf = stanford_to_saf(lines)
    assert_equal(set(saf.keys()), {'tokens', 'header'})

    assert_equal({t['lemma']
                  for t in saf['tokens']}, {'he', 'jump', 'cool', '!', '.'})
    assert_equal({t['sentence'] for t in saf['tokens']}, {1, 2})
Exemple #8
0
def test_lemmatize():
    _check_corenlp()
    lines = parse("He jumped. \n\n Cool!",
                  annotators=['tokenize', 'ssplit', 'pos', 'lemma'])
    saf = stanford_to_saf(lines)
    assert_equal(set(saf.keys()), {'tokens', 'header'})

    assert_equal({t['lemma'] for t in saf['tokens']},
                 {'he', 'jump', 'cool', '!', '.'})
    assert_equal({t['sentence'] for t in saf['tokens']},
                 {1, 2})
Exemple #9
0
def test_parse_xml():
    xml = open(os.path.join(os.path.dirname(__file__),
                            "test_corenlp.xml")).read()
    saf = stanford_to_saf(xml)
    assert_equal(
        {t['lemma']
         for t in saf['tokens']},
        {"John", "attack", "I", "in", "London", "hit", "he", "back", "."})
    london = [t for t in saf['tokens'] if t['lemma'] == 'London'][0]
    assert_equal(london['pos'], 'NNP')
    assert_in({"type": "LOCATION", "tokens": [london['id']]}, saf['entities'])
Exemple #10
0
def test_parse():
    _check_corenlp()
    saf = stanford_to_saf(parse("John loves himself"))
    lemmata = {t['id']: t['lemma'] for t in saf['tokens']}
    assert_equal(saf['trees'], [{
        "tree": "(ROOT (S (NP (NNP John)) (VP (VBZ loves) "
                "(NP (PRP himself)))))",
        "sentence": 1
        }])
    deps = {(lemmata[d['child']], lemmata[d['parent']], d['relation'])
            for d in saf['dependencies']}
    assert_equal(deps, {('John', 'love', 'nsubj'),
                        ('himself', 'love', 'dobj')})
    corefs = {tuple(sorted([lemmata[c[0][0]], lemmata[c[1][0]]]))
              for c in saf['coreferences']}
    assert_equal(corefs, {tuple(sorted(['John', 'himself']))})
Exemple #11
0
def test_parse():
    _check_corenlp()
    saf = stanford_to_saf(parse("John loves himself"))
    lemmata = {t['id']: t['lemma'] for t in saf['tokens']}
    assert_equal(saf['trees'], [{
        "tree": "(ROOT (S (NP (NNP John)) (VP (VBZ loves) "
                "(NP (PRP himself)))))",
        "sentence": 1
        }])
    deps = {(lemmata[d['child']], lemmata[d['parent']], d['relation'])
            for d in saf['dependencies']}
    assert_equal(deps, {('John', 'love', 'nsubj'),
                        ('himself', 'love', 'dobj')})
    corefs = {tuple(sorted([lemmata[c[0][0]], lemmata[c[1][0]]]))
              for c in saf['coreferences']}
    assert_equal(corefs, {tuple(sorted(['John', 'himself']))})
Exemple #12
0
def test_multiple_sentences():
    _check_corenlp()
    p = parse("John lives in Amsterdam. He works in London")
    saf = stanford_to_saf(p)
    tokens = {t['id'] : t for t in saf['tokens']}
    # are token ids unique?
    assert_equal(len(tokens), len(saf['tokens']))
    # is location in second sentence correct?
    entities = {tokens[e['tokens'][0]]['lemma'] : e['type']
                for e in saf['entities']}
    assert_in(('London', 'LOCATION'), entities.items())
    # is dependency in second sentence correct?
    rels = [(tokens[rel['child']]['lemma'], rel['relation'], tokens[rel['parent']]['lemma'])
            for rel in saf['dependencies']]
    assert_in(("he", "nsubj", "work"), rels)
    assert_in(("John", "nsubj", "live"), rels)
    # is coref parsed correctly?
    coref = {(tokens[x[0][0]]['lemma'], tokens[x[1][0]]['lemma']) for x in saf['coreferences']}
    assert_equal(coref, {("John", "he")})
Exemple #13
0
def test_multiple_sentences():
    _check_corenlp()
    p = parse("John lives in Amsterdam. He works in London")
    saf = stanford_to_saf(p)
    tokens = {t['id']: t for t in saf['tokens']}
    # are token ids unique?
    assert_equal(len(tokens), len(saf['tokens']))
    # is location in second sentence correct?
    entities = {tokens[e['tokens'][0]]['lemma']: e['type']
                for e in saf['entities']}
    assert_in(('London', 'LOCATION'), entities.items())
    # is dependency in second sentence correct?
    rels = [(tokens[rel['child']]['lemma'], rel['relation'],
             tokens[rel['parent']]['lemma'])
            for rel in saf['dependencies']]
    assert_in(("he", "nsubj", "work"), rels)
    assert_in(("John", "nsubj", "live"), rels)
    # is coref parsed correctly?
    coref = {(tokens[x[0][0]]['lemma'], tokens[x[1][0]]['lemma'])
             for x in saf['coreferences']}
    assert_equal(coref, {("John", "he")})