Exemple #1
0
def test_read_wacky():
    line = "was\tbe\tVBD\t18\t11\tPRD"
    sentence = conll.read([line, '\n'])[0]
    assert len(sentence) == 1
    token = sentence[0]
    assert token.id == 18
    assert token.form == "was"
    assert token.lemma == "be"
    assert token.pos == "VBD"
    assert token.head == 11
    assert token.deprel == "PRD"
Exemple #2
0
def test_read_turboparser():
    line = "11\tvaccines\tvaccine\tNNS\tNNS\t_\t10\tPMOD"
    sentence = conll.read([line, '\n'])[0]
    assert len(sentence) == 1
    token = sentence[0]
    assert token.id == 11
    assert token.form == "vaccines"
    assert token.lemma == "vaccine"
    assert token.cpos == "NNS"
    assert token.pos == "NNS"
    assert token.head == 10
    assert token.deprel == "PMOD"
Exemple #3
0
def test_read_french():
    """Test that conll.read understands French Bonsai output"""
    line = ("6\tchauffé\tchauffer\tV\tVPP\tg=m|m=part|n=s|t=past\t"
            "1100011\t5\tdep_coord\t_\t_")
    sentence = conll.read([line, '\n'])[0]
    assert len(sentence) == 1
    token = sentence[0]
    assert token.id == 6
    assert token.lemma == "chauffer"
    assert token.cpos == "V"
    assert token.pos == "VPP"
    assert token.feat[0].startswith("g=m")  # morpho features
    assert token.feat[1].startswith("110")  # cluster path
    assert token.head == 5
    assert token.deprel == "dep_coord"
def process(fname, chunk, fmt='turboparser'):
    """Process a memmapped chunk of a large file.

    The comparison detection logic is called from here.

    Parameters
    ----------

    fname : string
        The path to the file to be opened.

    chunk : tuple (int, int)
        Beginning and ending offsets in the file.  The code essentially
        processes `f.seek(chunk[0]).read(chunk[1])`.

    fmt : ('turboparser'|'wacky')
        CONLL dependency format to use.

    Returns
    -------
    chunk_matches : list
        List of tuples (sentence, matches) where the second element is a list
        of (pattern_no, dict) containing the slots matched by the pattern.

    """

    global filemap, fileobj
    chunk_matches = []
    if filemap is None or fileobj.name != fname:
        fileobj = open(fname)
        filemap = mmap.mmap(fileobj.fileno(),
                            os.path.getsize(fname),
                            access=mmap.ACCESS_READ)

    filemap.seek(chunk[0])
    lines = filemap.read(chunk[1]).splitlines()
    sents = get_sents_wacky(lines) if fmt == 'wacky' else get_sents(lines)
    for sent in sents:
        try:
            for s, root in read(sent + ["\n"], return_tree=True):
                matches = [(pat_no, m) for pat_no, pat in enumerate(patterns)
                           for m in match(root, pat)]
                if matches:
                    matches = deduplicate(matches)
                    chunk_matches.append((str(s), matches))
        except ValueError:
            pass  # sentence without root
    return chunk_matches
"""Code for comparison matching evaluation from my JSSP 2013 paper"""

import sys

PATH = '/Users/vene/bnc/similes/{}/'.format(sys.argv[1])

from glob import glob
from codecs import open
from yaml import dump, load
from pyglarf import GlarfTree
from compattern.pyglarf_args import get_args, find_comparison_nodes
from compattern.dependency import match
from compattern.dependency.seed_patterns import patterns

from compattern.dependency.conll import read
dep_parse = read(open('../data/bnc_{}_lemma.conll'.format(sys.argv[1])),
                 return_tree=True)
examples = []
for f in glob(PATH + '*.yaml'):
    f = open(f, encoding='utf-8')
    examples.extend(load(f))

sents, ctxs, gfs, gts = list(zip(*examples))
f = None
only_glarf = open("bnc_similes/{}/only_glarf.txt".format(sys.argv[1]),
                  "w",
                  encoding="utf-8")
only_dep = open("bnc_similes/{}/only_dep.txt".format(sys.argv[1]),
                "w",
                encoding="utf-8")
matches = 0
dep_matches = 0
This script shows the simple way of using this package to extract
comparisons from a parsed English corpus.
For example, you can run it against the 'data/hanks_tp_lemma.conll' file
provided.

By default this prints the dependency root of each comparison slot (topic,
vehicle, etc) but the entire subtrees are extracted and available.
"""
from __future__ import print_function
import fileinput

from compattern.dependency import match
from compattern.dependency.seed_patterns import patterns


def _lemma_or_form(tok):
    return tok.form.lower() if tok.lemma == '_' else tok.lemma.lower()


if __name__ == '__main__':
    from compattern.dependency.conll import read

    sents = read(fileinput.input(), return_tree=True)
    for sent, root in sents:
        print(sent)
        for pat in patterns:
            for m in match(root, pat):
                print("\n".join("{}: {}".format(key, val.form)
                                for key, val in m.items()))
                print()
Exemple #7
0
def test_as():
    sent, root = read(example_as, return_tree=True)[0]
    matches = match(root, seed_patterns.as_1)
    assert_greater(len(matches), 0)
Exemple #8
0
def test_than():
    sent, root = read(example_rbr, return_tree=True)[0]
    matches = match(root, seed_patterns.than_2)
    assert_greater(len(matches), 0)
Exemple #9
0
def test_like_t2():
    sent, root = read(example_like_t2, return_tree=True)[0]
    matches = match(root, seed_patterns.like_t2)
    assert_greater(len(matches), 0)
Exemple #10
0
def test_like():
    sent, root = read(example_like, return_tree=True)[0]
    matches = match(root, seed_patterns.like)
    assert_greater(len(matches), 0)
    assert_in('T', list(matches[0].keys()))
Exemple #11
0
def test_aussi_lemma():
    sent, root = read(ex_aussi, return_tree=True)[0]
    matches = match(root, aussi)
    assert_greater(len(matches), 0)