def test_read_wacky(): line = "was\tbe\tVBD\t18\t11\tPRD" sentence = conll.read([line, '\n'])[0] assert len(sentence) == 1 token = sentence[0] assert token.id == 18 assert token.form == "was" assert token.lemma == "be" assert token.pos == "VBD" assert token.head == 11 assert token.deprel == "PRD"
def test_read_turboparser(): line = "11\tvaccines\tvaccine\tNNS\tNNS\t_\t10\tPMOD" sentence = conll.read([line, '\n'])[0] assert len(sentence) == 1 token = sentence[0] assert token.id == 11 assert token.form == "vaccines" assert token.lemma == "vaccine" assert token.cpos == "NNS" assert token.pos == "NNS" assert token.head == 10 assert token.deprel == "PMOD"
def test_read_french(): """Test that conll.read understands French Bonsai output""" line = ("6\tchauffé\tchauffer\tV\tVPP\tg=m|m=part|n=s|t=past\t" "1100011\t5\tdep_coord\t_\t_") sentence = conll.read([line, '\n'])[0] assert len(sentence) == 1 token = sentence[0] assert token.id == 6 assert token.lemma == "chauffer" assert token.cpos == "V" assert token.pos == "VPP" assert token.feat[0].startswith("g=m") # morpho features assert token.feat[1].startswith("110") # cluster path assert token.head == 5 assert token.deprel == "dep_coord"
def process(fname, chunk, fmt='turboparser'): """Process a memmapped chunk of a large file. The comparison detection logic is called from here. Parameters ---------- fname : string The path to the file to be opened. chunk : tuple (int, int) Beginning and ending offsets in the file. The code essentially processes `f.seek(chunk[0]).read(chunk[1])`. fmt : ('turboparser'|'wacky') CONLL dependency format to use. Returns ------- chunk_matches : list List of tuples (sentence, matches) where the second element is a list of (pattern_no, dict) containing the slots matched by the pattern. """ global filemap, fileobj chunk_matches = [] if filemap is None or fileobj.name != fname: fileobj = open(fname) filemap = mmap.mmap(fileobj.fileno(), os.path.getsize(fname), access=mmap.ACCESS_READ) filemap.seek(chunk[0]) lines = filemap.read(chunk[1]).splitlines() sents = get_sents_wacky(lines) if fmt == 'wacky' else get_sents(lines) for sent in sents: try: for s, root in read(sent + ["\n"], return_tree=True): matches = [(pat_no, m) for pat_no, pat in enumerate(patterns) for m in match(root, pat)] if matches: matches = deduplicate(matches) chunk_matches.append((str(s), matches)) except ValueError: pass # sentence without root return chunk_matches
"""Code for comparison matching evaluation from my JSSP 2013 paper""" import sys PATH = '/Users/vene/bnc/similes/{}/'.format(sys.argv[1]) from glob import glob from codecs import open from yaml import dump, load from pyglarf import GlarfTree from compattern.pyglarf_args import get_args, find_comparison_nodes from compattern.dependency import match from compattern.dependency.seed_patterns import patterns from compattern.dependency.conll import read dep_parse = read(open('../data/bnc_{}_lemma.conll'.format(sys.argv[1])), return_tree=True) examples = [] for f in glob(PATH + '*.yaml'): f = open(f, encoding='utf-8') examples.extend(load(f)) sents, ctxs, gfs, gts = list(zip(*examples)) f = None only_glarf = open("bnc_similes/{}/only_glarf.txt".format(sys.argv[1]), "w", encoding="utf-8") only_dep = open("bnc_similes/{}/only_dep.txt".format(sys.argv[1]), "w", encoding="utf-8") matches = 0 dep_matches = 0
This script shows the simple way of using this package to extract comparisons from a parsed English corpus. For example, you can run it against the 'data/hanks_tp_lemma.conll' file provided. By default this prints the dependency root of each comparison slot (topic, vehicle, etc) but the entire subtrees are extracted and available. """ from __future__ import print_function import fileinput from compattern.dependency import match from compattern.dependency.seed_patterns import patterns def _lemma_or_form(tok): return tok.form.lower() if tok.lemma == '_' else tok.lemma.lower() if __name__ == '__main__': from compattern.dependency.conll import read sents = read(fileinput.input(), return_tree=True) for sent, root in sents: print(sent) for pat in patterns: for m in match(root, pat): print("\n".join("{}: {}".format(key, val.form) for key, val in m.items())) print()
def test_as(): sent, root = read(example_as, return_tree=True)[0] matches = match(root, seed_patterns.as_1) assert_greater(len(matches), 0)
def test_than(): sent, root = read(example_rbr, return_tree=True)[0] matches = match(root, seed_patterns.than_2) assert_greater(len(matches), 0)
def test_like_t2(): sent, root = read(example_like_t2, return_tree=True)[0] matches = match(root, seed_patterns.like_t2) assert_greater(len(matches), 0)
def test_like(): sent, root = read(example_like, return_tree=True)[0] matches = match(root, seed_patterns.like) assert_greater(len(matches), 0) assert_in('T', list(matches[0].keys()))
def test_aussi_lemma(): sent, root = read(ex_aussi, return_tree=True)[0] matches = match(root, aussi) assert_greater(len(matches), 0)