Example #1
0
def convconll(goldconll, parsesconll):
	"""Copy columns from parsesconll to goldconll file; overwrite in-place.

	Files should contain only a single document."""
	goldconlldata = next(iter(readconll(goldconll).values()))
	header = open(goldconll).readline().rstrip()
	parsesconlldata = next(iter(readconll(parsesconll).values()))
	if len(goldconlldata) != len(parsesconlldata):
		raise ValueError('mismatch in number of sentences')
	for gchunk, pchunk in zip(goldconlldata, parsesconlldata):
		if len(gchunk) != len(pchunk):
			raise ValueError('Sentence length mismatch')
		if len(pchunk[0]) < 13:
			raise ValueError('Not enough fields for gold CoNLL 2012 file')
		if len(gchunk[0]) < 13:
			raise ValueError('Not enough fields for parses CoNLL 2012 file')
	with open(goldconll, 'w') as out:
		print(header, file=out)
		for gchunk, pchunk in zip(goldconlldata, parsesconlldata):
			for gline, pline in zip(gchunk, pchunk):
				gline[5] = pline[5]
				gline[6] = pline[6]
				gline[11] = pline[11]
				print('\t'.join(gline[1:]), file=out)
			print('', file=out)
		print('#end document', file=out)
Example #2
0
def compare(cmd, goldfile, respfile, hidecorrectlinks=False, out=sys.stdout):
	"""Compare mentions and links across CoNLL 2012 files."""
	print('comparing gold file:', goldfile, file=out)
	print('against system output:', respfile, file=out)
	golddocs = readconll(goldfile)
	respdocs = readconll(respfile)
	for docname in golddocs:
		print('\ndocument:', *docname)
		gold = golddocs[docname]
		resp = respdocs[docname]
		goldspansforcluster = conllclusterdict(gold)
		respspansforcluster = conllclusterdict(resp)
		goldspans = {span for spans in goldspansforcluster.values()
				for span in spans}
		respspans = {span for spans in respspansforcluster.values()
				for span in spans}
		if cmd == 'mentions':
			comparementions(gold, resp, goldspans, respspans, out=out)
		elif cmd == 'links':
			comparecoref(resp, goldspans, respspans, goldspansforcluster,
					respspansforcluster, hidecorrectlinks, out)
		else:
			raise ValueError('unknown cmd: %s' % cmd)
Example #3
0
def parseclindata(pattern, outdir):
    """Parse the CLIN dataset."""
    origdir = os.getcwd()
    filenames = glob(os.path.abspath(pattern))
    os.mkdir(outdir)
    os.chdir(outdir)
    for n, conllfile in enumerate(filenames, 1):
        data = next(iter(readconll(conllfile).values()))
        fname = os.path.basename(conllfile)
        docname = fname[:fname.index('_')]
        tokenidx = 3
        print('Parsing %d/%d: %s' % (n, len(filenames), docname))
        parse(data, docname, tokenidx)
    os.chdir(origdir)
Example #4
0
def parsesemeval(path, outdir):
    """Parse the SemEval dataset."""
    path = os.path.abspath(path)
    origdir = os.getcwd()
    os.mkdir(outdir)
    os.chdir(outdir)
    with open(path) as inp:
        data = inp.read()
    docnames = re.findall(r'#begin document ([\w_]+)', data)
    docs = readconll(path)
    for n, docname in enumerate(docnames, 1):
        data = docs[docname, 0]
        tokenidx = 2
        print('Parsing %d/%d: %s' % (n, len(docnames), docname))
        parse(data, docname, tokenidx)
    os.chdir(origdir)
Example #5
0
def convalpino(conllfile, parsesdir):
	"""Add parse bits to a single file, overwrite in-place.

	File should contain only a single document."""
	from lxml import etree
	try:
		from discodop.tree import writebrackettree
		from discodop.treebank import AlpinoCorpusReader
		from discodop.treetransforms import raisediscnodes
	except ImportError:
		print('Install https://github.com/andreasvc/disco-dop')
		return
	conlldata = next(iter(readconll(conllfile).values()))
	header = open(conllfile).readlines()[0].rstrip()
	treebank = AlpinoCorpusReader(parsesdir + '/*.xml',
			morphology='replace',
			headrules='../disco-dop/alpino.headrules')
	for chunk, (_key, item) in zip(conlldata, treebank.itertrees()):
		if len(chunk) != len(item.sent):
			raise ValueError('length mismatch')
		if len(chunk[0]) < 12:
			raise ValueError('Not enough fields for gold CoNLL 2012 file')
	with open(conllfile, 'w') as out:
		print(header, file=out)
		for chunk, (_key, item) in zip(conlldata, treebank.itertrees()):
			raisediscnodes(item.tree)
			for n, (_, postag) in enumerate(item.tree.pos()):
				if len(chunk[n]) < 13:  # kludge
					chunk[n] = chunk[n][:-1] + ['-', chunk[n][-1]]
				# NB: parens as square brackets: N[eigen,...]
				chunk[n][5] = postag
			splitparse(writebrackettree(item.tree, item.sent), chunk)
			xmltree = etree.fromstring(item.block)
			addner(xmltree, chunk)
			for line in chunk:
				print('\t'.join(line[1:]), file=out)
			print('', file=out)
		print('#end document', file=out)
Example #6
0
def getstats(args, parsesdir=None):
    """Print stats for a list of CoNLL 2012 files."""
    import os
    from glob import glob
    from lxml import etree
    import coref
    sents = tokens = nummentions = numentities = numlinks = 0
    pronouns = nominals = names = 0
    ngdata, gadata = coref.readngdata()
    for fname in args:
        data = []
        try:
            docs = coref.readconll(fname)
        except Exception as err:
            print('file:', fname)
            print(err)
            return
        for (docname, part), data in docs.items():
            try:
                goldspansforcluster = coref.conllclusterdict(data)
            except Exception as err:
                print('file:', fname)
                print(err)
                return
            if parsesdir is not None:
                # given docname, read <parsesdir>/<docname>/*.xml
                path = os.path.join(parsesdir, docname, '*.xml')
                filenames = sorted(glob(path), key=coref.parsesentid)
                if len(data) != len(filenames):
                    raise ValueError(
                        'filename: %s; document %s %s; '
                        'sentences in CoNLL (%d) '
                        'and number of .xml parses (%d) not equal' %
                        (fname, docname, part, len(data), len(filenames)))
                trees = [(coref.parsesentid(filename), etree.parse(filename))
                         for filename in filenames]
                mentions = coref.extractmentionsfromconll(
                    data, trees, ngdata, gadata)
                pronouns += sum(m.type == 'pronoun' for m in mentions)
                nominals += sum(m.type == 'noun' for m in mentions)
                names += sum(m.type == 'name' for m in mentions)
            sents += len(data)
            tokens += sum(len(sent) for sent in data)
            nummentions += len({
                span
                for spans in goldspansforcluster.values() for span in spans
            })
            numentities += len(goldspansforcluster)
            numlinks += sum(
                int((len(cluster) * (len(cluster) - 1)) / 2)
                for cluster in goldspansforcluster.values())
    print('sents:', sents)
    print('tokens:', tokens)
    print('mentions:', nummentions)
    print('entities:', numentities)
    print('links:', numlinks)
    print('tok/sent:', tokens / sents)
    print('mentions / tokens:', nummentions / tokens)
    print('entities / tokens:', numentities / tokens)
    print('links / tokens:', numlinks / tokens)
    print('mentions / entities:', nummentions / numentities)
    print('links / entities:', numlinks / numentities)
    if parsesdir is None:
        print('specify --parses to get % pronouns, nominals, names')
    else:
        print('% pronouns', 100 * pronouns / nummentions)
        print('% nominals', 100 * nominals / nummentions)
        print('% names', 100 * names / nummentions)
Example #7
0
"""Load CoNLL 2012 files and report any errors or warnings."""
import sys
from coref import readconll, conllclusterdict, setverbose

setverbose(True, sys.stdout)
for filename in sys.argv[1:]:
    try:
        for docname, conlldata in readconll(filename).items():
            print('\n', filename, *docname)
            conllclusterdict(conlldata)
    except Exception as err:
        print(err)
        print('NB: Not checking for further errors in this file.')