def convconll(goldconll, parsesconll): """Copy columns from parsesconll to goldconll file; overwrite in-place. Files should contain only a single document.""" goldconlldata = next(iter(readconll(goldconll).values())) header = open(goldconll).readline().rstrip() parsesconlldata = next(iter(readconll(parsesconll).values())) if len(goldconlldata) != len(parsesconlldata): raise ValueError('mismatch in number of sentences') for gchunk, pchunk in zip(goldconlldata, parsesconlldata): if len(gchunk) != len(pchunk): raise ValueError('Sentence length mismatch') if len(pchunk[0]) < 13: raise ValueError('Not enough fields for gold CoNLL 2012 file') if len(gchunk[0]) < 13: raise ValueError('Not enough fields for parses CoNLL 2012 file') with open(goldconll, 'w') as out: print(header, file=out) for gchunk, pchunk in zip(goldconlldata, parsesconlldata): for gline, pline in zip(gchunk, pchunk): gline[5] = pline[5] gline[6] = pline[6] gline[11] = pline[11] print('\t'.join(gline[1:]), file=out) print('', file=out) print('#end document', file=out)
def compare(cmd, goldfile, respfile, hidecorrectlinks=False, out=sys.stdout): """Compare mentions and links across CoNLL 2012 files.""" print('comparing gold file:', goldfile, file=out) print('against system output:', respfile, file=out) golddocs = readconll(goldfile) respdocs = readconll(respfile) for docname in golddocs: print('\ndocument:', *docname) gold = golddocs[docname] resp = respdocs[docname] goldspansforcluster = conllclusterdict(gold) respspansforcluster = conllclusterdict(resp) goldspans = {span for spans in goldspansforcluster.values() for span in spans} respspans = {span for spans in respspansforcluster.values() for span in spans} if cmd == 'mentions': comparementions(gold, resp, goldspans, respspans, out=out) elif cmd == 'links': comparecoref(resp, goldspans, respspans, goldspansforcluster, respspansforcluster, hidecorrectlinks, out) else: raise ValueError('unknown cmd: %s' % cmd)
def parseclindata(pattern, outdir): """Parse the CLIN dataset.""" origdir = os.getcwd() filenames = glob(os.path.abspath(pattern)) os.mkdir(outdir) os.chdir(outdir) for n, conllfile in enumerate(filenames, 1): data = next(iter(readconll(conllfile).values())) fname = os.path.basename(conllfile) docname = fname[:fname.index('_')] tokenidx = 3 print('Parsing %d/%d: %s' % (n, len(filenames), docname)) parse(data, docname, tokenidx) os.chdir(origdir)
def parsesemeval(path, outdir): """Parse the SemEval dataset.""" path = os.path.abspath(path) origdir = os.getcwd() os.mkdir(outdir) os.chdir(outdir) with open(path) as inp: data = inp.read() docnames = re.findall(r'#begin document ([\w_]+)', data) docs = readconll(path) for n, docname in enumerate(docnames, 1): data = docs[docname, 0] tokenidx = 2 print('Parsing %d/%d: %s' % (n, len(docnames), docname)) parse(data, docname, tokenidx) os.chdir(origdir)
def convalpino(conllfile, parsesdir): """Add parse bits to a single file, overwrite in-place. File should contain only a single document.""" from lxml import etree try: from discodop.tree import writebrackettree from discodop.treebank import AlpinoCorpusReader from discodop.treetransforms import raisediscnodes except ImportError: print('Install https://github.com/andreasvc/disco-dop') return conlldata = next(iter(readconll(conllfile).values())) header = open(conllfile).readlines()[0].rstrip() treebank = AlpinoCorpusReader(parsesdir + '/*.xml', morphology='replace', headrules='../disco-dop/alpino.headrules') for chunk, (_key, item) in zip(conlldata, treebank.itertrees()): if len(chunk) != len(item.sent): raise ValueError('length mismatch') if len(chunk[0]) < 12: raise ValueError('Not enough fields for gold CoNLL 2012 file') with open(conllfile, 'w') as out: print(header, file=out) for chunk, (_key, item) in zip(conlldata, treebank.itertrees()): raisediscnodes(item.tree) for n, (_, postag) in enumerate(item.tree.pos()): if len(chunk[n]) < 13: # kludge chunk[n] = chunk[n][:-1] + ['-', chunk[n][-1]] # NB: parens as square brackets: N[eigen,...] chunk[n][5] = postag splitparse(writebrackettree(item.tree, item.sent), chunk) xmltree = etree.fromstring(item.block) addner(xmltree, chunk) for line in chunk: print('\t'.join(line[1:]), file=out) print('', file=out) print('#end document', file=out)
def getstats(args, parsesdir=None): """Print stats for a list of CoNLL 2012 files.""" import os from glob import glob from lxml import etree import coref sents = tokens = nummentions = numentities = numlinks = 0 pronouns = nominals = names = 0 ngdata, gadata = coref.readngdata() for fname in args: data = [] try: docs = coref.readconll(fname) except Exception as err: print('file:', fname) print(err) return for (docname, part), data in docs.items(): try: goldspansforcluster = coref.conllclusterdict(data) except Exception as err: print('file:', fname) print(err) return if parsesdir is not None: # given docname, read <parsesdir>/<docname>/*.xml path = os.path.join(parsesdir, docname, '*.xml') filenames = sorted(glob(path), key=coref.parsesentid) if len(data) != len(filenames): raise ValueError( 'filename: %s; document %s %s; ' 'sentences in CoNLL (%d) ' 'and number of .xml parses (%d) not equal' % (fname, docname, part, len(data), len(filenames))) trees = [(coref.parsesentid(filename), etree.parse(filename)) for filename in filenames] mentions = coref.extractmentionsfromconll( data, trees, ngdata, gadata) pronouns += sum(m.type == 'pronoun' for m in mentions) nominals += sum(m.type == 'noun' for m in mentions) names += sum(m.type == 'name' for m in mentions) sents += len(data) tokens += sum(len(sent) for sent in data) nummentions += len({ span for spans in goldspansforcluster.values() for span in spans }) numentities += len(goldspansforcluster) numlinks += sum( int((len(cluster) * (len(cluster) - 1)) / 2) for cluster in goldspansforcluster.values()) print('sents:', sents) print('tokens:', tokens) print('mentions:', nummentions) print('entities:', numentities) print('links:', numlinks) print('tok/sent:', tokens / sents) print('mentions / tokens:', nummentions / tokens) print('entities / tokens:', numentities / tokens) print('links / tokens:', numlinks / tokens) print('mentions / entities:', nummentions / numentities) print('links / entities:', numlinks / numentities) if parsesdir is None: print('specify --parses to get % pronouns, nominals, names') else: print('% pronouns', 100 * pronouns / nummentions) print('% nominals', 100 * nominals / nummentions) print('% names', 100 * names / nummentions)
"""Load CoNLL 2012 files and report any errors or warnings.""" import sys from coref import readconll, conllclusterdict, setverbose setverbose(True, sys.stdout) for filename in sys.argv[1:]: try: for docname, conlldata in readconll(filename).items(): print('\n', filename, *docname) conllclusterdict(conlldata) except Exception as err: print(err) print('NB: Not checking for further errors in this file.')