def postprocess(self, treestr, stage=-1): """Take parse tree and apply postprocessing.""" parsetree = Tree.parse(treestr, parse_leaf=int) if self.stages[stage].split: mergediscnodes(unbinarize(parsetree, childchar=':', expandunary=False)) saveheads(parsetree, self.binarization.tailmarker) unbinarize(parsetree, expandunary=False) removefanoutmarkers(parsetree) if self.relationalrealizational: parsetree = rrbacktransform(parsetree, self.relationalrealizational['adjunctionlabel']) if self.transformations: reversetransform(parsetree, self.transformations) return parsetree, False
def postprocess(self, treestr, stage=-1, derivs=None): """ Take parse tree and apply postprocessing. """ parsetree = Tree.parse(treestr, parse_leaf=int) if self.stages[stage].split: mergediscnodes(unbinarize(parsetree, childchar=':')) saveheads(parsetree, self.tailmarker) unbinarize(parsetree) removefanoutmarkers(parsetree) if self.relationalrealizational: parsetree = rrbacktransform(parsetree, self.relationalrealizational['adjunctionlabel']) if self.transformations: reversetransform(parsetree, self.transformations) fragments = derivs.get(treestr) if derivs else None return parsetree, fragments, False
def accept(): """Store parse & redirect to next sentence.""" # should include n referring to which n-best tree is to be accepted, # or tree in discbracket format if tree was manually edited. sentno = int(request.args.get('sentno')) # 1-indexed lineno = QUEUE[sentno - 1][0] sent = SENTENCES[lineno] username = session['username'] actions = session['actions'] actions[TIME] = int(round(time() - actions[TIME])) if 'dec' in request.args: actions[DECTREE] += int(request.args.get('dec', 0)) if 'tree' in request.args: n = 0 tree, senttok = discbrackettree(request.args.get('tree')) reversetransform(tree, senttok, ('APPEND-FUNC', 'addCase')) else: n = int(request.args.get('n', 0)) require = request.args.get('require', '') block = request.args.get('block', '') require, block = parseconstraints(require, block) resp = WORKERS[username].submit(worker.getparses, sent, require, block).result() senttok, parsetrees, _messages, _elapsed = resp tree = parsetrees[n - 1][1] for node in tree.subtrees(): node.label = LABELRE.match(node.label).group(1) actions[NBEST] = n session.modified = True block = writetree(tree, senttok, str(lineno + 1), 'export', comment='%s %r' % (username, actions)) app.logger.info(block) addentry(lineno, block, actions) WORKERS[username].submit(worker.augment, [tree], [senttok]) flash('Your annotation for sentence %d was stored %r' % (sentno, actions)) return (redirect(url_for('annotate', sentno=sentno + 1)) if sentno < len(SENTENCES) else 'THANK YOU. THAT WAS THE LAST SENTENCE.')
def test_transforms(): """Test reversibility of Tiger transformations.""" from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('alpinosample.export', headrules=headrules) nn = NegraCorpusReader('alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [ transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values()) ] print('\ntransformed') correct = exact = e = 0 for a, b, c, d in islice( zip(n.trees().values(), n.sents().values(), trees, count()), 100): transformc = reversetransform(c.copy(True), b, transformations) c1 = bracketings(canonicalize(a)) c2 = bracketings(canonicalize(transformc)) z = -1 # 825 if c1 != c2 or e == z: precision = len(set(c1) & set(c2)) / len(set(c1)) recall = len(set(c1) & set(c2)) / len(set(c2)) if precision != 1.0 or recall != 1.0 or d == z: print( d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(b))) print('no match', precision, recall) print(len(c1), len(c2), 'gold-transformed', set(c2) - set(c1), 'transformed-gold', set(c1) - set(c2)) print(a) print(transformc) handlefunctions('add', a) print(a, '\n', b, '\n\n') else: correct += 1 else: exact += 1 correct += 1 e += 1 print('matches', correct, '/', e, 100 * correct / e, '%') print('exact', exact)
def test_transforms(): """Test reversibility of Tiger transformations.""" from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('alpinosample.export', headrules=headrules) nn = NegraCorpusReader('alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values())] print('\ntransformed') correct = exact = d = 0 for a, b, c in islice(zip(n.trees().values(), trees, n.sents().values()), 100): transformb = reversetransform(b.copy(True), transformations) b1 = bracketings(canonicalize(a)) b2 = bracketings(canonicalize(transformb)) z = -1 # 825 if b1 != b2 or d == z: precision = len(set(b1) & set(b2)) / len(set(b1)) recall = len(set(b1) & set(b2)) / len(set(b2)) if precision != 1.0 or recall != 1.0 or d == z: print(d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(c))) print('no match', precision, recall) print(len(b1), len(b2), 'gold-transformed', set(b2) - set(b1), 'transformed-gold', set(b1) - set(b2)) print(a) print(transformb) handlefunctions('add', a) print(a, '\n', b, '\n\n') else: correct += 1 else: exact += 1 correct += 1 d += 1 print('matches', correct, '/', d, 100 * correct / d, '%') print('exact', exact)
def test_transform(self): from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader n = NegraCorpusReader('alpinosample.export') for transformations in (('FUNC-NODE', ), ('MORPH-NODE', ), ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')): nn = NegraCorpusReader('alpinosample.export') trees = [ transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values()) ] for a, b in islice(zip(n.trees().values(), trees), 100): before = bracketings(canonicalize(a)) transformb = reversetransform(b.copy(True), transformations) after = bracketings(canonicalize(transformb)) assert before == after, ( 'mismatch with %r\nbefore: %r\nafter: %r' % (transformations, before, after))
def test_transform(self): from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader n = NegraCorpusReader('alpinosample.export') for transformations in ( ('FUNC-NODE', ), ('MORPH-NODE', ), ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')): nn = NegraCorpusReader('alpinosample.export') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values())] for a, b in islice(zip(n.trees().values(), trees), 100): before = bracketings(canonicalize(a)) transformb = reversetransform(b.copy(True), transformations) after = bracketings(canonicalize(transformb)) assert before == after, ( 'mismatch with %r\nbefore: %r\nafter: %r' % ( transformations, before, after))
def main(): """Command line interface for applying tree(bank) transforms.""" import io from getopt import gnu_getopt, GetoptError from discodop import treebanktransforms actions = {'none': None, 'introducepreterminals': introducepreterminals, 'splitdisc': None, 'mergedisc': mergediscnodes, 'transform': None, 'unbinarize': unbinarize, 'binarize': None, 'optimalbinarize': None} flags = ('markorigin markheads leftunary rightunary tailmarker ' 'renumber reverse'.split()) options = ('inputfmt= outputfmt= inputenc= outputenc= slice= ensureroot= ' 'punct= headrules= functions= morphology= lemmas= factor= ' 'markorigin= maxlen= fmt= enc= transforms=').split() try: opts, args = gnu_getopt(sys.argv[1:], 'h:v:', flags + options) if not 1 <= len(args) <= 3: raise GetoptError('error: expected 1, 2, or 3 positional arguments') except GetoptError as err: print('error: %r\n%s' % (err, USAGE), file=sys.stderr) sys.exit(2) opts, action = dict(opts), args[0] if action not in actions: print('unrecognized action: %r\navailable actions: %s' % ( action, ', '.join(actions)), file=sys.stderr) sys.exit(2) if '--fmt' in opts: opts['--inputfmt'] = opts['--outputfmt'] = opts['--fmt'] if '--enc' in opts: opts['--inputenc'] = opts['--outputenc'] = opts['--enc'] if opts.get('--outputfmt', WRITERS[0]) not in WRITERS: print('unrecognized output format: %r\navailable formats: %s' % ( opts.get('--outputfmt'), ' '.join(WRITERS)), file=sys.stderr) sys.exit(2) infilename = args[1] if len(args) >= 2 and args[1] != '-' else '/dev/stdin' outfilename = args[2] if len(args) == 3 and args[2] != '-' else '/dev/stdout' # open corpus corpus = READERS[opts.get('--inputfmt', 'export')]( infilename, encoding=opts.get('--inputenc', 'utf-8'), headrules=opts.get('--headrules'), markheads='--markheads' in opts, ensureroot=opts.get('--ensureroot'), punct=opts.get('--punct'), functions=opts.get('--functions'), morphology=opts.get('--morphology'), lemmas=opts.get('--lemmas')) start, end = opts.get('--slice', ':').split(':') start, end = (int(start) if start else None), (int(end) if end else None) trees = corpus.itertrees(start, end) if '--maxlen' in opts: maxlen = int(opts['--maxlen']) trees = ((key, (tree, sent)) for key, (tree, sent) in trees if len(sent) <= maxlen) if '--renumber' in opts: trees = (('%8d' % n, treesent) for n, (_, treesent) in enumerate(trees, 1)) # select transformation transform = actions[action] if action in ('binarize', 'optimalbinarize'): h = int(opts.get('-h', 999)) v = int(opts.get('-v', 1)) if action == 'binarize': factor = opts.get('--factor', 'right') transform = lambda t, _: binarize(t, factor, h, v, leftmostunary='--leftunary' in opts, rightmostunary='--rightunary' in opts, tailmarker='$' if '--tailmarker' in opts else '') elif action == 'optimalbinarize': headdriven = '--headrules' in opts transform = lambda t, _: optimalbinarize(t, '|', headdriven, h, v) elif action == 'splitdisc': transform = lambda t, _: splitdiscnodes(t, '--markorigin' in opts) elif action == 'unbinarize': transform = lambda t, _: unbinarize(Tree.convert(t)) elif action == 'transform': tfs = opts['--transforms'].split(',') transform = lambda t, s: (treebanktransforms.reversetransform(t, tfs) if '--reverse' in opts else treebanktransforms.transform(t, s, tfs)) if transform is not None: # NB: transform cannot affect (no. of) terminals trees = ((key, (transform(tree, sent), sent)) for key, (tree, sent) in trees) # read, transform, & write trees headrules = None if opts.get('--outputfmt') in ('mst', 'conll'): if not opts.get('--headrules'): raise ValueError('need head rules for dependency conversion') headrules = treebanktransforms.readheadrules(opts.get('--headrules')) cnt = 0 if opts.get('--outputfmt') == 'dact': import alpinocorpus outfile = alpinocorpus.CorpusWriter(outfilename) if (action == 'none' and opts.get('--inputfmt') in ('alpino', 'dact') and set(opts) <= {'--slice', '--inputfmt', '--outputfmt', '--renumber'}): for n, (key, block) in islice(enumerate( corpus.blocks().items(), 1), start, end): outfile.write('%8d' % n if '--renumber' in opts else key, block) cnt += 1 else: for key, (tree, sent) in trees: outfile.write(str(key), writetree(tree, sent, key, 'alpino')) cnt += 1 else: encoding = opts.get('outputenc', 'utf-8') outfile = io.open(outfilename, 'w', encoding=encoding) # copy trees verbatim when only taking slice or converting encoding if (action == 'none' and opts.get('--inputfmt') == opts.get( '--outputfmt') and set(opts) <= {'--slice', '--inputenc', '--outputenc', '--inputfmt', '--outputfmt'}): for block in islice(corpus.blocks().values(), start, end): outfile.write(block) cnt += 1 else: for key, (tree, sent) in trees: outfile.write(writetree(tree, sent, key, opts.get('--outputfmt', 'export'), headrules)) cnt += 1 print('%sed %d trees with action %r' % ('convert' if action == 'none' else 'transform', cnt, action), file=sys.stderr)