def getsubtree(node, parent, morphology, lemmas): """ Parse a subtree of an Alpino tree. """ # FIXME: proper representation for arbitrary features source = [''] * len(FIELDS) source[WORD] = node.get('word') or ("#%s" % node.get('id')) source[LEMMA] = node.get('lemma') or node.get('root') source[MORPH] = node.get('postag') or node.get('frame') source[FUNC] = node.get('rel') if 'cat' in node.keys(): source[TAG] = node.get('cat') if node.get('index'): coindexed[node.get('index')] = source label = node.get('cat') result = ParentedTree(label.upper(), []) for child in node: subtree = getsubtree(child, result, morphology, lemmas) if subtree and ( 'word' in child.keys() or 'cat' in child.keys()): subtree.source[PARENT] = node.get('id') result.append(subtree) if not len(result): return None elif 'word' in node.keys(): source[TAG] = node.get('pt') or node.get('pos') if node.get('index'): coindexed[node.get('index')] = source result = ParentedTree('', list( range(int(node.get('begin')), int(node.get('end'))))) handlemorphology(morphology, lemmas, result, source) elif 'index' in node.keys(): coindexation[node.get('index')].extend( (node.get('rel'), parent)) return None result.source = source return result
def convert_tree(htree, vroot="VROOT"): """ :param htree: :type htree: ConstituentTree :type vroot: str :return: :rtype: Tuple[ParentedTree, List] """ nodes = {} for idx in htree.nodes(): token = htree.node_token(idx) if token.type() == "CONSTITUENT-CATEGORY": nodes[idx] = ParentedTree(token.category(), []) nodes[idx].source = (token.category(), '--', '--', '--', token.edge(), '--') elif token.type() == "CONSTITUENT-TERMINAL": # nodes[idx] = Tree(htree.full_yield().index(idx), []) nodes[idx] = ParentedTree(token.pos(), [htree.full_yield().index(idx)]) nodes[idx].source = (token.form(), '--', token.pos(), token.morph_feats(), token.edge(), '--') if True or len(htree.root) > 1 : tree = ParentedTree(vroot, [nodes[r] for r in htree.root]) else: tree = nodes[htree.root[0]] for idx in htree.nodes(): for c_idx in htree.children(idx): nodes[idx].append(nodes[c_idx]) if htree.disconnected(idx): tree.append(nodes[idx]) # handlefunctions(action='between', tree=tree) sent = [token.form() for token in htree.full_token_yield()] return tree, sent
def brackettree(treestr, sent, brackets, strtermre): """Parse a single tree presented in (disc)bracket format. in the 'bracket' case ``sent`` is ignored.""" if strtermre.search(treestr): # terminals are not all indices rest = sent.strip() sent, cnt = [], count() def substleaf(x): """Collect word and return index.""" sent.append(x) return next(cnt) tree = ParentedTree.parse(FRONTIERNTRE.sub(' -FRONTIER-)', treestr), parse_leaf=substleaf, brackets=brackets) else: # disc. trees with integer indices as terminals tree = ParentedTree.parse(treestr, parse_leaf=int, brackets=brackets) if sent.strip(): maxleaf = max(tree.leaves()) sent, rest = sent.strip('\n\r\t').split(' ', maxleaf), '' sep = [sent[-1].index(b) for b in '\t\n\r' if b in sent[-1]] if sep: sent[-1], rest = sent[-1][:min(sep)], sent[-1][min(sep) + 1:] else: sent, rest = map(str, range(max(tree.leaves()) + 1)), '' sent = [unquote(a) for a in sent] return tree, sent, rest
def _parsetree(self, block): """:returns: a transformed parse tree and sentence.""" tree, sent = self._parse(block) if not sent: # ??3 return tree if self.removeempty: removeemptynodes(tree, sent) if self.ensureroot and tree.label != self.ensureroot: tree = ParentedTree(self.ensureroot, [tree]) if not isinstance(self, BracketCorpusReader): # roughly order constituents by order in sentence for a in reversed(list(tree.subtrees(lambda x: len(x) > 1))): a.sort(key=Tree.leaves) if self.punct == 'remove': punctremove(tree, sent) elif self.punct == 'move' or self.punct == 'moveall': punctraise(tree, sent, self.punct == 'moveall') balancedpunctraise(tree, sent) # restore order for a in reversed(list(tree.subtrees(lambda x: len(x) > 1))): a.sort(key=Tree.leaves) elif self.punct == 'root': punctroot(tree, sent) if self.headrules: for node in tree.subtrees(lambda n: n and isinstance(n[0], Tree)): sethead(headfinder(node, self.headrules)) headorder(node, self.headfinal, self.reverse) if self.markheads: headmark(node) return tree, sent
def getchildren(parent): """ Traverse tree in export format and create Tree object. """ results = [] for n, source in children[parent]: # n is the index in the block to record word indices m = EXPORTNONTERMINAL.match(source[WORD]) if m: child = ParentedTree(source[TAG], getchildren(m.group(1))) else: # POS + terminal child = ParentedTree('', [n]) handlemorphology(morphology, lemmas, child, source) child.source = tuple(source) results.append(child) return results
def _parse(self, block): c = count() result = ParentedTree.parse(block, parse_leaf=lambda _: next(c)) if result.label not in ('TOP', 'ROOT'): result = ParentedTree('TOP', [result]) sent = self._word(block, orig=True) return result, sent
def _parse(self, block): treestr = block.split("\t", 1)[0] tree = ParentedTree.parse(treestr, parse_leaf=int) sent = self._word(block, orig=True) if not all(0 <= n < len(sent) for n in tree.leaves()): raise ValueError('All leaves must be in the interval 0..n with ' 'n=len(sent)\ntokens: %d indices: %r\nsent: %s' % ( len(sent), tree.leaves(), sent)) return tree, sent
def _parse(self, block): c = count() tree = ParentedTree.parse(LEAVESRE.sub(lambda _: ' %d)' % next(c), block), parse_leaf=int) # TODO: parse Penn TB functions and traces, put into .source attribute if self.functions == 'remove': handlefunctions(self.functions, tree) sent = self._word(block, orig=True) return tree, sent
def test_balancedpunctraise(self): tree = ParentedTree.parse('(ROOT ($, 3) ($[ 7) ($[ 13) ($, 14) ($, 20)' ' (S (NP (ART 0) (ADJA 1) (NN 2) (NP (CARD 4) (NN 5) (PP' ' (APPR 6) (CNP (NN 8) (ADV 9) (ISU ($. 10) ($. 11)' ' ($. 12))))) (S (PRELS 15) (MPN (NE 16) (NE 17)) (ADJD 18)' ' (VVFIN 19))) (VVFIN 21) (ADV 22) (NP (ADJA 23) (NN 24)))' ' ($. 25))', parse_leaf=int) sent = ("Die zweite Konzertreihe , sechs Abende mit ' Orgel plus " ". . . ' , die Hayko Siemens musikalisch leitet , bietet " "wieder ungewoehnliche Kombinationen .".split()) punctraise(tree, sent) balancedpunctraise(tree, sent) assert max(map(fanout, addbitsets(tree).subtrees())) == 1 nopunct = Tree.parse('(ROOT (S (NP (ART 0) (ADJA 1) (NN 2) (NP ' '(CARD 3) (NN 4) (PP (APPR 5) (CNP (NN 6) (ADV 7)))) (S ' '(PRELS 8) (MPN (NE 9) (NE 10)) (ADJD 11) (VVFIN 12))) ' '(VVFIN 13) (ADV 14) (NP (ADJA 15) (NN 16))))', parse_leaf=int) assert max(map(fanout, addbitsets(nopunct).subtrees())) == 1
def _parse(self, block): result = ParentedTree.parse(block.split("\t", 1)[0], parse_leaf=int) sent = self._word(block, orig=True) return result, sent
config.read(argv[1]) data = SupertagParseDataset(f"{config['Corpus']['filename']}.train") from discodop.tree import ParentedTree, Tree from discodop.treetransforms import unbinarize, removefanoutmarkers from discodop.eval import Evaluator, readparam from discodop.lexgrammar import SupertagGrammar grammar = load(open(f"{config['Corpus']['filename']}.grammar", "rb")) i = 0 evaluator = Evaluator(readparam("proper.prm")) for sentence in data: words = tuple(t.text for t in sentence) poss = tuple(t.get_tag("pos").value for t in sentence) tags = tuple(((t.get_tag("supertag").value, 0.0), ) for t in sentence) parses = grammar.parse(poss, tags, posmode=True) try: parse = next(parses) except StopIteration: leaves = (f"({p} {i})" for p, i in zip(poss, range(len(words)))) parse = ParentedTree(f"(NOPARSE {' '.join(leaves)})") gold = ParentedTree(sentence.get_labels("tree")[0].value) gold = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(gold)))) parse = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(parse)))) evaluator.add(i, gold.copy(deep=True), list(words), parse.copy(deep=True), list(words)) i += 1 print(evaluator.summary())
def evaluate(self, sentences: SupertagParseDataset, mini_batch_size: int = 32, num_workers: int = 1, embedding_storage_mode: str = "none", out_path=None, only_disc: str = "both", accuracy: str = "both", pos_accuracy: bool = True, return_loss: bool = True) -> Tuple[Result, float]: """ Predicts supertags, pos tags and parse trees, and reports the predictions scores for a set of sentences. :param sentences: a ``DataSet`` of sentences. For each sentence a gold parse tree is expected as value of the `tree` label, as provided by ``SupertagParseDataset``. :param only_disc: If set, overrides the setting `DISC_ONLY` in the evaluation parameter file ``self.evalparam``, i.e. only evaluates discontinuous constituents if True. Pass "both" to report both results. :param accuracy: either 'none', 'best', 'kbest' or 'both'. Determines if the accuracy is computed from the best, or k-best predicted tags. :param pos_accuracy: if set, reports acc. of predicted pos tags. :param return_loss: if set, nll loss wrt. gold tags is reported, otherwise the second component in the returned tuple is 0. :returns: tuple with evaluation ``Result``, where the main score is the f1-score (for all constituents, if only_disc == "both"). """ from flair.datasets import DataLoader from discodop.tree import ParentedTree, Tree from discodop.treetransforms import unbinarize, removefanoutmarkers from discodop.eval import Evaluator, readparam from timeit import default_timer from collections import Counter if self.__evalparam__ is None: raise Exception( "Need to specify evaluator parameter file before evaluating") if only_disc == "both": evaluators = { "F1-all": Evaluator({ **self.evalparam, "DISC_ONLY": False }), "F1-disc": Evaluator({ **self.evalparam, "DISC_ONLY": True }) } else: mode = self.evalparam["DISC_ONLY"] if only_disc == "param" else ( only_disc == "true") strmode = "F1-disc" if mode else "F1-all" evaluators = { strmode: Evaluator({ **self.evalparam, "DISC_ONLY": mode }) } data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) # predict supertags and parse trees eval_loss = 0 start_time = default_timer() for batch in data_loader: loss = self.predict(batch, embedding_storage_mode=embedding_storage_mode, supertag_storage_mode=accuracy, postag_storage_mode=pos_accuracy, label_name='predicted', return_loss=return_loss) eval_loss += loss if return_loss else 0 end_time = default_timer() i = 0 batches = 0 noparses = 0 acc_ctr = Counter() for batch in data_loader: for sentence in batch: for token in sentence: if accuracy in ("kbest", "both") and token.get_tag("supertag").value in \ (l.value for l in token.get_tags_proba_dist('predicted-supertag')): acc_ctr["kbest"] += 1 if accuracy in ("best", "both") and token.get_tag("supertag").value == \ token.get_tag('predicted-supertag').value: acc_ctr["best"] += 1 if pos_accuracy and token.get_tag( "pos").value == token.get_tag( "predicted-pos").value: acc_ctr["pos"] += 1 acc_ctr["all"] += len(sentence) sent = [token.text for token in sentence] gold = Tree(sentence.get_labels("tree")[0].value) gold = ParentedTree.convert( unbinarize(removefanoutmarkers(gold))) parse = Tree(sentence.get_labels("predicted")[0].value) parse = ParentedTree.convert( unbinarize(removefanoutmarkers(parse))) if parse.label == "NOPARSE": noparses += 1 for evaluator in evaluators.values(): evaluator.add(i, gold.copy(deep=True), list(sent), parse.copy(deep=True), list(sent)) i += 1 batches += 1 scores = { strmode: float_or_zero(evaluator.acc.scores()['lf']) for strmode, evaluator in evaluators.items() } if accuracy in ("both", "kbest"): scores["accuracy-kbest"] = acc_ctr["kbest"] / acc_ctr["all"] if accuracy in ("both", "best"): scores["accuracy-best"] = acc_ctr["best"] / acc_ctr["all"] if pos_accuracy: scores["accuracy-pos"] = acc_ctr["pos"] / acc_ctr["all"] scores["coverage"] = 1 - (noparses / i) scores["time"] = end_time - start_time return (Result( scores['F1-all'] if 'F1-all' in scores else scores['F1-disc'], "\t".join(f"{mode}" for mode in scores), "\t".join(f"{s}" for s in scores.values()), '\n\n'.join(evaluator.summary() for evaluator in evaluators.values())), eval_loss / batches)
def doparsing(**kwds): """Parse a set of sentences using worker processes.""" params = parser.DictObj(usetags=True, numproc=None, tailmarker='', category=None, deletelabel=(), deleteword=(), corpusfmt='export') params.update(kwds) results = [parser.DictObj(name=stage.name) for stage in params.parser.stages] for result in results: result.update(parsetrees=dict.fromkeys(params.testset), probs=dict.fromkeys(params.testset, float('nan')), frags=dict.fromkeys(params.testset, 0), elapsedtime=dict.fromkeys(params.testset), evaluator=evalmod.Evaluator(params.evalparam), noparse=0) if params.numproc == 1: initworker(params) dowork = (worker(a) for a in params.testset.items()) else: pool = multiprocessing.Pool(processes=params.numproc, initializer=initworker, initargs=(params,)) dowork = pool.imap_unordered(worker, params.testset.items()) logging.info('going to parse %d sentences.', len(params.testset)) # main parse loop over each sentence in test corpus for nsent, data in enumerate(dowork, 1): sentid, sentresults = data sent, goldtree, goldsent, _ = params.testset[sentid] goldsent = [w for w, _t in goldsent] logging.debug('%d/%d (%s). [len=%d] %s\n', nsent, len(params.testset), sentid, len(sent), ' '.join(goldsent)) for n, result in enumerate(sentresults): assert (results[n].parsetrees[sentid] is None and results[n].elapsedtime[sentid] is None) results[n].parsetrees[sentid] = result.parsetree if isinstance(result.prob, tuple): results[n].probs[sentid] = [log(a) for a in result.prob if isinstance(a, float)][0] results[n].frags[sentid] = [abs(a) for a in result.prob if isinstance(a, int)][0] elif isinstance(result.prob, float): try: results[n].probs[sentid] = log(result.prob) except ValueError: results[n].probs[sentid] = 300.0 if result.fragments is not None: results[n].frags[sentid] = len(result.fragments) results[n].elapsedtime[sentid] = result.elapsedtime if result.noparse: results[n].noparse += 1 sentmetrics = results[n].evaluator.add(sentid, goldtree.copy(True), goldsent, ParentedTree.convert(result.parsetree), goldsent) msg = result.msg if sentmetrics.scores()['LF'] == '100.00': msg += '\texact match' else: msg += '\tLP %(LP)s LR %(LR)s LF %(LF)s' % sentmetrics.scores() try: msg += '\n\t' + sentmetrics.bracketings() except Exception as err: msg += 'PROBLEM bracketings:\n%s\n%s' % ( result.parsetree, err) msg += '\n' if n + 1 == len(sentresults): try: msg += sentmetrics.visualize() except Exception as err: msg += 'PROBLEM drawing tree:\n%s\n%s' % ( sentmetrics.ctree, err) logging.debug(msg) msg = '' for n, result in enumerate(sentresults): metrics = results[n].evaluator.acc.scores() msg += ('%(name)s cov %(cov)5.2f; tag %(tag)s; ex %(ex)s; ' 'lp %(lp)s; lr %(lr)s; lf %(lf)s\n' % dict( name=result.name.ljust(7), cov=100 * (1 - results[n].noparse / nsent), **metrics)) logging.debug(msg) if params.numproc != 1: pool.terminate() pool.join() del dowork, pool writeresults(results, params) return results