Beispiel #1
0
	def getsubtree(node, parent, morphology, lemmas):
		""" Parse a subtree of an Alpino tree. """
		# FIXME: proper representation for arbitrary features
		source = [''] * len(FIELDS)
		source[WORD] = node.get('word') or ("#%s" % node.get('id'))
		source[LEMMA] = node.get('lemma') or node.get('root')
		source[MORPH] = node.get('postag') or node.get('frame')
		source[FUNC] = node.get('rel')
		if 'cat' in node.keys():
			source[TAG] = node.get('cat')
			if node.get('index'):
				coindexed[node.get('index')] = source
			label = node.get('cat')
			result = ParentedTree(label.upper(), [])
			for child in node:
				subtree = getsubtree(child, result, morphology, lemmas)
				if subtree and (
						'word' in child.keys() or 'cat' in child.keys()):
					subtree.source[PARENT] = node.get('id')
					result.append(subtree)
			if not len(result):
				return None
		elif 'word' in node.keys():
			source[TAG] = node.get('pt') or node.get('pos')
			if node.get('index'):
				coindexed[node.get('index')] = source
			result = ParentedTree('', list(
					range(int(node.get('begin')), int(node.get('end')))))
			handlemorphology(morphology, lemmas, result, source)
		elif 'index' in node.keys():
			coindexation[node.get('index')].extend(
					(node.get('rel'), parent))
			return None
		result.source = source
		return result
Beispiel #2
0
def convert_tree(htree, vroot="VROOT"):
    """
    :param htree:
    :type htree: ConstituentTree
    :type vroot: str
    :return:
    :rtype: Tuple[ParentedTree, List]
    """
    nodes = {}
    for idx in htree.nodes():
        token = htree.node_token(idx)
        if token.type() == "CONSTITUENT-CATEGORY":
            nodes[idx] = ParentedTree(token.category(), [])
            nodes[idx].source = (token.category(), '--', '--', '--', token.edge(), '--')
        elif token.type() == "CONSTITUENT-TERMINAL":
            # nodes[idx] = Tree(htree.full_yield().index(idx), [])
            nodes[idx] = ParentedTree(token.pos(), [htree.full_yield().index(idx)])
            nodes[idx].source = (token.form(), '--', token.pos(), token.morph_feats(), token.edge(), '--')

    if True or len(htree.root) > 1 :
        tree = ParentedTree(vroot, [nodes[r] for r in htree.root])
    else:
        tree = nodes[htree.root[0]]

    for idx in htree.nodes():
        for c_idx in htree.children(idx):
            nodes[idx].append(nodes[c_idx])
        if htree.disconnected(idx):
            tree.append(nodes[idx])

    # handlefunctions(action='between', tree=tree)

    sent = [token.form() for token in htree.full_token_yield()]

    return tree, sent
Beispiel #3
0
def brackettree(treestr, sent, brackets, strtermre):
	"""Parse a single tree presented in (disc)bracket format.

	in the 'bracket' case ``sent`` is ignored."""
	if strtermre.search(treestr):  # terminals are not all indices
		rest = sent.strip()
		sent, cnt = [], count()

		def substleaf(x):
			"""Collect word and return index."""
			sent.append(x)
			return next(cnt)

		tree = ParentedTree.parse(FRONTIERNTRE.sub(' -FRONTIER-)', treestr),
				parse_leaf=substleaf, brackets=brackets)
	else:  # disc. trees with integer indices as terminals
		tree = ParentedTree.parse(treestr, parse_leaf=int,
			brackets=brackets)
		if sent.strip():
			maxleaf = max(tree.leaves())
			sent, rest = sent.strip('\n\r\t').split(' ', maxleaf), ''
			sep = [sent[-1].index(b) for b in '\t\n\r' if b in sent[-1]]
			if sep:
				sent[-1], rest = sent[-1][:min(sep)], sent[-1][min(sep) + 1:]
		else:
			sent, rest = map(str, range(max(tree.leaves()) + 1)), ''
	sent = [unquote(a) for a in sent]
	return tree, sent, rest
Beispiel #4
0
	def _parsetree(self, block):
		""":returns: a transformed parse tree and sentence."""
		tree, sent = self._parse(block)
		if not sent:  # ??3
			return tree
		if self.removeempty:
			removeemptynodes(tree, sent)
		if self.ensureroot and tree.label != self.ensureroot:
			tree = ParentedTree(self.ensureroot, [tree])
		if not isinstance(self, BracketCorpusReader):
			# roughly order constituents by order in sentence
			for a in reversed(list(tree.subtrees(lambda x: len(x) > 1))):
				a.sort(key=Tree.leaves)
		if self.punct == 'remove':
			punctremove(tree, sent)
		elif self.punct == 'move' or self.punct == 'moveall':
			punctraise(tree, sent, self.punct == 'moveall')
			balancedpunctraise(tree, sent)
			# restore order
			for a in reversed(list(tree.subtrees(lambda x: len(x) > 1))):
				a.sort(key=Tree.leaves)
		elif self.punct == 'root':
			punctroot(tree, sent)
		if self.headrules:
			for node in tree.subtrees(lambda n: n and isinstance(n[0], Tree)):
				sethead(headfinder(node, self.headrules))
				headorder(node, self.headfinal, self.reverse)
				if self.markheads:
					headmark(node)
		return tree, sent
Beispiel #5
0
	def getchildren(parent):
		""" Traverse tree in export format and create Tree object. """
		results = []
		for n, source in children[parent]:
			# n is the index in the block to record word indices
			m = EXPORTNONTERMINAL.match(source[WORD])
			if m:
				child = ParentedTree(source[TAG], getchildren(m.group(1)))
			else:  # POS + terminal
				child = ParentedTree('', [n])
				handlemorphology(morphology, lemmas, child, source)
			child.source = tuple(source)
			results.append(child)
		return results
Beispiel #6
0
	def _parse(self, block):
		c = count()
		result = ParentedTree.parse(block, parse_leaf=lambda _: next(c))
		if result.label not in ('TOP', 'ROOT'):
			result = ParentedTree('TOP', [result])
		sent = self._word(block, orig=True)
		return result, sent
Beispiel #7
0
	def _parse(self, block):
		treestr = block.split("\t", 1)[0]
		tree = ParentedTree.parse(treestr, parse_leaf=int)
		sent = self._word(block, orig=True)
		if not all(0 <= n < len(sent) for n in tree.leaves()):
			raise ValueError('All leaves must be in the interval 0..n with '
					'n=len(sent)\ntokens: %d indices: %r\nsent: %s' % (
					len(sent), tree.leaves(), sent))
		return tree, sent
Beispiel #8
0
	def _parse(self, block):
		c = count()
		tree = ParentedTree.parse(LEAVESRE.sub(lambda _: ' %d)' % next(c),
				block), parse_leaf=int)
		# TODO: parse Penn TB functions and traces, put into .source attribute
		if self.functions == 'remove':
			handlefunctions(self.functions, tree)
		sent = self._word(block, orig=True)
		return tree, sent
Beispiel #9
0
	def test_balancedpunctraise(self):
		tree = ParentedTree.parse('(ROOT ($, 3) ($[ 7) ($[ 13) ($, 14) ($, 20)'
				' (S (NP (ART 0) (ADJA 1) (NN 2) (NP (CARD 4) (NN 5) (PP'
				' (APPR 6) (CNP (NN 8) (ADV 9) (ISU ($. 10) ($. 11)'
				' ($. 12))))) (S (PRELS 15) (MPN (NE 16) (NE 17)) (ADJD 18)'
				' (VVFIN 19))) (VVFIN 21) (ADV 22) (NP (ADJA 23) (NN 24)))'
				' ($. 25))', parse_leaf=int)
		sent = ("Die zweite Konzertreihe , sechs Abende mit ' Orgel plus "
				". . . ' , die Hayko Siemens musikalisch leitet , bietet "
				"wieder ungewoehnliche Kombinationen .".split())
		punctraise(tree, sent)
		balancedpunctraise(tree, sent)
		assert max(map(fanout, addbitsets(tree).subtrees())) == 1

		nopunct = Tree.parse('(ROOT (S (NP (ART 0) (ADJA 1) (NN 2) (NP '
				'(CARD 3) (NN 4) (PP (APPR 5) (CNP (NN 6) (ADV 7)))) (S '
				'(PRELS 8) (MPN (NE 9) (NE 10)) (ADJD 11) (VVFIN 12))) '
				'(VVFIN 13) (ADV 14) (NP (ADJA 15) (NN 16))))', parse_leaf=int)
		assert max(map(fanout, addbitsets(nopunct).subtrees())) == 1
Beispiel #10
0
	def test_balancedpunctraise(self):
		tree = ParentedTree.parse('(ROOT ($, 3) ($[ 7) ($[ 13) ($, 14) ($, 20)'
				' (S (NP (ART 0) (ADJA 1) (NN 2) (NP (CARD 4) (NN 5) (PP'
				' (APPR 6) (CNP (NN 8) (ADV 9) (ISU ($. 10) ($. 11)'
				' ($. 12))))) (S (PRELS 15) (MPN (NE 16) (NE 17)) (ADJD 18)'
				' (VVFIN 19))) (VVFIN 21) (ADV 22) (NP (ADJA 23) (NN 24)))'
				' ($. 25))', parse_leaf=int)
		sent = ("Die zweite Konzertreihe , sechs Abende mit ' Orgel plus "
				". . . ' , die Hayko Siemens musikalisch leitet , bietet "
				"wieder ungewoehnliche Kombinationen .".split())
		punctraise(tree, sent)
		balancedpunctraise(tree, sent)
		assert max(map(fanout, addbitsets(tree).subtrees())) == 1

		nopunct = Tree.parse('(ROOT (S (NP (ART 0) (ADJA 1) (NN 2) (NP '
				'(CARD 3) (NN 4) (PP (APPR 5) (CNP (NN 6) (ADV 7)))) (S '
				'(PRELS 8) (MPN (NE 9) (NE 10)) (ADJD 11) (VVFIN 12))) '
				'(VVFIN 13) (ADV 14) (NP (ADJA 15) (NN 16))))', parse_leaf=int)
		assert max(map(fanout, addbitsets(nopunct).subtrees())) == 1
Beispiel #11
0
	def _parse(self, block):
		result = ParentedTree.parse(block.split("\t", 1)[0], parse_leaf=int)
		sent = self._word(block, orig=True)
		return result, sent
Beispiel #12
0
config.read(argv[1])

data = SupertagParseDataset(f"{config['Corpus']['filename']}.train")

from discodop.tree import ParentedTree, Tree
from discodop.treetransforms import unbinarize, removefanoutmarkers
from discodop.eval import Evaluator, readparam
from discodop.lexgrammar import SupertagGrammar

grammar = load(open(f"{config['Corpus']['filename']}.grammar", "rb"))
i = 0
evaluator = Evaluator(readparam("proper.prm"))
for sentence in data:
    words = tuple(t.text for t in sentence)
    poss = tuple(t.get_tag("pos").value for t in sentence)
    tags = tuple(((t.get_tag("supertag").value, 0.0), ) for t in sentence)
    parses = grammar.parse(poss, tags, posmode=True)
    try:
        parse = next(parses)
    except StopIteration:
        leaves = (f"({p} {i})" for p, i in zip(poss, range(len(words))))
        parse = ParentedTree(f"(NOPARSE {' '.join(leaves)})")
    gold = ParentedTree(sentence.get_labels("tree")[0].value)
    gold = ParentedTree.convert(
        unbinarize(removefanoutmarkers(Tree.convert(gold))))
    parse = ParentedTree.convert(
        unbinarize(removefanoutmarkers(Tree.convert(parse))))
    evaluator.add(i, gold.copy(deep=True), list(words), parse.copy(deep=True),
                  list(words))
    i += 1
print(evaluator.summary())
Beispiel #13
0
    def evaluate(self,
                 sentences: SupertagParseDataset,
                 mini_batch_size: int = 32,
                 num_workers: int = 1,
                 embedding_storage_mode: str = "none",
                 out_path=None,
                 only_disc: str = "both",
                 accuracy: str = "both",
                 pos_accuracy: bool = True,
                 return_loss: bool = True) -> Tuple[Result, float]:
        """ Predicts supertags, pos tags and parse trees, and reports the
            predictions scores for a set of sentences.
            :param sentences: a ``DataSet`` of sentences. For each sentence
                a gold parse tree is expected as value of the `tree` label, as
                provided by ``SupertagParseDataset``.
            :param only_disc: If set, overrides the setting `DISC_ONLY` in the
                evaluation parameter file ``self.evalparam``, i.e. only evaluates
                discontinuous constituents if True. Pass "both" to report both
                results.
            :param accuracy: either 'none', 'best', 'kbest' or 'both'.
                Determines if the accuracy is computed from the best, or k-best
                predicted tags.
            :param pos_accuracy: if set, reports acc. of predicted pos tags.
            :param return_loss: if set, nll loss wrt. gold tags is reported,
                otherwise the second component in the returned tuple is 0.
            :returns: tuple with evaluation ``Result``, where the main score
                is the f1-score (for all constituents, if only_disc == "both").
        """
        from flair.datasets import DataLoader
        from discodop.tree import ParentedTree, Tree
        from discodop.treetransforms import unbinarize, removefanoutmarkers
        from discodop.eval import Evaluator, readparam
        from timeit import default_timer
        from collections import Counter

        if self.__evalparam__ is None:
            raise Exception(
                "Need to specify evaluator parameter file before evaluating")
        if only_disc == "both":
            evaluators = {
                "F1-all": Evaluator({
                    **self.evalparam, "DISC_ONLY": False
                }),
                "F1-disc": Evaluator({
                    **self.evalparam, "DISC_ONLY": True
                })
            }
        else:
            mode = self.evalparam["DISC_ONLY"] if only_disc == "param" else (
                only_disc == "true")
            strmode = "F1-disc" if mode else "F1-all"
            evaluators = {
                strmode: Evaluator({
                    **self.evalparam, "DISC_ONLY": mode
                })
            }

        data_loader = DataLoader(sentences,
                                 batch_size=mini_batch_size,
                                 num_workers=num_workers)

        # predict supertags and parse trees
        eval_loss = 0
        start_time = default_timer()
        for batch in data_loader:
            loss = self.predict(batch,
                                embedding_storage_mode=embedding_storage_mode,
                                supertag_storage_mode=accuracy,
                                postag_storage_mode=pos_accuracy,
                                label_name='predicted',
                                return_loss=return_loss)
            eval_loss += loss if return_loss else 0
        end_time = default_timer()

        i = 0
        batches = 0
        noparses = 0
        acc_ctr = Counter()
        for batch in data_loader:
            for sentence in batch:
                for token in sentence:
                    if accuracy in ("kbest", "both") and token.get_tag("supertag").value in \
                            (l.value for l in token.get_tags_proba_dist('predicted-supertag')):
                        acc_ctr["kbest"] += 1
                    if accuracy in ("best", "both") and token.get_tag("supertag").value == \
                            token.get_tag('predicted-supertag').value:
                        acc_ctr["best"] += 1
                    if pos_accuracy and token.get_tag(
                            "pos").value == token.get_tag(
                                "predicted-pos").value:
                        acc_ctr["pos"] += 1
                acc_ctr["all"] += len(sentence)
                sent = [token.text for token in sentence]
                gold = Tree(sentence.get_labels("tree")[0].value)
                gold = ParentedTree.convert(
                    unbinarize(removefanoutmarkers(gold)))
                parse = Tree(sentence.get_labels("predicted")[0].value)
                parse = ParentedTree.convert(
                    unbinarize(removefanoutmarkers(parse)))
                if parse.label == "NOPARSE":
                    noparses += 1
                for evaluator in evaluators.values():
                    evaluator.add(i, gold.copy(deep=True), list(sent),
                                  parse.copy(deep=True), list(sent))
                i += 1
            batches += 1
        scores = {
            strmode: float_or_zero(evaluator.acc.scores()['lf'])
            for strmode, evaluator in evaluators.items()
        }
        if accuracy in ("both", "kbest"):
            scores["accuracy-kbest"] = acc_ctr["kbest"] / acc_ctr["all"]
        if accuracy in ("both", "best"):
            scores["accuracy-best"] = acc_ctr["best"] / acc_ctr["all"]
        if pos_accuracy:
            scores["accuracy-pos"] = acc_ctr["pos"] / acc_ctr["all"]
        scores["coverage"] = 1 - (noparses / i)
        scores["time"] = end_time - start_time
        return (Result(
            scores['F1-all'] if 'F1-all' in scores else scores['F1-disc'],
            "\t".join(f"{mode}" for mode in scores),
            "\t".join(f"{s}" for s in scores.values()),
            '\n\n'.join(evaluator.summary()
                        for evaluator in evaluators.values())),
                eval_loss / batches)
Beispiel #14
0
def doparsing(**kwds):
	"""Parse a set of sentences using worker processes."""
	params = parser.DictObj(usetags=True, numproc=None, tailmarker='',
		category=None, deletelabel=(), deleteword=(), corpusfmt='export')
	params.update(kwds)
	results = [parser.DictObj(name=stage.name)
			for stage in params.parser.stages]
	for result in results:
		result.update(parsetrees=dict.fromkeys(params.testset),
				probs=dict.fromkeys(params.testset, float('nan')),
				frags=dict.fromkeys(params.testset, 0),
				elapsedtime=dict.fromkeys(params.testset),
				evaluator=evalmod.Evaluator(params.evalparam), noparse=0)
	if params.numproc == 1:
		initworker(params)
		dowork = (worker(a) for a in params.testset.items())
	else:
		pool = multiprocessing.Pool(processes=params.numproc,
				initializer=initworker, initargs=(params,))
		dowork = pool.imap_unordered(worker, params.testset.items())
	logging.info('going to parse %d sentences.', len(params.testset))
	# main parse loop over each sentence in test corpus
	for nsent, data in enumerate(dowork, 1):
		sentid, sentresults = data
		sent, goldtree, goldsent, _ = params.testset[sentid]
		goldsent = [w for w, _t in goldsent]
		logging.debug('%d/%d (%s). [len=%d] %s\n',
				nsent, len(params.testset), sentid, len(sent),
				' '.join(goldsent))
		for n, result in enumerate(sentresults):
			assert (results[n].parsetrees[sentid] is None
					and results[n].elapsedtime[sentid] is None)
			results[n].parsetrees[sentid] = result.parsetree
			if isinstance(result.prob, tuple):
				results[n].probs[sentid] = [log(a) for a in result.prob
						if isinstance(a, float)][0]
				results[n].frags[sentid] = [abs(a) for a in result.prob
						if isinstance(a, int)][0]
			elif isinstance(result.prob, float):
				try:
					results[n].probs[sentid] = log(result.prob)
				except ValueError:
					results[n].probs[sentid] = 300.0
			if result.fragments is not None:
				results[n].frags[sentid] = len(result.fragments)
			results[n].elapsedtime[sentid] = result.elapsedtime
			if result.noparse:
				results[n].noparse += 1

			sentmetrics = results[n].evaluator.add(sentid,
					goldtree.copy(True), goldsent,
					ParentedTree.convert(result.parsetree), goldsent)
			msg = result.msg
			if sentmetrics.scores()['LF'] == '100.00':
				msg += '\texact match'
			else:
				msg += '\tLP %(LP)s LR %(LR)s LF %(LF)s' % sentmetrics.scores()
				try:
					msg += '\n\t' + sentmetrics.bracketings()
				except Exception as err:
					msg += 'PROBLEM bracketings:\n%s\n%s' % (
							result.parsetree, err)
			msg += '\n'
			if n + 1 == len(sentresults):
				try:
					msg += sentmetrics.visualize()
				except Exception as err:
					msg += 'PROBLEM drawing tree:\n%s\n%s' % (
							sentmetrics.ctree, err)
			logging.debug(msg)
		msg = ''
		for n, result in enumerate(sentresults):
			metrics = results[n].evaluator.acc.scores()
			msg += ('%(name)s cov %(cov)5.2f; tag %(tag)s; ex %(ex)s; '
					'lp %(lp)s; lr %(lr)s; lf %(lf)s\n' % dict(
					name=result.name.ljust(7),
					cov=100 * (1 - results[n].noparse / nsent),
					**metrics))
		logging.debug(msg)
	if params.numproc != 1:
		pool.terminate()
		pool.join()
		del dowork, pool

	writeresults(results, params)
	return results