def test_eval(): """Simple sanity check; should give 100% score on all metrics.""" from discodop.treebank import READERS from discodop.eval import Evaluator, readparam gold = READERS['export']('alpinosample.export') parses = READERS['export']('alpinosample.export') goldtrees, goldsents, candsents = gold.trees(), gold.sents(), parses.sents() evaluator = Evaluator(readparam(None)) for n, ctree in parses.trees().items(): evaluator.add(n, goldtrees[n], goldsents[n], ctree, candsents[n]) evaluator.breakdowns() print(evaluator.summary())
from configparser import ConfigParser from supertagging.data import SupertagParseDataset config = ConfigParser() config.read(argv[1]) data = SupertagParseDataset(f"{config['Corpus']['filename']}.train") from discodop.tree import ParentedTree, Tree from discodop.treetransforms import unbinarize, removefanoutmarkers from discodop.eval import Evaluator, readparam from discodop.lexgrammar import SupertagGrammar grammar = load(open(f"{config['Corpus']['filename']}.grammar", "rb")) i = 0 evaluator = Evaluator(readparam("proper.prm")) for sentence in data: words = tuple(t.text for t in sentence) poss = tuple(t.get_tag("pos").value for t in sentence) tags = tuple(((t.get_tag("supertag").value, 0.0), ) for t in sentence) parses = grammar.parse(poss, tags, posmode=True) try: parse = next(parses) except StopIteration: leaves = (f"({p} {i})" for p, i in zip(poss, range(len(words)))) parse = ParentedTree(f"(NOPARSE {' '.join(leaves)})") gold = ParentedTree(sentence.get_labels("tree")[0].value) gold = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(gold)))) parse = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(parse))))
def evaluate(self, sentences: SupertagParseDataset, mini_batch_size: int = 32, num_workers: int = 1, embedding_storage_mode: str = "none", out_path=None, only_disc: str = "both", accuracy: str = "both", pos_accuracy: bool = True, return_loss: bool = True) -> Tuple[Result, float]: """ Predicts supertags, pos tags and parse trees, and reports the predictions scores for a set of sentences. :param sentences: a ``DataSet`` of sentences. For each sentence a gold parse tree is expected as value of the `tree` label, as provided by ``SupertagParseDataset``. :param only_disc: If set, overrides the setting `DISC_ONLY` in the evaluation parameter file ``self.evalparam``, i.e. only evaluates discontinuous constituents if True. Pass "both" to report both results. :param accuracy: either 'none', 'best', 'kbest' or 'both'. Determines if the accuracy is computed from the best, or k-best predicted tags. :param pos_accuracy: if set, reports acc. of predicted pos tags. :param return_loss: if set, nll loss wrt. gold tags is reported, otherwise the second component in the returned tuple is 0. :returns: tuple with evaluation ``Result``, where the main score is the f1-score (for all constituents, if only_disc == "both"). """ from flair.datasets import DataLoader from discodop.tree import ParentedTree, Tree from discodop.treetransforms import unbinarize, removefanoutmarkers from discodop.eval import Evaluator, readparam from timeit import default_timer from collections import Counter if self.__evalparam__ is None: raise Exception( "Need to specify evaluator parameter file before evaluating") if only_disc == "both": evaluators = { "F1-all": Evaluator({ **self.evalparam, "DISC_ONLY": False }), "F1-disc": Evaluator({ **self.evalparam, "DISC_ONLY": True }) } else: mode = self.evalparam["DISC_ONLY"] if only_disc == "param" else ( only_disc == "true") strmode = "F1-disc" if mode else "F1-all" evaluators = { strmode: Evaluator({ **self.evalparam, "DISC_ONLY": mode }) } data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) # predict supertags and parse trees eval_loss = 0 start_time = default_timer() for batch in data_loader: loss = self.predict(batch, embedding_storage_mode=embedding_storage_mode, supertag_storage_mode=accuracy, postag_storage_mode=pos_accuracy, label_name='predicted', return_loss=return_loss) eval_loss += loss if return_loss else 0 end_time = default_timer() i = 0 batches = 0 noparses = 0 acc_ctr = Counter() for batch in data_loader: for sentence in batch: for token in sentence: if accuracy in ("kbest", "both") and token.get_tag("supertag").value in \ (l.value for l in token.get_tags_proba_dist('predicted-supertag')): acc_ctr["kbest"] += 1 if accuracy in ("best", "both") and token.get_tag("supertag").value == \ token.get_tag('predicted-supertag').value: acc_ctr["best"] += 1 if pos_accuracy and token.get_tag( "pos").value == token.get_tag( "predicted-pos").value: acc_ctr["pos"] += 1 acc_ctr["all"] += len(sentence) sent = [token.text for token in sentence] gold = Tree(sentence.get_labels("tree")[0].value) gold = ParentedTree.convert( unbinarize(removefanoutmarkers(gold))) parse = Tree(sentence.get_labels("predicted")[0].value) parse = ParentedTree.convert( unbinarize(removefanoutmarkers(parse))) if parse.label == "NOPARSE": noparses += 1 for evaluator in evaluators.values(): evaluator.add(i, gold.copy(deep=True), list(sent), parse.copy(deep=True), list(sent)) i += 1 batches += 1 scores = { strmode: float_or_zero(evaluator.acc.scores()['lf']) for strmode, evaluator in evaluators.items() } if accuracy in ("both", "kbest"): scores["accuracy-kbest"] = acc_ctr["kbest"] / acc_ctr["all"] if accuracy in ("both", "best"): scores["accuracy-best"] = acc_ctr["best"] / acc_ctr["all"] if pos_accuracy: scores["accuracy-pos"] = acc_ctr["pos"] / acc_ctr["all"] scores["coverage"] = 1 - (noparses / i) scores["time"] = end_time - start_time return (Result( scores['F1-all'] if 'F1-all' in scores else scores['F1-disc'], "\t".join(f"{mode}" for mode in scores), "\t".join(f"{s}" for s in scores.values()), '\n\n'.join(evaluator.summary() for evaluator in evaluators.values())), eval_loss / batches)