def build_param(path='../util/proper.prm'): param = readparam(path if path else None) # param['CUTOFF_LEN'] = int(opts.get('--cutofflen', param['CUTOFF_LEN'])) # param['DISC_ONLY'] = '--disconly' in opts # param['DEBUG'] = max(param['DEBUG'], # '--verbose' in opts, 2 * ('--debug' in opts)) # param['TED'] |= '--ted' in opts # param['LA'] |= '--la' in opts # param['DEP'] = '--headrules' in opts return param
def test_eval(): """Simple sanity check; should give 100% score on all metrics.""" from discodop.treebank import READERS from discodop.eval import Evaluator, readparam gold = READERS['export']('alpinosample.export') parses = READERS['export']('alpinosample.export') goldtrees, goldsents, candsents = gold.trees(), gold.sents(), parses.sents() evaluator = Evaluator(readparam(None)) for n, ctree in parses.trees().items(): evaluator.add(n, goldtrees[n], goldsents[n], ctree, candsents[n]) evaluator.breakdowns() print(evaluator.summary())
from configparser import ConfigParser from supertagging.data import SupertagParseDataset config = ConfigParser() config.read(argv[1]) data = SupertagParseDataset(f"{config['Corpus']['filename']}.train") from discodop.tree import ParentedTree, Tree from discodop.treetransforms import unbinarize, removefanoutmarkers from discodop.eval import Evaluator, readparam from discodop.lexgrammar import SupertagGrammar grammar = load(open(f"{config['Corpus']['filename']}.grammar", "rb")) i = 0 evaluator = Evaluator(readparam("proper.prm")) for sentence in data: words = tuple(t.text for t in sentence) poss = tuple(t.get_tag("pos").value for t in sentence) tags = tuple(((t.get_tag("supertag").value, 0.0), ) for t in sentence) parses = grammar.parse(poss, tags, posmode=True) try: parse = next(parses) except StopIteration: leaves = (f"({p} {i})" for p, i in zip(poss, range(len(words)))) parse = ParentedTree(f"(NOPARSE {' '.join(leaves)})") gold = ParentedTree(sentence.get_labels("tree")[0].value) gold = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(gold)))) parse = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(parse))))
def set_eval_param(self, config: EvalParameters): self.__grammar__.fallback_prob = config.fallbackprob self.__evalparam__ = readparam(config.evalfilename) self.__ktags__ = config.ktags
def startexp( stages=(DEFAULTSTAGE, ), # see above corpusfmt='export', # choices: export, discbracket, bracket corpusdir='.', # filenames may include globbing characters '*' and '?'. traincorpus='alpinosample.export', trainencoding='utf-8', testcorpus='alpinosample.export', testencoding='utf-8', testmaxwords=40, trainmaxwords=40, trainnumsents=2, testnumsents=1, # number of sentences to parse skiptrain=True, # test set starts after training set # (useful when they are in the same file) skip=0, # number of sentences to skip from test corpus punct=None, # choices: None, 'move', 'remove', 'root' functions=None, # choices None, 'add', 'remove', 'replace' morphology=None, # choices: None, 'add', 'replace', 'between' transformations=None, # apply treebank transformations # postagging: pass None to use tags from treebank. postagging=None, relationalrealizational=None, # do not apply RR-transform headrules=None, # rules for finding heads of constituents bintype='binarize', # choices: binarize, optimal, optimalhead factor='right', revmarkov=True, v=1, h=2, pospa=False, # when v > 1, add parent annotation to POS tags? markhead=False, # prepend head to siblings leftmostunary=True, # start binarization with unary node rightmostunary=True, # end binarization with unary node tailmarker='', # with headrules, head is last node and can be marked fanout_marks_before_bin=False, evalparam='proper.prm', # EVALB-style parameter file quiet=False, reallyquiet=False, # quiet=no per sentence results numproc=1, # increase to use multiple CPUs; None: use all CPUs. resultdir='results', rerun=False): """ Execute an experiment. """ assert bintype in ('optimal', 'optimalhead', 'binarize') if postagging is not None: assert set(postagging).issubset({'method', 'model', 'unknownthreshold', 'openclassthreshold', 'simplelexsmooth'}) if postagging['method'] == 'unknownword': assert postagging['model'] in ('4', '6', 'base') assert postagging['unknownthreshold'] >= 1 assert postagging['openclassthreshold'] >= 0 else: assert postagging['method'] in ('treetagger', 'stanford') if rerun: assert os.path.exists(resultdir), ( 'Directory %r does not exist.' '--rerun requires a directory ' 'with the grammar(s) of a previous experiment.' % resultdir) else: assert not os.path.exists(resultdir), ( 'Directory %r exists.\n' 'Use --rerun to parse with existing grammar ' 'and overwrite previous results.' % resultdir) os.mkdir(resultdir) # Log everything, and send it to stderr, in a format with just the message. formatstr = '%(message)s' if reallyquiet: logging.basicConfig(level=logging.WARNING, format=formatstr) elif quiet: logging.basicConfig(level=logging.INFO, format=formatstr) else: logging.basicConfig(level=logging.DEBUG, format=formatstr) # also log to a file fileobj = logging.FileHandler(filename='%s/output.log' % resultdir) #fileobj.setLevel(logging.INFO) fileobj.setLevel(logging.DEBUG) fileobj.setFormatter(logging.Formatter(formatstr)) logging.getLogger('').addHandler(fileobj) corpusreader = getreader(corpusfmt) if not rerun: corpus = corpusreader(corpusdir, traincorpus, encoding=trainencoding, headrules=headrules, headfinal=True, headreverse=False, punct=punct, functions=functions, morphology=morphology) logging.info('%d sentences in training corpus %s/%s', len(corpus.parsed_sents()), corpusdir, traincorpus) if isinstance(trainnumsents, float): trainnumsents = int(trainnumsents * len(corpus.sents())) trees = list(corpus.parsed_sents().values())[:trainnumsents] sents = list(corpus.sents().values())[:trainnumsents] if transformations: trees = [transform(tree, sent, transformations) for tree, sent in zip(trees, sents)] if relationalrealizational: trees = [rrtransform(tree, **relationalrealizational)[0] for tree in trees] train_tagged_sents = [[(word, tag) for word, (_, tag) in zip(sent, sorted(tree.pos()))] for tree, sent in zip(trees, sents)] blocks = list(corpus.blocks().values())[:trainnumsents] assert trees, 'training corpus should be non-empty' logging.info('%d training sentences before length restriction', len(trees)) trees, sents, blocks = zip(*[sent for sent in zip(trees, sents, blocks) if len(sent[1]) <= trainmaxwords]) logging.info('%d training sentences after length restriction <= %d', len(trees), trainmaxwords) testset = corpusreader(corpusdir, testcorpus, encoding=testencoding, punct=punct, morphology=morphology, functions=functions) gold_sents = testset.tagged_sents() test_parsed_sents = testset.parsed_sents() if skiptrain: skip += trainnumsents logging.info('%d sentences in test corpus %s/%s', len(testset.parsed_sents()), corpusdir, testcorpus) logging.info('%d test sentences before length restriction', len(list(gold_sents)[skip:skip + testnumsents])) lexmodel = None test_tagged_sents = gold_sents if postagging and postagging['method'] in ('treetagger', 'stanford'): if postagging['method'] == 'treetagger': # these two tags are never given by tree-tagger, # so collect words whose tag needs to be overriden overridetags = ('PTKANT', 'PIDAT') elif postagging['method'] == 'stanford': overridetags = ('PTKANT', ) taglex = defaultdict(set) for sent in train_tagged_sents: for word, tag in sent: taglex[word].add(tag) overridetagdict = {tag: {word for word, tags in taglex.items() if tags == {tag}} for tag in overridetags} tagmap = {'$(': '$[', 'PAV': 'PROAV'} sents_to_tag = OrderedDict((a, b) for a, b in islice(gold_sents.items(), skip, skip + testnumsents) if len(b) <= testmaxwords), test_tagged_sents = externaltagging(postagging['method'], postagging['model'], sents_to_tag, overridetagdict, tagmap) # give these tags to parser usetags = True elif postagging and postagging['method'] == 'unknownword' and not rerun: postagging['unknownwordfun'] = getunknownwordfun(postagging['model']) # get smoothed probalities for lexical productions lexresults, msg = getunknownwordmodel( train_tagged_sents, postagging['unknownwordfun'], postagging['unknownthreshold'], postagging['openclassthreshold']) logging.info(msg) simplelexsmooth = postagging['simplelexsmooth'] if simplelexsmooth: lexmodel = lexresults[2:8] else: lexmodel, msg = getlexmodel(*lexresults) logging.info(msg) # NB: knownwords are all words in training set, lexicon is the subset # of words that are above the frequency threshold. # for training purposes we work with the subset, at test time we exploit # the full set of known words from the training set. sigs, knownwords, lexicon = lexresults[:3] postagging['sigs'], postagging['lexicon'] = sigs, knownwords # replace rare train words with signatures sents = replaceraretrainwords(train_tagged_sents, postagging['unknownwordfun'], lexicon) # make sure gold POS tags are not given to parser usetags = False elif postagging and postagging['method'] == 'unknownword' and rerun: usetags = False else: simplelexsmooth = False # give gold POS tags to parser usetags = True # 0: test sentences as they should be handed to the parser, # 1: gold trees for evaluation purposes # 2: gold sentence because test sentences may be mangled by unknown word # model # 3: blocks from treebank file to reproduce the relevant part of the # original treebank verbatim. testset = OrderedDict((a, (test_tagged_sents[a], test_parsed_sents[a], gold_sents[a], block)) for a, block in islice(testset.blocks().items(), skip, skip + testnumsents) if len(test_tagged_sents[a]) <= testmaxwords) assert test_tagged_sents, 'test corpus should be non-empty' logging.info('%d test sentences after length restriction <= %d', len(testset), testmaxwords) if rerun: trees = [] sents = [] toplabels = {tree.label for tree in trees} | { test_parsed_sents[n].label for n in testset} assert len(toplabels) == 1, 'expected unique ROOT label: %r' % toplabels top = toplabels.pop() if rerun: readgrammars(resultdir, stages, postagging, top) else: logging.info('read training & test corpus') getgrammars(trees, sents, stages, bintype, h, v, factor, tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead, fanout_marks_before_bin, testmaxwords, resultdir, numproc, lexmodel, simplelexsmooth, top, relationalrealizational) evalparam = evalmod.readparam(evalparam) evalparam['DEBUG'] = -1 evalparam['CUTOFF_LEN'] = 40 deletelabel = evalparam.get('DELETE_LABEL', ()) deleteword = evalparam.get('DELETE_WORD', ()) begin = time.clock() parser = Parser(stages, transformations=transformations, tailmarker=tailmarker, postagging=postagging if postagging and postagging['method'] == 'unknownword' else None, relationalrealizational=relationalrealizational) results = doparsing(parser=parser, testset=testset, resultdir=resultdir, usetags=usetags, numproc=numproc, deletelabel=deletelabel, deleteword=deleteword, corpusfmt=corpusfmt, morphology=morphology) if numproc == 1: logging.info('time elapsed during parsing: %gs', time.clock() - begin) for result in results[0]: nsent = len(result.parsetrees) header = (' ' + result.name.upper() + ' ').center(35, '=') evalsummary = evalmod.doeval(OrderedDict((a, b.copy(True)) for a, b in test_parsed_sents.items()), gold_sents, result.parsetrees, test_tagged_sents if usetags else gold_sents, evalparam) coverage = 'coverage: %s = %6.2f' % ( ('%d / %d' % (nsent - result.noparse, nsent)).rjust( 25 if any(len(a) > evalparam['CUTOFF_LEN'] for a in gold_sents.values()) else 14), 100.0 * (nsent - result.noparse) / nsent) logging.info('\n'.join(('', header, evalsummary, coverage))) return top
def startexp( stages=(parser.DictObj(parser.DEFAULTSTAGE), ), # see parser module corpusfmt='export', # choices: export, (disc)bracket, alpino, tiger traincorpus=parser.DictObj(DEFAULTS['traincorpus']), testcorpus=parser.DictObj(DEFAULTS['testcorpus']), binarization=parser.DictObj(DEFAULTS['binarization']), removeempty=False, # whether to remove empty terminals ensureroot=None, # ensure every tree has a root node with this label punct=None, # choices: None, 'move', 'remove', 'root' functions=None, # choices None, 'add', 'remove', 'replace' morphology=None, # choices: None, 'add', 'replace', 'between' transformations=None, # apply treebank transformations postagging=None, # postagging: pass None to use tags from treebank. relationalrealizational=None, # do not apply RR-transform evalparam='proper.prm', # EVALB-style parameter file verbosity=2, numproc=1, # increase to use multiple CPUs; None: use all CPUs. resultdir='results', rerun=False): """Execute an experiment.""" if rerun: if not os.path.exists(resultdir): raise ValueError('Directory %r does not exist.\n--rerun requires a' ' directory with the grammar(s) of a previous experiment.' % resultdir) else: if os.path.exists(resultdir): raise ValueError('Directory %r exists.\n' 'Use --rerun to parse with existing grammar ' 'and overwrite previous results.' % resultdir) os.mkdir(resultdir) # Log everything, and send it to stderr, in a format with just the message. formatstr = '%(message)s' if verbosity == 0: logging.basicConfig(level=logging.WARNING, format=formatstr) elif verbosity == 1: logging.basicConfig(level=logging.INFO, format=formatstr) elif verbosity == 2: logging.basicConfig(level=logging.DEBUG, format=formatstr) elif 3 <= verbosity <= 4: logging.basicConfig(level=5, format=formatstr) else: raise ValueError('verbosity should be >= 0 and <= 4. ') # also log to a file fileobj = logging.FileHandler(filename='%s/output.log' % resultdir) fileobj.setLevel(logging.DEBUG) fileobj.setFormatter(logging.Formatter(formatstr)) logging.getLogger('').addHandler(fileobj) if not rerun: trees, sents, train_tagged_sents = loadtraincorpus( corpusfmt, traincorpus, binarization, punct, functions, morphology, removeempty, ensureroot, transformations, relationalrealizational) elif isinstance(traincorpus.numsents, float): raise ValueError('need to specify number of training set sentences, ' 'not fraction, in rerun mode.') testsettb = treebank.READERS[corpusfmt]( testcorpus.path, encoding=testcorpus.encoding, removeempty=removeempty, morphology=morphology, functions=functions, ensureroot=ensureroot) if isinstance(testcorpus.numsents, float): testcorpus.numsents = int(testcorpus.numsents * len(testsettb.blocks())) if testcorpus.skiptrain: testcorpus.skip += ( # pylint: disable=maybe-no-member traincorpus.numsents) # pylint: disable=maybe-no-member test_blocks = OrderedDict() test_trees = OrderedDict() test_tagged_sents = OrderedDict() for n, a in islice(testsettb._read_blocks(), testcorpus.skip, testcorpus.skip # pylint: disable=maybe-no-member + testcorpus.numsents): tree, sent = testsettb._parsetree(a) if 1 <= len(sent) <= testcorpus.maxwords: test_blocks[n] = testsettb._strblock(n, a) test_trees[n] = tree test_tagged_sents[n] = [(word, tag) for word, (_, tag) in zip(sent, sorted(tree.pos()))] logging.info('%d test sentences after length restriction <= %d', len(test_trees), testcorpus.maxwords) lexmodel = None simplelexsmooth = False test_tagged_sents_mangled = test_tagged_sents if postagging and postagging.method in ('treetagger', 'stanford', 'frog'): if postagging.method == 'treetagger': # these two tags are never given by tree-tagger, # so collect words whose tag needs to be overriden overridetags = ('PTKANT', 'PIDAT') elif postagging.method == 'stanford': overridetags = ('PTKANT', ) elif postagging.method == 'frog': overridetags = () taglex = defaultdict(set) for sent in train_tagged_sents: for word, tag in sent: taglex[word].add(tag) overridetagdict = {tag: {word for word, tags in taglex.items() if tags == {tag}} for tag in overridetags} tagmap = {'$(': '$[', 'PAV': 'PROAV'} test_tagged_sents_mangled = lexicon.externaltagging(postagging.method, postagging.model, test_tagged_sents, overridetagdict, tagmap) if postagging.retag and not rerun: logging.info('re-tagging training corpus') sents_to_tag = OrderedDict(enumerate(train_tagged_sents)) train_tagged_sents = lexicon.externaltagging(postagging.method, postagging.model, sents_to_tag, overridetagdict, tagmap).values() for tree, tagged in zip(trees, train_tagged_sents): for node in tree.subtrees( lambda n: len(n) == 1 and isinstance(n[0], int)): node.label = tagged[node[0]][1] usetags = True # give these tags to parser elif postagging and postagging.method == 'unknownword': if not rerun: sents, lexmodel = getposmodel(postagging, train_tagged_sents) simplelexsmooth = postagging.simplelexsmooth usetags = False # make sure gold POS tags are not given to parser else: usetags = True # give gold POS tags to parser # 0: test sentences as they should be handed to the parser, # 1: gold trees for evaluation purposes # 2: gold sents because test sentences may be mangled by unknown word model # 3: blocks from treebank file to reproduce the relevant part of the # original treebank verbatim. testset = OrderedDict((n, ( test_tagged_sents_mangled[n], test_trees[n], test_tagged_sents[n], block)) for n, block in test_blocks.items()) if not test_tagged_sents: raise ValueError('test corpus (selection) should be non-empty.') if rerun: trees, sents = [], [] roots = {t.label for t in trees} | {test_trees[n].label for n in testset} if len(roots) != 1: raise ValueError('expected unique ROOT label: %r' % roots) top = roots.pop() if rerun: parser.readgrammars(resultdir, stages, postagging, top) else: logging.info('read training & test corpus') getgrammars(dobinarization(trees, sents, binarization, relationalrealizational), sents, stages, testcorpus.maxwords, resultdir, numproc, lexmodel, simplelexsmooth, top) evalparam = evalmod.readparam(evalparam) evalparam['DEBUG'] = -1 evalparam['CUTOFF_LEN'] = 40 deletelabel = evalparam.get('DELETE_LABEL', ()) deleteword = evalparam.get('DELETE_WORD', ()) begin = time.clock() theparser = parser.Parser(stages, transformations=transformations, binarization=binarization, postagging=postagging if postagging and postagging.method == 'unknownword' else None, relationalrealizational=relationalrealizational, verbosity=verbosity) results = doparsing(parser=theparser, testset=testset, resultdir=resultdir, usetags=usetags, numproc=numproc, deletelabel=deletelabel, deleteword=deleteword, corpusfmt=corpusfmt, morphology=morphology, evalparam=evalparam) if numproc == 1: logging.info('time elapsed during parsing: %gs', time.clock() - begin) for result in results: nsent = len(result.parsetrees) overcutoff = any(len(a) > evalparam['CUTOFF_LEN'] for a in test_tagged_sents.values()) header = (' ' + result.name.upper() + ' ').center( 44 if overcutoff else 35, '=') evalsummary = result.evaluator.summary() coverage = 'coverage: %s = %6.2f' % ( ('%d / %d' % (nsent - result.noparse, nsent)).rjust( 25 if overcutoff else 14), 100.0 * (nsent - result.noparse) / nsent) logging.info('\n'.join(('', header, evalsummary, coverage))) return top