def parse(self): self.counter += 1 if self.cfg_approx: chart, msg = pcfg.parse(self.input, self.disco_cfg_grammar, beam_beta=self.beam_beta, beam_delta=self.beam_delta) if chart: chart.filter() whitelist, msg = prunechart(chart, self.disco_grammar, k=self.pruning_k, splitprune=True, markorigin=True, finecfg=False) self.chart, msg = parse(self.input, self.disco_grammar, estimates=self.estimates, whitelist=whitelist, splitprune=True, markorigin=True, exhaustive=True) else: self.chart, msg = parse(self.input, self.disco_grammar, estimates=self.estimates, beam_beta=self.beam_beta, beam_delta=self.beam_delta, exhaustive=True) # if self.counter > 86: # print(self.input) # print(self.chart) # print(msg) if self.chart: self.chart.filter()
def test_cfg_approximation_conversion(self): grammar = self.build_nm_grammar() disco_grammar_rules = list(transform_grammar_cfg_approx(grammar)) print(disco_grammar_rules) disco_grammar = Grammar(disco_grammar_rules, start=grammar.start()) print(disco_grammar) n = 2 m = 3 inp = ["a"] * n + ["b"] * m + ["c"] * n + ["d"] * m chart, msg = parse(inp, disco_grammar, beam_beta=exp(-4)) chart.filter() print(chart) print(msg) fine_grammar_rules = list(transform_grammar(grammar)) fine = Grammar(fine_grammar_rules, start=grammar.start()) fine.getmapping(disco_grammar, re.compile('\*[0-9]+$'), None, True, True) whitelist, msg = prunechart(chart, fine, k=10000, splitprune=True, markorigin=True, finecfg=False) print(msg) print(whitelist) chart2, msg = parse(inp, fine, whitelist=whitelist, splitprune=True, markorigin=True) print(msg) print(chart2)
def test_issue51(): from discodop.containers import Grammar from discodop.plcfrs import parse g = Grammar([((('S', 'A'), ((0, ), )), 1.0), ((('A', 'Epsilon'), ('a', )), 1.0)], start='S') chart, _msg = parse(['b'], g) chart.filter()
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers from discodop.disambiguation import getderivations, marginalize corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [ addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10] ] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, _backtransform, _, _ = doubledop(trees, sents, debug=False, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(None, striplabelre=None, neverblockre=re.compile('^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) result, msg = grammar.testgrammar() assert result, 'RFE should sum to 1.\n%s' % msg for tree, sent in zip(corpus.trees().values(), sents): if debug: print('sentence:', ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: getderivations(chart, 100) _parses, _msg = marginalize('mpp', chart) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse('(ROOT (S (F (E (S (C (B (A 0))))))))', parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def parse(compiledgrammar, testsent, testtags=None): """ Parse a sentence with a grammar. """ chart, _ = plcfrs.parse(testsent, compiledgrammar, tags=testtags, exhaustive=True) print("input:", ' '.join("%d:%s" % a for a in enumerate(testtags if testtags else testsent)), end=' ') if chart: print() results = kbest.lazykbest(chart, 10)[0] for tree, prob in results: tree = Tree(tree) treetransforms.unbinarize(tree) print(exp(-prob), tree) print() return True else: print("no parse!\n") #print(chart) return False
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from math import exp corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [ addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10] ] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, backtransform, _, _ = doubledop(trees, sents, debug=False, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile('^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) assert grammar.testgrammar()[0], "RFE should sum to 1." for tree, sent in zip(corpus.trees().values(), sents): if debug: print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: mpp, parsetrees = {}, {} derivations, _ = lazykbest(chart, 1000, '}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.key, chart, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) if debug: print(len(mpp), 'parsetrees', sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): if debug: print(tp, t, '\nmatch:', t == str(tree)) if len(set(parsetrees[t])) != len(parsetrees[t]): print('chart:\n', chart) assert len(set(parsetrees[t])) == len(parsetrees[t]) if debug: for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def test_individual_parsing_stages(self): grammar = self.build_grammar() for r in transform_grammar(grammar): pprint(r) rule_list = list(transform_grammar(grammar)) pprint(rule_list) disco_grammar = Grammar(rule_list, start=grammar.start()) print(disco_grammar) inp = ["a"] * 3 estimates = 'SXlrgaps', getestimates(disco_grammar, 40, grammar.start()) print(type(estimates)) chart, msg = parse(inp, disco_grammar, estimates=estimates) print(chart) print(msg) chart.filter() print("filtered chart") print(disco_grammar.nonterminals) print(type(disco_grammar.nonterminals)) print(chart) # print(help(chart)) root = chart.root() print("root", root, type(root)) print(chart.indices(root)) print(chart.itemstr(root)) print(chart.stats()) print("root label", chart.label(root)) print(root, chart.itemid1(chart.label(root), chart.indices(root))) for i in range(1, chart.numitems() + 1): print(i, chart.label(i), chart.indices(i), chart.numedges(i)) if True or len(chart.indices(i)) > 1: for edge_num in range(chart.numedges(i)): edge = chart.getEdgeForItem(i, edge_num) if isinstance(edge, tuple): print("\t", disco_grammar.nonterminalstr(chart.label(i)) + "[" + str(i) + "]", "->", ' '.join([disco_grammar.nonterminalstr(chart.label(j)) + "[" + str(j) + "]" for j in [edge[1], edge[2]] if j != 0])) else: print("\t", disco_grammar.nonterminalstr(chart.label(i)) + "[" + str(i) + "]", "->", inp[edge]) print(chart.getEdgeForItem(root, 0)) # print(lazykbest(chart, 5)) manager = PyDerivationManager(grammar) manager.convert_chart_to_hypergraph(chart, disco_grammar, debug=True) file = tempfile.mktemp() print(file) manager.serialize(bytes(file, encoding="utf-8")) gi = PyGrammarInfo(grammar, manager.get_nonterminal_map()) sm = PyStorageManager() la = build_PyLatentAnnotation_initial(grammar, gi, sm) vec = py_edge_weight_projection(la, manager, variational=True, debug=True, log_mode=False) print(vec) self.assertEqual([1.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.5, 0.25, 0.25, 0.25, 0.25, 1.0], vec) vec = py_edge_weight_projection(la, manager, variational=False, debug=True, log_mode=False) print(vec) self.assertEqual([1.0, 1.0, 1.0, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 1.0], vec) der = manager.viterbi_derivation(0, vec, grammar) print(der) # print(disco_grammar.rulenos) # print(disco_grammar.numrules) # print(disco_grammar.lexicalbylhs) # print(disco_grammar.lexicalbyword) # print(disco_grammar.lexicalbynum) # print(disco_grammar.origrules, type(disco_grammar.origrules)) # print(disco_grammar.numbinary) # print(disco_grammar.numunary) # print(disco_grammar.toid) # print(disco_grammar.tolabel) # print(disco_grammar.bitpar) # striplabelre = re.compile(r'-\d+$') # msg = disco_grammar.getmapping(None, None) # disco_grammar.getrulemapping(disco_grammar, striplabelre) # mapping = disco_grammar.rulemapping # print(mapping) # for idx, group in enumerate(mapping): # print("Index", idx) # for elem in group: # print(grammar.rule_index(elem)) # for _, item in zip(range(20), chart.parseforest): # edge = chart.parseforest[item] # print(item, item.binrepr(), item.__repr__(), item.lexidx()) # print(type(edge)) for _ in range(5): vec2 = py_edge_weight_projection(la, manager, debug=True, log_mode=True) print(vec2)
def parse(self, sent, tags=None): """Parse a sentence and perform postprocessing. Yields a dictionary from parse trees to probabilities for each stage. :param sent: a sequence of tokens. :param tags: if given, will be given to the parser instead of trying all possible tags.""" if self.postagging: if self.transformations and 'FOLD-NUMBERS' in self.transformations: sent = ['000' if NUMBERRE.match(a) else a for a in sent] sent = replaceraretestwords(sent, self.postagging.unknownwordfun, self.postagging.lexicon, self.postagging.sigs) sent = list(sent) if tags is not None: tags = list(tags) chart = start = inside = outside = lastsuccessfulparse = None for n, stage in enumerate(self.stages): begin = time.clock() noparse = False parsetrees = fragments = None msg = '%s:\t' % stage.name.upper() model = u'default' if stage.dop: if (stage.estimator == 'ewe' or stage.objective.startswith('sl-dop')): model = u'ewe' elif stage.estimator == 'bon': model = u'bon' if stage.objective == 'shortest': model = u'shortest' x = stage.grammar.currentmodel stage.grammar.switch(model, logprob=stage.mode != 'pcfg-posterior') if stage.mode.startswith('pcfg-bitpar') and ( not hasattr(stage, 'rulesfile') or x != stage.grammar.currentmodel): exportbitpargrammar(stage) if not stage.binarized and not stage.mode.startswith('pcfg-bitpar'): raise ValueError('non-binarized grammar requires use of bitpar') if not stage.prune or chart: if n != 0 and stage.prune and stage.mode != 'dop-rerank': beginprune = time.clock() if self.stages[n - 1].mode == 'pcfg-posterior': whitelist, msg1 = whitelistfromposteriors( inside, outside, start, self.stages[n - 1].grammar, stage.grammar, stage.k, stage.splitprune, self.stages[n - 1].markorigin, stage.mode.startswith('pcfg')) else: whitelist, msg1 = prunechart( chart, stage.grammar, stage.k, stage.splitprune, self.stages[n - 1].markorigin, stage.mode.startswith('pcfg'), self.stages[n - 1].mode == 'pcfg-bitpar-nbest') msg += '%s; %gs\n\t' % (msg1, time.clock() - beginprune) else: whitelist = None if stage.mode == 'pcfg': chart, msg1 = pcfg.parse( sent, stage.grammar, tags=tags, whitelist=whitelist if stage.prune else None) elif stage.mode == 'pcfg-posterior': inside, outside, start, msg1 = pcfg.doinsideoutside( sent, stage.grammar, tags=tags) chart = start elif stage.mode.startswith('pcfg-bitpar'): if stage.mode == 'pcfg-bitpar-forest': numderivs = 0 elif (n == len(self.stages) - 1 or not self.stages[n + 1].prune): numderivs = stage.m else: # request 1000 nbest parses for CTF pruning numderivs = 1000 chart, cputime, msg1 = pcfg.parse_bitpar(stage.grammar, stage.rulesfile.name, stage.lexiconfile.name, sent, numderivs, stage.grammar.start, stage.grammar.toid[stage.grammar.start], tags=tags) begin -= cputime elif stage.mode == 'plcfrs': chart, msg1 = plcfrs.parse( sent, stage.grammar, tags=tags, exhaustive=stage.dop or ( n + 1 != len(self.stages) and self.stages[n + 1].prune), whitelist=whitelist, splitprune=stage.splitprune and self.stages[n - 1].split, markorigin=self.stages[n - 1].markorigin, estimates=(stage.estimates, stage.outside) if stage.estimates in ('SX', 'SXlrgaps') else None) elif stage.mode == 'dop-rerank': if chart: parsetrees = doprerank(chart, sent, stage.k, self.stages[n - 1].grammar, stage.grammar) msg1 = 're-ranked %d parse trees. ' % len(parsetrees) else: raise ValueError('unknown mode specified.') msg += '%s\n\t' % msg1 if (n != 0 and not chart and not noparse and stage.split == self.stages[n - 1].split): logging.error('ERROR: expected successful parse. ' 'sent: %s\nstage: %s.', ' '.join(sent), stage.name) # raise ValueError('ERROR: expected successful parse. ' # 'sent %s, %s.' % (nsent, stage.name)) if chart and stage.mode not in ('pcfg-posterior', 'dop-rerank' ) and not (self.relationalrealizational and stage.split): begindisamb = time.clock() if stage.mode == 'pcfg-bitpar-nbest': if not stage.kbest or stage.sample: raise ValueError('sampling not possible with bitpar ' 'in nbest mode.') derivations = chart.rankededges[chart.root()] entries = [None] * len(derivations) else: derivations, entries = getderivations(chart, stage.m, kbest=stage.kbest, sample=stage.sample, derivstrings=stage.dop != 'doubledop' or self.verbosity >= 3 or stage.objective == 'mcc') if self.verbosity >= 3: print('sent: %s\nstage: %s' % (' '.join(sent), stage.name)) print('%d-best derivations:\n%s' % ( min(stage.m, 100), '\n'.join('%d. %s %s' % (n + 1, ('subtrees=%d' % abs(int(prob / log(0.5)))) if stage.objective == 'shortest' else ('p=%g' % exp(-prob)), deriv) for n, (deriv, prob) in enumerate(derivations[:100])))) print('sum of probabitilies: %g\n' % sum(exp(-prob) for _, prob in derivations[:100])) if stage.objective == 'shortest': stage.grammar.switch(u'ewe' if stage.estimator == 'ewe' else u'default', True) parsetrees, msg1 = marginalize( stage.objective if stage.dop else 'mpd', derivations, entries, chart, sent=sent, tags=tags, backtransform=stage.backtransform, k=stage.m, sldop_n=stage.sldop_n, mcc_labda=stage.mcc_labda, mcc_labels=stage.mcc_labels, bitpar=stage.mode == 'pcfg-bitpar-nbest') msg += 'disambiguation: %s, %gs\n\t' % ( msg1, time.clock() - begindisamb) if self.verbosity >= 3: besttrees = nlargest(100, parsetrees, key=itemgetter(1)) print('100-best parse trees:\n%s' % '\n'.join( '%d. %s %s' % (n + 1, probstr(prob), treestr) for n, (treestr, prob, _) in enumerate(besttrees))) print('sum of probabitilies: %g\n' % sum((prob[1] if isinstance(prob, tuple) else prob) for _, prob, _ in besttrees)) if self.verbosity >= 4: print('Chart:\n%s' % chart) if parsetrees: try: resultstr, prob, fragments = max( parsetrees, key=itemgetter(1)) parsetree, noparse = self.postprocess(resultstr, n) if not all(a for a in parsetree.subtrees()): raise ValueError('empty nodes in tree: %s' % parsetree) if not len(parsetree.leaves()) == len(sent): raise ValueError('leaves missing. original tree: %s\n' 'postprocessed: %r' % (resultstr, parsetree)) except Exception: # pylint: disable=W0703 logging.error("something's amiss. %s", ''.join( traceback.format_exception(*sys.exc_info()))) parsetree, prob, noparse = self.noparse( stage, sent, tags, lastsuccessfulparse) else: lastsuccessfulparse = parsetree msg += probstr(prob) + ' ' else: fragments = None parsetree, prob, noparse = self.noparse( stage, sent, tags, lastsuccessfulparse) elapsedtime = time.clock() - begin msg += '%.2fs cpu time elapsed\n' % (elapsedtime) yield DictObj(name=stage.name, parsetree=parsetree, prob=prob, parsetrees=parsetrees, fragments=fragments, noparse=noparse, elapsedtime=elapsedtime, msg=msg)
def test(): """ Run some tests. """ from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import binarize, unbinarize, \ addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from discodop.fragments import getfragments logging.basicConfig(level=logging.DEBUG, format='%(message)s') filename = "alpinosample.export" corpus = NegraCorpusReader('.', filename, punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.parsed_sents().values())[:10]] print('plcfrs') lcfrs = Grammar(treebankgrammar(trees, sents), start=trees[0].label) print(lcfrs) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) print(grammar) grammar.testgrammar() fragments = getfragments(trees, sents, 1) debug = '--debug' in sys.argv grammarx, backtransform, _ = doubledop(trees, fragments, debug=debug) print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) print(grammar) assert grammar.testgrammar(), "DOP1 should sum to 1." for tree, sent in zip(corpus.parsed_sents().values(), sents): print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) print('\n', msg, end='') print("\ngold ", tree) print("double dop", end='') if chart: mpp = {} parsetrees = {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.getkey(), chart, grammar, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) print(len(mpp), 'parsetrees', end='') print(sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): print(tp, '\n', t, end='') print("match:", t == str(tree)) assert len(set(parsetrees[t])) == len(parsetrees[t]) if not debug: continue for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) else: print("no parse") print(chart) print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from math import exp corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10]] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, backtransform, _, _ = doubledop(trees, sents, debug=debug, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) assert grammar.testgrammar()[0], "RFE should sum to 1." for tree, sent in zip(corpus.trees().values(), sents): if debug: print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: mpp, parsetrees = {}, {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.key, chart, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) if debug: print(len(mpp), 'parsetrees', sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): if debug: print(tp, t, '\nmatch:', t == str(tree)) if len(set(parsetrees[t])) != len(parsetrees[t]): print('chart:\n', chart) assert len(set(parsetrees[t])) == len(parsetrees[t]) if debug: for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def parse(self, sent, tags=None): """ Parse a sentence and yield a dictionary from parse trees to probabilities for each stage. :param tags: if given, will be given to the parser instead of trying all possible tags. """ if self.postagging: sent = replaceraretestwords(sent, self.postagging['unknownwordfun'], self.postagging['lexicon'], self.postagging['sigs']) sent = list(sent) if tags is not None: tags = list(tags) chart = start = inside = outside = lastsuccessfulparse = None for n, stage in enumerate(self.stages): begin = time.clock() noparse = False parsetrees = fragments = None msg = '%s:\t' % stage.name.upper() model = u'default' if stage.dop: if (stage.estimator == 'ewe' or stage.objective.startswith('sl-dop')): model = u'ewe' elif stage.estimator == 'bon': model = u'bon' if stage.objective == 'shortest': model = u'shortest' x = stage.grammar.currentmodel stage.grammar.switch(model, logprob=stage.mode != 'pcfg-posterior') if stage.mode == 'pcfg-bitpar' and ( not hasattr(stage, 'rulesfile') or x != stage.grammar.currentmodel): exportbitpargrammar(stage) if not stage.prune or chart: if n != 0 and stage.prune and stage.mode != 'dop-rerank': beginprune = time.clock() if self.stages[n - 1].mode == 'pcfg-posterior': whitelist, msg1 = whitelistfromposteriors( inside, outside, start, self.stages[n - 1].grammar, stage.grammar, stage.k, stage.splitprune, self.stages[n - 1].markorigin, stage.mode.startswith('pcfg')) else: whitelist, msg1 = prunechart( chart, stage.grammar, stage.k, stage.splitprune, self.stages[n - 1].markorigin, stage.mode.startswith('pcfg'), self.stages[n - 1].mode == 'pcfg-bitpar') msg += '%s; %gs\n\t' % (msg1, time.clock() - beginprune) else: whitelist = None if stage.mode == 'pcfg': chart, msg1 = pcfg.parse( sent, stage.grammar, tags=tags, whitelist=whitelist if stage.prune else None) elif stage.mode == 'pcfg-posterior': inside, outside, start, msg1 = pcfg.doinsideoutside( sent, stage.grammar, tags=tags) chart = bool(start) elif stage.mode == 'pcfg-bitpar': chart, msg1 = pcfg.parse_bitpar(stage.grammar, stage.rulesfile.name, stage.lexiconfile.name, sent, 1000, # orig: stage.m; fixed for ctf stage.grammar.start, stage.grammar.toid[stage.grammar.start], tags=tags) msg1 += '%d derivations' % ( len(chart.rankededges[chart.root()])) elif stage.mode == 'plcfrs': chart, msg1 = plcfrs.parse( sent, stage.grammar, tags=tags, exhaustive=stage.dop or (n + 1 != len(self.stages) and self.stages[n + 1].prune), whitelist=whitelist, splitprune=stage.splitprune and self.stages[n - 1].split, markorigin=self.stages[n - 1].markorigin, estimates=(stage.useestimates, stage.outside) if stage.useestimates in ('SX', 'SXlrgaps') else None) elif stage.mode == 'dop-rerank': if chart: parsetrees = doprerank(chart, sent, stage.k, self.stages[n - 1].grammar, stage.grammar) msg1 = 're-ranked %d parse trees. ' % len(parsetrees) else: raise ValueError('unknown mode specified.') msg += '%s\n\t' % msg1 if (n != 0 and not chart and not noparse and stage.split == self.stages[n - 1].split): logging.error('ERROR: expected successful parse. ' 'sent: %s\nstage: %s.', ' '.join(sent), stage.name) #raise ValueError('ERROR: expected successful parse. ' # 'sent %s, %s.' % (nsent, stage.name)) if chart and stage.mode not in ('pcfg-posterior', 'dop-rerank' ) and not (self.relationalrealizational and stage.split): begindisamb = time.clock() if stage.objective == 'shortest': stage.grammar.switch('ewe' if stage.estimator == 'ewe' else 'default', True) parsetrees, derivs, msg1 = marginalize(stage.objective if stage.dop else 'mpd', chart, stage.grammar, stage.m, sample=stage.sample, kbest=stage.kbest, sent=sent, tags=tags, sldop_n=stage.sldop_n, backtransform=stage.backtransform, bitpar=stage.mode == 'pcfg-bitpar') msg += 'disambiguation: %s, %gs\n\t' % ( msg1, time.clock() - begindisamb) if parsetrees: resultstr, prob = max(parsetrees.items(), key=itemgetter(1)) try: parsetree, fragments, noparse = self.postprocess( resultstr, n, derivs) except ValueError as err: logging.error("something's amiss: %r", err) parsetree, prob, fragments, noparse = self.noparse( stage, sent, tags, lastsuccessfulparse) else: lastsuccessfulparse = parsetree msg += probstr(prob) + ' ' else: parsetree, prob, fragments, noparse = self.noparse( stage, sent, tags, lastsuccessfulparse) elapsedtime = time.clock() - begin msg += '%.2fs cpu time elapsed\n' % (elapsedtime) yield DictObj(name=stage.name, parsetree=parsetree, prob=prob, parsetrees=parsetrees, fragments=fragments, noparse=noparse, elapsedtime=elapsedtime, msg=msg)