Ejemplo n.º 1
0
 def parse(self):
     self.counter += 1
     if self.cfg_approx:
         chart, msg = pcfg.parse(self.input,
                                 self.disco_cfg_grammar,
                                 beam_beta=self.beam_beta,
                                 beam_delta=self.beam_delta)
         if chart:
             chart.filter()
             whitelist, msg = prunechart(chart,
                                         self.disco_grammar,
                                         k=self.pruning_k,
                                         splitprune=True,
                                         markorigin=True,
                                         finecfg=False)
             self.chart, msg = parse(self.input,
                                     self.disco_grammar,
                                     estimates=self.estimates,
                                     whitelist=whitelist,
                                     splitprune=True,
                                     markorigin=True,
                                     exhaustive=True)
     else:
         self.chart, msg = parse(self.input,
                                 self.disco_grammar,
                                 estimates=self.estimates,
                                 beam_beta=self.beam_beta,
                                 beam_delta=self.beam_delta,
                                 exhaustive=True)
     # if self.counter > 86:
     #     print(self.input)
     #     print(self.chart)
     #     print(msg)
     if self.chart:
         self.chart.filter()
Ejemplo n.º 2
0
    def test_cfg_approximation_conversion(self):
        grammar = self.build_nm_grammar()
        disco_grammar_rules = list(transform_grammar_cfg_approx(grammar))
        print(disco_grammar_rules)
        disco_grammar = Grammar(disco_grammar_rules, start=grammar.start())
        print(disco_grammar)
        n = 2
        m = 3
        inp = ["a"] * n + ["b"] * m + ["c"] * n + ["d"] * m

        chart, msg = parse(inp, disco_grammar, beam_beta=exp(-4))
        chart.filter()
        print(chart)
        print(msg)

        fine_grammar_rules = list(transform_grammar(grammar))

        fine = Grammar(fine_grammar_rules, start=grammar.start())
        fine.getmapping(disco_grammar, re.compile('\*[0-9]+$'), None, True, True)

        whitelist, msg = prunechart(chart, fine, k=10000, splitprune=True, markorigin=True, finecfg=False)
        print(msg)
        print(whitelist)

        chart2, msg = parse(inp, fine, whitelist=whitelist, splitprune=True, markorigin=True)
        print(msg)
        print(chart2)
Ejemplo n.º 3
0
def test_issue51():
    from discodop.containers import Grammar
    from discodop.plcfrs import parse
    g = Grammar([((('S', 'A'), ((0, ), )), 1.0),
                 ((('A', 'Epsilon'), ('a', )), 1.0)],
                start='S')
    chart, _msg = parse(['b'], g)
    chart.filter()
Ejemplo n.º 4
0
def test_grammar(debug=False):
    """Demonstrate grammar extraction."""
    from discodop.grammar import treebankgrammar, dopreduction, doubledop
    from discodop import plcfrs
    from discodop.containers import Grammar
    from discodop.treebank import NegraCorpusReader
    from discodop.treetransforms import addfanoutmarkers
    from discodop.disambiguation import getderivations, marginalize
    corpus = NegraCorpusReader('alpinosample.export', punct='move')
    sents = list(corpus.sents().values())
    trees = [
        addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
        for a in list(corpus.trees().values())[:10]
    ]
    if debug:
        print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
        print('dop reduction')
    grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
                      start=trees[0].label)
    if debug:
        print(grammar)
    _ = grammar.testgrammar()

    grammarx, _backtransform, _, _ = doubledop(trees,
                                               sents,
                                               debug=False,
                                               numproc=1)
    if debug:
        print('\ndouble dop grammar')
    grammar = Grammar(grammarx, start=trees[0].label)
    grammar.getmapping(None,
                       striplabelre=None,
                       neverblockre=re.compile('^#[0-9]+|.+}<'),
                       splitprune=False,
                       markorigin=False)
    if debug:
        print(grammar)
    result, msg = grammar.testgrammar()
    assert result, 'RFE should sum to 1.\n%s' % msg
    for tree, sent in zip(corpus.trees().values(), sents):
        if debug:
            print('sentence:',
                  ' '.join(a.encode('unicode-escape').decode() for a in sent))
        chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
        if debug:
            print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
        if chart:
            getderivations(chart, 100)
            _parses, _msg = marginalize('mpp', chart)
        elif debug:
            print('no parse\n', chart)
        if debug:
            print()
    tree = Tree.parse('(ROOT (S (F (E (S (C (B (A 0))))))))', parse_leaf=int)
    Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Ejemplo n.º 5
0
def parse(compiledgrammar, testsent, testtags=None):
	""" Parse a sentence with a grammar. """
	chart, _ = plcfrs.parse(testsent,
		compiledgrammar, tags=testtags, exhaustive=True)
	print("input:", ' '.join("%d:%s" % a
			for a in enumerate(testtags if testtags else testsent)), end=' ')
	if chart:
		print()
		results = kbest.lazykbest(chart, 10)[0]
		for tree, prob in results:
			tree = Tree(tree)
			treetransforms.unbinarize(tree)
			print(exp(-prob), tree)
		print()
		return True
	else:
		print("no parse!\n")
		#print(chart)
		return False
Ejemplo n.º 6
0
def test_grammar(debug=False):
    """Demonstrate grammar extraction."""
    from discodop.grammar import treebankgrammar, dopreduction, doubledop
    from discodop import plcfrs
    from discodop.containers import Grammar
    from discodop.treebank import NegraCorpusReader
    from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers
    from discodop.disambiguation import recoverfragments
    from discodop.kbest import lazykbest
    from math import exp
    corpus = NegraCorpusReader('alpinosample.export', punct='move')
    sents = list(corpus.sents().values())
    trees = [
        addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
        for a in list(corpus.trees().values())[:10]
    ]
    if debug:
        print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
        print('dop reduction')
    grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
                      start=trees[0].label)
    if debug:
        print(grammar)
    _ = grammar.testgrammar()

    grammarx, backtransform, _, _ = doubledop(trees,
                                              sents,
                                              debug=False,
                                              numproc=1)
    if debug:
        print('\ndouble dop grammar')
    grammar = Grammar(grammarx, start=trees[0].label)
    grammar.getmapping(grammar,
                       striplabelre=None,
                       neverblockre=re.compile('^#[0-9]+|.+}<'),
                       splitprune=False,
                       markorigin=False)
    if debug:
        print(grammar)
    assert grammar.testgrammar()[0], "RFE should sum to 1."
    for tree, sent in zip(corpus.trees().values(), sents):
        if debug:
            print("sentence:",
                  ' '.join(a.encode('unicode-escape').decode() for a in sent))
        chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
        if debug:
            print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
        if chart:
            mpp, parsetrees = {}, {}
            derivations, _ = lazykbest(chart, 1000, '}<')
            for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
                r = Tree(recoverfragments(d.key, chart, backtransform))
                r = str(removefanoutmarkers(unbinarize(r)))
                mpp[r] = mpp.get(r, 0.0) + exp(-p)
                parsetrees.setdefault(r, []).append((t, p))
            if debug:
                print(len(mpp), 'parsetrees',
                      sum(map(len, parsetrees.values())), 'derivations')
            for t, tp in sorted(mpp.items(), key=itemgetter(1)):
                if debug:
                    print(tp, t, '\nmatch:', t == str(tree))
                if len(set(parsetrees[t])) != len(parsetrees[t]):
                    print('chart:\n', chart)
                    assert len(set(parsetrees[t])) == len(parsetrees[t])
                if debug:
                    for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
                        print(' <= %6g %s' % (exp(-p), deriv))
        elif debug:
            print('no parse\n', chart)
        if debug:
            print()
    tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
    Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Ejemplo n.º 7
0
    def test_individual_parsing_stages(self):
        grammar = self.build_grammar()

        for r in transform_grammar(grammar):
            pprint(r)

        rule_list = list(transform_grammar(grammar))
        pprint(rule_list)
        disco_grammar = Grammar(rule_list, start=grammar.start())
        print(disco_grammar)

        inp = ["a"] * 3
        estimates = 'SXlrgaps', getestimates(disco_grammar, 40, grammar.start())
        print(type(estimates))
        chart, msg = parse(inp, disco_grammar, estimates=estimates)
        print(chart)
        print(msg)
        chart.filter()
        print("filtered chart")
        print(disco_grammar.nonterminals)
        print(type(disco_grammar.nonterminals))

        print(chart)
        # print(help(chart))

        root = chart.root()
        print("root", root, type(root))
        print(chart.indices(root))
        print(chart.itemstr(root))
        print(chart.stats())
        print("root label", chart.label(root))
        print(root, chart.itemid1(chart.label(root), chart.indices(root)))
        for i in range(1, chart.numitems() + 1):
            print(i, chart.label(i), chart.indices(i), chart.numedges(i))
            if True or len(chart.indices(i)) > 1:
                for edge_num in range(chart.numedges(i)):
                    edge = chart.getEdgeForItem(i, edge_num)
                    if isinstance(edge, tuple):
                        print("\t", disco_grammar.nonterminalstr(chart.label(i)) + "[" + str(i) + "]", "->", ' '.join([disco_grammar.nonterminalstr(chart.label(j)) + "[" + str(j) + "]" for j in [edge[1], edge[2]] if j != 0]))
                    else:
                        print("\t", disco_grammar.nonterminalstr(chart.label(i)) + "[" + str(i) + "]", "->", inp[edge])
        print(chart.getEdgeForItem(root, 0))
        # print(lazykbest(chart, 5))

        manager = PyDerivationManager(grammar)
        manager.convert_chart_to_hypergraph(chart, disco_grammar, debug=True)

        file = tempfile.mktemp()
        print(file)
        manager.serialize(bytes(file, encoding="utf-8"))

        gi = PyGrammarInfo(grammar, manager.get_nonterminal_map())
        sm = PyStorageManager()
        la = build_PyLatentAnnotation_initial(grammar, gi, sm)

        vec = py_edge_weight_projection(la, manager, variational=True, debug=True, log_mode=False)
        print(vec)
        self.assertEqual([1.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.5, 0.25, 0.25, 0.25, 0.25, 1.0], vec)

        vec = py_edge_weight_projection(la, manager, variational=False, debug=True, log_mode=False)
        print(vec)
        self.assertEqual([1.0, 1.0, 1.0, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 1.0], vec)

        der = manager.viterbi_derivation(0, vec, grammar)
        print(der)

        # print(disco_grammar.rulenos)
        # print(disco_grammar.numrules)
        # print(disco_grammar.lexicalbylhs)
        # print(disco_grammar.lexicalbyword)
        # print(disco_grammar.lexicalbynum)
        # print(disco_grammar.origrules, type(disco_grammar.origrules))
        # print(disco_grammar.numbinary)
        # print(disco_grammar.numunary)
        # print(disco_grammar.toid)
        # print(disco_grammar.tolabel)
        # print(disco_grammar.bitpar)
        # striplabelre = re.compile(r'-\d+$')
        # msg = disco_grammar.getmapping(None, None)
        # disco_grammar.getrulemapping(disco_grammar, striplabelre)
        # mapping = disco_grammar.rulemapping
        # print(mapping)
        # for idx, group in enumerate(mapping):
        #     print("Index", idx)
        #     for elem in group:
        #         print(grammar.rule_index(elem))

        # for _, item in zip(range(20), chart.parseforest):
        #     edge = chart.parseforest[item]
        #     print(item, item.binrepr(), item.__repr__(), item.lexidx())
        #     print(type(edge))
        for _ in range(5):
            vec2 = py_edge_weight_projection(la, manager, debug=True, log_mode=True)
            print(vec2)
Ejemplo n.º 8
0
	def parse(self, sent, tags=None):
		"""Parse a sentence and perform postprocessing.

		Yields a dictionary from parse trees to probabilities for each stage.

		:param sent: a sequence of tokens.
		:param tags: if given, will be given to the parser instead of trying
			all possible tags."""
		if self.postagging:
			if self.transformations and 'FOLD-NUMBERS' in self.transformations:
				sent = ['000' if NUMBERRE.match(a) else a for a in sent]
			sent = replaceraretestwords(sent,
					self.postagging.unknownwordfun,
					self.postagging.lexicon, self.postagging.sigs)
		sent = list(sent)
		if tags is not None:
			tags = list(tags)
		chart = start = inside = outside = lastsuccessfulparse = None
		for n, stage in enumerate(self.stages):
			begin = time.clock()
			noparse = False
			parsetrees = fragments = None
			msg = '%s:\t' % stage.name.upper()
			model = u'default'
			if stage.dop:
				if (stage.estimator == 'ewe'
						or stage.objective.startswith('sl-dop')):
					model = u'ewe'
				elif stage.estimator == 'bon':
					model = u'bon'
				if stage.objective == 'shortest':
					model = u'shortest'
			x = stage.grammar.currentmodel
			stage.grammar.switch(model, logprob=stage.mode != 'pcfg-posterior')
			if stage.mode.startswith('pcfg-bitpar') and (
					not hasattr(stage, 'rulesfile')
					or x != stage.grammar.currentmodel):
				exportbitpargrammar(stage)
			if not stage.binarized and not stage.mode.startswith('pcfg-bitpar'):
				raise ValueError('non-binarized grammar requires use of bitpar')
			if not stage.prune or chart:
				if n != 0 and stage.prune and stage.mode != 'dop-rerank':
					beginprune = time.clock()
					if self.stages[n - 1].mode == 'pcfg-posterior':
						whitelist, msg1 = whitelistfromposteriors(
								inside, outside, start,
								self.stages[n - 1].grammar, stage.grammar,
								stage.k, stage.splitprune,
								self.stages[n - 1].markorigin,
								stage.mode.startswith('pcfg'))
					else:
						whitelist, msg1 = prunechart(
								chart, stage.grammar, stage.k,
								stage.splitprune,
								self.stages[n - 1].markorigin,
								stage.mode.startswith('pcfg'),
								self.stages[n - 1].mode == 'pcfg-bitpar-nbest')
					msg += '%s; %gs\n\t' % (msg1, time.clock() - beginprune)
				else:
					whitelist = None
				if stage.mode == 'pcfg':
					chart, msg1 = pcfg.parse(
							sent, stage.grammar, tags=tags,
							whitelist=whitelist if stage.prune else None)
				elif stage.mode == 'pcfg-posterior':
					inside, outside, start, msg1 = pcfg.doinsideoutside(
							sent, stage.grammar, tags=tags)
					chart = start
				elif stage.mode.startswith('pcfg-bitpar'):
					if stage.mode == 'pcfg-bitpar-forest':
						numderivs = 0
					elif (n == len(self.stages) - 1
							or not self.stages[n + 1].prune):
						numderivs = stage.m
					else:  # request 1000 nbest parses for CTF pruning
						numderivs = 1000
					chart, cputime, msg1 = pcfg.parse_bitpar(stage.grammar,
							stage.rulesfile.name, stage.lexiconfile.name,
							sent, numderivs,
							stage.grammar.start,
							stage.grammar.toid[stage.grammar.start], tags=tags)
					begin -= cputime
				elif stage.mode == 'plcfrs':
					chart, msg1 = plcfrs.parse(
							sent, stage.grammar, tags=tags,
							exhaustive=stage.dop or (
								n + 1 != len(self.stages)
								and self.stages[n + 1].prune),
							whitelist=whitelist,
							splitprune=stage.splitprune
								and self.stages[n - 1].split,
							markorigin=self.stages[n - 1].markorigin,
							estimates=(stage.estimates, stage.outside)
								if stage.estimates in ('SX', 'SXlrgaps')
								else None)
				elif stage.mode == 'dop-rerank':
					if chart:
						parsetrees = doprerank(chart, sent, stage.k,
								self.stages[n - 1].grammar, stage.grammar)
						msg1 = 're-ranked %d parse trees. ' % len(parsetrees)
				else:
					raise ValueError('unknown mode specified.')
				msg += '%s\n\t' % msg1
				if (n != 0 and not chart and not noparse
						and stage.split == self.stages[n - 1].split):
					logging.error('ERROR: expected successful parse. '
							'sent: %s\nstage: %s.', ' '.join(sent), stage.name)
					# raise ValueError('ERROR: expected successful parse. '
					# 		'sent %s, %s.' % (nsent, stage.name))
			if chart and stage.mode not in ('pcfg-posterior', 'dop-rerank'
					) and not (self.relationalrealizational and stage.split):
				begindisamb = time.clock()
				if stage.mode == 'pcfg-bitpar-nbest':
					if not stage.kbest or stage.sample:
						raise ValueError('sampling not possible with bitpar '
								'in nbest mode.')
					derivations = chart.rankededges[chart.root()]
					entries = [None] * len(derivations)
				else:
					derivations, entries = getderivations(chart, stage.m,
							kbest=stage.kbest, sample=stage.sample,
							derivstrings=stage.dop != 'doubledop'
									or self.verbosity >= 3
									or stage.objective == 'mcc')
				if self.verbosity >= 3:
					print('sent: %s\nstage: %s' % (' '.join(sent), stage.name))
					print('%d-best derivations:\n%s' % (
						min(stage.m, 100),
						'\n'.join('%d. %s %s' % (n + 1,
							('subtrees=%d' % abs(int(prob / log(0.5))))
							if stage.objective == 'shortest'
							else ('p=%g' % exp(-prob)), deriv)
						for n, (deriv, prob) in enumerate(derivations[:100]))))
					print('sum of probabitilies: %g\n' %
							sum(exp(-prob) for _, prob in derivations[:100]))
				if stage.objective == 'shortest':
					stage.grammar.switch(u'ewe' if stage.estimator == 'ewe'
							else u'default', True)
				parsetrees, msg1 = marginalize(
						stage.objective if stage.dop else 'mpd',
						derivations, entries, chart,
						sent=sent, tags=tags,
						backtransform=stage.backtransform,
						k=stage.m, sldop_n=stage.sldop_n,
						mcc_labda=stage.mcc_labda, mcc_labels=stage.mcc_labels,
						bitpar=stage.mode == 'pcfg-bitpar-nbest')
				msg += 'disambiguation: %s, %gs\n\t' % (
						msg1, time.clock() - begindisamb)
				if self.verbosity >= 3:
					besttrees = nlargest(100, parsetrees, key=itemgetter(1))
					print('100-best parse trees:\n%s' % '\n'.join(
							'%d. %s %s' % (n + 1, probstr(prob), treestr)
							for n, (treestr, prob, _) in enumerate(besttrees)))
					print('sum of probabitilies: %g\n' %
							sum((prob[1] if isinstance(prob, tuple) else prob)
								for _, prob, _ in besttrees))
			if self.verbosity >= 4:
				print('Chart:\n%s' % chart)
			if parsetrees:
				try:
					resultstr, prob, fragments = max(
							parsetrees, key=itemgetter(1))
					parsetree, noparse = self.postprocess(resultstr, n)
					if not all(a for a in parsetree.subtrees()):
						raise ValueError('empty nodes in tree: %s' % parsetree)
					if not len(parsetree.leaves()) == len(sent):
						raise ValueError('leaves missing. original tree: %s\n'
							'postprocessed: %r' % (resultstr, parsetree))
				except Exception:  # pylint: disable=W0703
					logging.error("something's amiss. %s", ''.join(
								traceback.format_exception(*sys.exc_info())))
					parsetree, prob, noparse = self.noparse(
							stage, sent, tags, lastsuccessfulparse)
				else:
					lastsuccessfulparse = parsetree
				msg += probstr(prob) + ' '
			else:
				fragments = None
				parsetree, prob, noparse = self.noparse(
						stage, sent, tags, lastsuccessfulparse)
			elapsedtime = time.clock() - begin
			msg += '%.2fs cpu time elapsed\n' % (elapsedtime)
			yield DictObj(name=stage.name, parsetree=parsetree, prob=prob,
					parsetrees=parsetrees, fragments=fragments,
					noparse=noparse, elapsedtime=elapsedtime, msg=msg)
Ejemplo n.º 9
0
def test():
	""" Run some tests. """
	from discodop import plcfrs
	from discodop.containers import Grammar
	from discodop.treebank import NegraCorpusReader
	from discodop.treetransforms import binarize, unbinarize, \
			addfanoutmarkers, removefanoutmarkers
	from discodop.disambiguation import recoverfragments
	from discodop.kbest import lazykbest
	from discodop.fragments import getfragments
	logging.basicConfig(level=logging.DEBUG, format='%(message)s')
	filename = "alpinosample.export"
	corpus = NegraCorpusReader('.', filename, punct='move')
	sents = list(corpus.sents().values())
	trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
			for a in list(corpus.parsed_sents().values())[:10]]

	print('plcfrs')
	lcfrs = Grammar(treebankgrammar(trees, sents), start=trees[0].label)
	print(lcfrs)

	print('dop reduction')
	grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
			start=trees[0].label)
	print(grammar)
	grammar.testgrammar()

	fragments = getfragments(trees, sents, 1)
	debug = '--debug' in sys.argv
	grammarx, backtransform, _ = doubledop(trees, fragments, debug=debug)
	print('\ndouble dop grammar')
	grammar = Grammar(grammarx, start=trees[0].label)
	grammar.getmapping(grammar, striplabelre=None,
			neverblockre=re.compile(b'^#[0-9]+|.+}<'),
			splitprune=False, markorigin=False)
	print(grammar)
	assert grammar.testgrammar(), "DOP1 should sum to 1."
	for tree, sent in zip(corpus.parsed_sents().values(), sents):
		print("sentence:", ' '.join(a.encode('unicode-escape').decode()
				for a in sent))
		chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
		print('\n', msg, end='')
		print("\ngold ", tree)
		print("double dop", end='')
		if chart:
			mpp = {}
			parsetrees = {}
			derivations, _ = lazykbest(chart, 1000, b'}<')
			for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
				r = Tree(recoverfragments(d.getkey(), chart,
					grammar, backtransform))
				r = str(removefanoutmarkers(unbinarize(r)))
				mpp[r] = mpp.get(r, 0.0) + exp(-p)
				parsetrees.setdefault(r, []).append((t, p))
			print(len(mpp), 'parsetrees', end='')
			print(sum(map(len, parsetrees.values())), 'derivations')
			for t, tp in sorted(mpp.items(), key=itemgetter(1)):
				print(tp, '\n', t, end='')
				print("match:", t == str(tree))
				assert len(set(parsetrees[t])) == len(parsetrees[t])
				if not debug:
					continue
				for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
					print(' <= %6g %s' % (exp(-p), deriv))
		else:
			print("no parse")
			print(chart)
		print()
	tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
	Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Ejemplo n.º 10
0
def test_grammar(debug=False):
	"""Demonstrate grammar extraction."""
	from discodop.grammar import treebankgrammar, dopreduction, doubledop
	from discodop import plcfrs
	from discodop.containers import Grammar
	from discodop.treebank import NegraCorpusReader
	from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers
	from discodop.disambiguation import recoverfragments
	from discodop.kbest import lazykbest
	from math import exp
	corpus = NegraCorpusReader('alpinosample.export', punct='move')
	sents = list(corpus.sents().values())
	trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
			for a in list(corpus.trees().values())[:10]]
	if debug:
		print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
		print('dop reduction')
	grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
			start=trees[0].label)
	if debug:
		print(grammar)
	_ = grammar.testgrammar()

	grammarx, backtransform, _, _ = doubledop(trees, sents,
			debug=debug, numproc=1)
	if debug:
		print('\ndouble dop grammar')
	grammar = Grammar(grammarx, start=trees[0].label)
	grammar.getmapping(grammar, striplabelre=None,
			neverblockre=re.compile(b'^#[0-9]+|.+}<'),
			splitprune=False, markorigin=False)
	if debug:
		print(grammar)
	assert grammar.testgrammar()[0], "RFE should sum to 1."
	for tree, sent in zip(corpus.trees().values(), sents):
		if debug:
			print("sentence:", ' '.join(a.encode('unicode-escape').decode()
					for a in sent))
		chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
		if debug:
			print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
		if chart:
			mpp, parsetrees = {}, {}
			derivations, _ = lazykbest(chart, 1000, b'}<')
			for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
				r = Tree(recoverfragments(d.key, chart, backtransform))
				r = str(removefanoutmarkers(unbinarize(r)))
				mpp[r] = mpp.get(r, 0.0) + exp(-p)
				parsetrees.setdefault(r, []).append((t, p))
			if debug:
				print(len(mpp), 'parsetrees',
						sum(map(len, parsetrees.values())), 'derivations')
			for t, tp in sorted(mpp.items(), key=itemgetter(1)):
				if debug:
					print(tp, t, '\nmatch:', t == str(tree))
				if len(set(parsetrees[t])) != len(parsetrees[t]):
					print('chart:\n', chart)
					assert len(set(parsetrees[t])) == len(parsetrees[t])
				if debug:
					for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
						print(' <= %6g %s' % (exp(-p), deriv))
		elif debug:
			print('no parse\n', chart)
		if debug:
			print()
	tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
	Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Ejemplo n.º 11
0
	def parse(self, sent, tags=None):
		""" Parse a sentence and yield a dictionary from parse trees to
		probabilities for each stage.

		:param tags: if given, will be given to the parser instead of trying
			all possible tags. """
		if self.postagging:
			sent = replaceraretestwords(sent,
					self.postagging['unknownwordfun'],
					self.postagging['lexicon'], self.postagging['sigs'])
		sent = list(sent)
		if tags is not None:
			tags = list(tags)
		chart = start = inside = outside = lastsuccessfulparse = None
		for n, stage in enumerate(self.stages):
			begin = time.clock()
			noparse = False
			parsetrees = fragments = None
			msg = '%s:\t' % stage.name.upper()
			model = u'default'
			if stage.dop:
				if (stage.estimator == 'ewe'
						or stage.objective.startswith('sl-dop')):
					model = u'ewe'
				elif stage.estimator == 'bon':
					model = u'bon'
				if stage.objective == 'shortest':
					model = u'shortest'
			x = stage.grammar.currentmodel
			stage.grammar.switch(model, logprob=stage.mode != 'pcfg-posterior')
			if stage.mode == 'pcfg-bitpar' and (
					not hasattr(stage, 'rulesfile')
					or x != stage.grammar.currentmodel):
				exportbitpargrammar(stage)
			if not stage.prune or chart:
				if n != 0 and stage.prune and stage.mode != 'dop-rerank':
					beginprune = time.clock()
					if self.stages[n - 1].mode == 'pcfg-posterior':
						whitelist, msg1 = whitelistfromposteriors(
								inside, outside, start,
								self.stages[n - 1].grammar, stage.grammar,
								stage.k, stage.splitprune,
								self.stages[n - 1].markorigin,
								stage.mode.startswith('pcfg'))
					else:
						whitelist, msg1 = prunechart(
								chart, stage.grammar, stage.k,
								stage.splitprune,
								self.stages[n - 1].markorigin,
								stage.mode.startswith('pcfg'),
								self.stages[n - 1].mode == 'pcfg-bitpar')
					msg += '%s; %gs\n\t' % (msg1, time.clock() - beginprune)
				else:
					whitelist = None
				if stage.mode == 'pcfg':
					chart, msg1 = pcfg.parse(
							sent, stage.grammar, tags=tags,
							whitelist=whitelist if stage.prune else None)
				elif stage.mode == 'pcfg-posterior':
					inside, outside, start, msg1 = pcfg.doinsideoutside(
							sent, stage.grammar, tags=tags)
					chart = bool(start)
				elif stage.mode == 'pcfg-bitpar':
					chart, msg1 = pcfg.parse_bitpar(stage.grammar,
							stage.rulesfile.name, stage.lexiconfile.name,
							sent, 1000,  # orig: stage.m; fixed for ctf
							stage.grammar.start,
							stage.grammar.toid[stage.grammar.start], tags=tags)
					msg1 += '%d derivations' % (
							len(chart.rankededges[chart.root()]))
				elif stage.mode == 'plcfrs':
					chart, msg1 = plcfrs.parse(
							sent, stage.grammar, tags=tags,
							exhaustive=stage.dop or (n + 1 != len(self.stages)
								and self.stages[n + 1].prune),
							whitelist=whitelist,
							splitprune=stage.splitprune
								and self.stages[n - 1].split,
							markorigin=self.stages[n - 1].markorigin,
							estimates=(stage.useestimates, stage.outside)
								if stage.useestimates in ('SX', 'SXlrgaps')
								else None)
				elif stage.mode == 'dop-rerank':
					if chart:
						parsetrees = doprerank(chart, sent, stage.k,
								self.stages[n - 1].grammar, stage.grammar)
						msg1 = 're-ranked %d parse trees. ' % len(parsetrees)
				else:
					raise ValueError('unknown mode specified.')
				msg += '%s\n\t' % msg1
				if (n != 0 and not chart and not noparse
						and stage.split == self.stages[n - 1].split):
					logging.error('ERROR: expected successful parse. '
							'sent: %s\nstage: %s.', ' '.join(sent), stage.name)
					#raise ValueError('ERROR: expected successful parse. '
					#		'sent %s, %s.' % (nsent, stage.name))
			if chart and stage.mode not in ('pcfg-posterior', 'dop-rerank'
					) and not (self.relationalrealizational and stage.split):
				begindisamb = time.clock()
				if stage.objective == 'shortest':
					stage.grammar.switch('ewe' if stage.estimator == 'ewe'
							else 'default', True)
				parsetrees, derivs, msg1 = marginalize(stage.objective
						if stage.dop else 'mpd',
						chart, stage.grammar, stage.m,
						sample=stage.sample, kbest=stage.kbest,
						sent=sent, tags=tags,
						sldop_n=stage.sldop_n,
						backtransform=stage.backtransform,
						bitpar=stage.mode == 'pcfg-bitpar')
				msg += 'disambiguation: %s, %gs\n\t' % (
						msg1, time.clock() - begindisamb)
			if parsetrees:
				resultstr, prob = max(parsetrees.items(), key=itemgetter(1))
				try:
					parsetree, fragments, noparse = self.postprocess(
							resultstr, n, derivs)
				except ValueError as err:
					logging.error("something's amiss: %r", err)
					parsetree, prob, fragments, noparse = self.noparse(
							stage, sent, tags, lastsuccessfulparse)
				else:
					lastsuccessfulparse = parsetree
				msg += probstr(prob) + ' '
			else:
				parsetree, prob, fragments, noparse = self.noparse(
						stage, sent, tags, lastsuccessfulparse)
			elapsedtime = time.clock() - begin
			msg += '%.2fs cpu time elapsed\n' % (elapsedtime)
			yield DictObj(name=stage.name, parsetree=parsetree, prob=prob,
					parsetrees=parsetrees, fragments=fragments,
					noparse=noparse, elapsedtime=elapsedtime, msg=msg)