def iterrules(istream): """ Iterates through an input stream yielding synchronous rules. :param istream: :param linear_model: :return: """ for line in istream: if line.startswith('#'): continue line = line.strip() if not line: continue fields = line.split(' ||| ') if len(fields) < 4: raise ValueError('I expected at least 4 fields, got %d: %s' % (len(fields), fields)) if not is_nonterminal(fields[0]): raise ValueError( 'Expected a nonterminal LHS, got something else: <%s>' % fields[0]) lhs = Nonterminal(fields[0][1:-1]) # ignore brackets f_rhs = tuple( Nonterminal(x[1:-1]) if is_nonterminal(x) else Terminal(x) for x in fields[1].split()) e_rhs = tuple( Nonterminal(x[1:-1]) if is_nonterminal(x) else Terminal(x) for x in fields[2].split()) features = defaultdict(None, iterpairs(fields[3])) yield SCFGProduction.create(lhs, f_rhs, e_rhs, features)
def get_srule(self): """Returns that goal rule based on the state of the factory""" if self._n == 0: rhs = [Nonterminal(self._start_str)] else: rhs = [Nonterminal(self._make_goal_str(self._n - 1))] return SCFGProduction(Nonterminal(self._make_goal_str()), rhs, rhs, [1], {'GoalRule': 1.0})
def make_pass_grammar(seg, grammars, semiring, unk_lhs): """ Make an input fsa for an input segment as well as its pass-through grammar. :param seg: a Segment object. :param grammars: a sequence of SCFGs. :param semiring: must provide `one`. :return: the input WDFSA, the pass-through grammar """ fsa = WDFSA() pass_grammar = SCFG() unk = Nonterminal(unk_lhs) tokens = seg.src.split() for i, token in enumerate(tokens): word = Terminal(token) if any(g.in_ivocab(word) for g in grammars): pass_grammar.add( SCFGProduction.create(unk, [word], [word], {'PassThrough': 1.0})) else: pass_grammar.add( SCFGProduction.create(unk, [word], [word], { 'PassThrough': 1.0, 'Unknown': 1.0 })) fsa.add_arc(i, i + 1, word, semiring.one) fsa.make_initial(0) fsa.make_final(len(tokens)) return fsa, pass_grammar
def pass0(seg, extra_grammar_paths=[], glue_grammar_paths=[], pass_through=True, default_symbol='X', goal_str='GOAL', start_str='S', max_span=-1, n_goal=0, saving={}, redo=True, log=dummyfunc) -> 'Hypergraph': """ Pass0 consists in parsing with the source side of the grammar. For now, pass0 does not do any scoring (not even local), but it could (TODO). Steps 1. Make a hypergraph view of the grammar 2. Make an input DFA 3. Parse the input DFA :return: source forest """ if is_step_complete('forest', saving, redo): return unpickle_it(saving['forest']) # here we need to decode for sure log('[%d] Make hypergraph view of all available grammars', seg.id) # make a hypergraph view of all available grammars grammar = make_grammar_hypergraph(seg, extra_grammar_paths=extra_grammar_paths, glue_grammar_paths=glue_grammar_paths, pass_through=pass_through, default_symbol=default_symbol) # parse source lattice log('[%d] Parse source DFA', seg.id) goal_maker = GoalRuleMaker(goal_str=goal_str, start_str=start_str, n=n_goal) dfa = make_input_dfa(seg) forest = parse_dfa(grammar, grammar.fetch(Nonterminal(start_str)), dfa, goal_maker.get_iview(), bottomup=True, constraint=HieroConstraints(grammar, dfa, max_span)) if 'forest' in saving: pickle_it(saving['forest'], forest) return forest
def exact(uid, input, grammars, glue_grammars, options, outdir): semiring = SumTimes if options.intersection == 'earley': parser = Earley(grammars, input.fsa, glue_grammars=glue_grammars, semiring=semiring) elif options.intersection == 'nederhof': parser = Nederhof(grammars, input.fsa, glue_grammars=glue_grammars, semiring=semiring) else: raise NotImplementedError( "I don't know this intersection algorithm: %s" % options.intersection) # make a forest logging.info('Parsing...') forest = parser.do(root=Nonterminal(options.start), goal=Nonterminal(options.goal)) if not forest: logging.info('[%s] NO PARSE FOUND', uid) return tsorter = LazyTopSortTable(forest) # report info if necessary report_forest(uid, forest, tsorter, outdir, options) # decoding strategies if options.viterbi: tsort = tsorter.do() logging.info('Viterbi...') d = viterbi_derivation(forest, tsort, generations=options.generations) logging.info('Viterbi derivation: %s %s', derivation_weight(d), DerivationYield.derivation(d)) save_viterbi('{0}/viterbi/{1}.gz'.format(outdir, uid), d, omega_d=derivation_weight, get_projection=DerivationYield.tree) if options.kbest > 0: root = make_span(Nonterminal( options.goal)) # this is the root after intersection logging.info('K-best...') kbestparser = KBest(forest, root, options.kbest, MaxTimes, traversal=ItemDerivationYield.string, uniqueness=False).do() logging.info('Done!') derivations = list(kbestparser.iterderivations()) save_kbest('{0}/kbest/{1}.gz'.format(outdir, uid), derivations, omega_d=derivation_weight, get_projection=DerivationYield.tree) if options.samples > 0: logging.info('Sampling...') sampler = AncestralSampler(forest, tsorter.do(), generations=options.generations) samples = list(sampler.sample(options.samples)) # group samples by derivation and yield derivations = group_by_identity(samples) trees = group_by_projection(samples, get_projection=DerivationYield.tree) # save the empirical distribution over derivations save_mc_derivations('{0}/ancestral/derivations/{1}.gz'.format( outdir, uid), derivations, inside=sampler.Z, omega_d=derivation_weight) # save the empirical distribution over strings save_mc_yields('{0}/ancestral/trees/{1}.gz'.format(outdir, uid), trees) logging.info('[%s] Finished!', uid)
def setUp(self): self.X = Nonterminal('X') self.X01 = Span(Nonterminal('X'), 0, 1) self.X01b = Span(Nonterminal('X'), 0, 1)
def setUp(self): self.X = Nonterminal('X') self.X2 = Nonterminal('X')
def setUp(self): self.tX1 = Terminal('X') self.tX2 = Terminal('X') self.nX1 = Nonterminal('X') self.nX2 = Nonterminal('X')
def make_dead_srule(lhs='X', dead='<dead-end>', fname='DeadRule'): return SCFGProduction(Nonterminal(lhs), (Terminal(dead), ), (Terminal(dead), ), [], {fname: 1.0})
def get_next_srule(self): """Returns what would be the next goal rule without updating the state of the factory.""" rhs = [Nonterminal(self._make_goal_str(self._n))] return SCFGProduction(Nonterminal(self._make_goal_str(self._n + 1)), rhs, rhs, [1], {'GoalRule': 1.0})
def exact_parsing(seg: 'the input segment (e.g. a Sentence)', grammars: 'list of CFGs', glue_grammars: 'list of glue CFGs', options: 'command line options', outdir: 'where to save results'): """Parse the input exactly.""" logging.debug('Building input hypergraph') hg = cfg_to_hg(grammars, glue_grammars, omega=PCFG(fname='LogProb')) root = hg.fetch(Nonterminal(options.start)) # make input DFA dfa = make_dfa(seg.signatures) # get a parser implementation if options.intersection == 'earley': parser = EarleyParser(hg, dfa, semiring.inside) else: parser = NederhofParser(hg, dfa, semiring.inside) # parse logging.debug('Parsing') goal_rule = NewCFGProduction(Nonterminal(options.goal), [Nonterminal(options.goal)], {'LogProb': semiring.inside.one}) forest = parser.do(root, goal_rule) if not forest: logging.info('[%s] NO PARSE FOUND', seg.id) return tsorter = LazyTopSortTable(forest, acyclic=False) # report some information about the forest if options.forest: with smart_wopen('{0}/forest/{1}.gz'.format(outdir, seg.id)) as fo: print(forest, file=fo) if options.count: tsort = tsorter.do() counter = DerivationCounter(forest, tsort) logging.info('Paths: %d', counter.n_derivations()) with smart_wopen('{0}/count/{1}.gz'.format(outdir, seg.id)) as fo: print( 'nodes=%d edges=%d paths=%d' % (forest.n_nodes(), forest.n_edges(), counter.n_derivations()), file=fo) # decoding strategies omega_d = lambda d: semiring.inside.times.reduce(d.weights()) if options.viterbi: tsort = tsorter.do() logging.info('Viterbi...') raw_viterbi = viterbi_derivation(forest, tsort) viterbi = make_derivation(forest, raw_viterbi) score = omega_d(viterbi) logging.info('Viterbi derivation: %s', score) logging.info('Saving...') save_viterbi('{0}/viterbi/{1}.gz'.format(outdir, seg.id), SimpleNamespace(derivation=viterbi, count=1, value=score), get_projection=DerivationYield.derivation) if options.samples > 0: tsort = tsorter.do() logging.info('Sampling...') sampler = AncestralSampler(forest, tsort) raw_samples = sampler.sample(options.samples) logging.info('Saving...') derivations = group_raw(forest, raw_samples) save_mc_derivations('{0}/ancestral/derivations/{1}.gz'.format( outdir, seg.id), derivations, inside=sampler.Z) logging.info('[%s] Finished!', seg.id)
def sliced_parsing(seg: 'the input segment (e.g. a Sentence)', grammars: 'a list of CFGs', glue_grammars: 'a list of glue CFGs', options: 'command line options', outdir: 'whete to save results'): """Parse the input using sliced forests.""" # Input Hypergraph logging.debug('Building input hypergraph') hg = cfg_to_hg(grammars, glue_grammars, omega=PCFG(fname='LogProb')) root = hg.fetch(Nonterminal(options.start)) dfa = make_dfa(seg.signatures) goal_rule = NewCFGProduction(Nonterminal(options.goal), [Nonterminal(options.goal)], {'LogProb': semiring.inside.one}) # Slice variables dist = get_distribution(options.free_dist) if options.free_dist == 'beta': prior = VectorOfPriors( get_prior(options.prior_a[0], options.prior_a[1]), get_prior(options.prior_b[0], options.prior_b[1])) elif options.free_dist == 'exponential': prior = get_prior(options.prior_scale[0], options.prior_scale[1]) u = SpanSliceVariables({}, dist, prior) logging.debug('%r', u) # make initial conditions # TODO: consider intialisation heuristics such as attempt_initialisation(fsa, grammars, glue_grammars, options) logging.info('Looking for initial set of conditions...') uninformed_conditions(hg, dfa, u, root, goal_rule, options.batch, options.intersection) logging.info('Done') #u.reset(conditions) # Sampling sizes = [0, 0, 0 ] # number of nodes, edges and derivations (for logging purposes) if options.count: report_size = lambda: ' nodes={:5d} edges={:5d} |D|={:5d} '.format( *sizes) else: report_size = lambda: ' nodes={:5d} edges={:5d}'.format( sizes[0], sizes[1]) if options.progress: bar = progressbar(range(options.burn + (options.samples * options.lag)), prefix='Sampling', dynsuffix=report_size) else: bar = range(options.burn + (options.samples * options.lag)) # sample markov_chain = [] for _ in bar: # get a parser implementation if options.intersection == 'earley': parser = EarleyParser(hg, dfa, semiring.inside, u) else: parser = NederhofParser(hg, dfa, semiring.inside, u) # compute a slice (a randomly pruned forest) forest = parser.do(root, goal_rule) if not forest: raise ValueError('A slice can never be emtpy.') # sample from the slice tsort = RobustTopSortTable(forest) residual = reweight(forest, u, semiring.inside) sampler = AncestralSampler(forest, tsort, TableLookupFunction(residual)) raw_derivations = sampler.sample(options.batch) # update the slice variables and the state of the Markov chain u.reset(make_batch_conditions(forest, raw_derivations)) # TODO: compute values! # TODO: make a derivation class # it is a hypergraph with its own edges, weights and rules # so that we can ask its value directly # it will basically replace the following # >>> tuple(forest.rule(e) for e in d) # then fix viterbi, MC, MCMC (all save_* methods # and kbest #markov_chain.append([tuple(forest.rule(e) for e in d) for d in raw_derivations]) # this representation is forest agnostic markov_chain.append( [make_derivation(forest, d) for d in raw_derivations]) # update logging information sizes[0], sizes[1] = forest.n_nodes(), forest.n_edges() if options.count: # reporting counts sizes[2] = sampler.n_derivations() # apply MCMC filters to reduce hopefully auto-correlation batches = apply_filters(markov_chain, burn=options.burn, lag=options.lag) samples = apply_batch_filters(batches, resample=options.resample) # group by derivation derivations = group_by_identity(samples) # group by trees (free of nonterminal annotation) #trees = group_by_projection(samples, DerivationYield.tree) # save everything #omega_d = lambda d: semiring.inside.times.reduce([r.weight for r in d]) save_mcmc_derivation('{0}/slice/derivations/{1}.gz'.format(outdir, seg.id), derivations) #save_mcmc_yields('{0}/slice/trees/{1}.gz'.format(outdir, seg.id), trees) if options.save_chain: save_markov_chain( '{0}/slice/chain/{1}.gz'.format(outdir, seg.id), markov_chain, derivation2str=lambda d: DerivationYield.derivation(d.rules()), flat=False)
def biparse(seg: SegmentMetaData, options: SimpleNamespace, joint_model: ModelView, conditional_model: ModelView, workingdir=None, redo=True, log=dummyfunc) -> SimpleNamespace: """ Biparse a segment using a local model. 1. we parse the source with a joint model 2. we bi-parse source and target with a conditional model This separation allows us to factorise these models differently wrt local/nonlocal components. For example, an LM maybe seen as a local (read tractable) component of a conditional model, and as a nonlocal (read intractable) component of a joint model. An implementation detail: bi-parsing is implemented as a cascade of intersections (with projections in between). :param seg: a segment :param options: parsing options :param joint_model: a factorised view of the joint model, here we use only the local components :param conditional_model: a factorised view of the conditional, here we use only the local components :param workingdir: where to save files :param redo: whether or not previously saved computation should be discarded :param log: a logging function :return: result.{joint,conditional}.{forest,components} for the respective local model """ if workingdir: saving = preprocessed_training_files('{0}/{1}'.format(workingdir, seg.id)) else: saving = {} result = SimpleNamespace() result.joint = SimpleNamespace() result.conditional = SimpleNamespace() if conditional_model is None: steps = ['joint.forest', 'joint.components'] if all(is_step_complete(step, saving, redo) for step in steps): log('[%d] Reusing joint and conditional distributions from files', seg.id) result.joint.forest = unpickle_it(saving['joint.forest']) result.joint.components = unpickle_it(saving['joint.components']) result.conditional.forest = None result.conditional.components = [] return result steps = ['joint.forest', 'joint.components', 'conditional.forest', 'conditional.components'] if all(is_step_complete(step, saving, redo) for step in steps): log('[%d] Reusing joint and conditional distributions from files', seg.id) result.joint.forest = unpickle_it(saving['joint.forest']) result.joint.components = unpickle_it(saving['joint.components']) result.conditional.forest = unpickle_it(saving['conditional.forest']) result.conditional.components = unpickle_it(saving['conditional.components']) return result # 1. Make a grammar # here we need to decode for sure log('[%d] Make hypergraph view of all available grammars', seg.id) # make a hypergraph view of all available grammars grammar = make_grammar_hypergraph(seg, extra_grammar_paths=options.extra_grammars, glue_grammar_paths=options.glue_grammars, pass_through=options.pass_through, default_symbol=options.default_symbol) #print('GRAMMAR') #print(grammar) # 2. Joint distribution - Step 1: parse source lattice n_goal = 0 log('[%d] Parse source DFA', seg.id) goal_maker = GoalRuleMaker(goal_str=options.goal, start_str=options.start, n=n_goal) src_dfa = make_input_dfa(seg) src_forest = parse_dfa(grammar, grammar.fetch(Nonterminal(options.start)), src_dfa, goal_maker.get_iview(), bottomup=True, constraint=HieroConstraints(grammar, src_dfa, options.max_span)) #print('SOURCE') #print(src_forest) if not src_forest: raise ValueError('I cannot parse the input lattice: i) make sure your grammar has glue rules; ii) make sure it handles OOVs') # 3. Target projection of the forest log('[%d] Project target rules', seg.id) tgt_forest = make_target_forest(src_forest) #print('TARGET') #print(tgt_forest) # 4. Joint distribution - Step 2: scoring log('[%d] Joint model: (exact) local scoring', seg.id) result.joint = exact_rescoring(joint_model.local_model(), tgt_forest, goal_maker, log) # save joint distribution if 'joint.forest' in saving: pickle_it(saving['joint.forest'], result.joint.forest) if 'joint.components' in saving: pickle_it(saving['joint.components'], result.joint.components) if conditional_model is None: result.conditional.forest = None result.conditional.components = [] return result # 5. Conditional distribution - Step 1: parse the reference lattice log('[%d] Parse reference DFA', seg.id) ref_dfa = make_reference_dfa(seg) goal_maker.update() ref_forest = parse_dfa(result.joint.forest, 0, ref_dfa, goal_maker.get_oview(), bottomup=False) if not ref_forest: # reference cannot be parsed log('[%d] References cannot be parsed', seg.id) result.conditional.forest = ref_forest result.conditional.components = [] else: # 6. Conditional distribution - Step 2: scoring log('[%d] Conditional model: exact (local) scoring', seg.id) result.conditional = exact_rescoring(conditional_model.local_model(), ref_forest, goal_maker, log) # save conditional distribution if 'conditional.forest' in saving: pickle_it(saving['conditional.forest'], result.conditional.forest) if 'conditional.components' in saving: pickle_it(saving['conditional.components'], result.conditional.components) return result