def exact_rescoring(model: ModelContainer, forest: Hypergraph, goal_maker: GoalRuleMaker, log=dummyfunc) -> SimpleNamespace: """ Exactly rescore a forest with a certain model. :param model: an instance of ModelContainer :param forest: a Hypergraph :param goal_maker: an object to deliver (output view of) goal rules :param log: a logging function :return: result.forest and result.components as a SimpleNamespace object """ result = SimpleNamespace() if not model.stateful: # when the model is not stateful, we don't need Earley log('Lookup scoring') lookup_comps = get_lookup_components(forest, model.lookup.extractors()) # lookup log('Stateless scoring') stateless_comps = get_stateless_components(forest, model.stateless.extractors()) # stateless result.forest = forest result.components = [FComponents([comps1, comps2]) for comps1, comps2 in zip(lookup_comps, stateless_comps)] else: # here we cannot avoid it log('Forest rescoring') goal_maker.update() result.forest, result.components = rescore_forest(forest, 0, TableLookupScorer(model.lookup), StatelessScorer(model.stateless), StatefulScorer(model.stateful), goal_rule=goal_maker.get_oview(), keep_components=True) return result
def training_biparse(seg, args, workingdir, model, log=dummyfunc) -> 'bool': """ Steps: I. Pass0 and pass1: parse source, project, local scoring II. Pass2 - make a reference DFA - parse the reference DFA - fully score the reference forest (lookup, stateless, stateful) - save rescored forest and components :return: whether or not the input is bi-parsable """ pass1_files = ['{0}/{1}.hyp.forest'.format(workingdir, seg.id), '{0}/{1}.hyp.ffs.rule'.format(workingdir, seg.id), '{0}/{1}.hyp.ffs.stateless'.format(workingdir, seg.id)] ref_files = ['{0}/{1}.ref.ffs.all'.format(workingdir, seg.id), '{0}/{1}.ref.forest'.format(workingdir, seg.id)] # check for redundant work if all(os.path.exists(path) for path in pass1_files) and not args.redo: if all(os.path.exists(path) for path in ref_files): log('[%d] Reusing forests for segment', seg.id) return True # parsable else: return False # not parsable # pass0: parsing src_forest = pass0(seg, extra_grammar_paths=args.extra_grammar, glue_grammar_paths=args.glue_grammar, pass_through=args.pass_through, default_symbol=args.default_symbol, goal_str=args.goal, start_str=args.start, n_goal=0, saving={}, redo=args.redo, log=log) # pass1: local scoring saving1 = { 'forest': '{0}/{1}.hyp.forest'.format(workingdir, seg.id), 'lookup': '{0}/{1}.hyp.ffs.rule'.format(workingdir, seg.id), 'stateless': '{0}/{1}.hyp.ffs.stateless'.format(workingdir, seg.id) } tgt_forest, lookup_comps, stateless_comps = pass1(seg, src_forest, model, saving=saving1, redo=args.redo, log=log) # parse reference lattice log('[%d] Parse reference DFA', seg.id) ref_dfa = make_reference_dfa(seg) goal_maker = GoalRuleMaker(goal_str=args.goal, start_str=args.start, n=1) ref_forest = parse_dfa(tgt_forest, 0, ref_dfa, goal_maker.get_oview(), bottomup=False) if not ref_forest: return False # not parsable # pass2: rescore reference forest saving2 = { 'forest': '{0}/{1}.ref.forest'.format(workingdir, seg.id), 'components': '{0}/{1}.ref.ffs.all'.format(workingdir, seg.id) } goal_maker.update() pass2(seg, ref_forest, TableLookupScorer(model.lookup), StatelessScorer(model.stateless), StatefulScorer(model.stateful), goal_maker.get_oview(), saving=saving2, redo=args.redo, log=log) return True # parsable
def biparse(seg: SegmentMetaData, options: SimpleNamespace, joint_model: ModelView, conditional_model: ModelView, workingdir=None, redo=True, log=dummyfunc) -> SimpleNamespace: """ Biparse a segment using a local model. 1. we parse the source with a joint model 2. we bi-parse source and target with a conditional model This separation allows us to factorise these models differently wrt local/nonlocal components. For example, an LM maybe seen as a local (read tractable) component of a conditional model, and as a nonlocal (read intractable) component of a joint model. An implementation detail: bi-parsing is implemented as a cascade of intersections (with projections in between). :param seg: a segment :param options: parsing options :param joint_model: a factorised view of the joint model, here we use only the local components :param conditional_model: a factorised view of the conditional, here we use only the local components :param workingdir: where to save files :param redo: whether or not previously saved computation should be discarded :param log: a logging function :return: result.{joint,conditional}.{forest,components} for the respective local model """ if workingdir: saving = preprocessed_training_files('{0}/{1}'.format(workingdir, seg.id)) else: saving = {} result = SimpleNamespace() result.joint = SimpleNamespace() result.conditional = SimpleNamespace() if conditional_model is None: steps = ['joint.forest', 'joint.components'] if all(is_step_complete(step, saving, redo) for step in steps): log('[%d] Reusing joint and conditional distributions from files', seg.id) result.joint.forest = unpickle_it(saving['joint.forest']) result.joint.components = unpickle_it(saving['joint.components']) result.conditional.forest = None result.conditional.components = [] return result steps = ['joint.forest', 'joint.components', 'conditional.forest', 'conditional.components'] if all(is_step_complete(step, saving, redo) for step in steps): log('[%d] Reusing joint and conditional distributions from files', seg.id) result.joint.forest = unpickle_it(saving['joint.forest']) result.joint.components = unpickle_it(saving['joint.components']) result.conditional.forest = unpickle_it(saving['conditional.forest']) result.conditional.components = unpickle_it(saving['conditional.components']) return result # 1. Make a grammar # here we need to decode for sure log('[%d] Make hypergraph view of all available grammars', seg.id) # make a hypergraph view of all available grammars grammar = make_grammar_hypergraph(seg, extra_grammar_paths=options.extra_grammars, glue_grammar_paths=options.glue_grammars, pass_through=options.pass_through, default_symbol=options.default_symbol) #print('GRAMMAR') #print(grammar) # 2. Joint distribution - Step 1: parse source lattice n_goal = 0 log('[%d] Parse source DFA', seg.id) goal_maker = GoalRuleMaker(goal_str=options.goal, start_str=options.start, n=n_goal) src_dfa = make_input_dfa(seg) src_forest = parse_dfa(grammar, grammar.fetch(Nonterminal(options.start)), src_dfa, goal_maker.get_iview(), bottomup=True, constraint=HieroConstraints(grammar, src_dfa, options.max_span)) #print('SOURCE') #print(src_forest) if not src_forest: raise ValueError('I cannot parse the input lattice: i) make sure your grammar has glue rules; ii) make sure it handles OOVs') # 3. Target projection of the forest log('[%d] Project target rules', seg.id) tgt_forest = make_target_forest(src_forest) #print('TARGET') #print(tgt_forest) # 4. Joint distribution - Step 2: scoring log('[%d] Joint model: (exact) local scoring', seg.id) result.joint = exact_rescoring(joint_model.local_model(), tgt_forest, goal_maker, log) # save joint distribution if 'joint.forest' in saving: pickle_it(saving['joint.forest'], result.joint.forest) if 'joint.components' in saving: pickle_it(saving['joint.components'], result.joint.components) if conditional_model is None: result.conditional.forest = None result.conditional.components = [] return result # 5. Conditional distribution - Step 1: parse the reference lattice log('[%d] Parse reference DFA', seg.id) ref_dfa = make_reference_dfa(seg) goal_maker.update() ref_forest = parse_dfa(result.joint.forest, 0, ref_dfa, goal_maker.get_oview(), bottomup=False) if not ref_forest: # reference cannot be parsed log('[%d] References cannot be parsed', seg.id) result.conditional.forest = ref_forest result.conditional.components = [] else: # 6. Conditional distribution - Step 2: scoring log('[%d] Conditional model: exact (local) scoring', seg.id) result.conditional = exact_rescoring(conditional_model.local_model(), ref_forest, goal_maker, log) # save conditional distribution if 'conditional.forest' in saving: pickle_it(saving['conditional.forest'], result.conditional.forest) if 'conditional.components' in saving: pickle_it(saving['conditional.components'], result.conditional.components) return result