Esempio n. 1
0
def pass2(seg, forest,
          lookup_scorer, stateless_scorer, stateful_scorer,
          goal_rule, omega=None,
          saving={}, redo=True, log=dummyfunc) -> 'tuple':
    """
    Pass2 consists in exactly rescoring a forest.
    :return: rescored forest (a Hypergraph), and components (one FComponents object per edge)
    """

    if is_step_complete('forest', saving, redo) and is_step_complete('components', saving, redo) :
        rescored_forest = unpickle_it(saving['forest'])
        components = unpickle_it(saving['components'])
        return rescored_forest, components

    log('[%d] Forest rescoring', seg.id)
    rescored_forest, components = rescore_forest(forest,
                                                 0,
                                                 lookup_scorer,
                                                 stateless_scorer,
                                                 stateful_scorer,
                                                 goal_rule=goal_rule,
                                                 omega=omega,
                                                 keep_components=True)
    if 'forest' in saving:
        pickle_it(saving['forest'], rescored_forest)
    if 'components' in saving:
        pickle_it(saving['components'], components)

    return rescored_forest, components
Esempio n. 2
0
def pass0_to_pass1(seg, options, lookup, stateless, saving={}, redo=True, log=dummyfunc) -> 'str':
    """
    Pass1 consists in obtaining a target forest and locally scoring it.

    Steps
        1. Project target side of the forest
        2. Lookup scoring
        3. Stateless scoring

    :return: source forest
    """

    if is_step_complete('pass1.forest', saving, redo):  # try to reuse previous results
        tgt_forest = unpickle_it(saving['pass1.forest'])
    else: # execute pass0
        src_forest = pass0(seg, options, n_goal=0, saving={}, redo=redo, log=dummyfunc)
        # target projection
        log('[%d] Project target rules', seg.id)
        tgt_forest = make_target_forest(src_forest)
        if 'pass1.forest' in saving:
            pickle_it(saving['pass1.forest'], tgt_forest)

    if is_step_complete('pass1.components', saving, redo):
        components = unpickle_it(saving['pass1.components'])
    else:
        log('[%d] Lookup scoring', seg.id)
        lookup_comps = get_lookup_components(tgt_forest, lookup.extractors())
        log('[%d] Stateless scoring', seg.id)
        stateless_comps = get_stateless_components(tgt_forest, stateless.extractors())
        components = [FComponents([comps1, comps2]) for comps1, comps2 in zip(lookup_comps, stateless_comps)]
        if 'pass1.components' in saving:
            pickle_it(saving['pass1.components'], components)

    return tgt_forest, components
Esempio n. 3
0
def importance_sample(seg, options, proxy, target, saving={}, redo=True, log=dummyfunc):
    """

    :param seg:
    :param options:
    :param proxy:
    :param target:
    :param log:
    :return:
    """

    if is_step_complete('is.samples', saving, redo):
        return unpickle_it(saving['is.samples'])

    q_forest, q_components = pass0_to_pass2(seg, options,
                                            TableLookupScorer(proxy.lookup),
                                            StatelessScorer(proxy.stateless),
                                            StatefulScorer(proxy.stateful),
                                            saving=saving,
                                            redo=redo,
                                            log=log)

    # Make unnormalised q(d)
    q_func = TableLookupFunction(np.array([proxy.score(comps) for comps in q_components], dtype=ptypes.weight))

    log('[%d] Q-forest: nodes=%d edges=%d', seg.id, q_forest.n_nodes(), q_forest.n_edges())
    tsort = AcyclicTopSortTable(q_forest)

    sampler = AncestralSampler(q_forest, tsort, omega=q_func)
    samples = sampler.sample(options.samples)

    d_groups = group_by_identity(samples)
    y_groups = group_by_projection(d_groups, lambda group: yield_string(q_forest, group.key))

    is_yields = []
    for y_group in y_groups:
        y = y_group.key
        is_derivations = []
        for d_group in y_group.values:
            edges = d_group.key
            # reduce q weights through inside.times
            q_score = derivation_weight(q_forest, edges, semiring.inside, omega=q_func)
            # reduce q components through inside.times
            q_comps = proxy.constant(semiring.inside.one)
            for e in edges:
                q_comps = q_comps.hadamard(q_components[e], semiring.inside.times)
            # compute p components and p score
            p_comps, p_score = score_derivation(q_forest, edges, semiring.inside,
                                                TableLookupScorer(target.lookup),
                                                StatelessScorer(target.stateless),
                                                StatefulScorer(target.stateful))
            # TODO: save {y => {edges: (q_comps, p_comps, count)}}
            is_derivations.append(ISDerivation(edges, q_comps, p_comps, d_group.count))
        is_yields.append(ISYield(y, is_derivations, y_group.count))
    if 'is.samples' in saving:
        pickle_it(saving['is.samples'], is_yields)

    return is_yields
Esempio n. 4
0
def pass0(seg, extra_grammar_paths=[], glue_grammar_paths=[], pass_through=True,
          default_symbol='X', goal_str='GOAL', start_str='S', max_span=-1, n_goal=0,
          saving={}, redo=True, log=dummyfunc) -> 'Hypergraph':
    """
    Pass0 consists in parsing with the source side of the grammar.
    For now, pass0 does not do any scoring (not even local), but it could (TODO).

    Steps
        1. Make a hypergraph view of the grammar
        2. Make an input DFA
        3. Parse the input DFA

    :return: source forest
    """
    if is_step_complete('forest', saving, redo):
        return unpickle_it(saving['forest'])

    # here we need to decode for sure
    log('[%d] Make hypergraph view of all available grammars', seg.id)
    # make a hypergraph view of all available grammars
    grammar = make_grammar_hypergraph(seg,
                                      extra_grammar_paths=extra_grammar_paths,
                                      glue_grammar_paths=glue_grammar_paths,
                                      pass_through=pass_through,
                                      default_symbol=default_symbol)

    # parse source lattice
    log('[%d] Parse source DFA', seg.id)
    goal_maker = GoalRuleMaker(goal_str=goal_str, start_str=start_str, n=n_goal)
    dfa = make_input_dfa(seg)
    forest = parse_dfa(grammar,
                       grammar.fetch(Nonterminal(start_str)),
                       dfa,
                       goal_maker.get_iview(),
                       bottomup=True,
                       constraint=HieroConstraints(grammar, dfa, max_span))
    if 'forest' in saving:
        pickle_it(saving['forest'], forest)
    return forest
Esempio n. 5
0
def pass1(seg, src_forest, model,
          saving={}, redo=True,
          log=dummyfunc) -> 'str':
    """
    Pass1 consists in obtaining a target forest and locally scoring it.

    Steps
        1. Project target side of the forest
        2. Lookup scoring
        3. Stateless scoring

    :return: source forest
    """

    if is_step_complete('forest', saving, redo):
        tgt_forest = unpickle_it(saving['forest'])
    else:
        # target projection
        log('[%d] Project target rules', seg.id)
        tgt_forest = make_target_forest(src_forest)
        if 'forest' in saving:
            pickle_it(saving['forest'], tgt_forest)

    # local scoring
    if is_step_complete('lookup', saving, redo):
        lookup_comps = unpickle_it(saving['lookup'])
    else:
        log('[%d] Lookup scoring', seg.id)
        lookup_comps = get_lookup_components(tgt_forest, model.lookup.extractors())
        if 'lookup' in saving:
            pickle_it(saving['lookup'], lookup_comps)

    if is_step_complete('stateless', saving, redo):
        stateless_comps = unpickle_it(saving['stateless'])
    else:
        log('[%d] Stateless scoring', seg.id)
        stateless_comps = get_stateless_components(tgt_forest, model.stateless.extractors())
        if 'stateless' in saving:
            pickle_it(saving['stateless'], stateless_comps)

    return tgt_forest, lookup_comps, stateless_comps
Esempio n. 6
0
def biparse(seg: SegmentMetaData, options: SimpleNamespace,
            joint_model: ModelView, conditional_model: ModelView,
            workingdir=None, redo=True, log=dummyfunc) -> SimpleNamespace:
    """
    Biparse a segment using a local model.
    1. we parse the source with a joint model
    2. we bi-parse source and target with a conditional model
    This separation allows us to factorise these models differently wrt local/nonlocal components.
    For example, an LM maybe seen as a local (read tractable) component of a conditional model,
     and as a nonlocal (read intractable) component of a joint model.
    An implementation detail: bi-parsing is implemented as a cascade of intersections (with projections in between).

    :param seg: a segment
    :param options: parsing options
    :param joint_model: a factorised view of the joint model, here we use only the local components
    :param conditional_model: a factorised view of the conditional, here we use only the local components
    :param workingdir: where to save files
    :param redo: whether or not previously saved computation should be discarded
    :param log: a logging function
    :return: result.{joint,conditional}.{forest,components} for the respective local model
    """

    if workingdir:
        saving = preprocessed_training_files('{0}/{1}'.format(workingdir, seg.id))
    else:
        saving = {}

    result = SimpleNamespace()
    result.joint = SimpleNamespace()
    result.conditional = SimpleNamespace()

    if conditional_model is None:
        steps = ['joint.forest', 'joint.components']
        if all(is_step_complete(step, saving, redo) for step in steps):
            log('[%d] Reusing joint and conditional distributions from files', seg.id)
            result.joint.forest = unpickle_it(saving['joint.forest'])
            result.joint.components = unpickle_it(saving['joint.components'])
            result.conditional.forest = None
            result.conditional.components = []
            return result

    steps = ['joint.forest', 'joint.components', 'conditional.forest', 'conditional.components']
    if all(is_step_complete(step, saving, redo) for step in steps):
        log('[%d] Reusing joint and conditional distributions from files', seg.id)
        result.joint.forest = unpickle_it(saving['joint.forest'])
        result.joint.components = unpickle_it(saving['joint.components'])
        result.conditional.forest = unpickle_it(saving['conditional.forest'])
        result.conditional.components = unpickle_it(saving['conditional.components'])
        return result

    # 1. Make a grammar

    # here we need to decode for sure
    log('[%d] Make hypergraph view of all available grammars', seg.id)
    # make a hypergraph view of all available grammars
    grammar = make_grammar_hypergraph(seg,
                                      extra_grammar_paths=options.extra_grammars,
                                      glue_grammar_paths=options.glue_grammars,
                                      pass_through=options.pass_through,
                                      default_symbol=options.default_symbol)
    #print('GRAMMAR')
    #print(grammar)

    # 2. Joint distribution - Step 1: parse source lattice
    n_goal = 0
    log('[%d] Parse source DFA', seg.id)
    goal_maker = GoalRuleMaker(goal_str=options.goal, start_str=options.start, n=n_goal)
    src_dfa = make_input_dfa(seg)
    src_forest = parse_dfa(grammar,
                           grammar.fetch(Nonterminal(options.start)),
                           src_dfa,
                           goal_maker.get_iview(),
                           bottomup=True,
                           constraint=HieroConstraints(grammar, src_dfa, options.max_span))
    #print('SOURCE')
    #print(src_forest)

    if not src_forest:
        raise ValueError('I cannot parse the input lattice: i) make sure your grammar has glue rules; ii) make sure it handles OOVs')

    # 3. Target projection of the forest
    log('[%d] Project target rules', seg.id)
    tgt_forest = make_target_forest(src_forest)
    #print('TARGET')
    #print(tgt_forest)

    # 4. Joint distribution - Step 2: scoring

    log('[%d] Joint model: (exact) local scoring', seg.id)
    result.joint = exact_rescoring(joint_model.local_model(), tgt_forest, goal_maker, log)

    # save joint distribution
    if 'joint.forest' in saving:
        pickle_it(saving['joint.forest'], result.joint.forest)
    if 'joint.components' in saving:
        pickle_it(saving['joint.components'], result.joint.components)

    if conditional_model is None:
        result.conditional.forest = None
        result.conditional.components = []
        return result

    # 5. Conditional distribution - Step 1: parse the reference lattice

    log('[%d] Parse reference DFA', seg.id)
    ref_dfa = make_reference_dfa(seg)
    goal_maker.update()
    ref_forest = parse_dfa(result.joint.forest,
                           0,
                           ref_dfa,
                           goal_maker.get_oview(),
                           bottomup=False)

    if not ref_forest:  # reference cannot be parsed
        log('[%d] References cannot be parsed', seg.id)
        result.conditional.forest = ref_forest
        result.conditional.components = []
    else:
        # 6. Conditional distribution - Step 2: scoring
        log('[%d] Conditional model: exact (local) scoring', seg.id)
        result.conditional = exact_rescoring(conditional_model.local_model(), ref_forest, goal_maker, log)

    # save conditional distribution
    if 'conditional.forest' in saving:
        pickle_it(saving['conditional.forest'], result.conditional.forest)
    if 'conditional.components' in saving:
        pickle_it(saving['conditional.components'], result.conditional.components)

    return result
Esempio n. 7
0
def pass0_to_pass2(seg, options, lookup, stateless, stateful, saving={}, redo=True, log=dummyfunc) -> 'tuple':
    """
    Pass2 consists in exactly rescoring a forest.
    :return: rescored forest (a Hypergraph), and components (one FComponents object per edge)
    """

    # We try to reuse previous results
    if is_step_complete('pass2.forest', saving, redo) and is_step_complete('pass2.components', saving, redo):
        forest = unpickle_it(saving['pass2.forest'])
        components = unpickle_it(saving['pass2.components'])
        return forest, components

    # We check whether we need pass2
    if not stateful:  # execute passes 0 to 1 only
        forest, components = pass0_to_pass1(seg,
                                            options,
                                            lookup,
                                            stateless,
                                            saving,
                                            redo=redo,
                                            log=log)

        # TODO: complete components with empty stateful model
        # save (or link) forest
        if 'pass2.forest' in saving:
            if 'pass1.forest' in saving:
                symlink(saving['pass1.forest'], saving['pass2.forest'])
            else:
                pickle_it(saving['pass2.forest'], forest)
        # save (or link) components
        if 'pass2.components' in saving:
            if 'pass1.components' in saving:
                symlink(saving['pass1.components'], saving['pass2.components'])
            else:
                pickle_it(saving['pass2.components'], components)
        return forest, components

    # From here we are sure we have stateful scorers
    # then we first execute passes 0 to 1 (and discard dummy components)
    forest, _ = pass0_to_pass1(seg,
                               options,
                               TableLookupScorer(DummyModel()),
                               StatelessScorer(DummyModel()),
                               saving,
                               redo=redo,
                               log=log)

    # then we fully re-score the forest (keeping all components)
    log('[%d] Forest rescoring', seg.id)
    goal_maker = GoalRuleMaker(goal_str=options.goal, start_str=options.start, n=1)
    forest, components = rescore_forest(forest,
                                        0,
                                        TableLookupScorer(lookup),
                                        StatelessScorer(stateless),
                                        StatefulScorer(stateful),
                                        goal_rule=goal_maker.get_oview(),
                                        keep_components=True)
    # save the forest
    if 'pass2.forest' in saving:
        pickle_it(saving['pass2.forest'], forest)
    # save the components
    if 'pass2.components' in saving:
        pickle_it(saving['pass2.components'], components)

    return forest, components
Esempio n. 8
0
def slice_sample(seg, args, staticdir, supportdir, workspace, model):
    files = [
        '{0}/{1}.D.ffs.all'.format(supportdir, seg.id),
        '{0}/{1}.hyp.ffs.all'.format(workspace, seg.id)
    ]

    if all(os.path.exists(path) for path in files) and not args.redo:
        logging.info('Reusing samples for segment %d', seg.id)
        return

    # 1. Load pickled objects
    logging.debug('[%d] Loading target forest', seg.id)
    forest = unpickle_it('{0}/{1}.hyp.forest'.format(staticdir, seg.id))
    # TODO: store top sort table
    logging.debug('[%d] Loading local components', seg.id)
    lookupffs = unpickle_it('{0}/{1}.hyp.ffs.rule'.format(staticdir, seg.id))
    statelessffs = unpickle_it('{0}/{1}.hyp.ffs.stateless'.format(
        staticdir, seg.id))

    # 2. Compute l(d)
    # there is a guarantee that lookup components and stateless components were computed over the same forest
    # that is, with the same nodes/edges structure
    # this is crucial to compute l(d) as below
    logging.debug('[%d] Computing l(d)', seg.id)
    lfunc = TableLookupFunction(
        np.array([
            semiring.inside.times(model.lookup.score(ff1),
                                  model.stateless.score(ff2))
            for ff1, ff2 in zip(lookupffs, statelessffs)
        ],
                 dtype=ptypes.weight))

    # 3. Sample from f(d) = n(d) * l(d)
    logging.debug('[%d] Sampling from f(d) = n(d) * l(d)', seg.id)
    tsort = AcyclicTopSortTable(forest)
    goal_maker = GoalRuleMaker(args.goal, args.start, n=2)

    sampler = SlicedRescoring(forest, lfunc, tsort,
                              TableLookupScorer(model.dummy),
                              StatelessScorer(model.dummy),
                              StatefulScorer(model.stateful), semiring.inside,
                              goal_maker.get_oview(),
                              OutputView(make_dead_srule()))

    # here samples are represented as sequences of edge ids
    d0, markov_chain = sampler.sample(n_samples=args.samples[0],
                                      batch_size=args.batch,
                                      within=args.within,
                                      initial=args.initial,
                                      prior=args.prior,
                                      burn=args.burn,
                                      lag=args.lag,
                                      temperature0=args.temperature0)

    # save empirical support
    pickle_it(
        '{0}/{1}.D.ffs.all'.format(supportdir, seg.id),
        get_empirical_support(model, frozenset(seg.refs), forest, lookupffs,
                              statelessffs, markov_chain))

    # apply usual MCMC filters to the Markov chain
    samples = apply_filters(markov_chain, burn=args.burn, lag=args.lag)

    n_samples = len(samples)

    # 4. Complete feature vectors and compute expectation
    hypcomps = []
    hypexp = model.constant(semiring.prob.zero)
    d_groups = group_by_identity(samples)
    for d_group in d_groups:
        derivation = d_group.key
        # reconstruct components
        lookup_comps = model.lookup.constant(semiring.inside.one)
        stateless_comps = model.stateless.constant(semiring.inside.one)
        for e in derivation.edges:
            lookup_comps = lookup_comps.hadamard(lookupffs[e],
                                                 semiring.inside.times)
            stateless_comps = stateless_comps.hadamard(statelessffs[e],
                                                       semiring.inside.times)
        # complete components (lookup, stateless, stateful)
        # note that here we are updating derivation.components!
        derivation.components = FComponents(
            [lookup_comps, stateless_comps, derivation.components])
        # incorporate sample frequency
        hypcomps.append(
            derivation.components.power(
                float(d_group.count) / n_samples, semiring.inside))
        hypexp = hypexp.hadamard(hypcomps[-1], semiring.prob.plus)

    # save feature vectors
    pickle_it('{0}/{1}.hyp.ffs.all'.format(workspace, seg.id), hypcomps)

    # 5. Log stuff
    if args.save_d:
        save_mcmc_derivations(
            '{0}/{1}.hyp.d.gz'.format(workspace, seg.id),
            d_groups,
            valuefunc=lambda d: d.score,
            compfunc=lambda d: d.components,
            derivation2str=lambda d: bracketed_string(forest, d.edges))

    if args.save_y:
        projections = group_by_projection(
            samples, lambda d: yield_string(forest, d.edges))
        save_mcmc_yields('{0}/{1}.hyp.y.gz'.format(workspace, seg.id),
                         projections)

    if args.save_chain:
        markov_chain.appendleft(d0)
        save_markov_chain(
            '{0}/{1}.hyp.chain.gz'.format(workspace, seg.id),
            markov_chain,
            flat=True,
            valuefunc=lambda d: d.score,
            #compfunc=lambda d: d.components,  # TODO: complete feature vectors of all derivations
            derivation2str=lambda d: bracketed_string(forest, d.edges))