Beispiel #1
0
def importance_sample(seg, options, proxy, target, saving={}, redo=True, log=dummyfunc):
    """

    :param seg:
    :param options:
    :param proxy:
    :param target:
    :param log:
    :return:
    """

    if is_step_complete('is.samples', saving, redo):
        return unpickle_it(saving['is.samples'])

    q_forest, q_components = pass0_to_pass2(seg, options,
                                            TableLookupScorer(proxy.lookup),
                                            StatelessScorer(proxy.stateless),
                                            StatefulScorer(proxy.stateful),
                                            saving=saving,
                                            redo=redo,
                                            log=log)

    # Make unnormalised q(d)
    q_func = TableLookupFunction(np.array([proxy.score(comps) for comps in q_components], dtype=ptypes.weight))

    log('[%d] Q-forest: nodes=%d edges=%d', seg.id, q_forest.n_nodes(), q_forest.n_edges())
    tsort = AcyclicTopSortTable(q_forest)

    sampler = AncestralSampler(q_forest, tsort, omega=q_func)
    samples = sampler.sample(options.samples)

    d_groups = group_by_identity(samples)
    y_groups = group_by_projection(d_groups, lambda group: yield_string(q_forest, group.key))

    is_yields = []
    for y_group in y_groups:
        y = y_group.key
        is_derivations = []
        for d_group in y_group.values:
            edges = d_group.key
            # reduce q weights through inside.times
            q_score = derivation_weight(q_forest, edges, semiring.inside, omega=q_func)
            # reduce q components through inside.times
            q_comps = proxy.constant(semiring.inside.one)
            for e in edges:
                q_comps = q_comps.hadamard(q_components[e], semiring.inside.times)
            # compute p components and p score
            p_comps, p_score = score_derivation(q_forest, edges, semiring.inside,
                                                TableLookupScorer(target.lookup),
                                                StatelessScorer(target.stateless),
                                                StatefulScorer(target.stateful))
            # TODO: save {y => {edges: (q_comps, p_comps, count)}}
            is_derivations.append(ISDerivation(edges, q_comps, p_comps, d_group.count))
        is_yields.append(ISYield(y, is_derivations, y_group.count))
    if 'is.samples' in saving:
        pickle_it(saving['is.samples'], is_yields)

    return is_yields
Beispiel #2
0
 def test_ancestral(self):
     sampler = AncestralSampler(self.forest, self.tsort)
     size = 1000
     counts = Counter(sampler.sample(size))
     ranking = counts.most_common()
     top, n = ranking[0]
     #print()
     #print(n/size, sampler.prob(top))
     self.assertEqual(sampler.Z, -4.358310174252031)
     self.assertAlmostEqual(n/size, sampler.prob(top), places=1, msg='Random effects apply - double check.')
Beispiel #3
0
def uninformed_conditions(hg, dfa, slicevars, root, goal_rule, batch,
                          algorithm):
    """
    Search for an initial set of conditions without any heuristics.

    :param grammars:
    :param glue_grammars:
    :param fsa:
    :param slicevars:
    :param root:
    :param goal:
    :param batch:
    :param generations:
    :param semiring:
    :return:
    """

    while True:

        if algorithm == 'earley':
            parser = EarleyParser(hg, dfa, semiring.inside, slicevars)
        else:
            parser = NederhofParser(hg, dfa, semiring.inside, slicevars)

        # compute a slice (a randomly pruned forest)
        logging.debug('Computing slice...')
        forest = parser.do(root, goal_rule)
        if not forest:
            logging.debug('NO PARSE FOUND')
            slicevars.reset(
            )  # reset the slice variables (keeping conditions unchanged if any)
            continue

        tsort = RobustTopSortTable(forest)
        values = reweight(forest, slicevars, semiring.inside)
        sampler = AncestralSampler(forest, tsort, TableLookupFunction(values))
        raw_derivations = sampler.sample(batch)
        conditions = make_batch_conditions(forest, raw_derivations)
        slicevars.reset(conditions)
        return conditions
Beispiel #4
0
def decode(seg, args, proxy, target):
    # pass0
    src_forest = pipeline.pass0(seg,
                                extra_grammar_paths=args.extra_grammar,
                                glue_grammar_paths=args.glue_grammar,
                                pass_through=args.pass_through,
                                default_symbol=args.default_symbol,
                                goal_str=args.goal,
                                start_str=args.start,
                                max_span=args.max_span,
                                n_goal=0,
                                log=logging.info)

    if not proxy.stateful:
        tgt_forest, lookup_comps, stateless_comps = pipeline.pass1(
            seg, src_forest, proxy, saving={}, redo=True, log=logging.info)
        q_components = [
            FComponents([comp1, comp2])
            for comp1, comp2 in zip(lookup_comps, stateless_comps)
        ]
    else:
        tgt_forest = pipeline.make_target_forest(src_forest)
        goal_maker = GoalRuleMaker(goal_str=args.goal,
                                   start_str=args.start,
                                   n=1)
        tgt_forest, q_components = pipeline.pass2(
            seg,
            tgt_forest,
            TableLookupScorer(proxy.lookup),
            StatelessScorer(proxy.stateless),
            StatefulScorer(proxy.stateful),
            goal_rule=goal_maker.get_oview(),
            omega=None,
            saving={},
            redo=True,
            log=logging.info)
    # TODO: save tgt_forest and q_components
    # Make unnormalised q(d)
    q_func = TableLookupFunction(
        np.array([proxy.score(comps) for comps in q_components],
                 dtype=ptypes.weight))

    logging.info('[%d] Forest: nodes=%d edges=%d', seg.id,
                 tgt_forest.n_nodes(), tgt_forest.n_edges())
    tsort = AcyclicTopSortTable(tgt_forest)

    sampler = AncestralSampler(tgt_forest, tsort, omega=q_func)
    samples = sampler.sample(args.samples)
    n_samples = len(samples)

    d_groups = group_by_identity(samples)
    y_groups = group_by_projection(
        d_groups, lambda group: yield_string(tgt_forest, group.key))

    is_yields = []
    for y_group in y_groups:
        y = y_group.key
        is_derivations = []
        for d_group in y_group.values:
            edges = d_group.key
            # reduce q weights through inside.times
            q_score = derivation_weight(tgt_forest,
                                        edges,
                                        semiring.inside,
                                        omega=q_func)
            # reduce q components through inside.times
            q_comps = proxy.constant(semiring.inside.one)
            for e in edges:
                q_comps = q_comps.hadamard(q_components[e],
                                           semiring.inside.times)
            # compute p components and p score
            p_comps, p_score = score_derivation(
                tgt_forest, edges, semiring.inside,
                TableLookupScorer(target.lookup),
                StatelessScorer(target.stateless),
                StatefulScorer(target.stateful))
            # TODO: save {y => {edges: (q_comps, p_comps, count)}}
            is_derivations.append(
                ISDerivation(edges, q_comps, p_comps, d_group.count))
        is_yields.append(ISYield(y, is_derivations, y_group.count))
    # TODO: pickle pickling
    return decide(is_yields, n_samples, proxy, target)
Beispiel #5
0
def exact_parsing(seg: 'the input segment (e.g. a Sentence)',
                  grammars: 'list of CFGs', glue_grammars: 'list of glue CFGs',
                  options: 'command line options',
                  outdir: 'where to save results'):
    """Parse the input exactly."""

    logging.debug('Building input hypergraph')
    hg = cfg_to_hg(grammars, glue_grammars, omega=PCFG(fname='LogProb'))
    root = hg.fetch(Nonterminal(options.start))

    # make input DFA
    dfa = make_dfa(seg.signatures)

    # get a parser implementation
    if options.intersection == 'earley':
        parser = EarleyParser(hg, dfa, semiring.inside)
    else:
        parser = NederhofParser(hg, dfa, semiring.inside)

    # parse
    logging.debug('Parsing')
    goal_rule = NewCFGProduction(Nonterminal(options.goal),
                                 [Nonterminal(options.goal)],
                                 {'LogProb': semiring.inside.one})
    forest = parser.do(root, goal_rule)

    if not forest:
        logging.info('[%s] NO PARSE FOUND', seg.id)
        return

    tsorter = LazyTopSortTable(forest, acyclic=False)

    # report some information about the forest
    if options.forest:
        with smart_wopen('{0}/forest/{1}.gz'.format(outdir, seg.id)) as fo:
            print(forest, file=fo)
    if options.count:
        tsort = tsorter.do()
        counter = DerivationCounter(forest, tsort)
        logging.info('Paths: %d', counter.n_derivations())
        with smart_wopen('{0}/count/{1}.gz'.format(outdir, seg.id)) as fo:
            print(
                'nodes=%d edges=%d paths=%d' %
                (forest.n_nodes(), forest.n_edges(), counter.n_derivations()),
                file=fo)

    # decoding strategies
    omega_d = lambda d: semiring.inside.times.reduce(d.weights())

    if options.viterbi:
        tsort = tsorter.do()
        logging.info('Viterbi...')

        raw_viterbi = viterbi_derivation(forest, tsort)
        viterbi = make_derivation(forest, raw_viterbi)

        score = omega_d(viterbi)
        logging.info('Viterbi derivation: %s', score)
        logging.info('Saving...')
        save_viterbi('{0}/viterbi/{1}.gz'.format(outdir, seg.id),
                     SimpleNamespace(derivation=viterbi, count=1, value=score),
                     get_projection=DerivationYield.derivation)

    if options.samples > 0:
        tsort = tsorter.do()
        logging.info('Sampling...')
        sampler = AncestralSampler(forest, tsort)
        raw_samples = sampler.sample(options.samples)

        logging.info('Saving...')
        derivations = group_raw(forest, raw_samples)
        save_mc_derivations('{0}/ancestral/derivations/{1}.gz'.format(
            outdir, seg.id),
                            derivations,
                            inside=sampler.Z)

    logging.info('[%s] Finished!', seg.id)
Beispiel #6
0
def sliced_parsing(seg: 'the input segment (e.g. a Sentence)',
                   grammars: 'a list of CFGs',
                   glue_grammars: 'a list of glue CFGs',
                   options: 'command line options',
                   outdir: 'whete to save results'):
    """Parse the input using sliced forests."""

    # Input Hypergraph
    logging.debug('Building input hypergraph')
    hg = cfg_to_hg(grammars, glue_grammars, omega=PCFG(fname='LogProb'))
    root = hg.fetch(Nonterminal(options.start))
    dfa = make_dfa(seg.signatures)
    goal_rule = NewCFGProduction(Nonterminal(options.goal),
                                 [Nonterminal(options.goal)],
                                 {'LogProb': semiring.inside.one})

    # Slice variables
    dist = get_distribution(options.free_dist)
    if options.free_dist == 'beta':
        prior = VectorOfPriors(
            get_prior(options.prior_a[0], options.prior_a[1]),
            get_prior(options.prior_b[0], options.prior_b[1]))
    elif options.free_dist == 'exponential':
        prior = get_prior(options.prior_scale[0], options.prior_scale[1])

    u = SpanSliceVariables({}, dist, prior)
    logging.debug('%r', u)
    # make initial conditions
    # TODO: consider intialisation heuristics such as attempt_initialisation(fsa, grammars, glue_grammars, options)
    logging.info('Looking for initial set of conditions...')
    uninformed_conditions(hg, dfa, u, root, goal_rule, options.batch,
                          options.intersection)
    logging.info('Done')
    #u.reset(conditions)

    # Sampling
    sizes = [0, 0, 0
             ]  # number of nodes, edges and derivations (for logging purposes)
    if options.count:
        report_size = lambda: ' nodes={:5d} edges={:5d} |D|={:5d} '.format(
            *sizes)
    else:
        report_size = lambda: ' nodes={:5d} edges={:5d}'.format(
            sizes[0], sizes[1])
    if options.progress:
        bar = progressbar(range(options.burn +
                                (options.samples * options.lag)),
                          prefix='Sampling',
                          dynsuffix=report_size)
    else:
        bar = range(options.burn + (options.samples * options.lag))

    # sample
    markov_chain = []
    for _ in bar:

        # get a parser implementation
        if options.intersection == 'earley':
            parser = EarleyParser(hg, dfa, semiring.inside, u)
        else:
            parser = NederhofParser(hg, dfa, semiring.inside, u)

        # compute a slice (a randomly pruned forest)
        forest = parser.do(root, goal_rule)
        if not forest:
            raise ValueError('A slice can never be emtpy.')

        # sample from the slice
        tsort = RobustTopSortTable(forest)
        residual = reweight(forest, u, semiring.inside)
        sampler = AncestralSampler(forest, tsort,
                                   TableLookupFunction(residual))
        raw_derivations = sampler.sample(options.batch)
        # update the slice variables and the state of the Markov chain
        u.reset(make_batch_conditions(forest, raw_derivations))

        # TODO: compute values!
        # TODO: make a derivation class
        # it is a hypergraph with its own edges, weights and rules
        # so that we can ask its value directly
        # it will basically replace the following
        # >>> tuple(forest.rule(e) for e in d)
        # then fix viterbi, MC, MCMC (all save_* methods
        # and kbest
        #markov_chain.append([tuple(forest.rule(e) for e in d) for d in raw_derivations])

        # this representation is forest agnostic
        markov_chain.append(
            [make_derivation(forest, d) for d in raw_derivations])

        # update logging information
        sizes[0], sizes[1] = forest.n_nodes(), forest.n_edges()
        if options.count:  # reporting counts
            sizes[2] = sampler.n_derivations()

    # apply MCMC filters to reduce hopefully auto-correlation
    batches = apply_filters(markov_chain, burn=options.burn, lag=options.lag)
    samples = apply_batch_filters(batches, resample=options.resample)

    # group by derivation
    derivations = group_by_identity(samples)
    # group by trees (free of nonterminal annotation)
    #trees = group_by_projection(samples, DerivationYield.tree)
    # save everything
    #omega_d = lambda d: semiring.inside.times.reduce([r.weight for r in d])
    save_mcmc_derivation('{0}/slice/derivations/{1}.gz'.format(outdir, seg.id),
                         derivations)
    #save_mcmc_yields('{0}/slice/trees/{1}.gz'.format(outdir, seg.id), trees)
    if options.save_chain:
        save_markov_chain(
            '{0}/slice/chain/{1}.gz'.format(outdir, seg.id),
            markov_chain,
            derivation2str=lambda d: DerivationYield.derivation(d.rules()),
            flat=False)
Beispiel #7
0
def decode(seg, args, model, outdir):
    """
    """

    # pass0
    src_forest = pipeline.pass0(seg,
                                extra_grammar_paths=args.extra_grammar,
                                glue_grammar_paths=args.glue_grammar,
                                pass_through=args.pass_through,
                                default_symbol=args.default_symbol,
                                goal_str=args.goal,
                                start_str=args.start,
                                max_span=args.max_span,
                                n_goal=0,
                                log=logging.info)
    tgt_forest = pipeline.make_target_forest(src_forest,
                                             TableLookupScorer(model.lookup))
    tsort = AcyclicTopSortTable(tgt_forest)

    if args.viterbi:
        viterbi(seg.id, tgt_forest, tsort, outdir, "pass0")

    # pass1
    if model.stateless:
        tgt_forest = stateless_rescoring(tgt_forest,
                                         StatelessScorer(model.stateless),
                                         semiring.inside)
        if args.viterbi:
            viterbi(seg.id, tgt_forest, tsort, outdir, "pass1")

    samples = []

    if args.framework == 'exact' or not model.stateful:  # exact scoring or no stateful scoring
        # we have access to Viterbi, k-best, sampling
        if model.stateful:
            goal_maker = GoalRuleMaker(goal_str=args.goal,
                                       start_str=args.start,
                                       n=1)

            rescorer = EarleyRescorer(tgt_forest,
                                      TableLookupScorer(model.dummy),
                                      StatelessScorer(model.dummy),
                                      StatefulScorer(model.stateful),
                                      semiring.inside)

            tgt_forest = rescorer.do(tsort.root(), goal_maker.get_oview())
            tsort = AcyclicTopSortTable(tgt_forest)

        # Do everything: viterbi, map, consensus, etc...
        if args.viterbi:
            viterbi(seg.id, tgt_forest, tsort, outdir, "pass2")

        if args.kbest > 0:
            # TODO: call kbest code
            pass
        if args.samples > 0:
            sampler = AncestralSampler(tgt_forest, tsort)
            samples = sampler.sample(args.samples)
            derivations = group_by_identity(samples)
            save_mc_derivations(
                '{0}/exact/derivations/{1}.gz'.format(outdir, seg.id),
                derivations,
                sampler.Z,
                valuefunc=lambda d: derivation_weight(tgt_forest, d, semiring.
                                                      inside),
                derivation2str=lambda d: bracketed_string(tgt_forest, d))
            projections = group_by_projection(
                samples, lambda d: yield_string(tgt_forest, d))
            save_mc_yields('{0}/exact/yields/{1}.gz'.format(outdir, seg.id),
                           projections)

            # TODO: fix this hack
            # it's here just so I can reuse pipeline.consensus
            # the fix involves moving SampleReturn to a more general module
            # and making AncestralSampler use it
            from grasp.alg.rescoring import SampleReturn
            samples = [SampleReturn(s, 0.0, FComponents([])) for s in samples]

    else:  # for sliced scoring, we only have access to sampling

        logging.info('Sliced rescoring...')
        from grasp.alg.rescoring import SlicedRescoring
        goal_maker = GoalRuleMaker(goal_str=args.goal,
                                   start_str=args.start,
                                   n=1)

        rescorer = SlicedRescoring(tgt_forest,
                                   HypergraphLookupFunction(tgt_forest), tsort,
                                   TableLookupScorer(model.dummy),
                                   StatelessScorer(model.dummy),
                                   StatefulScorer(model.stateful),
                                   semiring.inside, goal_maker.get_oview(),
                                   make_dead_oview(args.default_symbol))

        if args.gamma_shape > 0:
            gamma_shape = args.gamma_shape
        else:
            gamma_shape = len(model)  # number of local components
        gamma_scale_type = args.gamma_scale[0]
        gamma_scale_parameter = float(args.gamma_scale[1])

        # here samples are represented as sequences of edge ids
        d0, markov_chain = rescorer.sample(
            n_samples=args.samples,
            batch_size=args.batch,
            within=args.within,
            initial=args.initial,
            gamma_shape=gamma_shape,
            gamma_scale_type=gamma_scale_type,
            gamma_scale_parameter=gamma_scale_parameter,
            burn=args.burn,
            lag=args.lag,
            temperature0=args.temperature0)

        # apply usual MCMC heuristics (e.g. burn-in, lag)
        samples = apply_filters(markov_chain, burn=args.burn, lag=args.lag)

        # group by derivation (now a sample is represented by a Derivation object)
        derivations = group_by_identity(samples)
        save_mcmc_derivations(
            '{0}/slice/derivations/{1}.gz'.format(outdir, seg.id),
            derivations,
            valuefunc=lambda d: d.score,
            derivation2str=lambda d: bracketed_string(tgt_forest, d.edges))
        projections = group_by_projection(
            samples, lambda d: yield_string(tgt_forest, d.edges))
        save_mcmc_yields('{0}/slice/yields/{1}.gz'.format(outdir, seg.id),
                         projections)

        if args.save_chain:
            markov_chain.appendleft(d0)
            save_markov_chain(
                '{0}/slice/chain/{1}.gz'.format(outdir, seg.id),
                markov_chain,
                flat=True,
                valuefunc=lambda d: d.score,
                derivation2str=lambda d: bracketed_string(tgt_forest, d.edges))

    if samples:
        # decision rule
        decisions = pipeline.consensus(seg, tgt_forest, samples)
        return decisions[0]