Example #1
0
def iterrules(istream):
    """
    Iterates through an input stream yielding synchronous rules.
    :param istream:
    :param linear_model:
    :return:
    """
    for line in istream:
        if line.startswith('#'):
            continue
        line = line.strip()
        if not line:
            continue
        fields = line.split(' ||| ')
        if len(fields) < 4:
            raise ValueError('I expected at least 4 fields, got %d: %s' %
                             (len(fields), fields))
        if not is_nonterminal(fields[0]):
            raise ValueError(
                'Expected a nonterminal LHS, got something else: <%s>' %
                fields[0])
        lhs = Nonterminal(fields[0][1:-1])  # ignore brackets
        f_rhs = tuple(
            Nonterminal(x[1:-1]) if is_nonterminal(x) else Terminal(x)
            for x in fields[1].split())
        e_rhs = tuple(
            Nonterminal(x[1:-1]) if is_nonterminal(x) else Terminal(x)
            for x in fields[2].split())
        features = defaultdict(None, iterpairs(fields[3]))
        yield SCFGProduction.create(lhs, f_rhs, e_rhs, features)
Example #2
0
 def get_srule(self):
     """Returns that goal rule based on the state of the factory"""
     if self._n == 0:
         rhs = [Nonterminal(self._start_str)]
     else:
         rhs = [Nonterminal(self._make_goal_str(self._n - 1))]
     return SCFGProduction(Nonterminal(self._make_goal_str()), rhs, rhs,
                           [1], {'GoalRule': 1.0})
Example #3
0
def make_pass_grammar(seg, grammars, semiring, unk_lhs):
    """
    Make an input fsa for an input segment as well as its pass-through grammar.
    :param seg: a Segment object.
    :param grammars: a sequence of SCFGs.
    :param semiring: must provide `one`.
    :return: the input WDFSA, the pass-through grammar
    """
    fsa = WDFSA()
    pass_grammar = SCFG()
    unk = Nonterminal(unk_lhs)
    tokens = seg.src.split()
    for i, token in enumerate(tokens):
        word = Terminal(token)
        if any(g.in_ivocab(word) for g in grammars):
            pass_grammar.add(
                SCFGProduction.create(unk, [word], [word],
                                      {'PassThrough': 1.0}))
        else:
            pass_grammar.add(
                SCFGProduction.create(unk, [word], [word], {
                    'PassThrough': 1.0,
                    'Unknown': 1.0
                }))
        fsa.add_arc(i, i + 1, word, semiring.one)
    fsa.make_initial(0)
    fsa.make_final(len(tokens))
    return fsa, pass_grammar
Example #4
0
def pass0(seg, extra_grammar_paths=[], glue_grammar_paths=[], pass_through=True,
          default_symbol='X', goal_str='GOAL', start_str='S', max_span=-1, n_goal=0,
          saving={}, redo=True, log=dummyfunc) -> 'Hypergraph':
    """
    Pass0 consists in parsing with the source side of the grammar.
    For now, pass0 does not do any scoring (not even local), but it could (TODO).

    Steps
        1. Make a hypergraph view of the grammar
        2. Make an input DFA
        3. Parse the input DFA

    :return: source forest
    """
    if is_step_complete('forest', saving, redo):
        return unpickle_it(saving['forest'])

    # here we need to decode for sure
    log('[%d] Make hypergraph view of all available grammars', seg.id)
    # make a hypergraph view of all available grammars
    grammar = make_grammar_hypergraph(seg,
                                      extra_grammar_paths=extra_grammar_paths,
                                      glue_grammar_paths=glue_grammar_paths,
                                      pass_through=pass_through,
                                      default_symbol=default_symbol)

    # parse source lattice
    log('[%d] Parse source DFA', seg.id)
    goal_maker = GoalRuleMaker(goal_str=goal_str, start_str=start_str, n=n_goal)
    dfa = make_input_dfa(seg)
    forest = parse_dfa(grammar,
                       grammar.fetch(Nonterminal(start_str)),
                       dfa,
                       goal_maker.get_iview(),
                       bottomup=True,
                       constraint=HieroConstraints(grammar, dfa, max_span))
    if 'forest' in saving:
        pickle_it(saving['forest'], forest)
    return forest
Example #5
0
def exact(uid, input, grammars, glue_grammars, options, outdir):
    semiring = SumTimes
    if options.intersection == 'earley':
        parser = Earley(grammars,
                        input.fsa,
                        glue_grammars=glue_grammars,
                        semiring=semiring)
    elif options.intersection == 'nederhof':
        parser = Nederhof(grammars,
                          input.fsa,
                          glue_grammars=glue_grammars,
                          semiring=semiring)
    else:
        raise NotImplementedError(
            "I don't know this intersection algorithm: %s" %
            options.intersection)

    # make a forest
    logging.info('Parsing...')
    forest = parser.do(root=Nonterminal(options.start),
                       goal=Nonterminal(options.goal))
    if not forest:
        logging.info('[%s] NO PARSE FOUND', uid)
        return

    tsorter = LazyTopSortTable(forest)

    # report info if necessary
    report_forest(uid, forest, tsorter, outdir, options)

    # decoding strategies

    if options.viterbi:
        tsort = tsorter.do()
        logging.info('Viterbi...')
        d = viterbi_derivation(forest, tsort, generations=options.generations)
        logging.info('Viterbi derivation: %s %s', derivation_weight(d),
                     DerivationYield.derivation(d))
        save_viterbi('{0}/viterbi/{1}.gz'.format(outdir, uid),
                     d,
                     omega_d=derivation_weight,
                     get_projection=DerivationYield.tree)

    if options.kbest > 0:
        root = make_span(Nonterminal(
            options.goal))  # this is the root after intersection
        logging.info('K-best...')
        kbestparser = KBest(forest,
                            root,
                            options.kbest,
                            MaxTimes,
                            traversal=ItemDerivationYield.string,
                            uniqueness=False).do()
        logging.info('Done!')
        derivations = list(kbestparser.iterderivations())
        save_kbest('{0}/kbest/{1}.gz'.format(outdir, uid),
                   derivations,
                   omega_d=derivation_weight,
                   get_projection=DerivationYield.tree)

    if options.samples > 0:
        logging.info('Sampling...')
        sampler = AncestralSampler(forest,
                                   tsorter.do(),
                                   generations=options.generations)
        samples = list(sampler.sample(options.samples))
        # group samples by derivation and yield
        derivations = group_by_identity(samples)
        trees = group_by_projection(samples,
                                    get_projection=DerivationYield.tree)
        # save the empirical distribution over derivations
        save_mc_derivations('{0}/ancestral/derivations/{1}.gz'.format(
            outdir, uid),
                            derivations,
                            inside=sampler.Z,
                            omega_d=derivation_weight)
        # save the empirical distribution over strings
        save_mc_yields('{0}/ancestral/trees/{1}.gz'.format(outdir, uid), trees)

    logging.info('[%s] Finished!', uid)
Example #6
0
 def setUp(self):
     self.X = Nonterminal('X')
     self.X01 = Span(Nonterminal('X'), 0, 1)
     self.X01b = Span(Nonterminal('X'), 0, 1)
Example #7
0
 def setUp(self):
     self.X = Nonterminal('X')
     self.X2 = Nonterminal('X')
Example #8
0
 def setUp(self):
     self.tX1 = Terminal('X')
     self.tX2 = Terminal('X')
     self.nX1 = Nonterminal('X')
     self.nX2 = Nonterminal('X')
Example #9
0
def make_dead_srule(lhs='X', dead='<dead-end>', fname='DeadRule'):
    return SCFGProduction(Nonterminal(lhs), (Terminal(dead), ),
                          (Terminal(dead), ), [], {fname: 1.0})
Example #10
0
 def get_next_srule(self):
     """Returns what would be the next goal rule without updating the state of the factory."""
     rhs = [Nonterminal(self._make_goal_str(self._n))]
     return SCFGProduction(Nonterminal(self._make_goal_str(self._n + 1)),
                           rhs, rhs, [1], {'GoalRule': 1.0})
Example #11
0
def exact_parsing(seg: 'the input segment (e.g. a Sentence)',
                  grammars: 'list of CFGs', glue_grammars: 'list of glue CFGs',
                  options: 'command line options',
                  outdir: 'where to save results'):
    """Parse the input exactly."""

    logging.debug('Building input hypergraph')
    hg = cfg_to_hg(grammars, glue_grammars, omega=PCFG(fname='LogProb'))
    root = hg.fetch(Nonterminal(options.start))

    # make input DFA
    dfa = make_dfa(seg.signatures)

    # get a parser implementation
    if options.intersection == 'earley':
        parser = EarleyParser(hg, dfa, semiring.inside)
    else:
        parser = NederhofParser(hg, dfa, semiring.inside)

    # parse
    logging.debug('Parsing')
    goal_rule = NewCFGProduction(Nonterminal(options.goal),
                                 [Nonterminal(options.goal)],
                                 {'LogProb': semiring.inside.one})
    forest = parser.do(root, goal_rule)

    if not forest:
        logging.info('[%s] NO PARSE FOUND', seg.id)
        return

    tsorter = LazyTopSortTable(forest, acyclic=False)

    # report some information about the forest
    if options.forest:
        with smart_wopen('{0}/forest/{1}.gz'.format(outdir, seg.id)) as fo:
            print(forest, file=fo)
    if options.count:
        tsort = tsorter.do()
        counter = DerivationCounter(forest, tsort)
        logging.info('Paths: %d', counter.n_derivations())
        with smart_wopen('{0}/count/{1}.gz'.format(outdir, seg.id)) as fo:
            print(
                'nodes=%d edges=%d paths=%d' %
                (forest.n_nodes(), forest.n_edges(), counter.n_derivations()),
                file=fo)

    # decoding strategies
    omega_d = lambda d: semiring.inside.times.reduce(d.weights())

    if options.viterbi:
        tsort = tsorter.do()
        logging.info('Viterbi...')

        raw_viterbi = viterbi_derivation(forest, tsort)
        viterbi = make_derivation(forest, raw_viterbi)

        score = omega_d(viterbi)
        logging.info('Viterbi derivation: %s', score)
        logging.info('Saving...')
        save_viterbi('{0}/viterbi/{1}.gz'.format(outdir, seg.id),
                     SimpleNamespace(derivation=viterbi, count=1, value=score),
                     get_projection=DerivationYield.derivation)

    if options.samples > 0:
        tsort = tsorter.do()
        logging.info('Sampling...')
        sampler = AncestralSampler(forest, tsort)
        raw_samples = sampler.sample(options.samples)

        logging.info('Saving...')
        derivations = group_raw(forest, raw_samples)
        save_mc_derivations('{0}/ancestral/derivations/{1}.gz'.format(
            outdir, seg.id),
                            derivations,
                            inside=sampler.Z)

    logging.info('[%s] Finished!', seg.id)
Example #12
0
def sliced_parsing(seg: 'the input segment (e.g. a Sentence)',
                   grammars: 'a list of CFGs',
                   glue_grammars: 'a list of glue CFGs',
                   options: 'command line options',
                   outdir: 'whete to save results'):
    """Parse the input using sliced forests."""

    # Input Hypergraph
    logging.debug('Building input hypergraph')
    hg = cfg_to_hg(grammars, glue_grammars, omega=PCFG(fname='LogProb'))
    root = hg.fetch(Nonterminal(options.start))
    dfa = make_dfa(seg.signatures)
    goal_rule = NewCFGProduction(Nonterminal(options.goal),
                                 [Nonterminal(options.goal)],
                                 {'LogProb': semiring.inside.one})

    # Slice variables
    dist = get_distribution(options.free_dist)
    if options.free_dist == 'beta':
        prior = VectorOfPriors(
            get_prior(options.prior_a[0], options.prior_a[1]),
            get_prior(options.prior_b[0], options.prior_b[1]))
    elif options.free_dist == 'exponential':
        prior = get_prior(options.prior_scale[0], options.prior_scale[1])

    u = SpanSliceVariables({}, dist, prior)
    logging.debug('%r', u)
    # make initial conditions
    # TODO: consider intialisation heuristics such as attempt_initialisation(fsa, grammars, glue_grammars, options)
    logging.info('Looking for initial set of conditions...')
    uninformed_conditions(hg, dfa, u, root, goal_rule, options.batch,
                          options.intersection)
    logging.info('Done')
    #u.reset(conditions)

    # Sampling
    sizes = [0, 0, 0
             ]  # number of nodes, edges and derivations (for logging purposes)
    if options.count:
        report_size = lambda: ' nodes={:5d} edges={:5d} |D|={:5d} '.format(
            *sizes)
    else:
        report_size = lambda: ' nodes={:5d} edges={:5d}'.format(
            sizes[0], sizes[1])
    if options.progress:
        bar = progressbar(range(options.burn +
                                (options.samples * options.lag)),
                          prefix='Sampling',
                          dynsuffix=report_size)
    else:
        bar = range(options.burn + (options.samples * options.lag))

    # sample
    markov_chain = []
    for _ in bar:

        # get a parser implementation
        if options.intersection == 'earley':
            parser = EarleyParser(hg, dfa, semiring.inside, u)
        else:
            parser = NederhofParser(hg, dfa, semiring.inside, u)

        # compute a slice (a randomly pruned forest)
        forest = parser.do(root, goal_rule)
        if not forest:
            raise ValueError('A slice can never be emtpy.')

        # sample from the slice
        tsort = RobustTopSortTable(forest)
        residual = reweight(forest, u, semiring.inside)
        sampler = AncestralSampler(forest, tsort,
                                   TableLookupFunction(residual))
        raw_derivations = sampler.sample(options.batch)
        # update the slice variables and the state of the Markov chain
        u.reset(make_batch_conditions(forest, raw_derivations))

        # TODO: compute values!
        # TODO: make a derivation class
        # it is a hypergraph with its own edges, weights and rules
        # so that we can ask its value directly
        # it will basically replace the following
        # >>> tuple(forest.rule(e) for e in d)
        # then fix viterbi, MC, MCMC (all save_* methods
        # and kbest
        #markov_chain.append([tuple(forest.rule(e) for e in d) for d in raw_derivations])

        # this representation is forest agnostic
        markov_chain.append(
            [make_derivation(forest, d) for d in raw_derivations])

        # update logging information
        sizes[0], sizes[1] = forest.n_nodes(), forest.n_edges()
        if options.count:  # reporting counts
            sizes[2] = sampler.n_derivations()

    # apply MCMC filters to reduce hopefully auto-correlation
    batches = apply_filters(markov_chain, burn=options.burn, lag=options.lag)
    samples = apply_batch_filters(batches, resample=options.resample)

    # group by derivation
    derivations = group_by_identity(samples)
    # group by trees (free of nonterminal annotation)
    #trees = group_by_projection(samples, DerivationYield.tree)
    # save everything
    #omega_d = lambda d: semiring.inside.times.reduce([r.weight for r in d])
    save_mcmc_derivation('{0}/slice/derivations/{1}.gz'.format(outdir, seg.id),
                         derivations)
    #save_mcmc_yields('{0}/slice/trees/{1}.gz'.format(outdir, seg.id), trees)
    if options.save_chain:
        save_markov_chain(
            '{0}/slice/chain/{1}.gz'.format(outdir, seg.id),
            markov_chain,
            derivation2str=lambda d: DerivationYield.derivation(d.rules()),
            flat=False)
Example #13
0
def biparse(seg: SegmentMetaData, options: SimpleNamespace,
            joint_model: ModelView, conditional_model: ModelView,
            workingdir=None, redo=True, log=dummyfunc) -> SimpleNamespace:
    """
    Biparse a segment using a local model.
    1. we parse the source with a joint model
    2. we bi-parse source and target with a conditional model
    This separation allows us to factorise these models differently wrt local/nonlocal components.
    For example, an LM maybe seen as a local (read tractable) component of a conditional model,
     and as a nonlocal (read intractable) component of a joint model.
    An implementation detail: bi-parsing is implemented as a cascade of intersections (with projections in between).

    :param seg: a segment
    :param options: parsing options
    :param joint_model: a factorised view of the joint model, here we use only the local components
    :param conditional_model: a factorised view of the conditional, here we use only the local components
    :param workingdir: where to save files
    :param redo: whether or not previously saved computation should be discarded
    :param log: a logging function
    :return: result.{joint,conditional}.{forest,components} for the respective local model
    """

    if workingdir:
        saving = preprocessed_training_files('{0}/{1}'.format(workingdir, seg.id))
    else:
        saving = {}

    result = SimpleNamespace()
    result.joint = SimpleNamespace()
    result.conditional = SimpleNamespace()

    if conditional_model is None:
        steps = ['joint.forest', 'joint.components']
        if all(is_step_complete(step, saving, redo) for step in steps):
            log('[%d] Reusing joint and conditional distributions from files', seg.id)
            result.joint.forest = unpickle_it(saving['joint.forest'])
            result.joint.components = unpickle_it(saving['joint.components'])
            result.conditional.forest = None
            result.conditional.components = []
            return result

    steps = ['joint.forest', 'joint.components', 'conditional.forest', 'conditional.components']
    if all(is_step_complete(step, saving, redo) for step in steps):
        log('[%d] Reusing joint and conditional distributions from files', seg.id)
        result.joint.forest = unpickle_it(saving['joint.forest'])
        result.joint.components = unpickle_it(saving['joint.components'])
        result.conditional.forest = unpickle_it(saving['conditional.forest'])
        result.conditional.components = unpickle_it(saving['conditional.components'])
        return result

    # 1. Make a grammar

    # here we need to decode for sure
    log('[%d] Make hypergraph view of all available grammars', seg.id)
    # make a hypergraph view of all available grammars
    grammar = make_grammar_hypergraph(seg,
                                      extra_grammar_paths=options.extra_grammars,
                                      glue_grammar_paths=options.glue_grammars,
                                      pass_through=options.pass_through,
                                      default_symbol=options.default_symbol)
    #print('GRAMMAR')
    #print(grammar)

    # 2. Joint distribution - Step 1: parse source lattice
    n_goal = 0
    log('[%d] Parse source DFA', seg.id)
    goal_maker = GoalRuleMaker(goal_str=options.goal, start_str=options.start, n=n_goal)
    src_dfa = make_input_dfa(seg)
    src_forest = parse_dfa(grammar,
                           grammar.fetch(Nonterminal(options.start)),
                           src_dfa,
                           goal_maker.get_iview(),
                           bottomup=True,
                           constraint=HieroConstraints(grammar, src_dfa, options.max_span))
    #print('SOURCE')
    #print(src_forest)

    if not src_forest:
        raise ValueError('I cannot parse the input lattice: i) make sure your grammar has glue rules; ii) make sure it handles OOVs')

    # 3. Target projection of the forest
    log('[%d] Project target rules', seg.id)
    tgt_forest = make_target_forest(src_forest)
    #print('TARGET')
    #print(tgt_forest)

    # 4. Joint distribution - Step 2: scoring

    log('[%d] Joint model: (exact) local scoring', seg.id)
    result.joint = exact_rescoring(joint_model.local_model(), tgt_forest, goal_maker, log)

    # save joint distribution
    if 'joint.forest' in saving:
        pickle_it(saving['joint.forest'], result.joint.forest)
    if 'joint.components' in saving:
        pickle_it(saving['joint.components'], result.joint.components)

    if conditional_model is None:
        result.conditional.forest = None
        result.conditional.components = []
        return result

    # 5. Conditional distribution - Step 1: parse the reference lattice

    log('[%d] Parse reference DFA', seg.id)
    ref_dfa = make_reference_dfa(seg)
    goal_maker.update()
    ref_forest = parse_dfa(result.joint.forest,
                           0,
                           ref_dfa,
                           goal_maker.get_oview(),
                           bottomup=False)

    if not ref_forest:  # reference cannot be parsed
        log('[%d] References cannot be parsed', seg.id)
        result.conditional.forest = ref_forest
        result.conditional.components = []
    else:
        # 6. Conditional distribution - Step 2: scoring
        log('[%d] Conditional model: exact (local) scoring', seg.id)
        result.conditional = exact_rescoring(conditional_model.local_model(), ref_forest, goal_maker, log)

    # save conditional distribution
    if 'conditional.forest' in saving:
        pickle_it(saving['conditional.forest'], result.conditional.forest)
    if 'conditional.components' in saving:
        pickle_it(saving['conditional.components'], result.conditional.components)

    return result