Ejemplo n.º 1
0
def make_grammar(fsa):
    cfg = WCFG()
    cfg.add(Rule('[S]', ['[X]'], 0.0))
    cfg.add(Rule('[X]', ['[X]', '[X]'], 0.0))
    for word in fsa.itersymbols():
        cfg.add(Rule('[X]', [word], 0.0))
    return cfg
Ejemplo n.º 2
0
    def get_cfg(self, goal, root):
        """
        Constructs the CFG by visiting complete items in a top-down fashion.
        This is effectively a reachability test and it serves the purpose of filtering nonterminal symbols
        that could never be reached from the root.
        Note that bottom-up intersection typically does enumerate a lot of useless (unreachable) items.
        This is the recursive procedure described in the paper (Nederhof and Satta, 2008).
        """

        G = WCFG()
        processed = set()
        fsa = self._wfsa
        itergenerating = self._agenda.itergenerating
        itercomplete = self._agenda.itercomplete

        def make_rules(lhs, start, end):
            if (start, lhs, end) in processed:
                return
            processed.add((lhs, start, end))
            for item in itercomplete(lhs, start, end):
                G.add(self.get_intersected_rule(item))
                fsa_states = item.inner + (item.dot, )
                for i, sym in itertools.ifilter(
                        lambda (_, s): is_nonterminal(s),
                        enumerate(item.rule.rhs)):
                    if (
                            sym, fsa_states[i], fsa_states[i + 1]
                    ) not in processed:  # Nederhof does not perform this test, but in python it turned out crucial
                        make_rules(sym, fsa_states[i], fsa_states[i + 1])
Ejemplo n.º 3
0
def main(args):
    wcfg = WCFG(read_grammar_rules(args.grammar))
    #print 'GRAMMAR'
    #print wcfg

    for input_str in args.input:
        wfsa = make_linear_fsa(input_str)
        #print 'FSA'
        #print wfsa
        parser = Earley(wcfg, wfsa)
        forest = parser.do('[S]', '[GOAL]')
        if not forest:
            print 'NO PARSE FOUND'
            continue
        new_rules = []
        for rule in forest:
            if len(rule.rhs) > 1 and all(map(is_nonterminal, rule.rhs)):
                new_rules.append(
                    Rule(rule.lhs, reversed(rule.rhs), rule.log_prob))
        [forest.add(rule) for rule in new_rules]
        print '# FOREST'
        print forest
        print

        if args.show_permutations:
            print '# PERMUTATIONS'
            counts = count_derivations(forest, '[GOAL]')
            total = 0
            for p, n in sorted(counts['p'].iteritems(), key=lambda (k, v): k):
                print 'permutation=(%s) derivations=%d' % (' '.join(
                    str(i) for i in p), n)
                total += n
            print 'permutations=%d derivations=%d' % (len(
                counts['p'].keys()), total)
            print
def make_grammar(fsa):
    cfg = WCFG()
    cfg.add(Rule('[S]', ['[X]'], 0.0))
    cfg.add(Rule('[X]', ['[X]', '[X]'], 0.0))
    for word in fsa.itersymbols():
        cfg.add(Rule('[X]', [word], 0.0))
    return cfg
def initialise(wcfg, wfsa, root, goal, intersection):
    """
    Calculate a first derivation based on a simpler (thus smaller/faster) version of the grammar
    Thereby determining the initial conditions.
    Only applicable with the 'milos' grammar format, i.e. non-terminals have the form: '[P1234*2_1]'
    """
    smaller = WCFG([])

    logging.debug('Creating a smaller grammar for initial conditions...')
    for line in wcfg:
        if 0 < permutation_length(line.lhs) <= 2:
            smaller.add(line)
        elif line.lhs == root or line.lhs == '[UNK]':
            smaller.add(line)

    if intersection == 'nederhof':
        init_parser = Nederhof(smaller, wfsa)
    elif intersection == 'earley':
        init_parser = Earley(smaller, wfsa)
    else:
        raise NotImplementedError('I do not know this algorithm: %s' % intersection)

    logging.debug('Init Parsing...')
    init_forest = init_parser.do(root, goal)

    if not init_forest:
        print 'NO PARSE FOUND'
        return {}
    else:
        logging.debug('Forest: rules=%d', len(init_forest))

        logging.debug('Init Topsorting...')
        # sort the forest
        sorted_nodes = top_sort(init_forest)

        # calculate the inside weight of the sorted forest
        logging.debug('Init Inside...')
        init_inside_prob = inside(init_forest, sorted_nodes)

        logging.debug('Init Sampling...')
        gen_sampling = GeneralisedSampling(init_forest, init_inside_prob)
        init_d = gen_sampling.sample(goal)

    return get_conditions(init_d)
Ejemplo n.º 6
0
def initialise(wcfg, wfsa, root, goal, intersection):
    """
    Calculate a first derivation based on a simpler (thus smaller/faster) version of the grammar
    Thereby determining the initial conditions.
    Only applicable with the 'milos' grammar format, i.e. non-terminals have the form: '[P1234*2_1]'
    """
    smaller = WCFG([])

    logging.debug('Creating a smaller grammar for initial conditions...')
    for line in wcfg:
        if 0 < permutation_length(line.lhs) <= 2:
            smaller.add(line)
        elif line.lhs == root or line.lhs == '[UNK]':
            smaller.add(line)

    if intersection == 'nederhof':
        init_parser = Nederhof(smaller, wfsa)
    elif intersection == 'earley':
        init_parser = Earley(smaller, wfsa)
    else:
        raise NotImplementedError('I do not know this algorithm: %s' %
                                  intersection)

    logging.debug('Init Parsing...')
    init_forest = init_parser.do(root, goal)

    if not init_forest:
        print 'NO PARSE FOUND'
        return {}
    else:
        logging.debug('Forest: rules=%d', len(init_forest))

        logging.debug('Init Topsorting...')
        # sort the forest
        sorted_nodes = top_sort(init_forest)

        # calculate the inside weight of the sorted forest
        logging.debug('Init Inside...')
        init_inside_prob = inside(init_forest, sorted_nodes)

        logging.debug('Init Sampling...')
        gen_sampling = GeneralisedSampling(init_forest, init_inside_prob)
        init_d = gen_sampling.sample(goal)

    return get_conditions(init_d)
Ejemplo n.º 7
0
def read_grammar(rules_file, lexicon_file, transform=math.log):
    return WCFG(
        chain(iterrules(rules_file, transform),
              iterlexicon(lexicon_file, transform)))