Esempio n. 1
0
    def __init__(self, grammar_sets, debug=False):
        # allow copy construction
        if isinstance(grammar_sets, Grammar):
            grammar_sets = grammar_sets.grammar_sets

        assert isinstance(grammar_sets, GrammarTables), repr(type(grammar_sets).__name__)
        grammar_sets._verify()
        terminals, non_terminals, productions, starts = self._grammar_sets = grammar_sets

        if debug:
            print "productions:", tuple(enumerate(productions))
            print "starts:", tuple(starts)

        # bijective maps with non-negative ids
        self.terminal_by_id, self.id_by_terminal = frozenbijection(terminals)
        self.non_terminal_by_id, self.id_by_non_terminal = frozenbijection(non_terminals)

        if debug:
            print "terminal_by_id:", tuple(enumerate(self.terminal_by_id))
            print "terminal by integer:", tuple(
                (self.symbol_to_integer(terminal), terminal) for terminal in self.terminal_by_id
            )
            print "non_terminal_by_id:", tuple(enumerate(self.non_terminal_by_id))

        # create the index-based/integer-based left-hand sides and integer-based right-hand sides
        # note: we put completion_id at end of each rhs as a cursor-indexible sentinel
        for lhs, rhs in productions:
            assert self.id_by_non_terminal[lhs] == self.symbol_to_integer(lhs)
        prod1 = sorteduniquetuple(
            (
                self.id_by_non_terminal[lhs],
                tuple(self.symbol_to_integer(symbol) for symbol in rhs) + (self.completion_id,),
            )
            for lhs, rhs in productions
        )
        if debug:
            print "prod1:", prod1

        # introduce two further levels of indirection: sharing of rhs sequences, sharing of rule sets

        self.rhs_by_id, id_by_rhs = frozenbijection(rhs for lhs, rhs in prod1)
        if debug:
            print "rhs_by_id:", tuple(enumerate(self.rhs_by_id))
            print "rhs uniqueness:", len(self.rhs_by_id), "of", len(prod1), ":", len(self.rhs_by_id) / len(prod1)

        # each production as a pair: this uses an index for each rhs sequence
        prod2 = sorteduniquetuple((lhs, id_by_rhs[rhs]) for lhs, rhs in prod1)
        if debug:
            print "prod2:", prod2

        proddict = defaultdict(set)
        for lhs, rhs in prod2:
            proddict[lhs].add(rhs)
        # here, for each production, a tuple of the set of rhs indices is used for rhs
        prod3 = sorteduniquetuple((lhs, sorteduniquetuple(proddict[lhs])) for lhs in xrange(len(proddict)))

        self.ruleset_by_id, id_by_ruleset = frozenbijection(rhs for lhs, rhs in prod3)
        if debug:
            print "ruleset_by_id:", tuple(enumerate(self.ruleset_by_id))
            print "ruleset uniqueness:", len(self.ruleset_by_id), "of", len(prod3), ":", len(self.ruleset_by_id) / len(
                prod3
            )

        # note: lhs are now a (useless) enumeration
        self.ruleset_id_by_lhs_id = tuple(id_by_ruleset[rhs] for lhs, rhs in prod3)
        if debug:
            print "self.ruleset_id_by_lhs_id:", tuple(enumerate(self.ruleset_id_by_lhs_id))

        self.starts = sorteduniquetuple(self.id_by_non_terminal[start] for start in starts)
        if debug:
            print "starts:", self.starts

        self._verify()
Esempio n. 2
0
def WfstCompose(wfst1, wfst2, epsilon=None):
    """
    >>> # epsilon = 'Ε'
    >>> epsilon = '-'
    >>> wfst1 = FrozenGraph(GraphTables(((0, 1), (0, 0, 0, 0, 0), (1, 1, 1, 1, 1), (('a', 'A'), ('b', 'B'), ('c', 'C'), ('d', 'D'), ('e', 'E')))))
    >>> wfst2 = FrozenGraph(GraphTables(((0, 1), (0, 0, 0, 0, 0), (1, 1, 1, 1, 1), (('A', 0), ('B', 1), ('C', 2), ('D', 3), ('E', 4, ))))) #0.5)))))
    >>> wfst3 = wfst1 + wfst2 + WfstCompose(wfst1, wfst2)
    workset: set([(0, 0)])
    a A 0
    b B 1
    c C 2
    d D 3
    e E 4
    >>> #wfst3.dot_display(arc_label_callback=_wfst_arc_label_callback, globals=('rankdir=LR;',)) and None

    >>> wfst10 = FrozenGraph(GraphTables(((0, 1, 2, 3, 4), (0, 1, 1, 2, 3), (1, 2, 3, 1, 4), (('a', 'b'), ('p','a'), ('l','a'), ('p','n'), ('e', epsilon)))))
    >>> wfst11 = FrozenGraph(GraphTables(((0, 1), (0, 0, 0, 1, ), (1, 1, 1, 0, ), (('a', 'A'), ('b', 'B'), ('n', 'N'), (epsilon, epsilon), ))))
    >>> wfst12 = wfst10 + wfst11 + WfstCompose(wfst10, wfst11, epsilon)
    workset: set([(0, 0)])
    a b B
    p a A
    l a A
    p n N
    >>> #wfst12.dot_display(arc_label_callback=_wfst_arc_label_callback, globals=('rankdir=LR;',)) and None
    """

    nodelabels1, arcstartnodes1, arcendnodes1, arclabels1 = wfst1.graphseqs
    nodelabels2, arcstartnodes2, arcendnodes2, arclabels2 = wfst2.graphseqs

    nodeadjout1 = wfst1.nodeadjout
    nodeadjout2 = wfst2.nodeadjout

    starts1 = frozenset((0,))
    starts2 = frozenset((0,))
    ends1 = frozenset((4,))
    ends2 = frozenset((1,))

    # XXX need to get per-node epsilon closures first

    workset = set((s1, s2) for s1 in starts1 for s2 in starts2)
    print 'workset:', workset
    # some may become isolated
    startnodes = frozenset(workset)
    nodes = set(startnodes)
    arcs = set()
    while workset:
        u1, u2 = u = workset.pop()
        outs1 = nodeadjout1[u1]
        outs2 = nodeadjout2[u2]
        for v1, e1 in outs1:
            in1, out1 = arclabels1[e1]
            for v2, e2 in outs2:
                in2, out2 = arclabels2[e2]
                if in2 is out2 is epsilon:
                    v = u1, v2
                    if v not in nodes:
                        nodes.add(v)
                        assert v not in workset
                        workset.add(v)
                elif out1 is epsilon:
                    v = v1, v2
                    if v not in nodes:
                        nodes.add(v)
                        assert v not in workset
                        workset.add(v)
                    arc = u, v, (in1, out1)
                    #assert arc not in arcs
                    arcs.add(arc)
                elif out1 == in2:
                    print in1, out1, out2
                    v = v1, v2
                    if v not in nodes:
                        nodes.add(v)
                        assert v not in workset
                        workset.add(v)
                    arc = u, v, (in1, out2)
                    assert arc not in arcs
                    arcs.add(arc)
    node_by_id, id_by_node = frozenbijection(nodes)
    arcstart, arcend, arclabel = tuplenoflist(3)
    for u, v, label in arcs:
        arcstart.append(id_by_node[u])
        arcend.append(id_by_node[v])
        arclabel.append(label)
    return GraphBase(GraphTables((node_by_id, tuple(arcstart), tuple(arcend), tuple(arclabel))))
Esempio n. 3
0
    def __init__(self, grammar_sets, debug=False):
        # allow copy construction
        if isinstance(grammar_sets, Grammar):
            grammar_sets = grammar_sets.grammar_sets

        assert isinstance(grammar_sets,
                          GrammarTables), repr(type(grammar_sets).__name__)
        grammar_sets._verify()
        terminals, non_terminals, productions, starts = self._grammar_sets = grammar_sets

        if debug:
            print 'productions:', tuple(enumerate(productions))
            print 'starts:', tuple(starts)

        # bijective maps with non-negative ids
        self.terminal_by_id, self.id_by_terminal = frozenbijection(terminals)
        self.non_terminal_by_id, self.id_by_non_terminal = frozenbijection(
            non_terminals)

        if debug:
            print 'terminal_by_id:', tuple(enumerate(self.terminal_by_id))
            print 'terminal by integer:', tuple(
                (self.symbol_to_integer(terminal), terminal)
                for terminal in self.terminal_by_id)
            print 'non_terminal_by_id:', tuple(
                enumerate(self.non_terminal_by_id))

        # create the index-based/integer-based left-hand sides and integer-based right-hand sides
        # note: we put completion_id at end of each rhs as a cursor-indexible sentinel
        for lhs, rhs in productions:
            assert self.id_by_non_terminal[lhs] == self.symbol_to_integer(lhs)
        prod1 = sorteduniquetuple(
            (self.id_by_non_terminal[lhs],
             tuple(self.symbol_to_integer(symbol)
                   for symbol in rhs) + (self.completion_id, ))
            for lhs, rhs in productions)
        if debug:
            print 'prod1:', prod1

        # introduce two further levels of indirection: sharing of rhs sequences, sharing of rule sets

        self.rhs_by_id, id_by_rhs = frozenbijection(rhs for lhs, rhs in prod1)
        if debug:
            print 'rhs_by_id:', tuple(enumerate(self.rhs_by_id))
            print 'rhs uniqueness:', len(
                self.rhs_by_id), 'of', len(prod1), ':', len(
                    self.rhs_by_id) / len(prod1)

        # each production as a pair: this uses an index for each rhs sequence
        prod2 = sorteduniquetuple((lhs, id_by_rhs[rhs]) for lhs, rhs in prod1)
        if debug:
            print 'prod2:', prod2

        proddict = defaultdict(set)
        for lhs, rhs in prod2:
            proddict[lhs].add(rhs)
        # here, for each production, a tuple of the set of rhs indices is used for rhs
        prod3 = sorteduniquetuple((lhs, sorteduniquetuple(proddict[lhs]))
                                  for lhs in xrange(len(proddict)))

        self.ruleset_by_id, id_by_ruleset = frozenbijection(
            rhs for lhs, rhs in prod3)
        if debug:
            print 'ruleset_by_id:', tuple(enumerate(self.ruleset_by_id))
            print 'ruleset uniqueness:', len(
                self.ruleset_by_id), 'of', len(prod3), ':', len(
                    self.ruleset_by_id) / len(prod3)

        # note: lhs are now a (useless) enumeration
        self.ruleset_id_by_lhs_id = tuple(id_by_ruleset[rhs]
                                          for lhs, rhs in prod3)
        if debug:
            print 'self.ruleset_id_by_lhs_id:', tuple(
                enumerate(self.ruleset_id_by_lhs_id))

        self.starts = sorteduniquetuple(self.id_by_non_terminal[start]
                                        for start in starts)
        if debug:
            print 'starts:', self.starts

        self._verify()