def __init__(self, grammar_sets, debug=False): # allow copy construction if isinstance(grammar_sets, Grammar): grammar_sets = grammar_sets.grammar_sets assert isinstance(grammar_sets, GrammarTables), repr(type(grammar_sets).__name__) grammar_sets._verify() terminals, non_terminals, productions, starts = self._grammar_sets = grammar_sets if debug: print "productions:", tuple(enumerate(productions)) print "starts:", tuple(starts) # bijective maps with non-negative ids self.terminal_by_id, self.id_by_terminal = frozenbijection(terminals) self.non_terminal_by_id, self.id_by_non_terminal = frozenbijection(non_terminals) if debug: print "terminal_by_id:", tuple(enumerate(self.terminal_by_id)) print "terminal by integer:", tuple( (self.symbol_to_integer(terminal), terminal) for terminal in self.terminal_by_id ) print "non_terminal_by_id:", tuple(enumerate(self.non_terminal_by_id)) # create the index-based/integer-based left-hand sides and integer-based right-hand sides # note: we put completion_id at end of each rhs as a cursor-indexible sentinel for lhs, rhs in productions: assert self.id_by_non_terminal[lhs] == self.symbol_to_integer(lhs) prod1 = sorteduniquetuple( ( self.id_by_non_terminal[lhs], tuple(self.symbol_to_integer(symbol) for symbol in rhs) + (self.completion_id,), ) for lhs, rhs in productions ) if debug: print "prod1:", prod1 # introduce two further levels of indirection: sharing of rhs sequences, sharing of rule sets self.rhs_by_id, id_by_rhs = frozenbijection(rhs for lhs, rhs in prod1) if debug: print "rhs_by_id:", tuple(enumerate(self.rhs_by_id)) print "rhs uniqueness:", len(self.rhs_by_id), "of", len(prod1), ":", len(self.rhs_by_id) / len(prod1) # each production as a pair: this uses an index for each rhs sequence prod2 = sorteduniquetuple((lhs, id_by_rhs[rhs]) for lhs, rhs in prod1) if debug: print "prod2:", prod2 proddict = defaultdict(set) for lhs, rhs in prod2: proddict[lhs].add(rhs) # here, for each production, a tuple of the set of rhs indices is used for rhs prod3 = sorteduniquetuple((lhs, sorteduniquetuple(proddict[lhs])) for lhs in xrange(len(proddict))) self.ruleset_by_id, id_by_ruleset = frozenbijection(rhs for lhs, rhs in prod3) if debug: print "ruleset_by_id:", tuple(enumerate(self.ruleset_by_id)) print "ruleset uniqueness:", len(self.ruleset_by_id), "of", len(prod3), ":", len(self.ruleset_by_id) / len( prod3 ) # note: lhs are now a (useless) enumeration self.ruleset_id_by_lhs_id = tuple(id_by_ruleset[rhs] for lhs, rhs in prod3) if debug: print "self.ruleset_id_by_lhs_id:", tuple(enumerate(self.ruleset_id_by_lhs_id)) self.starts = sorteduniquetuple(self.id_by_non_terminal[start] for start in starts) if debug: print "starts:", self.starts self._verify()
def WfstCompose(wfst1, wfst2, epsilon=None): """ >>> # epsilon = 'Ε' >>> epsilon = '-' >>> wfst1 = FrozenGraph(GraphTables(((0, 1), (0, 0, 0, 0, 0), (1, 1, 1, 1, 1), (('a', 'A'), ('b', 'B'), ('c', 'C'), ('d', 'D'), ('e', 'E'))))) >>> wfst2 = FrozenGraph(GraphTables(((0, 1), (0, 0, 0, 0, 0), (1, 1, 1, 1, 1), (('A', 0), ('B', 1), ('C', 2), ('D', 3), ('E', 4, ))))) #0.5))))) >>> wfst3 = wfst1 + wfst2 + WfstCompose(wfst1, wfst2) workset: set([(0, 0)]) a A 0 b B 1 c C 2 d D 3 e E 4 >>> #wfst3.dot_display(arc_label_callback=_wfst_arc_label_callback, globals=('rankdir=LR;',)) and None >>> wfst10 = FrozenGraph(GraphTables(((0, 1, 2, 3, 4), (0, 1, 1, 2, 3), (1, 2, 3, 1, 4), (('a', 'b'), ('p','a'), ('l','a'), ('p','n'), ('e', epsilon))))) >>> wfst11 = FrozenGraph(GraphTables(((0, 1), (0, 0, 0, 1, ), (1, 1, 1, 0, ), (('a', 'A'), ('b', 'B'), ('n', 'N'), (epsilon, epsilon), )))) >>> wfst12 = wfst10 + wfst11 + WfstCompose(wfst10, wfst11, epsilon) workset: set([(0, 0)]) a b B p a A l a A p n N >>> #wfst12.dot_display(arc_label_callback=_wfst_arc_label_callback, globals=('rankdir=LR;',)) and None """ nodelabels1, arcstartnodes1, arcendnodes1, arclabels1 = wfst1.graphseqs nodelabels2, arcstartnodes2, arcendnodes2, arclabels2 = wfst2.graphseqs nodeadjout1 = wfst1.nodeadjout nodeadjout2 = wfst2.nodeadjout starts1 = frozenset((0,)) starts2 = frozenset((0,)) ends1 = frozenset((4,)) ends2 = frozenset((1,)) # XXX need to get per-node epsilon closures first workset = set((s1, s2) for s1 in starts1 for s2 in starts2) print 'workset:', workset # some may become isolated startnodes = frozenset(workset) nodes = set(startnodes) arcs = set() while workset: u1, u2 = u = workset.pop() outs1 = nodeadjout1[u1] outs2 = nodeadjout2[u2] for v1, e1 in outs1: in1, out1 = arclabels1[e1] for v2, e2 in outs2: in2, out2 = arclabels2[e2] if in2 is out2 is epsilon: v = u1, v2 if v not in nodes: nodes.add(v) assert v not in workset workset.add(v) elif out1 is epsilon: v = v1, v2 if v not in nodes: nodes.add(v) assert v not in workset workset.add(v) arc = u, v, (in1, out1) #assert arc not in arcs arcs.add(arc) elif out1 == in2: print in1, out1, out2 v = v1, v2 if v not in nodes: nodes.add(v) assert v not in workset workset.add(v) arc = u, v, (in1, out2) assert arc not in arcs arcs.add(arc) node_by_id, id_by_node = frozenbijection(nodes) arcstart, arcend, arclabel = tuplenoflist(3) for u, v, label in arcs: arcstart.append(id_by_node[u]) arcend.append(id_by_node[v]) arclabel.append(label) return GraphBase(GraphTables((node_by_id, tuple(arcstart), tuple(arcend), tuple(arclabel))))
def __init__(self, grammar_sets, debug=False): # allow copy construction if isinstance(grammar_sets, Grammar): grammar_sets = grammar_sets.grammar_sets assert isinstance(grammar_sets, GrammarTables), repr(type(grammar_sets).__name__) grammar_sets._verify() terminals, non_terminals, productions, starts = self._grammar_sets = grammar_sets if debug: print 'productions:', tuple(enumerate(productions)) print 'starts:', tuple(starts) # bijective maps with non-negative ids self.terminal_by_id, self.id_by_terminal = frozenbijection(terminals) self.non_terminal_by_id, self.id_by_non_terminal = frozenbijection( non_terminals) if debug: print 'terminal_by_id:', tuple(enumerate(self.terminal_by_id)) print 'terminal by integer:', tuple( (self.symbol_to_integer(terminal), terminal) for terminal in self.terminal_by_id) print 'non_terminal_by_id:', tuple( enumerate(self.non_terminal_by_id)) # create the index-based/integer-based left-hand sides and integer-based right-hand sides # note: we put completion_id at end of each rhs as a cursor-indexible sentinel for lhs, rhs in productions: assert self.id_by_non_terminal[lhs] == self.symbol_to_integer(lhs) prod1 = sorteduniquetuple( (self.id_by_non_terminal[lhs], tuple(self.symbol_to_integer(symbol) for symbol in rhs) + (self.completion_id, )) for lhs, rhs in productions) if debug: print 'prod1:', prod1 # introduce two further levels of indirection: sharing of rhs sequences, sharing of rule sets self.rhs_by_id, id_by_rhs = frozenbijection(rhs for lhs, rhs in prod1) if debug: print 'rhs_by_id:', tuple(enumerate(self.rhs_by_id)) print 'rhs uniqueness:', len( self.rhs_by_id), 'of', len(prod1), ':', len( self.rhs_by_id) / len(prod1) # each production as a pair: this uses an index for each rhs sequence prod2 = sorteduniquetuple((lhs, id_by_rhs[rhs]) for lhs, rhs in prod1) if debug: print 'prod2:', prod2 proddict = defaultdict(set) for lhs, rhs in prod2: proddict[lhs].add(rhs) # here, for each production, a tuple of the set of rhs indices is used for rhs prod3 = sorteduniquetuple((lhs, sorteduniquetuple(proddict[lhs])) for lhs in xrange(len(proddict))) self.ruleset_by_id, id_by_ruleset = frozenbijection( rhs for lhs, rhs in prod3) if debug: print 'ruleset_by_id:', tuple(enumerate(self.ruleset_by_id)) print 'ruleset uniqueness:', len( self.ruleset_by_id), 'of', len(prod3), ':', len( self.ruleset_by_id) / len(prod3) # note: lhs are now a (useless) enumeration self.ruleset_id_by_lhs_id = tuple(id_by_ruleset[rhs] for lhs, rhs in prod3) if debug: print 'self.ruleset_id_by_lhs_id:', tuple( enumerate(self.ruleset_id_by_lhs_id)) self.starts = sorteduniquetuple(self.id_by_non_terminal[start] for start in starts) if debug: print 'starts:', self.starts self._verify()