def _pair_from_tree(self, tree): if (tree.node != 'Pair'): raise RuntimeException('expected Pair, got ' + str(tree)) if len(tree) == 1: return KimmoPair(tree[0], tree[0]) else: return KimmoPair(tree[0], tree[2])
def from_dfa_dict(name, states, subsets): fsa = FSA() pairs = set([KimmoPair.make('@')]) for (statename, trans) in states.items(): for label in trans: if label != 'others': pairs.add(KimmoPair.make(label)) for (statename, trans) in states.items(): parts = statename.split() source = parts[-1] if not parts[0].startswith('rej'): fsa.add_final(source) if fsa.start() == 0 and source in ['begin', 'Begin', '1', 1]: fsa.set_start(source) if source in ['start', 'Start']: fsa.set_start(source) used_pairs = set() for label in trans: if label != 'others': used_pairs.add(KimmoPair.make(label)) for label, target in trans.items(): if label.lower() == 'others': fsa.insert_safe(source, KimmoPair.make('@'), target) for pair in pairs.difference(used_pairs): fsa.insert_safe(source, pair, target) else: fsa.insert_safe(source, KimmoPair.make(label), target) return KimmoFSARule(name, fsa, subsets)
def parse_table(name, table, subsets): lines = table.split('\n') if len(lines) < 4: raise ValueError,\ "Rule %s has too few lines to be an FSA table." % name pairs1 = lines[1].strip().split() pairs2 = lines[2].strip().split() if len(pairs1) != len(pairs2): raise ValueError,\ "Rule %s has pair definitions that don't line up." % name pairs = [KimmoPair(p1, p2) for p1, p2 in zip(pairs1, pairs2)] finals = [] fsa = FSA() for line in lines[3:]: line = line.strip() if not line: continue groups = re.match(r'(\w+)(\.|:)\s*(.*)', line) if groups is None: raise ValueError,\ "Can't parse this line of the state table for rule %s:\n%s"\ % (name, line) state, char, morestates = groups.groups() if fsa.start() == 0: fsa.set_start(state) if char == ':': finals.append(state) fsa.add_state(state) morestates = morestates.split() if len(morestates) != len(pairs): raise ValueError,\ "Rule %s has a row of the wrong length:\n%s\ngot %d items, should be %d"\ % (name, line, len(morestates), len(pairs)) for pair, nextstate in zip(pairs, morestates): fsa.insert_safe(state, pair, nextstate) fsa.set_final(finals) return KimmoFSARule(name, fsa, subsets)
def _from_yaml_dict(cls, map): lexicon = map.get('lexicon') if lexicon: lexicon = KimmoMorphology.load(lexicon) subsets = map['subsets'] for key, value in subsets.items(): if isinstance(value, basestring): subsets[key] = value.split() defaults = map['defaults'] if isinstance(defaults, basestring): defaults = defaults.split() defaults = [KimmoPair.make(text) for text in defaults] ruledic = map['rules'] rules = [] for (name, rule) in ruledic.items(): if isinstance(rule, dict): rules.append(KimmoFSARule.from_dfa_dict(name, rule, subsets)) elif isinstance(rule, basestring): if rule.strip().startswith('FSA'): rules.append(KimmoFSARule.parse_table(name, rule, subsets)) else: rules.append(KimmoArrowRule(name, rule, subsets)) else: raise ValueError, "Can't recognize the data structure in '%s' as a rule: %s" % ( name, rule) return cls(subsets, defaults, rules, lexicon)
def _from_yaml_dict(cls, map): lexicon = map.get('lexicon') if lexicon: lexicon = KimmoMorphology.load(lexicon) subsets = {} if 'subsets' in map: map['subsets'] for key, value in subsets.items(): if isinstance(value, basestring): subsets[key] = value.split() defaults = map['defaults'] if isinstance(defaults, basestring): defaults = defaults.split() defaults = [KimmoPair.make(text) for text in defaults] rules = [] return cls(subsets, defaults, rules, lexicon)
def complete_fsa(self, fsa, fail_state=None): fsa = deepcopy(fsa) if fail_state is None: fail_state = fsa.add_state('Fail') fsa.insert('Fail', KimmoPair.make('@'), 'Fail') sorted_pairs = sort_subsets(self._pairs, self._subsets) for state in fsa.states(): trans = fsa._transitions[state] for pair in self._pairs: if pair not in trans: for sp in sorted_pairs: if sp in trans and sp.includes(pair, self._subsets): trans[pair] = trans[sp] break trans[pair] = [fail_state] if trans[pair] == []: trans[pair] = [fail_state] fsa._build_reverse_transitions() return fsa
def _from_yaml_dict(cls, map): lexicon = map.get('lexicon') if lexicon: lexicon = KimmoMorphology.load(lexicon) subsets = map['subsets'] for key, value in subsets.items(): if isinstance(value, basestring): subsets[key] = value.split() defaults = map['defaults'] if isinstance(defaults, basestring): defaults = defaults.split() defaults = [KimmoPair.make(text) for text in defaults] ruledic = map['rules'] rules = [] for (name, rule) in ruledic.items(): if isinstance(rule, dict): rules.append(KimmoFSARule.from_dfa_dict(name, rule, subsets)) elif isinstance(rule, basestring): if rule.strip().startswith('FSA'): rules.append(KimmoFSARule.parse_table(name, rule, subsets)) else: rules.append(KimmoArrowRule(name, rule, subsets)) else: raise ValueError, "Can't recognize the data structure in '%s' as a rule: %s" % (name, rule) return cls(subsets, defaults, rules, lexicon)
def _pairify(state): newstate = {} for label, targets in state.items(): newstate[KimmoPair.make(label)] = targets return newstate
def _generate(self, pairs, state_list, morphology_state=None, word='', lexical=None, surface=None, features='', log=None, origsurface=None): feat = None if morphology_state: morph = self._morphology morphed = False for state, feat in morph.next_states(morphology_state, word): if feat is not None: #log.addFeature(feat) newfeat = combine_features(features, feat) else: newfeat = features #log.clearFeatures() for result in self._generate(pairs, state_list, state, '', lexical, surface, newfeat, log, origsurface): #log.clearFeatures() log.addFeature(feat) yield result return # only first result needed morphed = True #log.clearFeatures() if morphed: #log.clearFeatures() return lexical_chars = list(morph.valid_lexical(morphology_state, word, self._pair_alphabet.union(set([KimmoPair.make(x) for x in origsurface])))) + list(self._null) else: #log.clearFeatures() lexical_chars = None if lexical == '' or surface == '': if morphology_state is None or morphology_state.lower() == 'end': # check that all rules are in accepting states for r in range(len(self._rules)): rule = self._rules[r] state = state_list[r] if state not in rule.fsa().finals(): log.clearFeatures() return if log: log.succeed(pairs) #if feat is not None: # log.addFeature(feat) #log.clearFeatures() yield pairs, features #log.clearFeatures() return #print len(lexical_chars) npa = self._pair_alphabet.union(set([KimmoPair.make(x) for x in origsurface])) next_pairs = [p for p in npa if (lexical is None or startswith(lexical, self._pairtext(p.input()))) and (surface is None or startswith(surface, self._pairtext(p.output())))] for pair in next_pairs: if pair.input() == self._null and pair.output() == self._null: print "Warning: The pair 0:0 would be an infinite loop. Ignoring it." log.clearFeatures() continue if lexical_chars is not None and pair.input() not in lexical_chars: #log.clearLastFeature() continue new_states = state_list[:] for r in range(len(self._rules)): rule = self._rules[r] state = state_list[r] next_state = self._advance_rule(rule, state, pair) new_states[r] = next_state newword = word + self._pairtext(pair.input()) if log: log.step(pairs, pair, self._rules, state_list, new_states, morphology_state, newword) #if feat: # log.addFeature(feat) fail = False for new_state in new_states: if new_state is None or str(new_state) == '0'\ or str(new_state) == 'reject': fail = True break if fail: continue newlex, newsurf = lexical, surface if lexical: newlex = lexical[len(self._pairtext(pair.input())):] if surface: newsurf = surface[len(self._pairtext(pair.output())):] for result in self._generate(pairs+[pair], new_states, morphology_state, newword, newlex, newsurf, features, log, origsurface): yield result return # only first result needed
def _generate(self, pairs, state_list, morphology_state=None, word='', lexical=None, surface=None, features='', log=None, origsurface=None): feat = None if morphology_state: morph = self._morphology morphed = False for state, feat in morph.next_states(morphology_state, word): if feat is not None: #log.addFeature(feat) newfeat = combine_features(features, feat) else: newfeat = features #log.clearFeatures() for result in self._generate(pairs, state_list, state, '', lexical, surface, newfeat, log, origsurface): #log.clearFeatures() log.addFeature(feat) yield result return # only first result needed morphed = True #log.clearFeatures() if morphed: #log.clearFeatures() return lexical_chars = list( morph.valid_lexical( morphology_state, word, self._pair_alphabet.union( set([KimmoPair.make(x) for x in origsurface])))) + list(self._null) else: #log.clearFeatures() lexical_chars = None if lexical == '' or surface == '': if morphology_state is None or morphology_state.lower() == 'end': # check that all rules are in accepting states for r in range(len(self._rules)): rule = self._rules[r] state = state_list[r] if state not in rule.fsa().finals(): log.clearFeatures() return if log: log.succeed(pairs) #if feat is not None: # log.addFeature(feat) #log.clearFeatures() yield pairs, features #log.clearFeatures() return #print len(lexical_chars) npa = self._pair_alphabet.union( set([KimmoPair.make(x) for x in origsurface])) next_pairs = [ p for p in npa if (lexical is None or startswith(lexical, self._pairtext(p.input( )))) and (surface is None or startswith(surface, self._pairtext(p.output()))) ] for pair in next_pairs: if pair.input() == self._null and pair.output() == self._null: print "Warning: The pair 0:0 would be an infinite loop. Ignoring it." log.clearFeatures() continue if lexical_chars is not None and pair.input() not in lexical_chars: #log.clearLastFeature() continue new_states = state_list[:] for r in range(len(self._rules)): rule = self._rules[r] state = state_list[r] next_state = self._advance_rule(rule, state, pair) new_states[r] = next_state newword = word + self._pairtext(pair.input()) if log: log.step(pairs, pair, self._rules, state_list, new_states, morphology_state, newword) #if feat: # log.addFeature(feat) fail = False for new_state in new_states: if new_state is None or str(new_state) == '0'\ or str(new_state) == 'reject': fail = True break if fail: continue newlex, newsurf = lexical, surface if lexical: newlex = lexical[len(self._pairtext(pair.input())):] if surface: newsurf = surface[len(self._pairtext(pair.output())):] for result in self._generate(pairs + [pair], new_states, morphology_state, newword, newlex, newsurf, features, log, origsurface): yield result return # only first result needed