def parse_table(name, table, subsets): lines = table.split('\n') if len(lines) < 4: raise ValueError,\ "Rule %s has too few lines to be an FSA table." % name pairs1 = lines[1].strip().split() pairs2 = lines[2].strip().split() if len(pairs1) != len(pairs2): raise ValueError,\ "Rule %s has pair definitions that don't line up." % name pairs = [KimmoPair(p1, p2) for p1, p2 in zip(pairs1, pairs2)] finals = [] fsa = FSA() for line in lines[3:]: line = line.strip() if not line: continue groups = re.match(r'(\w+)(\.|:)\s*(.*)', line) if groups is None: raise ValueError,\ "Can't parse this line of the state table for rule %s:\n%s"\ % (name, line) state, char, morestates = groups.groups() if fsa.start() == 0: fsa.set_start(state) if char == ':': finals.append(state) fsa.add_state(state) morestates = morestates.split() if len(morestates) != len(pairs): raise ValueError,\ "Rule %s has a row of the wrong length:\n%s\ngot %d items, should be %d"\ % (name, line, len(morestates), len(pairs)) for pair, nextstate in zip(pairs, morestates): fsa.insert_safe(state, pair, nextstate) fsa.set_final(finals) return KimmoFSARule(name, fsa, subsets)
def from_dfa_dict(name, states, subsets): fsa = FSA() pairs = set([KimmoPair.make('@')]) for (statename, trans) in states.items(): for label in trans: if label != 'others': pairs.add(KimmoPair.make(label)) for (statename, trans) in states.items(): parts = statename.split() source = parts[-1] if not parts[0].startswith('rej'): fsa.add_final(source) if fsa.start() == 0 and source in ['begin', 'Begin', '1', 1]: fsa.set_start(source) if source in ['start', 'Start']: fsa.set_start(source) used_pairs = set() for label in trans: if label != 'others': used_pairs.add(KimmoPair.make(label)) for label, target in trans.items(): if label.lower() == 'others': fsa.insert_safe(source, KimmoPair.make('@'), target) for pair in pairs.difference(used_pairs): fsa.insert_safe(source, pair, target) else: fsa.insert_safe(source, KimmoPair.make(label), target) return KimmoFSARule(name, fsa, subsets)
def from_text(text): fsa = FSA([], {}, 'Begin', ['End']) state = 'Begin' for line in text.split('\n'): line = line.strip() if not line or startswith(line, ';'): continue if line[-1] == ':': state = line[:-1] else: if endswith(line.split()[0], ':'): parts = line.split() name = parts[0][:-1] next_states = parts[1:] for next in next_states: fsa.insert_safe(name, None, next) elif len(line.split()) > 2: # this is a lexicon entry word, next, features = line.split(None, 2) if startswith(word, '"') or\ startswith(word, "'") and endswith(word, "'"): word = eval(word) if features: if features == 'None': features = None elif features[0] in '\'"{': features = YAMLwrapper(features) fsa.insert_safe(state, (word, features), next) elif len(line.split()) == 2: word, next = line.split() features = '' if word == "''": word = '' fsa.insert_safe(state, (word, features), next) else: print "Ignoring line in morphology: %r" % line return KimmoMorphology(fsa)
def from_text(text): fsa = FSA([], {}, 'Begin', ['End']) state = 'Begin' for line in text.split('\n'): line = line.strip() if not line or line.startswith(';'): continue if line[-1] == ':': state = line[:-1] else: if line.split()[0].endswith(':'): parts = line.split() name = parts[0][:-1] next_states = parts[1:] for next in next_states: fsa.insert_safe(name, None, next) elif len(line.split()) > 2: # this is a lexicon entry word, next, features = line.split(None, 2) if word.startswith('"') or\ word.startswith("'") and word.endswith("'"): word = eval(word) if features: if features == 'None': features = None elif features[0] in '\'"{': features = YAMLwrapper(features) fsa.insert_safe(state, (word, features), next) elif len(line.split()) == 2: word, next = line.split() features = '' if word == "''": word = '' fsa.insert_safe(state, (word, features), next) else: print "Ignoring line in morphology: %r" % line return KimmoMorphology(fsa)