def parse_table(name, table, subsets):
     lines = table.split('\n')
     if len(lines) < 4:
         raise ValueError,\
         "Rule %s has too few lines to be an FSA table." % name
     pairs1 = lines[1].strip().split()
     pairs2 = lines[2].strip().split()
     if len(pairs1) != len(pairs2):
         raise ValueError,\
         "Rule %s has pair definitions that don't line up." % name
     pairs = [KimmoPair(p1, p2) for p1, p2 in zip(pairs1, pairs2)]
     finals = []
     fsa = FSA()
     for line in lines[3:]:
         line = line.strip()
         if not line: continue
         groups = re.match(r'(\w+)(\.|:)\s*(.*)', line)
         if groups is None:
             raise ValueError,\
             "Can't parse this line of the state table for rule %s:\n%s"\
             % (name, line)
         state, char, morestates = groups.groups()
         if fsa.start() == 0: fsa.set_start(state)
         if char == ':': finals.append(state)
         fsa.add_state(state)
         morestates = morestates.split()
         if len(morestates) != len(pairs):
             raise ValueError,\
             "Rule %s has a row of the wrong length:\n%s\ngot %d items, should be %d"\
             % (name, line, len(morestates), len(pairs))
         for pair, nextstate in zip(pairs, morestates):
             fsa.insert_safe(state, pair, nextstate)
     fsa.set_final(finals)
     return KimmoFSARule(name, fsa, subsets)
Beispiel #2
0
 def parse_table(name, table, subsets):
     lines = table.split('\n')
     if len(lines) < 4:
         raise ValueError,\
         "Rule %s has too few lines to be an FSA table." % name
     pairs1 = lines[1].strip().split()
     pairs2 = lines[2].strip().split()
     if len(pairs1) != len(pairs2):
         raise ValueError,\
         "Rule %s has pair definitions that don't line up." % name
     pairs = [KimmoPair(p1, p2) for p1, p2 in zip(pairs1, pairs2)]
     finals = []
     fsa = FSA()
     for line in lines[3:]:
         line = line.strip()
         if not line: continue
         groups = re.match(r'(\w+)(\.|:)\s*(.*)', line)
         if groups is None:
             raise ValueError,\
             "Can't parse this line of the state table for rule %s:\n%s"\
             % (name, line)
         state, char, morestates = groups.groups()
         if fsa.start() == 0: fsa.set_start(state)
         if char == ':': finals.append(state)
         fsa.add_state(state)
         morestates = morestates.split()
         if len(morestates) != len(pairs):
             raise ValueError,\
             "Rule %s has a row of the wrong length:\n%s\ngot %d items, should be %d"\
             % (name, line, len(morestates), len(pairs))
         for pair, nextstate in zip(pairs, morestates):
             fsa.insert_safe(state, pair, nextstate)
     fsa.set_final(finals)
     return KimmoFSARule(name, fsa, subsets)
    def from_dfa_dict(name, states, subsets):
        fsa = FSA()
        pairs = set([KimmoPair.make('@')])
        for (statename, trans) in states.items():
            for label in trans:
                if label != 'others':
                    pairs.add(KimmoPair.make(label))
        for (statename, trans) in states.items():
            parts = statename.split()
            source = parts[-1]
            if not parts[0].startswith('rej'):
                fsa.add_final(source)

            if fsa.start() == 0 and source in ['begin', 'Begin', '1', 1]:
                fsa.set_start(source)
            if source in ['start', 'Start']:
                fsa.set_start(source)

            used_pairs = set()
            for label in trans:
                if label != 'others':
                    used_pairs.add(KimmoPair.make(label))
            for label, target in trans.items():
                if label.lower() == 'others':
                    fsa.insert_safe(source, KimmoPair.make('@'), target)
                    for pair in pairs.difference(used_pairs):
                        fsa.insert_safe(source, pair, target)
                else:
                    fsa.insert_safe(source, KimmoPair.make(label), target)
        return KimmoFSARule(name, fsa, subsets)
Beispiel #4
0
 def from_dfa_dict(name, states, subsets):
     fsa = FSA()
     pairs = set([KimmoPair.make('@')])
     for (statename, trans) in states.items():
         for label in trans:
             if label != 'others':
                 pairs.add(KimmoPair.make(label))
     for (statename, trans) in states.items():
         parts = statename.split()
         source = parts[-1]
         if not parts[0].startswith('rej'):
             fsa.add_final(source)
         
         if fsa.start() == 0 and source in ['begin', 'Begin', '1', 1]:
             fsa.set_start(source)
         if source in ['start', 'Start']:
             fsa.set_start(source)
             
         used_pairs = set()
         for label in trans:
             if label != 'others':
                 used_pairs.add(KimmoPair.make(label))
         for label, target in trans.items():
             if label.lower() == 'others':
                 fsa.insert_safe(source, KimmoPair.make('@'), target)
                 for pair in pairs.difference(used_pairs):
                     fsa.insert_safe(source, pair, target)
             else:
                 fsa.insert_safe(source, KimmoPair.make(label), target)
     return KimmoFSARule(name, fsa, subsets)
Beispiel #5
0
 def from_text(text):
     fsa = FSA([], {}, 'Begin', ['End'])
     state = 'Begin'
     for line in text.split('\n'):
         line = line.strip()
         if not line or startswith(line, ';'): continue
         if line[-1] == ':':
             state = line[:-1]
         else:
             if endswith(line.split()[0], ':'):
                 parts = line.split()
                 name = parts[0][:-1]
                 next_states = parts[1:]
                 for next in next_states:
                     fsa.insert_safe(name, None, next)
             elif len(line.split()) > 2:
                 # this is a lexicon entry
                 word, next, features = line.split(None, 2)
                 if startswith(word, '"') or\
                 startswith(word, "'") and endswith(word, "'"):
                     word = eval(word)
                 if features:
                     if features == 'None': features = None
                     elif features[0] in '\'"{':
                         features = YAMLwrapper(features)
                 fsa.insert_safe(state, (word, features), next)
             elif len(line.split()) == 2:
                 word, next = line.split()
                 features = ''
                 if word == "''":
                     word = ''
                 fsa.insert_safe(state, (word, features), next)
             else:
                 print "Ignoring line in morphology: %r" % line
     return KimmoMorphology(fsa)
 def from_text(text):
     fsa = FSA([], {}, 'Begin', ['End'])
     state = 'Begin'
     for line in text.split('\n'):
         line = line.strip()
         if not line or line.startswith(';'): continue
         if line[-1] == ':':
             state = line[:-1]
         else:
             if line.split()[0].endswith(':'):
                 parts = line.split()
                 name = parts[0][:-1]
                 next_states = parts[1:]
                 for next in next_states:
                     fsa.insert_safe(name, None, next)
             elif len(line.split()) > 2:
                 # this is a lexicon entry
                 word, next, features = line.split(None, 2)
                 if word.startswith('"') or\
                 word.startswith("'") and word.endswith("'"):
                     word = eval(word)
                 if features:
                     if features == 'None': features = None
                     elif features[0] in '\'"{':
                         features = YAMLwrapper(features)
                 fsa.insert_safe(state, (word, features), next)
             elif len(line.split()) == 2:
                 word, next = line.split()
                 features = ''
                 if word == "''":
                     word = ''
                 fsa.insert_safe(state, (word, features), next)
             else:
                 print "Ignoring line in morphology: %r" % line
     return KimmoMorphology(fsa)