def parse_rules(cls, s): """ Parse a L{CFG} line involving C{Categories}. A line has this form: C{lhs -> rhs | rhs | ...} where C{lhs} is a Category, and each C{rhs} is a sequence of Categories. @returns: a list of C{Productions}, one for each C{rhs}. """ _PARSE_RE = cls._PARSE_RE position = 0 try: lhs, position = cls._parse(s, position) except ValueError as e: estr = ('Error parsing field structure\n\n\t' + s + '\n\t' + ' ' * e.args[1] + '^ ' + 'Expected %s\n' % e.args[0]) raise ValueError(estr) lhs.freeze() match = _PARSE_RE['arrow'].match(s, position) if match is None: raise ValueError('arrow', position) else: position = match.end() rules = [] while position < len(s): rhs = [] while position < len(s) and _PARSE_RE['disjunct'].match( s, position) is None: try: val, position = cls._parseval(s, position, {}) except ValueError as e: estr = ('Error parsing field structure\n\n\t' + s + '\n\t' + ' ' * e.args[1] + '^ ' + 'Expected %s\n' % e.args[0]) raise ValueError(estr) if isinstance(val, Category): val.freeze() rhs.append(val) position = _PARSE_RE['whitespace'].match(s, position).end() rules.append(cfg.Production(lhs, rhs)) if position < len(s): match = _PARSE_RE['disjunct'].match(s, position) position = match.end() # Special case: if there's nothing after the arrow, it is one rule with # an empty RHS, instead of no rules. if len(rules) == 0: rules = [cfg.Production(lhs, ())] return rules
def productions(self): """ Generate the productions that correspond to the non-terminal nodes of the tree. For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the form P -> C1 C2 ... Cn. @rtype: list of C{cfg.Production}s """ if not isinstance(self.node, str): raise TypeError, 'Productions can only be generated from trees having node labels that are strings' prods = [ cfg.Production(cfg.Nonterminal(self.node), _child_names(self)) ] for child in self: if isinstance(child, Tree): prods += child.productions() return prods
def demo(): """ A demonstration showing how C{Grammar}s can be created and used. """ from nodebox_linguistics_extended.parser.nltk_lite.parse import cfg # Create some nonterminals S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP') N, V, P, Det = cfg.nonterminals('N, V, P, Det') VP_slash_NP = VP/NP print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP] print ' S.symbol() =>', `S.symbol()` print print cfg.Production(S, [NP]) # Create some Grammar Productions grammar = cfg.parse_grammar(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' N -> 'dog' N -> 'cat' V -> 'chased' V -> 'sat' P -> 'on' P -> 'in' """) print 'A Grammar:', `grammar` print ' grammar.start() =>', `grammar.start()` print ' grammar.productions() =>', # Use string.replace(...) is to line-wrap the output. print `grammar.productions()`.replace(',', ',\n'+' '*25) print
def demo(): """ A demonstration of the recursive descent parser. """ from nodebox_linguistics_extended.parser.nltk_lite.parse import cfg # Define some nonterminals S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP') V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det') # Define a grammar. productions = ( # Syntactic Productions cfg.Production(S, [NP, 'saw', NP]), cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(VP, [V, NP, PP]), cfg.Production(NP, [Det, N, PP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['telescope'])) grammar = cfg.Grammar(S, productions) # Tokenize a sample sentence. sent = list(tokenize.whitespace('I saw a man in the park')) # Define a list of parsers. parser = RecursiveDescent(grammar) parser.trace() for p in parser.get_parse_list(sent): print(p)
def demo3(): from nodebox_linguistics_extended.parser.nltk_lite.parse import cfg (S, VP, NP, PP, P, N, Name, V, Det) = \ nonterminals('S, VP, NP, PP, P, N, Name, V, Det') productions = ( # Syntactic Productions cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP, PP]), cfg.Production(VP, [V, NP]), cfg.Production(PP, [P, NP]), cfg.Production(PP, []), cfg.Production(PP, ['up', 'over', NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['statue']), cfg.Production(Det, ['my']), ) t = Tk() def destroy(e, t=t): t.destroy() t.bind('q', destroy) p = ProductionList(t, productions) p.pack(expand=1, fill='both') p.add_callback('select', p.markonly) p.add_callback('move', p.markonly) p.focus() p.mark(productions[2]) p.mark(productions[8])
def demo2(): from nodebox_linguistics_extended.parser.nltk_lite.parse import cfg nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP, PP]), cfg.Production(VP, [V, NP]), cfg.Production(PP, [P, NP]), cfg.Production(PP, []), cfg.Production(PP, ['up', 'over', NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['statue']), cfg.Production(Det, ['my']), ) grammar = cfg.Grammar(S, productions) text = 'I saw a man in the park'.split() d = CFGDemo(grammar, text) d.mainloop()
def demo(): import sys, time S = GrammarCategory.parse('S') VP = GrammarCategory.parse('VP') NP = GrammarCategory.parse('NP') PP = GrammarCategory.parse('PP') V = GrammarCategory.parse('V') N = GrammarCategory.parse('N') P = GrammarCategory.parse('P') Name = GrammarCategory.parse('Name') Det = GrammarCategory.parse('Det') DetSg = GrammarCategory.parse('Det[-pl]') DetPl = GrammarCategory.parse('Det[+pl]') NSg = GrammarCategory.parse('N[-pl]') NPl = GrammarCategory.parse('N[+pl]') # Define some grammatical productions. grammatical_productions = [ cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), cfg.Production(NP, (NP, PP)), cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), cfg.Production(VP, (V, )), cfg.Production(NP, (DetPl, NPl)), cfg.Production(NP, (DetSg, NSg)) ] # Define some lexical productions. lexical_productions = [ cfg.Production(NP, ('John', )), cfg.Production(NP, ('I', )), cfg.Production(Det, ('the', )), cfg.Production(Det, ('my', )), cfg.Production(Det, ('a', )), cfg.Production(NSg, ('dog', )), cfg.Production(NSg, ('cookie', )), cfg.Production(V, ('ate', )), cfg.Production(V, ('saw', )), cfg.Production(P, ('with', )), cfg.Production(P, ('under', )), ] earley_grammar = cfg.Grammar(S, grammatical_productions) earley_lexicon = {} for prod in lexical_productions: earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) sent = 'I saw John with a dog with my cookie' print "Sentence:\n", sent from nodebox_linguistics_extended.parser.nltk_lite import tokenize tokens = list(tokenize.whitespace(sent)) t = time.time() cp = FeatureEarleyChartParse(earley_grammar, earley_lexicon, trace=1) trees = cp.get_parse_list(tokens) print "Time: %s" % (time.time() - t) for tree in trees: print tree
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk_lite context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk_lite.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string """ 1. Tokenize ------------------------------------------------------------------------ """ # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp(self.string, re_all) """ 2. Develop a context free grammar ------------------------------------------------------------------------ """ # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = cfg.nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" cfg.Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain cfg.Production(H, [D, '/', D]), # domain, forward slash, another operator cfg.Production(H, [D, '/', O])) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = cfg.Production(D, [tok]), productions = productions + prod # Make a grammar out of our productions grammar = cfg.Grammar(O, productions) rd_parser = parse.RecursiveDescent(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp(self.string, re_all) toklist = list(tokens) """ 3. Parse using the context free grammar ------------------------------------------------------------------------ """ # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return """ 4. Refine and convert to a Tree representation ------------------------------------------------------------------------ """ # Set the nltk_lite.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":", "").replace("')'", "").replace( "table(", "").replace("','", "").replace("'", "").replace("/", "") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk_lite.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()
def demo(): """ Create a shift reduce parser demo, using a simple grammar and text. """ from nodebox_linguistics_extended.parser.nltk_lite.parse import cfg nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP, PP]), cfg.Production(VP, [V, NP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['statue']), cfg.Production(Det, ['my']), ) grammar = cfg.Grammar(S, productions) # tokenize the sentence sent = list(tokenize.whitespace('my dog saw a man in the park with a statue')) ShiftReduceDemo(grammar, sent).mainloop()
class Category(FeatureStructure, cfg.Nonterminal): """ A C{Category} is a specialized feature structure, intended for use in parsing. It can act as a C{Nonterminal}. A C{Category} differs from a C{FeatureStructure} in these ways: - Categories may not be re-entrant. - Categories use value-based equality, while FeatureStructures use identity-based equality. - Strings in Categories are compared case-insensitively. - Categories have one feature marked as the 'head', which prints differently than other features if it has a value. For example, in the C{repr()} representation of a Category, the head goes to the left, on the outside of the brackets. Subclasses of C{Category} may change the feature name that is designated as the head, which is _head by default. - Subclasses of C{Category} may contain a list of I{required features}, which are names of features whose value is None if unspecified. A Category lacking a feature that is required in it will not unify with any Category that has that feature. If a required feature's value is C{None}, it is considered to be not present. (Mixing different subclasses of C{Category} is probably a bad idea.) - C{True} and C{False} are allowed as values. A feature named C{foo} with a value of C{True} is simply expressed as C{+foo}. Similarly, if it is C{False}, it is expressed as C{-foo}. """ headname = '_head' requiredFeatures = [] def __init__(self, **features): FeatureStructure.__init__(self, **features) self._required = self.__class__.requiredFeatures for name in self._required: if not self._features.has_key(name): self._features[name] = None items = self._features.items() items.sort() self._hash = None self._frozen = False self._memorepr = None def required_features(self): "@return: A list of the names of all required features." return self._required def __cmp__(self, other): return cmp(repr(self), repr(other)) def __div__(self, other): """ @return: A new Category based on this one, with its C{/} feature set to C{other}. """ temp = self.deepcopy() dict = temp._features dict['/'] = other return self.__class__(**dict) def __eq__(self, other): """ @return: True if C{self} and C{other} assign the same value to to every feature. In particular, return true if C{self[M{p}]==other[M{p}]} for every feature path M{p} such that C{self[M{p}]} or C{other[M{p}]} is a base value (i.e., not a nested Category). @rtype: C{bool} """ # Get the result of equal_values, and make it a real boolean while # we're at it. if not other.__class__ == self.__class__: return False if hash(self) != hash(other): return False return (self.equal_values(other) == True) def __ne__(self, other): return not (self == other) def __hash__(self): if self._hash is not None: return self._hash items = self._features.items() items.sort() return hash(tuple(items)) def freeze(self): """ Freezing a Category memoizes its hash value, to make comparisons on it faster. After freezing, the Category and all its values are immutable. @return: self """ for val in self._features.values(): if isinstance(val, Category) and not val.frozen(): val.freeze() self._hash = hash(self) self._memorepr = self._repr({}, {}) self._frozen = True return self def frozen(self): """ Returns whether this Category is frozen (immutable). @rtype: C{bool} """ return self._frozen def __setitem__(self, name, value): if self._frozen: raise "Cannot modify a frozen Category" self._features[name] = value def symbol(self): """ @return: The node value corresponding to this C{Category}. @rtype: C{Category} """ return self def head(self): """ @return: The head of this category (the value shown outside the brackets in its string representation). If there is no head, returns None. @rtype: C{str} or C{None} """ return self._features.get(self.__class__.headname) def deepcopy(self, memo=None): """ @return: A deep copy of C{self}. """ newcopy = self.__class__() features = newcopy._features # Fill out the features. for (fname, fval) in self._features.items(): if isinstance(fval, FeatureStructure): features[fname] = fval.deepcopy() else: features[fname] = fval return newcopy def reentrances(self): return [] def feature_names(self): """ @return: a list of all features that have values. """ return filter(lambda x: not (x in self._required and self[x] is None), self._features.keys()) def get_feature(self, *args): try: return self.__getitem__(*args) except IndexError: return StarValue() def has_feature(self, name): return (name in self.feature_names()) ################################################################# ## Variables ################################################################# def remove_unbound_vars(self): selfcopy = self.deepcopy() selfcopy._remove_unbound_vars() return selfcopy def _remove_unbound_vars(self): for (fname, fval) in self._features.items(): if isinstance(fval, FeatureVariable): del self._features[fname] elif isinstance(fval, Category): fval._remove_unbound_vars() ################################################################# ## Unification ################################################################# def _destructively_unify(self, other, bindings, trace=False, depth=0): FeatureStructure._destructively_unify(self, other, bindings, \ trace=trace, ci_str_cmp=True, depth=depth) ################################################################# ## String Representations ################################################################# def __repr__(self): """ @return: A string representation of this feature structure. """ if self._memorepr is not None: return self._memorepr else: return self._repr({}, {}) def _repr(self, reentrances, reentrance_ids): segments = [] items = self.feature_names() items.sort() # sorting note: keys are unique strings, so we'll # never fall through to comparing values. for fname in items: if fname == self.__class__.headname: continue fval = self[fname] if isinstance(fval, bool): if fval: segments.append('+%s' % fname) else: segments.append('-%s' % fname) elif not isinstance(fval, Category): segments.append('%s=%r' % (fname, fval)) else: fval_repr = fval._repr(reentrances, reentrance_ids) segments.append('%s=%s' % (fname, fval_repr)) head = self._features.get(self.__class__.headname) if head is None: head = '' if head and not len(segments): return head return '%s[%s]' % (head, ', '.join(segments)) def _str(self, reentrances, reentrance_ids): # This code is very similar to FeatureStructure._str but # we print the head feature very differently, so it's hard to # combine the two methods. # Special case: if len(self.feature_names()) == 0: return ['[]'] if self.feature_names() == [self.__class__.headname]: return ['%s[]' % self[self.__class__.headname]] # What's the longest feature name? Use this to align names. maxfnamelen = max([len(k) for k in self.feature_names()]) lines = [] items = self.feature_names() items.sort() # sorting note: keys are unique strings, so we'll # never fall through to comparing values. if self.__class__.headname in items: items.remove(self.__class__.headname) # items.insert(0, self.__class__.headname) for fname in items: fval = self[fname] if not isinstance(fval, FeatureStructure): # It's not a nested feature structure -- just print it. lines.append('%s = %r' % (fname.ljust(maxfnamelen), fval)) else: # It's a new feature structure. Separate it from # other values by a blank line. if lines and lines[-1] != '': lines.append('') # Recursively print the feature's value (fval). fval_lines = fval._str(reentrances, reentrance_ids) # Indent each line to make room for fname. fval_lines = [(' '*(maxfnamelen+3))+l for l in fval_lines] # Pick which line we'll display fname on. nameline = (len(fval_lines)-1)/2 fval_lines[nameline] = ( fname.ljust(maxfnamelen)+' ='+ fval_lines[nameline][maxfnamelen+2:]) # Add the feature structure to the output. lines += fval_lines # Separate FeatureStructures by a blank line. lines.append('') # Get rid of any excess blank lines. if lines[-1] == '': lines = lines[:-1] # Add brackets around everything. headline = (len(lines) - 1)/2 if self.has_feature(self.__class__.headname): head = self[self.__class__.headname] else: head = '' maxlen = max([len(line) for line in lines]) for l in range(len(lines)): line = lines[l] if l == headline: lines[l] = ('%s[ %s%s ]' % (head, line, ' '*(maxlen-len(line)))) else: lines[l] = ('%s[ %s%s ]' % (' '*len(head), line, ' '*(maxlen-len(line)))) return lines ################################################################# ## Parsing ################################################################# # Regular expressions for parsing. # Extend the expressions already present in FeatureStructure._PARSE_RE _PARSE_RE = {'categorystart': re.compile(r'\s*([^\s\(\)"\'\-=,\[\]]*)\s*\['), 'bool': re.compile(r'\s*([-\+])'), 'arrow': re.compile(r'\s*->\s*'), #'application': re.compile(r'(app)\((\?[a-z][a-z]*)\s*,\s*(\?[a-z][a-z]*)\)'), 'disjunct': re.compile(r'\s*\|\s*'), 'whitespace': re.compile(r'\s*')} for (k, v) in FeatureStructure._PARSE_RE.iteritems(): assert k not in _PARSE_RE _PARSE_RE[k] = v # [classmethod] def _parse(cls, s, position=0, reentrances=None): """ Helper function that parses a Category. @param s: The string to parse. @param position: The position in the string to start parsing. @param reentrances: A dictionary from reentrance ids to values. @return: A tuple (val, pos) of the feature structure created by parsing and the position where the parsed feature structure ends. """ # A set of useful regular expressions (precompiled) _PARSE_RE = cls._PARSE_RE # Find the head, if there is one. match = _PARSE_RE['name'].match(s, position) if match is not None: head = match.group(1) position = match.end() else: head = None # Check that the name is followed by an open bracket. if position >= len(s) or s[position] != '[': return cls(**{cls.headname: head}), position position += 1 # If it's immediately followed by a close bracket, then just # return an empty feature structure. match = _PARSE_RE['bracket'].match(s, position) if match is not None: if head is None: return cls(), match.end() else: return cls(**{cls.headname: head}), match.end() # Build a list of the features defined by the structure. # Each feature has one of the three following forms: # name = value # +name # -name features = {} if head is not None: features[cls.headname] = head while position < len(s): # Use these variables to hold info about the feature: name = target = val = None # Is this a shorthand boolean value? match = _PARSE_RE['bool'].match(s, position) if match is not None: if match.group(1) == '+': val = True else: val = False position = match.end() # Find the next feature's name. match = _PARSE_RE['name'].match(s, position) if match is None: raise ValueError('feature name', position) name = match.group(1) position = match.end() # If it's not a shorthand boolean, it must be an assignment. if val is None: match = _PARSE_RE['assign'].match(s, position) if match is None: raise ValueError('equals sign', position) position = match.end() val, position = cls._parseval(s, position, reentrances) features[name] = val # Check for a close bracket match = _PARSE_RE['bracket'].match(s, position) if match is not None: return cls(**features), match.end() # Otherwise, there should be a comma match = _PARSE_RE['comma'].match(s, position) if match is None: raise ValueError('comma', position) position = match.end() # We never saw a close bracket. raise ValueError('close bracket', position) # [classmethod] def _parseval(cls, s, position, reentrances): """ Helper function that parses a feature value. Currently supports: None, bools, integers, variables, strings, nested feature structures. @param s: The string to parse. @param position: The position in the string to start parsing. @param reentrances: A dictionary from reentrance ids to values. @return: A tuple (val, pos) of the value created by parsing and the position where the parsed value ends. """ # A set of useful regular expressions (precompiled) _PARSE_RE = cls._PARSE_RE # End of string (error) if position == len(s): raise ValueError('value', position) # Semantic value of the form <app(?x, ?y) >'; return an ApplicationExpression match = _PARSE_RE['application'].match(s, position) if match is not None: fun = ParserSubstitute(match.group(2)).next() arg = ParserSubstitute(match.group(3)).next() return ApplicationExpressionSubst(fun, arg), match.end() # other semantic value enclosed by '< >'; return value given by the lambda expr parser match = _PARSE_RE['semantics'].match(s, position) if match is not None: return ParserSubstitute(match.group(1)).next(), match.end() # String value if s[position] in "'\"": start = position quotemark = s[position:position+1] position += 1 while 1: match = _PARSE_RE['stringmarker'].search(s, position) if not match: raise ValueError('close quote', position) position = match.end() if match.group() == '\\': position += 1 elif match.group() == quotemark: return eval(s[start:position]), position # Nested category if _PARSE_RE['categorystart'].match(s, position) is not None: return cls._parse(s, position, reentrances) # Variable match = _PARSE_RE['var'].match(s, position) if match is not None: return FeatureVariable.parse(match.group()), match.end() # None match = _PARSE_RE['none'].match(s, position) if match is not None: return None, match.end() # Integer value match = _PARSE_RE['int'].match(s, position) if match is not None: return int(match.group()), match.end() # Alphanumeric symbol (must be checked after integer) match = _PARSE_RE['symbol'].match(s, position) if match is not None: return cls(**{cls.headname: match.group()}), match.end() # We don't know how to parse this value. raise ValueError('value', position) # [classmethod] # Used by GrammarFile def parse_rules(cls, s): """ Parse a L{CFG} line involving C{Categories}. A line has this form: C{lhs -> rhs | rhs | ...} where C{lhs} is a Category, and each C{rhs} is a sequence of Categories. @returns: a list of C{Productions}, one for each C{rhs}. """ _PARSE_RE = cls._PARSE_RE position = 0 try: lhs, position = cls._parse(s, position) except ValueError, e: estr = ('Error parsing field structure\n\n\t' + s + '\n\t' + ' '*e.args[1] + '^ ' + 'Expected %s\n' % e.args[0]) raise ValueError, estr lhs.freeze() match = _PARSE_RE['arrow'].match(s, position) if match is None: raise ValueError('arrow', position) else: position = match.end() rules = [] while position < len(s): rhs = [] while position < len(s) and _PARSE_RE['disjunct'].match(s, position) is None: try: val, position = cls._parseval(s, position, {}) except ValueError, e: estr = ('Error parsing field structure\n\n\t' + s + '\n\t' + ' '*e.args[1] + '^ ' + 'Expected %s\n' % e.args[0]) raise ValueError, estr if isinstance(val, Category): val.freeze() rhs.append(val) position = _PARSE_RE['whitespace'].match(s, position).end() rules.append(cfg.Production(lhs, rhs)) if position < len(s): match = _PARSE_RE['disjunct'].match(s, position) position = match.end()
estr = ('Error parsing field structure\n\n\t' + s + '\n\t' + ' '*e.args[1] + '^ ' + 'Expected %s\n' % e.args[0]) raise ValueError, estr if isinstance(val, Category): val.freeze() rhs.append(val) position = _PARSE_RE['whitespace'].match(s, position).end() rules.append(cfg.Production(lhs, rhs)) if position < len(s): match = _PARSE_RE['disjunct'].match(s, position) position = match.end() # Special case: if there's nothing after the arrow, it is one rule with # an empty RHS, instead of no rules. if len(rules) == 0: rules = [cfg.Production(lhs, ())] return rules _parseval=classmethod(_parseval) _parse=classmethod(_parse) parse_rules=classmethod(parse_rules) class GrammarCategory(Category): """ A class of C{Category} for use in parsing. The name of the head feature in a C{GrammarCategory} is C{pos} (for "part of speech"). There is one required feature, C{/}, which is intended to indicate a type of phrase that is missing from the grammatical structure.