Beispiel #1
0
    def parse_rules(cls, s):
        """
        Parse a L{CFG} line involving C{Categories}. A line has this form:
        
        C{lhs -> rhs | rhs | ...}

        where C{lhs} is a Category, and each C{rhs} is a sequence of
        Categories.
        
        @returns: a list of C{Productions}, one for each C{rhs}.
        """
        _PARSE_RE = cls._PARSE_RE
        position = 0
        try:
            lhs, position = cls._parse(s, position)
        except ValueError as e:
            estr = ('Error parsing field structure\n\n\t' + s + '\n\t' +
                    ' ' * e.args[1] + '^ ' + 'Expected %s\n' % e.args[0])
            raise ValueError(estr)
        lhs.freeze()

        match = _PARSE_RE['arrow'].match(s, position)
        if match is None: raise ValueError('arrow', position)
        else: position = match.end()
        rules = []
        while position < len(s):
            rhs = []
            while position < len(s) and _PARSE_RE['disjunct'].match(
                    s, position) is None:
                try:
                    val, position = cls._parseval(s, position, {})
                except ValueError as e:
                    estr = ('Error parsing field structure\n\n\t' + s +
                            '\n\t' + ' ' * e.args[1] + '^ ' +
                            'Expected %s\n' % e.args[0])
                    raise ValueError(estr)
                if isinstance(val, Category): val.freeze()
                rhs.append(val)
                position = _PARSE_RE['whitespace'].match(s, position).end()
            rules.append(cfg.Production(lhs, rhs))

            if position < len(s):
                match = _PARSE_RE['disjunct'].match(s, position)
                position = match.end()

        # Special case: if there's nothing after the arrow, it is one rule with
        # an empty RHS, instead of no rules.
        if len(rules) == 0: rules = [cfg.Production(lhs, ())]
        return rules
Beispiel #2
0
    def productions(self):
        """
        Generate the productions that correspond to the non-terminal nodes of the tree.
        For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
        form P -> C1 C2 ... Cn.

        @rtype: list of C{cfg.Production}s
        """

        if not isinstance(self.node, str):
            raise TypeError, 'Productions can only be generated from trees having node labels that are strings'

        prods = [
            cfg.Production(cfg.Nonterminal(self.node), _child_names(self))
        ]
        for child in self:
            if isinstance(child, Tree):
                prods += child.productions()
        return prods
Beispiel #3
0
def demo():
    """
    A demonstration showing how C{Grammar}s can be created and used.
    """

    from en.parser.nltk_lite.parse import cfg

    # Create some nonterminals
    S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP')
    N, V, P, Det = cfg.nonterminals('N, V, P, Det')
    VP_slash_NP = VP / NP

    print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP]
    print '    S.symbol() =>', ` S.symbol() `
    print

    print cfg.Production(S, [NP])

    # Create some Grammar Productions
    grammar = cfg.parse_grammar("""
    S -> NP VP
    PP -> P NP
    NP -> Det N
    NP -> NP PP
    VP -> V NP
    VP -> VP PP
    Det -> 'a'
    Det -> 'the'
    N -> 'dog'
    N -> 'cat'
    V -> 'chased'
    V -> 'sat'
    P -> 'on'
    P -> 'in'
    """)

    print 'A Grammar:', ` grammar `
    print '    grammar.start()       =>', ` grammar.start() `
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(',', ',\n' + ' ' * 25)
    print
Beispiel #4
0
def demo():
    """
    Create a shift reduce parser demo, using a simple grammar and
    text. 
    """
    
    from en.parser.nltk_lite.parse import cfg
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s)
                                           for s in nonterminals.split()]
    
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(NP, [NP, PP]),
        cfg.Production(VP, [VP, PP]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(VP, [V, NP]),
        cfg.Production(PP, [P, NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),   cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),  cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),  cfg.Production(P, ['in']),
        cfg.Production(P, ['with']), cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),  cfg.Production(N, ['statue']),
        cfg.Production(Det, ['my']),
        )

    grammar = cfg.Grammar(S, productions)

    # tokenize the sentence
    sent = list(tokenize.whitespace('my dog saw a man in the park with a statue'))

    ShiftReduceDemo(grammar, sent).mainloop()
Beispiel #5
0
def demo():
    import sys, time

    S = GrammarCategory.parse('S')
    VP = GrammarCategory.parse('VP')
    NP = GrammarCategory.parse('NP')
    PP = GrammarCategory.parse('PP')
    V = GrammarCategory.parse('V')
    N = GrammarCategory.parse('N')
    P = GrammarCategory.parse('P')
    Name = GrammarCategory.parse('Name')
    Det = GrammarCategory.parse('Det')
    DetSg = GrammarCategory.parse('Det[-pl]')
    DetPl = GrammarCategory.parse('Det[+pl]')
    NSg = GrammarCategory.parse('N[-pl]')
    NPl = GrammarCategory.parse('N[+pl]')

    # Define some grammatical productions.
    grammatical_productions = [
        cfg.Production(S, (NP, VP)),
        cfg.Production(PP, (P, NP)),
        cfg.Production(NP, (NP, PP)),
        cfg.Production(VP, (VP, PP)),
        cfg.Production(VP, (V, NP)),
        cfg.Production(VP, (V, )),
        cfg.Production(NP, (DetPl, NPl)),
        cfg.Production(NP, (DetSg, NSg))
    ]

    # Define some lexical productions.
    lexical_productions = [
        cfg.Production(NP, ('John', )),
        cfg.Production(NP, ('I', )),
        cfg.Production(Det, ('the', )),
        cfg.Production(Det, ('my', )),
        cfg.Production(Det, ('a', )),
        cfg.Production(NSg, ('dog', )),
        cfg.Production(NSg, ('cookie', )),
        cfg.Production(V, ('ate', )),
        cfg.Production(V, ('saw', )),
        cfg.Production(P, ('with', )),
        cfg.Production(P, ('under', )),
    ]

    earley_grammar = cfg.Grammar(S, grammatical_productions)
    earley_lexicon = {}
    for prod in lexical_productions:
        earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())

    sent = 'I saw John with a dog with my cookie'
    print("Sentence:\n", sent)
    from en.parser.nltk_lite import tokenize
    tokens = list(tokenize.whitespace(sent))
    t = time.time()
    cp = FeatureEarleyChartParse(earley_grammar, earley_lexicon, trace=1)
    trees = cp.get_parse_list(tokens)
    print("Time: %s" % (time.time() - t))
    for tree in trees:
        print(tree)
Beispiel #6
0
def demo():
    """
    A demonstration of the recursive descent parser.
    """

    from en.parser.nltk_lite.parse import cfg

    # Define some nonterminals
    S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP')
    V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det')

    # Define a grammar.
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, 'saw', NP]),
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(NP, [Det, N, PP]),
        cfg.Production(PP, [P, NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),   cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),  cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),  cfg.Production(P, ['in']),
        cfg.Production(P, ['with']), cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),  cfg.Production(N, ['telescope'])
        )
    grammar = cfg.Grammar(S, productions)

    # Tokenize a sample sentence.
    sent = list(tokenize.whitespace('I saw a man in the park'))

    # Define a list of parsers.
    parser = RecursiveDescent(grammar)
    parser.trace()
    for p in parser.get_parse_list(sent):
        print p
Beispiel #7
0
Datei: cfg.py Projekt: mgolden/en
def demo3():
    from en.parser.nltk_lite.parse import cfg
    (S, VP, NP, PP, P, N, Name, V, Det) = \
        nonterminals('S, VP, NP, PP, P, N, Name, V, Det')

    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(NP, [NP, PP]),
        cfg.Production(VP, [VP, PP]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(VP, [V, NP]),
        cfg.Production(PP, [P, NP]),
        cfg.Production(PP, []),
        cfg.Production(PP, ['up', 'over', NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),
        cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),
        cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),
        cfg.Production(P, ['in']),
        cfg.Production(P, ['with']),
        cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),
        cfg.Production(N, ['statue']),
        cfg.Production(Det, ['my']),
    )

    t = Tk()

    def destroy(e, t=t):
        t.destroy()

    t.bind('q', destroy)
    p = ProductionList(t, productions)
    p.pack(expand=1, fill='both')
    p.add_callback('select', p.markonly)
    p.add_callback('move', p.markonly)
    p.focus()
    p.mark(productions[2])
    p.mark(productions[8])
Beispiel #8
0
Datei: cfg.py Projekt: mgolden/en
def demo2():
    from en.parser.nltk_lite.parse import cfg
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [cfg.Nonterminal(s) for s in nonterminals.split()]
    productions = (
        # Syntactic Productions
        cfg.Production(S, [NP, VP]),
        cfg.Production(NP, [Det, N]),
        cfg.Production(NP, [NP, PP]),
        cfg.Production(VP, [VP, PP]),
        cfg.Production(VP, [V, NP, PP]),
        cfg.Production(VP, [V, NP]),
        cfg.Production(PP, [P, NP]),
        cfg.Production(PP, []),
        cfg.Production(PP, ['up', 'over', NP]),

        # Lexical Productions
        cfg.Production(NP, ['I']),
        cfg.Production(Det, ['the']),
        cfg.Production(Det, ['a']),
        cfg.Production(N, ['man']),
        cfg.Production(V, ['saw']),
        cfg.Production(P, ['in']),
        cfg.Production(P, ['with']),
        cfg.Production(N, ['park']),
        cfg.Production(N, ['dog']),
        cfg.Production(N, ['statue']),
        cfg.Production(Det, ['my']),
    )
    grammar = cfg.Grammar(S, productions)

    text = 'I saw a man in the park'.split()
    d = CFGDemo(grammar, text)
    d.mainloop()
Beispiel #9
0
class Category(FeatureStructure, cfg.Nonterminal):
    """
    A C{Category} is a specialized feature structure, intended for use in
    parsing.  It can act as a C{Nonterminal}.

    A C{Category} differs from a C{FeatureStructure} in these ways:

        - Categories may not be re-entrant.
        
        - Categories use value-based equality, while FeatureStructures use
          identity-based equality.

        - Strings in Categories are compared case-insensitively.
          
        - Categories have one feature marked as the 'head', which prints
          differently than other features if it has a value. For example,
          in the C{repr()} representation of a Category, the head goes to the
          left, on the outside of the brackets. Subclasses of C{Category}
          may change the feature name that is designated as the head, which is
          _head by default.

        - Subclasses of C{Category} may contain a list of I{required features},
          which are names of features whose value is None if unspecified. A
          Category lacking a feature that is required in it will not unify with
          any Category that has that feature. If a required feature's value is
          C{None}, it is considered to be not present. (Mixing different
          subclasses of C{Category} is probably a bad idea.)
          
        - C{True} and C{False} are allowed as values. A feature named C{foo}
          with a value of C{True} is simply expressed as C{+foo}. Similarly, if
          it is C{False}, it is expressed as C{-foo}.
    """
    
    headname = '_head'
    requiredFeatures = []
    
    def __init__(self, **features):
        FeatureStructure.__init__(self, **features)
        self._required = self.__class__.requiredFeatures
        for name in self._required:
            if not self._features.has_key(name):
                self._features[name] = None
        items = self._features.items()
        items.sort()
        self._hash = None
        self._frozen = False
        self._memorepr = None

    def required_features(self):
        "@return: A list of the names of all required features."
        return self._required

    def __cmp__(self, other):
        return cmp(repr(self), repr(other))
    
    def __div__(self, other):
        """
        @return: A new Category based on this one, with its C{/} feature set to 
        C{other}.
        """
        temp = self.deepcopy()
        dict = temp._features
        dict['/'] = other
        return self.__class__(**dict)

    def __eq__(self, other):
        """
        @return: True if C{self} and C{other} assign the same value to
        to every feature.  In particular, return true if
        C{self[M{p}]==other[M{p}]} for every feature path M{p} such
        that C{self[M{p}]} or C{other[M{p}]} is a base value (i.e.,
        not a nested Category).
        @rtype: C{bool}
        """
        
        # Get the result of equal_values, and make it a real boolean while
        # we're at it.
        if not other.__class__ == self.__class__: return False
        if hash(self) != hash(other): return False
        return (self.equal_values(other) == True)

    def __ne__(self, other):
        return not (self == other)

    def __hash__(self):
        if self._hash is not None: return self._hash
        items = self._features.items()
        items.sort()
        return hash(tuple(items))
    
    def freeze(self):
        """
        Freezing a Category memoizes its hash value, to make comparisons on it
        faster. After freezing, the Category and all its values are immutable.

        @return: self
        """
        for val in self._features.values():
            if isinstance(val, Category) and not val.frozen():
                val.freeze()
        self._hash = hash(self)
        self._memorepr = self._repr({}, {})
        self._frozen = True
        return self

    def frozen(self):
        """
        Returns whether this Category is frozen (immutable).
        
        @rtype: C{bool}
        """
        return self._frozen
    
    def __setitem__(self, name, value):
        if self._frozen: raise "Cannot modify a frozen Category"
        self._features[name] = value
    
    def symbol(self):
        """
        @return: The node value corresponding to this C{Category}. 
        @rtype: C{Category}
        """
        return self

    def head(self):
        """
        @return: The head of this category (the value shown outside the
        brackets in its string representation). If there is no head, returns
        None.
        @rtype: C{str} or C{None}
        """
        return self._features.get(self.__class__.headname)
    
    def deepcopy(self, memo=None):
        """
        @return: A deep copy of C{self}.
        """
        newcopy = self.__class__()
        features = newcopy._features

        # Fill out the features.
        for (fname, fval) in self._features.items():
            if isinstance(fval, FeatureStructure):
                features[fname] = fval.deepcopy()
            else:
                features[fname] = fval

        return newcopy
    
    def reentrances(self):
        return []

    def feature_names(self):
        """
        @return: a list of all features that have values.
        """
        return filter(lambda x: not (x in self._required and self[x] is None),
        self._features.keys())
    
    def get_feature(self, *args):
        try:
            return self.__getitem__(*args)
        except IndexError:
            return StarValue()
    
    def has_feature(self, name):
        return (name in self.feature_names())

    #################################################################
    ## Variables
    #################################################################
    
    def remove_unbound_vars(self):
        selfcopy = self.deepcopy()
        selfcopy._remove_unbound_vars()
        return selfcopy

    def _remove_unbound_vars(self):
        for (fname, fval) in self._features.items():
            if isinstance(fval, FeatureVariable):
                del self._features[fname]
            elif isinstance(fval, Category):
                fval._remove_unbound_vars()

    #################################################################
    ## Unification
    #################################################################
 
    def _destructively_unify(self, other, bindings, trace=False, depth=0):
        FeatureStructure._destructively_unify(self, other, bindings, \
            trace=trace, ci_str_cmp=True, depth=depth)
  
    #################################################################
    ## String Representations
    #################################################################


    def __repr__(self):
        """
        @return: A string representation of this feature structure.
        """
        if self._memorepr is not None: return self._memorepr
        else: return self._repr({}, {})
    
    def _repr(self, reentrances, reentrance_ids):
        segments = []

        items = self.feature_names()
        items.sort() # sorting note: keys are unique strings, so we'll
                     # never fall through to comparing values.
        for fname in items:
            if fname == self.__class__.headname: continue
            fval = self[fname]
            if isinstance(fval, bool):
                if fval: segments.append('+%s' % fname)
                else: segments.append('-%s' % fname)
            elif not isinstance(fval, Category):
                segments.append('%s=%r' % (fname, fval))
            else:
                fval_repr = fval._repr(reentrances, reentrance_ids)
                segments.append('%s=%s' % (fname, fval_repr))

        head = self._features.get(self.__class__.headname)
        if head is None: head = ''
        if head and not len(segments): return head
        return '%s[%s]' % (head, ', '.join(segments))

    def _str(self, reentrances, reentrance_ids):
        # This code is very similar to FeatureStructure._str but
        # we print the head feature very differently, so it's hard to
        # combine the two methods.

        # Special case:
        if len(self.feature_names()) == 0:
            return ['[]']
        if self.feature_names() == [self.__class__.headname]:
            return ['%s[]' % self[self.__class__.headname]]
        
        
        # What's the longest feature name?  Use this to align names.
        maxfnamelen = max([len(k) for k in self.feature_names()])

        lines = []
        items = self.feature_names()
        items.sort() # sorting note: keys are unique strings, so we'll
                     # never fall through to comparing values.
        if self.__class__.headname in items:
            items.remove(self.__class__.headname)
            # items.insert(0, self.__class__.headname)
        for fname in items:
            fval = self[fname]
                
            if not isinstance(fval, FeatureStructure):
                # It's not a nested feature structure -- just print it.
                lines.append('%s = %r' % (fname.ljust(maxfnamelen), fval))

            else:
                # It's a new feature structure.  Separate it from
                # other values by a blank line.
                if lines and lines[-1] != '': lines.append('')

                # Recursively print the feature's value (fval).
                fval_lines = fval._str(reentrances, reentrance_ids)
                
                # Indent each line to make room for fname.
                fval_lines = [(' '*(maxfnamelen+3))+l for l in fval_lines]

                # Pick which line we'll display fname on.
                nameline = (len(fval_lines)-1)/2
                
                fval_lines[nameline] = (
                        fname.ljust(maxfnamelen)+' ='+
                        fval_lines[nameline][maxfnamelen+2:])

                # Add the feature structure to the output.
                lines += fval_lines
                            
                # Separate FeatureStructures by a blank line.
                lines.append('')

        # Get rid of any excess blank lines.
        if lines[-1] == '': lines = lines[:-1]
        
        # Add brackets around everything.
        headline = (len(lines) - 1)/2
        if self.has_feature(self.__class__.headname):
            head = self[self.__class__.headname]
        else:
            head = ''
        maxlen = max([len(line) for line in lines])
        for l in range(len(lines)):
            line = lines[l]
            if l == headline:
                lines[l] = ('%s[ %s%s ]' % (head, line, ' '*(maxlen-len(line))))
            else:
                lines[l] = ('%s[ %s%s ]' % (' '*len(head), line, ' '*(maxlen-len(line))))

        return lines

    #################################################################
    ## Parsing
    #################################################################

    # Regular expressions for parsing.
    # Extend the expressions already present in FeatureStructure._PARSE_RE
    _PARSE_RE = {'categorystart': re.compile(r'\s*([^\s\(\)"\'\-=,\[\]]*)\s*\['),
                 'bool': re.compile(r'\s*([-\+])'),
                 'arrow': re.compile(r'\s*->\s*'),
                #'application': re.compile(r'(app)\((\?[a-z][a-z]*)\s*,\s*(\?[a-z][a-z]*)\)'),
                 'disjunct': re.compile(r'\s*\|\s*'),
                 'whitespace': re.compile(r'\s*')}
    for (k, v) in FeatureStructure._PARSE_RE.iteritems():
        assert k not in _PARSE_RE
        _PARSE_RE[k] = v
    
    # [classmethod]
    def _parse(cls, s, position=0, reentrances=None):
        """
        Helper function that parses a Category.
        @param s: The string to parse.
        @param position: The position in the string to start parsing.
        @param reentrances: A dictionary from reentrance ids to values.
        @return: A tuple (val, pos) of the feature structure created
            by parsing and the position where the parsed feature
            structure ends.
        """
        # A set of useful regular expressions (precompiled)
        _PARSE_RE = cls._PARSE_RE

        # Find the head, if there is one.
        match = _PARSE_RE['name'].match(s, position)
        if match is not None:
            head = match.group(1)
            position = match.end()
        else: head = None
        
        # Check that the name is followed by an open bracket.
        if position >= len(s) or s[position] != '[':
            return cls(**{cls.headname: head}), position
        position += 1

        # If it's immediately followed by a close bracket, then just
        # return an empty feature structure.
        match = _PARSE_RE['bracket'].match(s, position)
        if match is not None:
            if head is None: return cls(), match.end()
            else: return cls(**{cls.headname: head}), match.end()

        # Build a list of the features defined by the structure.
        # Each feature has one of the three following forms:
        #     name = value
        #     +name
        #     -name
        features = {}
        if head is not None: features[cls.headname] = head
        while position < len(s):
            # Use these variables to hold info about the feature:
            name = target = val = None

            # Is this a shorthand boolean value?
            match = _PARSE_RE['bool'].match(s, position)
            if match is not None:
                if match.group(1) == '+': val = True
                else: val = False
                position = match.end()
            
            # Find the next feature's name.
            match = _PARSE_RE['name'].match(s, position)
            if match is None: raise ValueError('feature name', position)
            name = match.group(1)
            position = match.end()
            
            # If it's not a shorthand boolean, it must be an assignment.
            if val is None:
                match = _PARSE_RE['assign'].match(s, position)
                if match is None: raise ValueError('equals sign', position)
                position = match.end()
                val, position = cls._parseval(s, position, reentrances)
            features[name] = val
                    
            # Check for a close bracket
            match = _PARSE_RE['bracket'].match(s, position)
            if match is not None:
                return cls(**features), match.end()
                
            # Otherwise, there should be a comma
            match = _PARSE_RE['comma'].match(s, position)
            if match is None: raise ValueError('comma', position)
            position = match.end()
            
        # We never saw a close bracket.
        raise ValueError('close bracket', position)

    # [classmethod]
    def _parseval(cls, s, position, reentrances):
        """
        Helper function that parses a feature value.  Currently
        supports: None, bools, integers, variables, strings, nested feature
        structures.
        @param s: The string to parse.
        @param position: The position in the string to start parsing.
        @param reentrances: A dictionary from reentrance ids to values.
        @return: A tuple (val, pos) of the value created by parsing
            and the position where the parsed value ends.
        """
        # A set of useful regular expressions (precompiled)
        _PARSE_RE = cls._PARSE_RE
        
        # End of string (error)
        if position == len(s): raise ValueError('value', position)

        # Semantic value of the form <app(?x, ?y) >'; return an ApplicationExpression
        match = _PARSE_RE['application'].match(s, position)
        if match is not None:
            fun = ParserSubstitute(match.group(2)).next()
            arg = ParserSubstitute(match.group(3)).next()
            return ApplicationExpressionSubst(fun, arg), match.end()       

        # other semantic value enclosed by '< >'; return value given by the lambda expr parser
        match = _PARSE_RE['semantics'].match(s, position)
        if match is not None:
            return ParserSubstitute(match.group(1)).next(), match.end()	
        
        # String value
        if s[position] in "'\"":
            start = position
            quotemark = s[position:position+1]
            position += 1
            while 1: 
                match = _PARSE_RE['stringmarker'].search(s, position)
                if not match: raise ValueError('close quote', position)
                position = match.end()
                if match.group() == '\\': position += 1
                elif match.group() == quotemark:
                    return eval(s[start:position]), position

        # Nested category
        if _PARSE_RE['categorystart'].match(s, position) is not None:
            return cls._parse(s, position, reentrances)

        # Variable
        match = _PARSE_RE['var'].match(s, position)
        if match is not None:
            return FeatureVariable.parse(match.group()), match.end()

        # None
        match = _PARSE_RE['none'].match(s, position)
        if match is not None:
            return None, match.end()

        # Integer value
        match = _PARSE_RE['int'].match(s, position)
        if match is not None:
            return int(match.group()), match.end()

        # Alphanumeric symbol (must be checked after integer)
        match = _PARSE_RE['symbol'].match(s, position)
        if match is not None:
            return cls(**{cls.headname: match.group()}), match.end()

        # We don't know how to parse this value.
        raise ValueError('value', position)
    
    # [classmethod]
    # Used by GrammarFile
    def parse_rules(cls, s):
        """
        Parse a L{CFG} line involving C{Categories}. A line has this form:
        
        C{lhs -> rhs | rhs | ...}

        where C{lhs} is a Category, and each C{rhs} is a sequence of
        Categories.
        
        @returns: a list of C{Productions}, one for each C{rhs}.
        """
        _PARSE_RE = cls._PARSE_RE
        position = 0
        try:
            lhs, position = cls._parse(s, position)
        except ValueError, e:
            estr = ('Error parsing field structure\n\n\t' +
                    s + '\n\t' + ' '*e.args[1] + '^ ' +
                    'Expected %s\n' % e.args[0])
            raise ValueError, estr
        lhs.freeze()

        match = _PARSE_RE['arrow'].match(s, position)
        if match is None: raise ValueError('arrow', position)
        else: position = match.end()
        rules = []
        while position < len(s):
            rhs = []
            while position < len(s) and _PARSE_RE['disjunct'].match(s, position) is None:
                try:
                    val, position = cls._parseval(s, position, {})
                except ValueError, e:
                    estr = ('Error parsing field structure\n\n\t' +
                        s + '\n\t' + ' '*e.args[1] + '^ ' +
                        'Expected %s\n' % e.args[0])
                    raise ValueError, estr
                if isinstance(val, Category): val.freeze()
                rhs.append(val)
                position = _PARSE_RE['whitespace'].match(s, position).end()
            rules.append(cfg.Production(lhs, rhs))
            
            if position < len(s):
                match = _PARSE_RE['disjunct'].match(s, position)
                position = match.end()
Beispiel #10
0
                    estr = ('Error parsing field structure\n\n\t' +
                        s + '\n\t' + ' '*e.args[1] + '^ ' +
                        'Expected %s\n' % e.args[0])
                    raise ValueError, estr
                if isinstance(val, Category): val.freeze()
                rhs.append(val)
                position = _PARSE_RE['whitespace'].match(s, position).end()
            rules.append(cfg.Production(lhs, rhs))
            
            if position < len(s):
                match = _PARSE_RE['disjunct'].match(s, position)
                position = match.end()
        
        # Special case: if there's nothing after the arrow, it is one rule with
        # an empty RHS, instead of no rules.
        if len(rules) == 0: rules = [cfg.Production(lhs, ())]
        return rules

    _parseval=classmethod(_parseval)
    _parse=classmethod(_parse)
    parse_rules=classmethod(parse_rules)


class GrammarCategory(Category):
    """
    A class of C{Category} for use in parsing.

    The name of the head feature in a C{GrammarCategory} is C{pos} (for "part
    of speech"). There is one required feature, C{/}, which is intended to
    indicate a type of phrase that is missing from the grammatical structure.
Beispiel #11
0
    def parse(self, p_string):
        """
        Parses a string and stores the resulting hierarchy of "domains"
        "hierarchies" and "tables"

        For the sake of NLP I've parsed the string using the nltk_lite 
        context free grammar library.

        A query is a "sentence" and can either be a domain, hierarchy or a table.
        A domain is simply a word.
        A hierarchy is expressed as "domain/domain"
        A table is exressed as "table(sentence, sentence, sentence)"

        Internally the query is represented as a nltk_lite.parse.tree

        Process:
          1. string is tokenized
          2. develop a context free grammar
          3. parse
          4. convert to a tree representation
        """
        self.nltktree = None

        # Store the query string
        self.string = p_string
        """
        1. Tokenize
        ------------------------------------------------------------------------
        """

        # Tokenize the query string, allowing only strings, parentheses,
        # forward slashes and commas.
        re_all = r'table[(]|\,|[)]|[/]|\w+'
        data_tokens = tokenize.regexp(self.string, re_all)
        """
        2. Develop a context free grammar
        ------------------------------------------------------------------------
        """

        # Develop a context free grammar
        # S = sentence, T = table, H = hierarchy, D = domain
        O, T, H, D = cfg.nonterminals('O, T, H, D')

        # Specify the grammar
        productions = (
            # A sentence can be either a table, hierarchy or domain
            cfg.Production(O, [D]),
            cfg.Production(O, [H]),
            cfg.Production(O, [T]),

            # A table must be the following sequence:
            # "table(", sentence, comma, sentence, comma, sentence, ")"
            cfg.Production(T, ['table(', O, ',', O, ',', O, ')']),

            # A hierarchy must be the following sequence:
            # domain, forward slash, domain
            cfg.Production(H, [D, '/', D]),
            # domain, forward slash, another operator
            cfg.Production(H, [D, '/', O]))

        # Add domains to the cfg productions
        # A domain is a token that is entirely word chars
        re_domain = compile(r'^\w+$')
        # Try every token and add if it matches the above regular expression
        for tok in data_tokens:
            if re_domain.match(tok):
                prod = cfg.Production(D, [tok]),
                productions = productions + prod

        # Make a grammar out of our productions
        grammar = cfg.Grammar(O, productions)
        rd_parser = parse.RecursiveDescent(grammar)

        # Tokens need to be redefined.
        # It disappears after first use, and I don't know why.
        tokens = tokenize.regexp(self.string, re_all)
        toklist = list(tokens)
        """
        3. Parse using the context free grammar
        ------------------------------------------------------------------------
        """
        # Store the parsing.
        # Only the first one, as the grammar should be completely nonambiguous.
        try:
            self.parseList = rd_parser.get_parse_list(toklist)[0]
        except IndexError:
            print "Could not parse query."
            return
        """
        4. Refine and convert to a Tree representation
        ------------------------------------------------------------------------
        """
        # Set the nltk_lite.parse.tree tree for this query to the global sentence
        string = str(self.parseList)
        string2 = string.replace(":", "").replace("')'", "").replace(
            "table(", "").replace("','", "").replace("'", "").replace("/", "")
        self.nltktree = parse.tree.bracket_parse(string2)

        # Store the resulting nltk_lite.parse.tree tree
        self.parseTree = QuerySentence(self.nltktree)
        self.xml = self.parseTree.toXML()