Beispiel #1
0
 def token(self, name, value, source):
     #print "tok:", name, "->", source.debug()
     #print "Token", name, value
     if name == "SYMDEF":
         return value
     elif name == "STRING":
         tok = self.tokens.get(value)
         if not tok:
             tok = Token(value)
             self.tokens[value] = tok
         return tok
     elif name == "SYMBOL":
         sym = self.terminals.get(value)
         if not sym:
             sym = Token(value)
             self.terminals[value] = sym
         return sym
     elif name in (
             '*',
             '+',
             '(',
             '[',
             ']',
             ')',
             '|',
     ):
         return name
     return BuilderToken(name, value)
Beispiel #2
0
def grammar_grammar():
    """Builds the grammar for the grammar file
    """
    # star: '*' | '+'
    star = Alternative("star", Token('*'), Token('+'))
    star_opt = KleeneStar("star_opt", 0, 1, rule=star)

    # rule: SYMBOL ':' alternative
    symbol = Sequence("symbol", Token('SYMBOL'), star_opt)
    symboldef = Token("SYMDEF")
    alternative = Sequence("alternative")
    rule = Sequence("rule", symboldef, alternative)

    # grammar: rule+
    grammar = KleeneStar("grammar", _min=1, rule=rule)

    # alternative: sequence ( '|' sequence )*
    sequence = KleeneStar("sequence", 1)
    seq_cont_list = Sequence("seq_cont_list", Token('|'), sequence)
    sequence_cont = KleeneStar("sequence_cont", 0, rule=seq_cont_list)

    alternative.args = [sequence, sequence_cont]

    # option: '[' alternative ']'
    option = Sequence("option", Token('['), alternative, Token(']'))

    # group: '(' alternative ')'
    group = Sequence("group", Token('('), alternative, Token(')'), star_opt)

    # sequence: (SYMBOL | STRING | option | group )+
    string = Token('STRING')
    alt = Alternative("sequence_alt", symbol, string, option, group)
    sequence.args = [alt]

    return grammar
Beispiel #3
0
 def visit_STRING(self, node):
     value = node.value
     tok = self.tokens.get(value)
     if not tok:
         if pylexer.py_punct.match(value):
             tok = Token(value)
         elif pylexer.py_name.match(value):
             tok = Token('NAME', value)
         else:
             raise SyntaxError("Unknown STRING value ('%s')" % value)
         self.tokens[value] = tok
     return tok
Beispiel #4
0
 def handle_TOK_STRING( self, name, value ):
     if value in self.parser.tok_values:
         # punctuation
         tokencode = self.parser.tok_values[value]
         tok = Token(self.parser, tokencode, None)
     else:
         if not is_py_name(value):
             raise RuntimeError("Unknown STRING value ('%s')" % value)
         # assume a keyword
         tok = Token(self.parser, self.parser.tokens['NAME'], value)
         if value not in self.keywords:
             self.keywords.append(value)
     self.rule_stack.append(tok)
Beispiel #5
0
    def next(self):
        """returns the next token"""
        # We only support 1-lookahead which
        # means backtracking more than one token
        # will re-tokenize the stream (but this is the
        # grammar lexer so we don't care really!)
        _p = self.parser
        if self._peeked is not None:
            peeked = self._peeked
            self._peeked = None
            return peeked

        pos = self.pos
        inp = self.input
        end = len(self.input)
        pos = self.skip_empty_lines(inp, pos, end)
        if pos == end:
            return Token(_p, _p.EOF, None)

        # at this point nextchar is not a white space nor \n
        nextchr = inp[pos]
        if nextchr == "'":
            npos = self.match_string(inp, pos, end)
            # could get a string terminated by EOF here
            if npos == end and inp[end - 1] != "'":
                self.RaiseError("Unterminated string")
            self.pos = npos
            _endpos = npos - 1
            assert _endpos >= 0
            return Token(_p, _p.TOK_STRING, inp[pos + 1:_endpos])
        else:
            npos = match_symbol(inp, pos, end)
            if npos != pos:
                self.pos = npos
                if npos != end and inp[npos] == ":":
                    self.pos += 1
                    return Token(_p, _p.TOK_SYMDEF, inp[pos:npos])
                else:
                    return Token(_p, _p.TOK_SYMBOL, inp[pos:npos])

        # we still have pos!=end here
        chr = inp[pos]
        if chr in "[]()*+|":
            self.pos = pos + 1
            return Token(_p, _p.tok_values[chr], chr)
        self.RaiseError("Unknown token")
Beispiel #6
0
    def visit_symbol(self, node):
        star_opt = node.nodes[1]
        sym = node.nodes[0].value
        terminal = self.terminals.get(sym)
        if not terminal:
            terminal = Token(sym)
            self.terminals[sym] = terminal

        return self.repeat(star_opt, terminal)
    n_dev = int(n * 0.1)
    n_test = int(n * 0.1)
    train_data = data[:n_train]
    dev_data = data[n_train:n_train + n_dev]
    test_data = data[n_train + n_dev:]

    # train_data = train_data[:100]

    print('done')
    print(
        f'-> {n} sentences loaded ({n_train} train, {n_dev} dev, {n_test} test)'
    )

    # create PCFG
    print('Computing PCFG from training corpus...', end='', flush=True)
    grammar = PCFG(train_data, Token('SENT'))
    print('done')

    # transform PCFG so that it is in Chomsky normal form
    print('Converting PCFG to Chomsky normal form...', end='', flush=True)
    grammar.chomsky_normal_form()
    print('done')

    # compute grammar stats
    print('Retrieving grammar symbols and productions...', end='', flush=True)
    grammar.compute_symbols()
    grammar.compute_productions()
    print('done')
    print(
        f'-> {grammar.n_all_symbols} non-terminal symbols ({grammar.n_corpus_symbols} originals, '
        f'{grammar.n_artificial_symbols} artificials), {grammar.n_terminals} terminal words'
Beispiel #8
0
def test1():
    # Test grammar 1

    print("------ Test 1 ---------")

    # Abbreviations
    NT = Nonterminal
    TERM = Terminal

    # Hard-coded test case - grammar not read from file

    E = NT('E')
    T = NT('T')
    P = NT('P')
    plus = TERM('+')
    mult = TERM('*')
    ident = TERM('ident')

    g = {
        E: [Production(rhs=[E, plus, T]),
            Production(rhs=[T])],
        T: [Production(rhs=[T, mult, P]),
            Production(rhs=[P])],
        P: [Production(rhs=[ident])],
    }

    p = Parser(g, E)
    s = [
        Token('ident', 'a'),
        Token('*', '*'),
        Token('ident', 'b'),
        Token('+', '+'),
        Token('ident', 'c'),
        Token('*', '*'),
        Token('ident', 'd'),
        Token('+', '+'),
        Token('ident', 'e'),
        Token('+', '+'),
        Token('ident', 'f')
    ]

    forest = p.go(s)

    print("Parse combinations: {0}".format(Parser.num_combinations(forest)))

    ParseForestPrinter.print_forest(forest)
Beispiel #9
0
 def make_token(w):
     if w[0].isupper():
         return NameToken('nafn', w)
     return Token('orð', w)
Beispiel #10
0
 def get_token(self, codename ):
     """Returns a new or existing Token"""
     if codename in self.tokens:
         return self.tokens[codename]
     token = self.tokens[codename] = Token(self.parser, codename, None)
     return token