def token(self, name, value, source): #print "tok:", name, "->", source.debug() #print "Token", name, value if name == "SYMDEF": return value elif name == "STRING": tok = self.tokens.get(value) if not tok: tok = Token(value) self.tokens[value] = tok return tok elif name == "SYMBOL": sym = self.terminals.get(value) if not sym: sym = Token(value) self.terminals[value] = sym return sym elif name in ( '*', '+', '(', '[', ']', ')', '|', ): return name return BuilderToken(name, value)
def grammar_grammar(): """Builds the grammar for the grammar file """ # star: '*' | '+' star = Alternative("star", Token('*'), Token('+')) star_opt = KleeneStar("star_opt", 0, 1, rule=star) # rule: SYMBOL ':' alternative symbol = Sequence("symbol", Token('SYMBOL'), star_opt) symboldef = Token("SYMDEF") alternative = Sequence("alternative") rule = Sequence("rule", symboldef, alternative) # grammar: rule+ grammar = KleeneStar("grammar", _min=1, rule=rule) # alternative: sequence ( '|' sequence )* sequence = KleeneStar("sequence", 1) seq_cont_list = Sequence("seq_cont_list", Token('|'), sequence) sequence_cont = KleeneStar("sequence_cont", 0, rule=seq_cont_list) alternative.args = [sequence, sequence_cont] # option: '[' alternative ']' option = Sequence("option", Token('['), alternative, Token(']')) # group: '(' alternative ')' group = Sequence("group", Token('('), alternative, Token(')'), star_opt) # sequence: (SYMBOL | STRING | option | group )+ string = Token('STRING') alt = Alternative("sequence_alt", symbol, string, option, group) sequence.args = [alt] return grammar
def visit_STRING(self, node): value = node.value tok = self.tokens.get(value) if not tok: if pylexer.py_punct.match(value): tok = Token(value) elif pylexer.py_name.match(value): tok = Token('NAME', value) else: raise SyntaxError("Unknown STRING value ('%s')" % value) self.tokens[value] = tok return tok
def handle_TOK_STRING( self, name, value ): if value in self.parser.tok_values: # punctuation tokencode = self.parser.tok_values[value] tok = Token(self.parser, tokencode, None) else: if not is_py_name(value): raise RuntimeError("Unknown STRING value ('%s')" % value) # assume a keyword tok = Token(self.parser, self.parser.tokens['NAME'], value) if value not in self.keywords: self.keywords.append(value) self.rule_stack.append(tok)
def next(self): """returns the next token""" # We only support 1-lookahead which # means backtracking more than one token # will re-tokenize the stream (but this is the # grammar lexer so we don't care really!) _p = self.parser if self._peeked is not None: peeked = self._peeked self._peeked = None return peeked pos = self.pos inp = self.input end = len(self.input) pos = self.skip_empty_lines(inp, pos, end) if pos == end: return Token(_p, _p.EOF, None) # at this point nextchar is not a white space nor \n nextchr = inp[pos] if nextchr == "'": npos = self.match_string(inp, pos, end) # could get a string terminated by EOF here if npos == end and inp[end - 1] != "'": self.RaiseError("Unterminated string") self.pos = npos _endpos = npos - 1 assert _endpos >= 0 return Token(_p, _p.TOK_STRING, inp[pos + 1:_endpos]) else: npos = match_symbol(inp, pos, end) if npos != pos: self.pos = npos if npos != end and inp[npos] == ":": self.pos += 1 return Token(_p, _p.TOK_SYMDEF, inp[pos:npos]) else: return Token(_p, _p.TOK_SYMBOL, inp[pos:npos]) # we still have pos!=end here chr = inp[pos] if chr in "[]()*+|": self.pos = pos + 1 return Token(_p, _p.tok_values[chr], chr) self.RaiseError("Unknown token")
def visit_symbol(self, node): star_opt = node.nodes[1] sym = node.nodes[0].value terminal = self.terminals.get(sym) if not terminal: terminal = Token(sym) self.terminals[sym] = terminal return self.repeat(star_opt, terminal)
n_dev = int(n * 0.1) n_test = int(n * 0.1) train_data = data[:n_train] dev_data = data[n_train:n_train + n_dev] test_data = data[n_train + n_dev:] # train_data = train_data[:100] print('done') print( f'-> {n} sentences loaded ({n_train} train, {n_dev} dev, {n_test} test)' ) # create PCFG print('Computing PCFG from training corpus...', end='', flush=True) grammar = PCFG(train_data, Token('SENT')) print('done') # transform PCFG so that it is in Chomsky normal form print('Converting PCFG to Chomsky normal form...', end='', flush=True) grammar.chomsky_normal_form() print('done') # compute grammar stats print('Retrieving grammar symbols and productions...', end='', flush=True) grammar.compute_symbols() grammar.compute_productions() print('done') print( f'-> {grammar.n_all_symbols} non-terminal symbols ({grammar.n_corpus_symbols} originals, ' f'{grammar.n_artificial_symbols} artificials), {grammar.n_terminals} terminal words'
def test1(): # Test grammar 1 print("------ Test 1 ---------") # Abbreviations NT = Nonterminal TERM = Terminal # Hard-coded test case - grammar not read from file E = NT('E') T = NT('T') P = NT('P') plus = TERM('+') mult = TERM('*') ident = TERM('ident') g = { E: [Production(rhs=[E, plus, T]), Production(rhs=[T])], T: [Production(rhs=[T, mult, P]), Production(rhs=[P])], P: [Production(rhs=[ident])], } p = Parser(g, E) s = [ Token('ident', 'a'), Token('*', '*'), Token('ident', 'b'), Token('+', '+'), Token('ident', 'c'), Token('*', '*'), Token('ident', 'd'), Token('+', '+'), Token('ident', 'e'), Token('+', '+'), Token('ident', 'f') ] forest = p.go(s) print("Parse combinations: {0}".format(Parser.num_combinations(forest))) ParseForestPrinter.print_forest(forest)
def make_token(w): if w[0].isupper(): return NameToken('nafn', w) return Token('orð', w)
def get_token(self, codename ): """Returns a new or existing Token""" if codename in self.tokens: return self.tokens[codename] token = self.tokens[codename] = Token(self.parser, codename, None) return token