def tokenize_lll(text): tokens = [] for line in text.split('\n'): i = line.find(';') if i >= 0: tokens += parser.tokenize(line[:i]) else: tokens += parser.tokenize(line) return tokens
def test_tokenize(self): string = "(+ 1 2)" answer = ["(", "+", "1", "2", ")"] self.assertEqual(tokenize(string), answer) string = " ( + 1 2 ) " self.assertEqual(tokenize(string), answer)
def test_evaluate_missing_arguments(): expr = parse(tokenize("(* 5)")) with raises(MissingArgument) as excinfo: evaluate(expr) want = "Not enough arguments for operator: '*'." assert want == str(excinfo.value)
def run(program): tokens = parser.tokenize(program) ast = parser.parse(tokens) final_val = None for expr in ast: final_val = scheme_eval(expr) return final_val
def test_evaluate_unknown_operator(): expr = parse(tokenize("@")) with raises(UnknownOperator) as excinfo: evaluate(expr) want = "Unknown operator: '@'." assert want == str(excinfo.value)
def repl(input_fn=input): """Read-Eval-Print-Loop""" print(f'To exit, type {QUIT_COMMAND}', file=sys.stderr) while True: # ___________________________________________ Read try: line = input_fn('> ') except EOFError: break if line == QUIT_COMMAND: break if not line: continue # ___________________________________________ Eval current_exp = parse_exp(tokenize(line)) try: value = evaluate(current_exp) except errors.UndefinedVariable as exc: print('***', exc) continue # ___________________________________________ Print print(value)
def ngrams(string, n=3, continuous=False): """ Returns a list of n-grams (tuples of n successive words) from the given string. Alternatively, you can supply a Text or Sentence object. With continuous=False, n-grams will not run over sentence markers (i.e., .!?). """ def strip_period(s, punctuation=set(".:;,!?()[]'\"")): return [ w for w in s if (isinstance(w, Word) and w.string or w) not in punctuation ] if n <= 0: return [] if isinstance(string, basestring): s = [strip_period(s.split(" ")) for s in tokenize(string)] if isinstance(string, Sentence): s = [strip_period(string)] if isinstance(string, Text): s = [strip_period(s) for s in string] if continuous: s = [sum(s, [])] g = [] for s in s: #s = [None] + s + [None] g.extend([tuple(s[i:i + n]) for i in range(len(s) - n + 1)]) return g
def demo(): from parser import tokenize, parse_exp source = '(* 6 (+ 3 4))' tokens = tokenize(source) expr = parse_exp(tokens) result = evaluate(expr) print(result)
def repl(input_fn=input): """Read-Eval-Print-Loop""" print(f'To exit, type {QUIT_COMMAND}', file=sys.stderr) while True: # ___________________________________________ Read try: source = multiline_input('> ', '... ', quit_cmd=QUIT_COMMAND, input_fn=input_fn) except (EOFError, QuitRequest): break except errors.UnexpectedCloseParen as exc: print('***', exc) continue if not source: continue # ___________________________________________ Eval current_exp = parse_exp(tokenize(source)) if isinstance(current_exp, list) and current_exp[0] == 'define': result = define_function(current_exp[1:]) else: try: result = evaluate({}, current_exp) except errors.EvaluatorException as exc: print('***', exc) continue # ___________________________________________ Print print(result)
def repl(input_fn=input): """Read-Eval-Print-Loop""" print(f'To exit, type {QUIT_COMMAND}', file=sys.stderr) while True: # ___________________________________________ Read try: line = input_fn('> ') except EOFError: break if line == QUIT_COMMAND: break if not line: continue # ___________________________________________ Eval current_exp = parse_exp(tokenize(line)) if isinstance(current_exp, list) and current_exp[0] == 'define': result = define_function(current_exp[1:]) else: try: result = evaluate({}, current_exp) except (errors.UndefinedVariable, errors.UndefinedFunction) as exc: print('***', exc) continue # ___________________________________________ Print print(result)
def test_recog(ret, scope): global counter, ambiguities recog = CppMeta.recognize(ret.text, scope) name = type(ret).__name__ Cpp.incr(name, recog) #grpname = recog[0] and recog[3][0][0] or None #if grpname != name: if name not in recog: if name != 'CppStatement' or not recog: # grpname is None: print "=======================================================" print "Experimental recognition discrepancy:" print " ", type(ret).__name__, ret.text print " ", type(ret).recognize print " ", tokenize(ret.text) #print " ", grpname, recog print " ", recog print "--------------------------------------" elif recog: # grpname is not None: #print "Experimental recognition detected a", #print recog #print grpname, ":", ret.text pass if len(recog) > 1: ambiguities.add(tuple(recog.iterkeys())) print "=======================================================" print "Ambiguity detected !" print "Statement <", ret.text, "> is either one of :", print ', '.join(recog.iterkeys()) print recog print "--------------------------------------" #print counter, ret return ret
def test_evaluate_too_many_arguments(): expr = parse(tokenize("(/ 6 7 8)")) with raises(TooManyArguments) as excinfo: evaluate(expr) want = "Too many arguments for operator: '/'." assert want == str(excinfo.value)
def test_tokenize(self): text = ''' () (+ 123 ;; some comment! ( * xyz 34)) ''' expected = '( ) ( + 123 ( * xyz 34 ) )'.split() self.assertEqual(expected, tokenize(text))
def print_token_list(data): for token in parser.tokenize(data): lineno = str(token.lineno).ljust(4) if token.type.isalpha(): # empiric rule print "line {} - {}: '{}'".format( lineno, token.type, token.value ) else: print "line {} - {}".format(lineno, token.type)
def test_read_tokens(self): tokens = tokenize(self.test_string) result = read_from_tokens(tokens) self.assertEqual(result, ['+', 1, 2]) self.assertEqual(parse("(+ 1 (- 2 3))"), ['+', 1, ['-', 2, 3]]) # test single self.assertEqual(parse("3"), 3)
def test_tokenizer(self): bad_tests = [ ('2 3 4 f'), ('1 2 + 4/'), ('1 23 4 8 q') ] for bad in bad_tests: with self.assertRaises(LexicalError): for z in tokenize(bad): pass
def testAccepts(self): for pattern, testCases in self.getCases().items(): sigma = set(pattern) - set('()|*') tokens = tokenize(pattern + '#') rpn = infixToPostfix(tokens) st, positions = syntaxTreeFromRPN(rpn) followpos = st.getFollowpos() dfa = st.toDFA(followpos, positions, sigma) for t, accepts in testCases.items(): self.assertEqual(dfa.accepts(t), accepts)
def main(): completer = AutoCompleter(['unordered_map', 'unordered_set', 'list', 'print']) readline.set_completer(completer.complete) readline.read_init_file('linereader.rc') while True: line = input('["Q" to quit]: ') if line.strip() == 'Q': break else: completer.learn(parser.tokenize(line))
def cli(argv=None): if not argv: argv = sys.argv arguments = docopt(__doc__, argv=argv[1:], help=True, version='0.1.3') # print(argv) # print(arguments) # sys.exit() entries = [] errors = [] colors = Histogram() ipth = arguments.get('INFILE') opth = arguments.get('--output') verbose = bool(arguments.get('--verbose')) with open(ipth, 'rb') as fh: idx = 0 while True: linen = fh.readline() if not linen: break line = linen.strip() tokens = rolodexer.tokenize(line) try: terms = rolodexer.classify(tokens) except rolodexer.RolodexerError: errors.append(idx) else: entries.append(terms) if 'color' in terms: colors.inc(terms.get('color')) idx += 1 output_dict = { u"entries": entries, u"errors": errors } if verbose: print("Entries parsed: %s" % len(entries), file=sys.stderr) print("Errors encountered: %s" % len(errors), file=sys.stderr) print_colors(colors) if opth == 'stdout': output_json = json.dumps(output_dict, **JSON_ARGS) print(output_json, file=sys.stdout) elif not exists(opth) and isdir(dirname(opth)): if verbose: print("rolodexer: saving output to %s" % opth, file=sys.stderr) with open(opth, 'wb') as fp: json.dump(output_dict, fp, **JSON_ARGS)
def parse(self): ok, start, end, groups = match(tokenize(self.text), 'assignment') if groups is None: #print >> sys.stderr, "Assignment matching failed !" #print >> sys.stderr, " \\_: text", self.text return l, e, r = groups self.lvalue = l[1] # tokens self.rvalue = r[1] # tokens self.effect = e[0] # group name; will be 'set' or 'update' if not ok: print "FAILURE", self.text, groups
def testMinimizeAndAccepts(self): for pattern, testCases in self.getCases().items(): sigma = set(pattern) - set('()|*') tokens = tokenize(pattern + '#') rpn = infixToPostfix(tokens) st, positions = syntaxTreeFromRPN(rpn) followpos = st.getFollowpos() dfa = st.toDFA(followpos, positions, sigma) eqCls = dfa.findEquivalenceClasses(sigma) dfa.mergeEquivalentStates(eqCls) for t, accepts in testCases.items(): self.assertEqual(dfa.accepts(t), accepts)
def testConstruct(self): patterns = [ '(0|1(01*0)*1)*', '(01*1)*1', '(a|b)*abb', '(a|b)*', '(a*|b*)*', '((000)|(001)|(010)|(011)|(100)|(101)|(110)|(111))*' ] for p in patterns: sigma = set(p) - set('()|*') tokens = tokenize(p + '#') rpn = infixToPostfix(tokens) st, positions = syntaxTreeFromRPN(rpn) followpos = st.getFollowpos() dfa = st.toDFA(followpos, positions, sigma)
def test_tokenize(self): cases = [ ('1', ['1']), ('(+ 1 2)', ['(', '+', '1', '2', ')']), ('(+)', ['(', '+', ')']), ('123', ['123']), ('12.3', ['12.3']), ('-12.3', ['-12.3']), ('(+ 12.3 2)', ['(', '+', '12.3', '2', ')']), (' ( a \n b) ', ['(', 'a', 'b', ')']), ('(+ 1 (- 3 2))', ['(', '+', '1', '(', '-', '3', '2', ')', ')']) ] for e, r in cases: self.assertListEqual(parser.tokenize(e), r)
def testTokenize(self): testCases = [ '(0|1(01*0)*1)*', '(01*1)*1', '(a|b)*abb', '(a|b)*', '(a*|b*)*' ] expected = [[ '(', '0', '|', '1', 'CONCAT', '(', '0', 'CONCAT', '1', '*', 'CONCAT', '0', ')', '*', 'CONCAT', '1', ')', '*' ], [ '(', '0', 'CONCAT', '1', '*', 'CONCAT', '1', ')', '*', 'CONCAT', '1' ], [ '(', 'a', '|', 'b', ')', '*', 'CONCAT', 'a', 'CONCAT', 'b', 'CONCAT', 'b' ], ['(', 'a', '|', 'b', ')', '*'], ['(', 'a', '*', '|', 'b', '*', ')', '*']] for t, e in zip(testCases, expected): self.assertEqual(tokenize(t), e)
def ngrams(string, n=3): """ Returns a list of n-grams (tuples of n successive words) from the given string. Alternatively, you can supply a Text or Sentence object. n-grams will not run over sentence markers (i.e., .!?). """ def strip_period(s, punctuation=(".:;,!?()[]'\"")): return [w for w in s if (isinstance(w, Word) and w.string or w) not in punctuation] if n <= 0: return [] if isinstance(string, basestring): s = [strip_period(s.split(" ")) for s in tokenize(string)] if isinstance(string, Sentence): s = [strip_period(string)] if isinstance(string, Text): s = [strip_period(s) for s in string] g = [] for s in s: #s = [None] + s + [None] g.extend([tuple(s[i:i+n]) for i in range(len(s)-n+1)]) return g
def scmcompile(source, filename): """Compile Scheme source into code object. Compile the source into a code object. Code objects can be executed by a call to eval(). """ # parse the source code into a parse tree. tokens = tokenize(source) parse_tree = [] while tokens: parse_tree.append(parse(tokens)) # Do some initialization # Note: # Currently, peak.util.assembler is required by this file # plus codegen.py. The dependence on the outside module # should be abstracted away. In addition, the codegen.py # file could probably be combined with this file. c = peak.util.assembler.Code() c.co_name = '<module>' c.co_firstlineno = 1 c.co_filename = filename c.co_flags = 64 # Not sure why? # Hack for now to load the runtime library. c.LOAD_CONST(-1) c.LOAD_CONST(None) c.IMPORT_NAME('scmimport.runtime') c.STORE_NAME('scmimport') # generate code from the parse tree for node in parse_tree: codegen.gen_code(c, node) # Hack for now, need to load and return None. c.LOAD_CONST(None) c.RETURN_VALUE() # emit the code object. return c.code()
def run(source_file, env=None): """Read and execute opened source file""" source = source_file.read() if env is None: env = {} tokens = tokenize(source) while tokens: try: current_exp = parse_exp(tokens) except errors.UnexpectedCloseParen as exc: print('***', exc, file=sys.stderr) break if isinstance(current_exp, list) and current_exp[0] == 'define': define_function(current_exp[1:]) else: try: evaluate(env, current_exp) except errors.EvaluatorException as exc: print('***', exc, file=sys.stderr) continue
def ic_eval(snippet): '''Eval function of REPL. Return code snippet excution output. ''' global o_start tokens = parser.tokenize(''.join(snippet)) t = s_type(tokens) if not t: return '' dump(SRC, t, snippet) output = interpret(SRC) if output is None: return None else: completer.learn(tokens) if t == '#INC': list(map(lambda x: headers.add(x.strip()), snippet)) else: main_body.extend(snippet) i = o_start o_start = len(output) return output[i:]
def complete(self, text, state): # print(text, ':', state) if state == 0: line = '$' + readline.get_line_buffer() words = parser.tokenize(line) if words[-1].isidentifier(): self.pretext = text[:-len(words[-1])] self.prev = words[-2] text = words[-1] else: self.pretext = text self.prev = words[-1] text = '' utext = text.ljust(200, '~') self.l = bisect.bisect_left(self.tags[self.prev], text) self.r = bisect.bisect_right(self.tags[self.prev], utext) # print() # print(self.pretext, self.prev, text, self.l, self.r) try: return self.pretext+self.tags[self.prev][self.l:self.r][state] except IndexError: return None
def recognize(cls, text, scope): tokens = tokenize(text) ok, start, end, groups = match(tokens, 'c_label|(scope colon)') if ok: tokens = tokens[end:] ret = cls.__recog(tokens) clsbyname = lambda n: getattr(sys.modules[__name__], n) classes = map(clsbyname, ret.iterkeys()) print print " classes", classes if ret and len(ret) > 1: # try and disambiguate stuff # first, a derived class has priority over the base class. #print " more than one possible meaning" def test_class(c): others = tuple(cl for cl in classes if cl is not c) return issubclass(c, others) #print " BEFORE disambiguation by derivation", classes subclasses = filter(test_class, classes) #print " AFTER disambiguation by derivation", subclasses if subclasses: classes = subclasses if len(classes) != 1: print "ambiguity:", text print " classes", classes validate = lambda c: cls.validate(scope, c, text, ret[c.__name__]) statements = filter(lambda x: x is not None, imap(validate, classes)) print " statements", statements if len(statements) == 1: return statements[0] else: raise AmbiguousStatement(text, scope, statements) return CppStatement(text, scope, [])
def tokenize(string): return parser.tokenize(string, [ integer, op_equality, op_plusminus, op_muldiv, op_assignment, op_link, op_if, open_paren, close_paren, comma, variable ])
def parse(scope, lines, start, level): #print level, "line #%i" % start, lines[start] dump(lines, start) if lines[start] == '{': ret = CppStatement('<DATA>', scope, []) start -= 1 else: ret = CppMeta.recognize(lines[start], scope) #print " %s" % (' ' * len(str(start))), ret if ret is None: raise InvalidStatement("Couldn't parse < %s >" % lines[start]) for abs_expr in ret.absorb: start += 1 dump(lines, start) #print level, "ABSORB", abs_expr #print level, "-line #%i" % start, lines[start] ok, mstart, mend, groups = match(tokenize(lines[start]), abs_expr) if not ok: raise InvalidStatement(lines[start]) #print repr(ret.text), repr(lines[start]) ret.text += lines[start] #print repr(ret.text) for g in groups: ret.process_payload(g) if (start + 1) < len(lines) and lines[start + 1] == '{': if ret.absorb_sub: end = start + 1 while lines[end] != '}': end += 1 text = tokenize('\n'.join(lines[start + 2:end])) ok, mstart, mend, groups = match(text, ret.absorb_sub) for g in groups: ret.process_payload(g) start = end + 1 else: ret.pre_sub() start += 2 while start < len(lines) and lines[start] != '}': statement, start = Cpp.parse(ret, lines, start, level + 1) ret.sub.append(statement) for abspo in ret.absorb_post: start += 1 dump(lines, start) #print level, "ABSORB POST", abspo #print level, "-line #%i" % start, lines[start] ok, mstart, mend, groups = match(tokenize(lines[start]), abspo) if not ok: raise InvalidStatement('\n' + lines[start] + '\nwhile expecting ' + abspo + '\nafter ' + type(ret).__name__ + '\n' + ret.text) ret.text += lines[start] for g in groups: #g.dump() ret.process_payload(g) ret.post_sub() ret.commit() return ret, start + 1
""" from calculus import Variable, Abstraction, Application, β_reduction from parser import tokenize, parse def evaluate(term): if isinstance(term, Variable): return term elif isinstance(term, Abstraction): return Abstraction(term.variable, evaluate(term.term)) elif isinstance(term, Application): if isinstance(term.term1, Abstraction): return β_reduction(term) else: return Application(evaluate(term.term1), evaluate(term.term2)) if __name__ == '__main__': from sys import stdin exp = parse(tokenize(stdin.read())) while True: exp1 = evaluate(exp) if str(exp) == str(exp1): break exp = exp1 print(exp)
def test_x_read(self): self.assertEqual(parser(tokenize('x')), 'x')
def test_parse_expressions(source, want): tokens = tokenize(source) assert want == parse(tokens)
def run(self): p = parser.tokenize(lines); lines = [self.pop()]; st = Stack(p[0], p[1], p[2]) def src(self): self.push(" ".join(str(self.code)))
def test_parser(self): self.assertEqual(parser(tokenize(self.line)), [['lambda', ['x'], 'x'], '"Lisp"'])
#!/usr/bin/env python if __name__ == '__main__': from gram import test_grammar for i, rule in enumerate(test_grammar.rules): print('rule #{}: {} -> {}'.format(i, rule.lhs, ' '.join(rule.rhs))) print('') from parser import PrecedenceTable, PrecedenceParser from parser import make_core_grammar, tokenize pt = PrecedenceTable.from_grammar(test_grammar) pt.dump() parser = PrecedenceParser(pt, make_core_grammar(test_grammar)) from sys import stdin parse_rules, rpns = parser.parse(list(tokenize(stdin.read()))) print('parse rules: {}'.format(parse_rules)) print('RPNs: \n{}'.format('\n'.join(' '.join(rpn) for rpn in rpns)))
def ngrams_from_text(text, feature_vector, prefix, n=1, style='float'): sentences = tokenize(text, break_into_sentences=True) words_flat = flatten(sentences) get_ngrams(feature_vector=feature_vector, n=n, prefix=prefix, words=words_flat, style=style)
def run(self): p = parser.tokenize(lines) lines = [self.pop()] st = Stack(p[0], p[1], p[2])
def test_tokenize(source, want): tokens = tokenize(source) assert want == list(tokens)
# attempting to open its signature file try: signature = open(file_path + ".sig").readlines() parser.read_signature(signature) except IOError: print "warning: no signature file found" except parser.SignatureError, e: print "warning (signature error): %s" %(e.msg) except IndexError: raise SystemExit(usage) #print parser.inj_symbols #print parser.symbols tokens = parser.tokenize(prenex_file) # attempt to parse main file try: syntax_tree = parser.parse(tokens) except parser.SyntaxError, e: error = "In line: %d, syntax error: Got '%s', expected '%s'.\n"\ %(e.lineNumber, e.got, e.expected) if e.msg != None: error += "\t%s" %(e.msg) raise SystemExit(error) except parser.DefinitionError, e: raise SystemExit("In line: %d, definition error: %s.\n" %(e.lineNumber, e.msg))
def test_tokenize(self): self.assertEqual(tokenize(self.line), ['(', '(', 'lambda', '(', 'x', ')', 'x', ')', '"Lisp"', ')'])
def test_x(self): self.assertEqual(tokenize('x'), ['x'])
def test_parse_unexpected_end_of_source(): tokens = tokenize('(') with raises(errors.UnexpectedEndOfSource) as excinfo: parse_exp(tokens) assert "Unexpected end of source code." == str(excinfo.value)
def itemize(self): return match(tokenize(self.text), self.recognize)
def postprocess(statement, context): # TESTING ! print >> sys.stderr, "EXPERIMENTAL RECOGNITION", \ CppMeta.recognize(statement.text) # END TESTING ! c_type = len(context) and type(context[-1]) or None if type(statement) is ElseStatement and c_type is IfStatement: ret = context.pop() ret.elses.append(statement) return ret if type(statement) is WhileStatement and c_type is DoWhileStatement: ret = context.pop() ret.whilecond.append(statement) return ret if type(statement) in (ClassDeclStatement, StructDeclStatement): scopes = ('public', 'private', 'protected') def strip(scope, text): if text.startswith(scope): return text[len(scope) + 1:].strip() return text def strip3(text): return strip_scope('public', strip_scope('private', strip_scope('protected', text))) def strip_helper(st): st.text = strip3(st.text) statement.sub = map(strip_helper, statement.sub) return statement m = Cpp.assignment_re.match(statement.text) if m: # Detect chained assignments and split them parts = filter(bool, re.split(Cpp.assignment_op, statement.text)) if len(parts) > 2: #print statement.text #print parts t = statement.text # chained assignment ! expr = parts[-1] exprpos = len(t) - len(expr) expr = expr[:-1] # strip final ; exprend = exprpos + len(expr) for i in xrange(len(parts)-2, -1, -1): lvaluepos = t.rfind(parts[i], 0, exprpos) tmp_assign = t[lvaluepos:exprend].strip() + ';' #print "chained assignment#%i:"%i, tmp_assign context.append(AssignmentStatement(tmp_assign)) exprpos = lvaluepos exprend = lvaluepos + len(parts[i]) return context.pop() # "much more better" ((C) Jack Sparrow) # to keep the code simple else: ret = AssignmentStatement(statement.text) #ret.sub = statement.sub #ret.lvalue = m.group(1) return ret if Cpp.local_var_decl_re.match(statement.text): #print "DETECTED LOCAL VAR DECL" ok, start, end, grps = match(tokenize(statement.text), 'var_decl') #print ok and "SUCCESS" or "FAILED", statement.text, grps # print tokenize(statement.text) # #dump_expression('var_decl') ret = VarDeclStatement(statement.text) ret.sub = statement.sub return ret # tokenize to differentiate the rest #print "TOKENIZING", statement.text #for x in tokenize(statement.text): # print x return statement
def test_parse_atoms(source, ast): got = parse(tokenize(source)) assert type(ast) == type(got) assert ast == got
def test_parse_exp_atoms(source, ast): got = parse_exp(tokenize(source)) assert ast == got
def test_parse_application(source, ast): got = parse(tokenize(source)) assert ast == got
def test_parse_unexpected_close_parenthesis(): tokens = tokenize(')') with raises(errors.UnexpectedCloseParen) as excinfo: parse_exp(tokens) assert "Unexpected close parenthesis." == str(excinfo.value)