def tokenize(s, filename, spos= 0, sline= 0, slinePos= 0): keywords = [ 'import', 'def', 'then', 'do', 'if', 'elif', 'else', 'while', 'int', 'float', 'none', 'bool', 'string', 'break', 'continue', 'true', 'false', 'let', 'ext', 'type', 'string', 'var', "not", "or", "and", ] special = ["bang", "arrow", "doublecolon", "line", "underscore", "assign", "assignPlus", "assignSub", "assignMul", "assignDiv", 'colon', 'dot', 'openC', 'openB', 'closeC', 'closeB', 'comma', 'closeS', 'openS', 'doubleDot', 'semi'] token_specification = [ ("comment", r"/\*([\s\S]*?)\*/"), ("indent", r'\n[ ]*'), ('commentLine', r'//.*'), ('newline', r'\n'), ('openB', '{'), ('closeB', '}'), ('openC', '\('), ('closeC', '\)'), ('f32', r'[\d_]+(\.[\d_]+|f)'), ('i32', r'[\d_]+'), ('arrow', r'->'), ('equal', r'=='), ('doublecolon', r'::'), ("colon", r":"), ("semi", r";"), ('ne', r'!='), ('assign', r'='), ('openS', r'\['), ('closeS', r'\]'), ('assignPlus', r'\+='), ('assignSub', r'\-='), ('assignMul', r'\*='), ('assignDiv', r'\/='), ('operator', r'[+*\/\-%><^]|(\|>)'), ('line', r'\|'), ('identifier', r'[A-Za-z0-9_]+'), ('underscore', '_'), ('skip', r'[ \t]'), ("str", r'"(?:\\.|({.*})|[^"\\])*"'), ('doubleDot', '\.\.'), ('dot', '\.'), ('tab', '\t'), ('comma', ','), ('bang', '!'), ] tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification) get_token = re.compile(tok_regex).match line = 1 pos = spos mo = get_token(s) lastIndent = 0 lastTyp = None linePos = slinePos line = sline array = [] while mo is not None: typ = mo.lastgroup next = get_token(s, mo.end()) if typ == "indent" or typ == "newline": val = mo.group(typ) array.append(Token("\n", "symbol", line, pos)) line += 1 linePos = mo.end() if next == None: array.append(Token(0,"indent", line, pos)) break if next.lastgroup == "indent": array.append(Token(lastIndent, "indent", line, pos )) else: array.append(Token(len(val)-1, "indent", line, pos)) lastIndent = len(val)-1 elif typ == "comment": val = mo.group(typ) c = mo.end() r = val.rfind("\n") linePos = c + r line += len(val) - len(val.replace("\n", "")) array.append(Token(val, "comment", line, pos )) elif typ in ["str"]: val = mo.group(typ) if typ == "str": def notBack(iter): if iter == 0: return True if val[iter-1] != "\\": return True return not notBack(iter-1) start = 0 inBrace = False tokens = [] val = val[1:-1] bcount = 0 shouldBe = 0 v = list(val) for iter in range(len(val)): i = val[iter] if notBack(iter) and i == "{" and not inBrace: tokens.append(Token('"'+val[start: iter]+'"', "str", line, pos)) inBrace = True start = iter+1 shouldBe = bcount if i == "{": bcount += 1 elif notBack(iter) and i == "}": bcount -= 1 if bcount == shouldBe and inBrace: tokens.append(Token("concat", "operator", line, pos+start)) tokens.append(Token("(", "symbol", line, pos+start)) tokens += tokenize(val[start: iter], filename, pos+start, line, linePos) tokens.append(Token(")", "symbol", line, pos+iter)) tokens.append(Token("concat", "operator", line, pos+iter)) start = iter + 1 inBrace = False tokens.append(Token('"'+val[start:]+'"', "str", line, pos)) array += tokens elif typ != 'skip': val = mo.group(typ) if typ == 'identifier' and val in keywords: if val in ["true", "false"]: typ = "bool" elif val in ["_"]: typ = "symbol" else: typ = "keyword" elif typ == "f32": val = val[:-1]+".0" if val[-1] == "f" else val if typ == "i32" or typ == "f32": val = val.replace("_", "") elif typ in special: typ = "symbol" elif typ == "equal" or typ == "mut" or typ == "ne": typ = "operator" if val != " ": array.append(Token(val, typ, line, pos)) elif val == "\t": Error.compileError(filename[1], line, "tabs are not allowed") lastTyp = typ pos = mo.end() - linePos mo = next if spos == 0 and sline == 0: array.append(Token("\n", "symbol", line-1, pos)) array.append(Token(0, "indent", line, pos)) return array
def tokenize(s, filename, spos=0, sline=0, slinePos=0): tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification) get_token = re.compile(tok_regex).match line = 1 pos = spos mo = get_token(s) lastIndent = 0 lastTyp = None linePos = slinePos line = sline extension = False array = [] while mo is not None: typ = mo.lastgroup next = get_token(s, mo.end()) if typ == "indent" or typ == "newline": val = mo.group(typ) array.append(Token("\n", "symbol", line, pos)) line += 1 linePos = mo.end() if next == None: array.append(Token(0, "indent", line, pos)) break if next.lastgroup == "indent": array.append(Token(lastIndent, "indent", line, pos)) else: array.append(Token(len(val) - 1, "indent", line, pos)) lastIndent = len(val) - 1 elif typ == "comment": val = mo.group(typ) c = mo.end() r = val.rfind("\n") linePos = c + r line += len(val) - len(val.replace("\n", "")) array.append(Token(val, "comment", line, pos)) elif typ == "setAtom": array.append(Token("set", "operator", line, pos)) elif typ in ["str"]: template = False val = mo.group(typ) def notBack(iter): if iter == 0: return True if val[iter - 1] != "\\": return True return not notBack(iter - 1) start = 0 inBrace = False tokens = [] val = val[1:-1] bcount = 0 shouldBe = 0 v = list(val) for iter in range(len(val)): i = val[iter] if notBack(iter) and i == "{" and not inBrace: tokens.append( Token('"' + val[start:iter] + '"', "str", line, pos)) inBrace = True start = iter + 1 shouldBe = bcount template = True if i == "{": bcount += 1 elif notBack(iter) and i == "}": bcount -= 1 if bcount == shouldBe and inBrace: tokens.append( Token("concat", "operator", line, pos + iter)) tokens.append(Token("(", "symbol", line, pos + iter)) t = tokenize(val[start:iter], filename) for i in t[:-2]: i.line += line i.column += pos + start tokens.append(i) tokens.append(Token(")", "symbol", line, pos + iter)) tokens.append( Token("concat", "operator", line, pos + iter)) start = iter + 1 inBrace = False elif i == "\n": line += 1 tokens.append(Token('"' + val[start:] + '"', "str", line, pos)) if template: tokens.insert(0, Token("(", "symbol", line, pos)) tokens.append(Token(")", "symbol", line, pos + iter)) array += tokens elif typ == "single": val = mo.group(typ) array.append(Token(val, "str", line, pos)) elif typ == "spaceDoubleDot": _val = mo.group(typ) val = _val.replace(" ", "") pos = pos + (len(_val) - len(val)) array.append(Token(val, "symbol", line, pos)) elif typ != 'skip' and not typ in ["comment", "commentLine"]: val = mo.group(typ) if typ == "identifier": def my_replace(match): match = match.group() return match[1].upper() val = re.sub(r'\-[A-Za-z]', my_replace, val) if typ == 'identifier' and val in keywords: if val in ["true", "false"]: typ = "bool" elif val in ["_"]: typ = "symbol" else: typ = "keyword" extension = True elif typ == "f32": val = val[:-1] + ".0" if val[-1] == "f" else val elif typ == "hex": typ = "i32" if typ == "i32" or typ == "f32": val = val.replace("_", "") elif typ in special: typ = "symbol" elif typ == "equal" or typ == "mut" or typ == "ne": typ = "operator" if val != " ": array.append(Token(val, typ, line, pos)) elif val == "\t": Error.compileError(filename[1], line, "tabs are not allowed") lastTyp = typ #mo.start() - line_start pos = mo.start() - linePos mo = next if spos == 0 and sline == 0: array.append(Token("\n", "symbol", line - 1, pos)) array.append(Token(0, "indent", line, pos)) return array
def tokenize(s, filename, spos=0, sline=0, slinePos=0): keywords = [ 'import', 'def', 'then', 'do', 'if', 'elif', 'else', 'while', 'int', 'float', 'none', 'bool', 'string', 'break', 'continue', 'true', 'false', 'let', 'ext', 'type', 'string', 'var', "not", "or", "and", "lens", "match", "with", "decoder", ] special = [ "dollar", "bang", "arrow", "doublecolon", "line", "underscore", "assign", "assignPlus", "assignSub", "assignMul", "assignDiv", 'colon', 'dot', 'openC', 'openB', 'closeC', 'closeB', 'comma', 'closeS', 'openS', 'doubleDot', 'semi' ] token_specification = [ ("comment", r"/\*([\s\S]*?)\*/"), ("indent", r'\n[ ]*'), ('commentLine', r'//.*'), ('newline', r'\n'), ('openB', '{'), ('closeB', '}'), ('openC', '\('), ('closeC', '\)'), ('hex', r'0[xX][0-9a-fA-F]+'), ('f32', r'\d*[\d_]*\d+(\.\d*[\d_]*(\d+)|f)'), ('i32', r'\d*[\d_]*(\d+)'), ('arrow', r'->'), ('equal', r'=='), ('doublecolon', r'::'), ("colon", r":"), ("semi", r";"), ('ne', r'!='), ('assign', r'='), ('openS', r'\['), ('closeS', r'\]'), ('assignPlus', r'\+='), ('assignSub', r'\-='), ('assignMul', r'\*='), ('assignDiv', r'\/='), ('operator', r'(\|>|>>|<-)|[+*\/\-%><^\\]'), ('line', r'\|'), ('identifier', r'[^\d\W](\w|(-[^\d\W]))*' ), #[A-Za-z0-9_$]*([A-Za-z0-9_$]*-[A-Za-z_$]+)* ('underscore', '_'), ('skip', r'[ \t]'), ("str", r'"(?:\\.|({.*})|[^"\\])*"'), ('doubleDot', '\.\.'), ('dot', '\.'), ('tab', '\t'), ('comma', ','), ('bang', '!'), ('dollar', '\$'), ('set', '=>'), ] tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification) get_token = re.compile(tok_regex).match line = 1 pos = spos mo = get_token(s) lastIndent = 0 lastTyp = None linePos = slinePos line = sline array = [] while mo is not None: typ = mo.lastgroup next = get_token(s, mo.end()) if typ == "indent" or typ == "newline": val = mo.group(typ) array.append(Token("\n", "symbol", line, pos)) line += 1 linePos = mo.end() if next == None: array.append(Token(0, "indent", line, pos)) break if next.lastgroup == "indent": array.append(Token(lastIndent, "indent", line, pos)) else: array.append(Token(len(val) - 1, "indent", line, pos)) lastIndent = len(val) - 1 elif typ == "comment": val = mo.group(typ) c = mo.end() r = val.rfind("\n") linePos = c + r line += len(val) - len(val.replace("\n", "")) array.append(Token(val, "comment", line, pos)) elif typ in ["str"]: val = mo.group(typ) def notBack(iter): if iter == 0: return True if val[iter - 1] != "\\": return True return not notBack(iter - 1) start = 0 inBrace = False tokens = [] val = val[1:-1] bcount = 0 shouldBe = 0 v = list(val) for iter in range(len(val)): i = val[iter] if notBack(iter) and i == "{" and not inBrace: tokens.append( Token('"' + val[start:iter] + '"', "str", line, pos)) inBrace = True start = iter + 1 shouldBe = bcount if i == "{": bcount += 1 elif notBack(iter) and i == "}": bcount -= 1 if bcount == shouldBe and inBrace: tokens.append( Token("concat", "operator", line, pos + start)) tokens.append(Token("(", "symbol", line, pos + start)) tokens += tokenize(val[start:iter], filename, pos + start, line, linePos) tokens.append(Token(")", "symbol", line, pos + iter)) tokens.append( Token("concat", "operator", line, pos + iter)) start = iter + 1 inBrace = False elif i == "\n": line += 1 tokens.append(Token('"' + val[start:] + '"', "str", line, pos)) array += tokens elif typ != 'skip' and not typ in ["comment", "commentLine"]: val = mo.group(typ) if typ == "identifier": def my_replace(match): match = match.group() return match[1].upper() val = re.sub(r'\-[A-Za-z]', my_replace, val) if typ == 'identifier' and val in keywords: if val in ["true", "false"]: typ = "bool" elif val in ["_"]: typ = "symbol" else: typ = "keyword" elif typ == "f32": val = val[:-1] + ".0" if val[-1] == "f" else val elif typ == "hex": typ = "i32" if typ == "i32" or typ == "f32": val = val.replace("_", "") elif typ in special: typ = "symbol" elif typ == "equal" or typ == "mut" or typ == "ne": typ = "operator" if val != " ": array.append(Token(val, typ, line, pos)) elif val == "\t": Error.compileError(filename[1], line, "tabs are not allowed") lastTyp = typ pos = mo.end() - linePos mo = next if spos == 0 and sline == 0: array.append(Token("\n", "symbol", line - 1, pos)) array.append(Token(0, "indent", line, pos)) return array