def tokenize_number(self, line_index, column_index, line): number = line[column_index] end_column_index = column_index dot_found = False decimal_inserted = False while end_column_index + 1 < len(line) and ( line[end_column_index + 1] in lexicon.DIGITS or (not dot_found and line[end_column_index + 1] == '.')): number += line[end_column_index + 1] if dot_found: decimal_inserted = True elif line[end_column_index + 1] == '.': dot_found = True end_column_index += 1 # If the number contains a "dot", it should have decimal numbers. if dot_found and not decimal_inserted: return Token(number, Code.MF_NUMBER, line_index, line_index, column_index, end_column_index) else: return Token(number, Code.NUMBER, line_index, line_index, column_index, end_column_index)
def tokenize_string(self, line_index, column_index, line): string = line[column_index] end_column_index = column_index end_found = False invalid_symbol_found = False while end_column_index + 1 < len(line) and not end_found: char = line[end_column_index + 1] # If a quotation mark is found, the last char is verified, and if it is a backslash, the string is not ended. if char in lexicon.STRING_DELIMITER: if line[end_column_index] != chr(92): end_found = True elif not letter_digit_symbol.match(char): invalid_symbol_found = True if char != '\n': string += char end_column_index += 1 if end_found and not invalid_symbol_found: return Token(string, Code.STRING, line_index, line_index, column_index, end_column_index) return Token(string, Code.MF_STRING, line_index, line_index, column_index, end_column_index)
def tokenize_logical_op(self, line_index, column_index, line): logical_op = line[column_index] end_column_index = column_index if end_column_index + 1 < len(line) and logical_op + line[ end_column_index + 1] in lexicon.LOGICAL_OPERATORS_EXTENDED: logical_op += line[end_column_index + 1] end_column_index += 1 elif not logical_op in lexicon.COMMOM_RELATIONAL_LOGICAL: return Token(logical_op, Code.MF_OPERATOR, line_index, line_index, column_index, end_column_index) return Token(logical_op, Code.OP_LOGICAL, line_index, line_index, column_index, end_column_index)
def tokenize_id_or_keyword(self, line_index, column_index, line): id_or_keyword = line[column_index] end_column_index = column_index while end_column_index + 1 < len( line) and letter_digit_underscore.match( line[end_column_index + 1]): id_or_keyword += line[end_column_index + 1] end_column_index += 1 if id_or_keyword in lexicon.KEYWORDS: return Token(id_or_keyword, Code.KEYWORD, line_index, line_index, column_index, end_column_index) else: return Token(id_or_keyword, Code.IDENTIFIER, line_index, line_index, column_index, end_column_index)
def execute(self): line_index = 0 column_index = 0 line_index_changed = False while line_index < len(self.input_lines): line = self.input_lines[line_index] line_index_changed = False while column_index < len(line): char = line[column_index] token = Token(char, Code.INVALID_SYMBOL, line_index, line_index, column_index, column_index) if char in lexicon.DELIMITERS: token = self.tokenize_delimiter(line_index, column_index, line[column_index]) elif char in lexicon.DIGITS: token = self.tokenize_number(line_index, column_index, line) elif char in lexicon.ARITHMETIC_OPERATORS_BEGINNING: token = self.tokenize_arithmetic_op( line_index, column_index, line) elif char in lexicon.RELATIONAL_OPERATORS_BEGINNING: token = self.tokenize_relational_op( line_index, column_index, line) elif char in lexicon.LOGICAL_OPERATORS_BEGGINING: token = self.tokenize_logical_op(line_index, column_index, line) elif char in lexicon.STRING_DELIMITER: token = self.tokenize_string(line_index, column_index, line) elif char in lexicon.COMMOM_RELATIONAL_LOGICAL: token = self.tokenize_relational_or_logical_op( line_index, column_index, line) elif char in lexicon.COMMOM_ARITHMETIC_COMMENT: token = self.tokenize_arithmetic_or_comment( line_index, column_index, line) elif letter.match(line[column_index]): token = self.tokenize_id_or_keyword( line_index, column_index, line) # Add the token to tokens list only if its lexeme is not a blank space or a comment. if not token.lexeme.isspace() and token.code != Code.COMMENT: self.lexical_tokens.append(token) column_index = token.column_end_index + 1 # if the token index was changed, the columns loop should be broke. if line_index != token.line_end_index: line_index = token.line_end_index line_index_changed = True break # indexes should be changed only if the line_index was not changed on the last column loop. if not line_index_changed: line_index += 1 column_index = 0 return self.lexical_tokens
def tokenize_relational_op(self, line_index, column_index, line): relational_op = line[column_index] end_column_index = column_index if end_column_index + 1 < len(line) and relational_op + line[ end_column_index + 1] in lexicon.RELATIONAL_OPERATORS_EXTENDED: relational_op += line[end_column_index + 1] end_column_index += 1 return Token(relational_op, Code.OP_RELATIONAL, line_index, line_index, column_index, end_column_index)
def tokenize_arithmetic_op(self, line_index, column_index, line): arithmetic_op = line[column_index] end_column_index = column_index if end_column_index + 1 < len(line) and arithmetic_op + line[ end_column_index + 1] in lexicon.ARITHMETIC_OPERATORS_EXTENDED: arithmetic_op += line[end_column_index + 1] end_column_index += 1 return Token(arithmetic_op, Code.OP_ARITHMETIC, line_index, line_index, column_index, end_column_index)
def ignore_comment(self, line_index, column_index, init): comment = init end_line_index = line_index end_column_index = column_index + 2 if init == '//': comment = self.input_lines[end_line_index][column_index:] return Token(comment, Code.COMMENT, line_index, end_line_index, column_index, len(self.input_lines[line_index]) - 1) end_reached = False while end_line_index < len(self.input_lines) and not end_reached: line = self.input_lines[end_line_index] while end_column_index < len(line) and not end_reached: char = line[end_column_index] comment += char if char == '*' and end_column_index + 1 < len(line): next_char = line[end_column_index + 1] if next_char == '/': comment += next_char end_reached = True end_column_index += 1 if not end_reached: end_line_index += 1 end_column_index = 0 if end_reached: return Token(comment, Code.COMMENT, line_index, end_line_index, column_index, end_column_index) return Token(comment, Code.MF_COMMENT, line_index, end_line_index, column_index, end_column_index)
def tokenize_delimiter(self, line_index, column_index, delimiter): return Token(delimiter, Code.DELIMITER, line_index, line_index, column_index, column_index)
def open_function(self, identifier, init_index_array, array): end_index_array = init_index_array + 1 open_amount = 0 first_open = True func_array = [array[init_index_array]] func_len = len(array) error = False child_error = False func_str = array[init_index_array].lexeme function_name = array[init_index_array].lexeme # gets the function array in the initial array while end_index_array < func_len: token = array[end_index_array] if not first_open and open_amount == 0: break elif token.lexeme == '(': if first_open or open_amount > 0: open_amount += 1 elif open_amount == 0: break elif token.lexeme == ')': if open_amount == 0: break else: open_amount -= 1 func_array.append(token) end_index_array += 1 first_open = False params_index = 3 open_amount = 0 func_str += '(' cur_param = [func_array[2]] func_array_w_types = [func_array[0], func_array[1]] no_params = False if params_index >= len(func_array): no_params = True func_str += ')' while params_index < len(func_array): token = func_array[params_index] if token.lexeme == '(': open_amount += 1 cur_param.append(token) elif token.lexeme == ')' and open_amount > 0: open_amount -= 1 cur_param.append(token) elif open_amount == 0 and (token.lexeme == ',' or token.lexeme == ')'): param_type = self.get_type_expression(identifier, cur_param) func_str += param_type func_str += token.lexeme tk = '' if param_type == 'int': tk = Token('0', Code.NUMBER, identifier.line_begin_index, identifier.line_end_index, identifier.column_begin_index, identifier.column_end_index) elif param_type == 'real': tk = Token('0.0', Code.NUMBER, identifier.line_begin_index, identifier.line_end_index, identifier.column_begin_index, identifier.column_end_index) elif param_type == 'string': tk = Token('" "', Code.KEYWORD, identifier.line_begin_index, identifier.line_end_index, identifier.column_begin_index, identifier.column_end_index) elif param_type == 'boolean': tk = Token('true', Code.KEYWORD, identifier.line_begin_index, identifier.line_end_index, identifier.column_begin_index, identifier.column_end_index) else: tk = Token('object', Code.IDENTIFIER, identifier.line_begin_index, identifier.line_end_index, identifier.column_begin_index, identifier.column_end_index) func_array_w_types.append(tk) func_array_w_types.append(token) cur_param = [] else: cur_param.append(token) params_index += 1 #verify if the function exists: func_declared = False found_key = '' return_type = func_str if no_params: if func_str in self.symbols: func_declared = True found_key = func_str else: for key in self.symbols: try: proc_key = key[:(key.index('('))] if function_name in proc_key: params = key[(key.index('(') + 1):key.index(')')] splitted_params = params.split(',') params_str = '' for i, param in enumerate(splitted_params): if len(splitted_params) > 0: params_str += param.split()[0] if i + 1< len(splitted_params): params_str += ',' if function_name + '(' + params_str + ')' == func_str: func_declared = True found_key = key break except: func_declared = False if func_declared: if '@return' in self.symbols[found_key]: return_type = self.symbols[found_key]['@return'] if return_type == 'int': return_type = '0' elif return_type == 'real': return_type = '0.0' elif return_type == 'string': return_type = '" "' elif return_type == 'boolean': return_type = 'true' else: self.add_error(array[init_index_array], 'A procedure does not returns any type:`' + func_str + '`.') elif not child_error: self.add_error(array[init_index_array], 'This function does not exists: `' + func_str + '`.') error = True if not error: error = child_error return [end_index_array, return_type, error]
def get_type_expression(self, identifier, expr_array): if not hasattr(identifier, 'lexeme'): return expr = '' custom_type = '' for x in expr_array: if hasattr(x, 'lexeme'): expr += x.lexeme x = re.findall(LETTER, expr) if len(x) == 0: if any(l_and_r in expr for l_and_r in LOGICAL_AND_RELATIONAL) > 0: return 'boolean' else: try: if isinstance(eval(expr), int): return 'int' else: return 'real' except: return 'invalid' """ if self.symbols[scope][identifier.lexeme]['type'] == 'int': if isinstance(eval(expr), int): return 'int' else: self.add_error(identifier, 'You cannot assign `' + str(eval(expr)) +'` to `int`') elif self.symbols[scope][identifier.lexeme]['type'] == 'real': return 'real' else: self.add_error(identifier, 'You cannot assign `' + str(eval(expr)) +'` to ' + '`' + self.symbols[scope][identifier.lexeme]['type'] + '`') """ else: is_int = 0 is_real = 0 is_boolean = 0 is_string = 0 last_scope = '' index = 0 array_len = len(expr_array) if any(l_and_r in expr for l_and_r in LOGICAL_AND_RELATIONAL) > 0: is_boolean = 1 while index < array_len: token = expr_array[index] if token.lexeme == 'global': last_scope = 'global' elif token.lexeme == 'local': last_scope = 'local' elif token.code == Code.IDENTIFIER: proc_declared = False if index + 1 < array_len and expr_array[index + 1].lexeme == '(': for key in self.symbols: try: proc_key = key[:(key.index('(') + 1)] if token.lexeme + '(' == proc_key: proc_declared = True break except: proc_declared = False if proc_declared: initial_index = index resp = self.open_function(identifier, index, expr_array) last_index = resp[0] part1 = expr_array[0:initial_index] part2 = expr_array[last_index:array_len] new_expr_array = [] for part in part1: new_expr_array.append(part) tk_str = resp[1] tk = '' if tk_str == '0': tk = Token(tk_str, Code.NUMBER, identifier.line_begin_index, identifier.line_end_index, identifier.column_begin_index, identifier.column_end_index) elif tk_str == '0.0': tk = Token(tk_str, Code.NUMBER, identifier.line_begin_index, identifier.line_end_index, identifier.column_begin_index, identifier.column_end_index) elif tk_str == '" "': tk = Token(tk_str, Code.KEYWORD, identifier.line_begin_index, identifier.line_end_index, identifier.column_begin_index, identifier.column_end_index) elif tk_str == 'true': tk = Token(tk_str, Code.KEYWORD, identifier.line_begin_index, identifier.line_end_index, identifier.column_begin_index, identifier.column_end_index) else: tk = Token(tk_str, Code.IDENTIFIER, identifier.line_begin_index, identifier.line_end_index, identifier.column_begin_index, identifier.column_end_index) new_expr_array.append(tk) for part in part2: new_expr_array.append(part) expr_array = new_expr_array array_len = len(new_expr_array) index = len(part1) - 1 else: new_index = index access_list = last_scope if last_scope == 'global' or last_scope == 'local': access_list += '.' array_found = 0 while new_index < array_len: if array_found == 0 and expr_array[new_index].code == Code.IDENTIFIER: access_list += expr_array[new_index].lexeme elif array_found == 0 and expr_array[new_index].lexeme == '.': access_list += '.' elif expr_array[new_index].lexeme == '[': if array_found == 0: access_list += expr_array[new_index].lexeme array_found += 1 elif expr_array[new_index].lexeme == ']': array_found -= 1 if array_found == 0: access_list += expr_array[new_index].lexeme elif array_found == 0: break new_index += 1 found_type = self.get_access_type(access_list, token, True) if found_type == 'int': is_int = 1 elif found_type == 'real': is_real = 1 elif found_type == 'boolean': is_boolean = 1 elif found_type == 'string': is_string = 1 else: custom_type = found_type """ else: if not ('(' in token.lexeme and ')' in token.lexeme): self.add_error(identifier, 'Identifier not declared: `' + token.lexeme + '`') """ index = new_index last_scope = '' elif token.code == Code.NUMBER: try: if isinstance(eval(token.lexeme), int): is_int = 1 else: is_real = 1 except: pass elif token.lexeme == 'true' or token.lexeme == 'false': is_boolean = 1 elif token.code == Code.STRING: is_string = 1 index += 1 expr = '' for x in expr_array: expr += x.lexeme if is_int + is_real + is_string > 1: self.add_error(identifier, 'There are more than one types in a single expression `' + expr + '`. Conversions are not allowed here.') elif is_boolean == 1: return 'boolean' elif is_int == 1: return 'int' elif is_real == 1: return 'real' elif is_string == 1: return 'string' if custom_type != '': return custom_type return 'invalid'