def unwrap_code_lines(tokens): unwrapped_tokens = [] include = True prev_tokens = [ Token('', 'newline', False), Token('', 'newline', False), Token('', 'newline', False) ] for token in tokens: if token.group == 'line continuation': include = False prev_tokens.append(token) prev_tokens = prev_tokens[1:] if token.group == 'whitespace' and \ prev_tokens[-1].group == 'newline' and \ prev_tokens[-2].group == 'line continuation': if prev_tokens[-3].group != 'whitespace': unwrapped_tokens.append(Token(' ', 'whitespace', False)) elif include: unwrapped_tokens.append(token) prev_tokens.append(token) prev_tokens = prev_tokens[1:] if token.group == 'newline': if not include: prev_tokens.append(token) prev_tokens = prev_tokens[1:] include = True return unwrapped_tokens
def tokenize_line(self, line, tokenizer, wide): # break apart the line based on fixed format tokens = [] # The fixed-format PL/1 line format is: # 1: space or C or * # 2-72: program text # 73-: identification, traditionally sequence number (ignored) line_indicator = line[0:1] if wide: line_text = line[1:] line_identification = '' else: line_text = line[1:72] line_identification = line[72:] # tokenize the line indicator if line_indicator in ['C', '*']: tokens.append(Token(line, 'comment', False)) else: if len(line_indicator) > 0 and line_indicator != ' ': tokens.append(Token(line_indicator, 'invalid', False)) else: tokens.append(Token(' ', 'whitespace', False)) # tokenize the code tokens += tokenizer.tokenize(line_text) # tokenize the line identification if len(line_identification) > 0: tokens.append( Token(line_identification, 'line identification', False)) tokens.append(Token('\n', 'newline', False)) return tokens
def extract_keywords_from_identifiers(tokens, keywords, operators): new_tokens = [] words = keywords + operators for token in tokens: if token.group == 'variable': new_texts = BasicExaminer.extract_keywords(token.text, words) for new_text in new_texts: if new_text in keywords: new_token = Token(new_text, 'keyword', False) elif new_text in operators: new_token = Token(new_text, 'operator', False) else: if new_text.isdigit(): new_token = Token(new_text, 'number', True) else: new_token = Token(new_text, 'variable', True) new_tokens.append(new_token) else: new_tokens.append(token) return new_tokens
def calc_line_format_confidence(self): drop_types = ['whitespace', 'comment', 'line continuation'] tokens = Examiner.drop_tokens(self.tokens, drop_types) line_bracket_count = 0 num_bracket_count = 0 prev2_token = Token('\n', 'newline', False) prev_token = Token('\n', 'newline', False) for token in tokens: if token.group == 'group' and token.text == '{': num_bracket_count += 1 if prev_token.group == 'newline' and\ (prev2_token.group != 'group' or prev2_token.text != '{'): line_bracket_count += 1 self.errors.append({ 'TYPE': 'LINE FORMAT', 'TOKEN': token.text }) prev2_token = prev_token prev_token = token line_format_confidence = 1.0 if num_bracket_count > 0: line_format_confidence = 1.0 - (line_bracket_count / num_bracket_count) self.confidences['line format'] = line_format_confidence
def tokenize_alt_line(self, line, line_indicator): token = None if line_indicator in ['*', '/', 'D', 'd']: # the entire line is a comment (including DEBUG lines) token = Token(line[6:], 'comment', False) if line_indicator == '$': token = Token(line[6:], 'preprocessor', False) return token
def get_tokens(self): if self.text is None: return None if len(self.text) != 2: return None return [ Token(self.text[0], 'identifier', True), Token(self.text[1], 'group', False) ]
def get_tokens(self): if self.text == None: return None token1 = Token('', 'comment', False) token2 = Token('', 'comment', False) if self.text.startswith('Remark', False): token1 = Token('Remark', 'keyword', False) token2 = Token(self.text[6:], 'comment', False) return [token1, token2]
def get_tokens(self): if self.text is None: return None if self.case_sensitive: if self.text.startswith(self.lead): return [Token(self.text, self.group, False)] else: if self.text.lower().startswith(self.lead): return [Token(self.text, self.group, False)] return None
def get_tokens(self): if self.token1 is None: return None if self.token2 is None: token1 = Token(self.token1, 'keyword', False) tokens = [token1] else: token1 = Token(self.token1, 'keyword', False) token2 = Token(self.token2, 'comment', False) tokens = [token1, token2] return tokens
def tokenize_line_indicator(self, line_indicator): token = None if line_indicator == ' ': token = Token(' ', 'whitespace', False) else: if line_indicator == '-': token = Token(line_indicator, 'continuation line', False) else: if line_indicator != '': token = Token(line_indicator, 'invalid', False) return token
def tokenize_line_number(self, line_number): token = None if len(line_number) > 0: if line_number.isspace(): token = Token(line_number, 'whitespace', False) else: if line_number.isdigit(): token = Token(line_number, 'line number', False) else: token = Token(line_number, 'line identification', False) return token
def get_tokens(self): if self.text is None: return None # split the text into 'TEXT', content, and 'ENDTEXT' tokens len_start = len(self.prefix) len_end = len(self.suffix) starter_token = Token(self.text[:len_start], 'keyword', False) ender_token = Token(self.text[-len_end:], 'keyword', False) content = Token(self.text[len_start:-len_end], 'string', True) return [starter_token, content, ender_token]
def get_tokens(self): if self.text is None: return None tokens = None if self.text[-1] == '.': # a terminating dot is not part of the PIC token1 = Token(self.text[:-1], 'picture', True) token2 = Token('.', 'statement terminator', False) tokens = [token1, token2] else: tokens = [Token(self.text, 'picture', True)] return tokens
def convert_identifiers_to_labels(self): prev_2_token = Token('\n', 'newline', False) prev_token = Token('\n', 'newline', False) for token in self.tokens: if token.group == 'group' and token.text == ',' and \ prev_token.group == 'identifier' and \ prev_2_token.group == 'newline': prev_token.group = 'label' prev_token.is_operand = False if token.group not in ['whitespace']: prev_2_token = prev_token prev_token = token
def calc_operator_3_confidence(self, tokens, num_operators, group_ends, allow_pairs): errors = 0 prev_token = Token('\n', 'newline', False) lower_unary_operators = [] for op in self.unary_operators: lower_unary_operators.append(op.lower()) for token in tokens: prev_token_operand = prev_token.is_operand or \ prev_token.text in group_ends or \ prev_token.text.lower() == 'end' if token.group == 'operator' and \ not prev_token_operand and \ prev_token.text not in self.adjective_operators and \ prev_token.text not in self.postfix_operators and \ token.text not in self.keyword_postfix and \ token.text.lower() not in lower_unary_operators and \ [prev_token.text, token.text] not in allow_pairs: errors += 1 self.errors.append({ 'TYPE': 'OPERATOR3', 'FIRST': prev_token.text, 'SECOND': token.text }) prev_token = token operator_confidence_3 = 1.0 - errors / num_operators self.confidences['operator_3'] = operator_confidence_3
def calc_operator_2_confidence(self, tokens, num_operators, allow_pairs): errors = 0 prev_token = Token('\n', 'newline', False) lower_unary_operators = [] for op in self.unary_operators: lower_unary_operators.append(op.lower()) for token in tokens: if token.group == 'operator' and \ prev_token.group == 'operator' and \ prev_token.text not in self.adjective_operators and \ prev_token.text not in self.postfix_operators and \ token.text.lower() not in lower_unary_operators and \ [prev_token.text, token.text] not in allow_pairs: errors += 1 self.errors.append({ 'TYPE': 'OPERATOR2', 'FIRST': prev_token.text, 'SECOND': token.text }) prev_token = token operator_confidence_2 = 1.0 - errors / num_operators self.confidences['operator_2'] = operator_confidence_2
def calc_token_2_confidence(self, allowed_tokens=[]): num_repeated_tokens = 0 prev_token = Token('\n', 'newline', False) allowed_groups = [ 'invalid', 'whitespace', 'newline', 'comment', 'line description', 'group' ] for token in self.tokens: if token.group not in allowed_groups and token.text not in allowed_tokens: if token.group == prev_token.group and token.text == prev_token.text: num_repeated_tokens += 1 self.errors.append({ 'TYPE': 'TOKEN', 'REPEATED': token.text, 'FIRST': '', 'SECOND': '' }) prev_token = token repeat_confidence = 1.0 if len(self.tokens) > 0: repeat_unconfidence = num_repeated_tokens / len(self.tokens) repeat_confidence = 1.0 - repeat_unconfidence self.confidences['repeated token'] = repeat_confidence
def combine_identifier_colon(tokens, separator_groups, separator_texts, ignore_groups): new_list = [] new_token = None first_printable_token = True for token in tokens: if token.text == ':' and \ new_token is not None and new_token.group == 'identifier' and \ first_printable_token: new_token = Token(new_token.text + token.text, 'label', False) else: if new_token is not None: new_list.append(new_token) if new_token.group in separator_groups or \ new_token.text in separator_texts: first_printable_token = True else: if new_token.group not in ignore_groups: first_printable_token = False new_token = token if new_token is not None: new_list.append(new_token) return new_list
def calc_operator_4_confidence(self, tokens, num_operators, group_starts, allow_pairs): errors = 0 prev_token = Token('\n', 'newline', False) lower_unary_operators = [] for op in self.unary_operators: lower_unary_operators.append(op.lower()) for token in tokens: prev_token_postfix_operator = prev_token.text.lower() in ( op.lower() for op in self.postfix_operators) if prev_token.group == 'operator' and \ not prev_token_postfix_operator and \ not token.is_operand and \ token.text.lower() not in lower_unary_operators and \ token.text not in group_starts and \ [prev_token.text, token.text] not in allow_pairs: errors += 1 self.errors.append({ 'TYPE': 'OPERATOR4', 'FIRST': prev_token.text, 'SECOND': token.text }) prev_token = token operator_confidence_4 = 1.0 - errors / num_operators self.confidences['operator_4'] = operator_confidence_4
def calc_operand_n_confidence(self, tokens, operand_types, max_count): confidence_name_u = 'OPERAND_' + str(max_count) confidence_name_l = 'operand_' + str(max_count) n_operand_count = 0 consec_count = 0 prev_token = Token('\n', 'newline', False) for token in tokens: if token.group in operand_types: consec_count += 1 if consec_count > max_count: n_operand_count += 1 self.errors.append({ 'TYPE': confidence_name_u, 'FIRST': prev_token.text, 'SECOND': token.text }) else: consec_count = 0 prev_token = token operand_confidence = 1.0 if len(tokens) > 0: operand_confidence = 1.0 - (n_operand_count / len(tokens)) self.confidences[confidence_name_l] = operand_confidence
def combine_identifier_colon(tokens, separator_groups, separator_texts, ignore_groups): new_list = [] new_token = None first_printable_token = True in_declaration = True for token in tokens: if token.text == ':' and \ new_token is not None and new_token.group == 'identifier' and \ first_printable_token and \ not in_declaration: new_token = Token(new_token.text + token.text, 'label', False) else: if new_token is not None: new_list.append(new_token) if new_token.group in separator_groups or \ new_token.text in separator_texts: first_printable_token = True else: if new_token.group not in ignore_groups: first_printable_token = False new_token = token if token.text.lower() in ['procedure', 'function']: in_declaration = True if token.text.lower() == 'begin': in_declaration = False if new_token is not None: new_list.append(new_token) return new_list
def tokenize_asm_code(text, tab_size, opcode_tokenizer, opcode_extras, args_tokenizer, label_leads, label_mids, label_ends, comment_leads, line_comment_leads, use_line_id): lines = text.split('\n') tokens = [] indents = [] previous_was_continued = False for line in lines: newline = '\n' if len(line) > 0 and line[-1] == '\r': newline = '\r\n' line = line.rstrip('\r') line = line.rstrip() line = Tokenizer.tabs_to_spaces(line, tab_size) # get tokens and indents line_tokens, line_indents, was_continued = Tokenizer.tokenize_asm_line( line, previous_was_continued, opcode_tokenizer, opcode_extras, args_tokenizer, label_leads, label_mids, label_ends, comment_leads, line_comment_leads, use_line_id) tokens += line_tokens indents.append(line_indents) tokens.append(Token(newline, 'newline', False)) previous_was_continued = was_continued # return tokens and indents return tokens, indents
def calc_value_value_different_confidence(self, tokens): # remove tokens we don't care about drop_types = [ 'whitespace', 'comment', 'line description', 'line continuation' ] tokens = Examiner.drop_tokens(self.tokens, drop_types) value_types = ['number', 'string', 'symbol'] two_value_count = 0 prev_token = Token('\n', 'newline', False) for token in tokens: if token.group in value_types and\ prev_token.group in value_types and\ not token.group == prev_token.group: two_value_count += 1 self.errors.append({ 'TYPE': 'VALUE VALUE DIFFERENT', 'FIRST': prev_token.text, 'SECOND': token.text }) prev_token = token value_value_confidence = 1.0 if len(tokens) > 0: value_value_confidence = 1.0 - (two_value_count / len(tokens)) self.confidences['value value'] = value_value_confidence
def convert_asm_identifiers_to_labels(self): prev_token = Token('\n', 'newline', False) for token in self.tokens: if token.group == 'identifier' and prev_token.group == 'newline': token.group = 'label' prev_token = token
def get_tokens(self): if self.text is None: return None if self.text == '': return None return [Token(self.text, 'comment', False)]
def convert_keywords_to_identifiers(tokens): prev_token = Token('\n', 'newline', False) for token in tokens: if prev_token.text == 'type' and token.group != 'class': prev_token.group = 'identifier' prev_token = token
def get_tokens(self): if self.text is None: return None if self.text.startswith(self.prefix): return [Token(self.text, 'preprocessor', False)] return None
def tokenize_line(self, line, tokenizer, wide): # break apart the line based on fixed format tokens = [] # The COBOL line format is: # 1-6: line number or blank (ignored) # 7: space or one of *, /, D, d, $, - # 8-71: program text # 72-: identification, traditionally sequence number (ignored) line_indicator = line[6:7] if wide: line_text = line[7:] line_identification = '' else: line_text = line[7:71] line_identification = line[72:] if line.startswith(('//', '/*')): tokens.append(Token(line, 'jcl', False)) else: line_number = line[:6] token = self.tokenize_line_number(line_number) if token is not None: tokens.append(token) # tokenize the line indicator if line_indicator in ['*', '/', 'D', 'd', '$']: token = self.tokenize_alt_line(line, line_indicator) if token is not None: tokens.append(token) else: token = self.tokenize_line_indicator(line_indicator) if token is not None: tokens.append(token) # tokenize the code tokens += tokenizer.tokenize(line_text) # tokenize the line identification if len(line_identification) > 0: tokens.append(Token(line_identification, 'line identification', False)) tokens.append(Token('\n', 'newline', False)) return tokens
def get_tokens(self): if self.text is None: return None m = re.match(self.regex, self.text) if m is not None: g = m.groups() if len(g) != 4: return None token1 = Token(g[0], 'keyword', False) token2 = Token(g[1], 'whitespace', False) token3 = Token(g[2], 'keyword', False) token4 = Token(g[3], 'comment', False) tokens = [token1, token2, token3, token4] return tokens
def convert_operators_to_identifiers(self): prev_token = Token('\n', 'newline', False) for token in self.tokens: if token.text == '*' and prev_token.text == '::': token.group = 'identifier' if token.group not in ['whitespace', 'comment', 'newline']: prev_token = token