def calc_confidences(self, operand_types, group_starts, group_mids, group_ends, indents): tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) # self.calc_keyword_confidence() if indents is not None: self.calc_indent_confidence(indents)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", '`', "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '&', '!', '^', '<<', '>>', '&^', '=', '+=', '-=', '*=', '<<=', '>>=', '&^=', '&&', '||', '<-', '++', '--', '==', '!=', '<=', '>=', ':=', '...', '.', ':', '<', '>' ] self.unary_operators = ['+', '-', '*', '!', '&', '<-', ':'] self.postfix_operators = ['++', '--', ':'] groupers = ['(', ')', ',', '[', ']', '{', '}'] # group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'break', 'case', 'chan', 'const', 'continue', 'default', 'defer', 'else', 'fallthrough', 'for', 'func', 'go', 'goto', 'if', 'import', 'interface', 'map', 'package', 'range', 'return' 'select', 'struct', 'switch', 'type', 'var' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64', 'complex64', 'complex128', 'byte', 'rune', 'string', 'uint', 'int', 'uintptr' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['nil', 'true', 'false'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, class_type_tb, string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') # tokens = Examiner.combine_identifier_colon(tokens, ['newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_format_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) perl_identfier_tb = PerlIdentifierTokenBuilder() operand_types.append('identifier') specials = [ '$_', '@_', '$$', '$"', '$(', '$)', '$>', '$<', '$;', '$]', '$[', '$&', '$`', "$'", '$+', '@+', '%+', '@-', '%-', '$,', '$.', '$/', '$\\', '$|', '$%', '$-', '$:', '$=', '$^', '$~', '$!', '$?', '$@', '$#', '$*' ] specials_tb = CaseInsensitiveListTokenBuilder(specials, 'identifier', True) dollar_carat_tb = PerlDollarCaretIdentifierTokenBuilder() sigilbrace_tb = PerlSigilBraceTokenBuilder() quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') q_string_tb = PerlQStringTokenBuilder() regex_tb = RegexTokenBuilder() m_regex_tb = MRegexTokenBuilder() s_regex_tb = SRegexTokenBuilder() y_regex_tb = YRegexTokenBuilder() tr_regex_tb = TrRegexTokenBuilder() operand_types.append('regex') prototype_tb = PerlPrototypeTokenBuilder() comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment') line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) directives = ['#line'] preprocessor_tb = CaseSensitiveListTokenBuilder( directives, 'preprocessor', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '**', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=', '**=', '+=', '*=', '&=', '&.=', '<<=', '&&=', '-=', '/=', '|=', '|.=', '>>=', '||=', '.=', '%=', '^=', '^.=', '//=', 'x=', 'ne', 'gt', 'ge', 'le', 'lt', 'eq', '!', '&', '|', '~', '<<', '>>', '^', '.', '..', '...', '++', '--', '->', '=>', '&&', '||', '?', '<->', '<=>', 'and', 'cmp', 'or', 'xor' ] self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':', '::'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'bless', 'break', 'continue', 'die', 'do', 'else', 'elsif', 'eval', 'exit', 'exp', 'for', 'foreach', 'if', 'last', 'lock', 'my', 'next', 'no', 'our', 'package', 'redo', 'return', 'say', 'sub', 'taint', 'undef', 'unless', 'until', 'use', 'wantarray', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True) values = ['NULL'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, prototype_tb, identifier_tb, perl_identfier_tb, specials_tb, dollar_carat_tb, sigilbrace_tb, string_tb, q_string_tb, regex_tb, m_regex_tb, s_regex_tb, y_regex_tb, tr_regex_tb, preprocessor_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code, ['__END__']) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment', 'line description']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) hex_constant_tb = PrefixedIntegerTokenBuilder( '0H', False, '0123456789ABCDEFabcdef') octal_constant_tb = PrefixedIntegerTokenBuilder( '0O', False, '01234567') binary_constant_tb = PrefixedIntegerTokenBuilder('0B', False, '01') operand_types.append('number') leads = '_$' extras = '_$' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() terminators_tb = CaseInsensitiveListTokenBuilder( [';'], 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '===', '!==', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '=>', '^', '.', ':', '++', '--', '&&', '||', '?', '$', '?.', 'new', 'delete' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = [ '+', '-', '!', '~', '++', '--', ':', '$', 'new', 'delete' ] self.postfix_operators = ['++', '--', ':'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False) regex_tb = RegexTokenBuilder() keywords = [ 'break', 'case', 'catch', 'class', 'const', 'continue', 'debugger', 'default', 'do', 'else', 'enum', 'export', 'extends', 'finally', 'for', 'function', 'if', 'import', 'in', 'instanceof', 'return', 'switch', 'throw', 'try', 'typeof', 'while', 'with', 'as', 'implements', 'interface', 'let', 'package', 'private', 'protected', 'public', 'static', 'yield', 'constructor', 'declare', 'get', 'module', 'require', 'set', 'type', 'from', 'of' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'any', 'boolean', 'byte', 'char', 'number', 'string', 'symbol', 'void', 'never', 'object' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['this', 'super', 'null', 'true', 'false', 'undefined'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb, string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.convert_keywords_to_identifiers(['.']) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(False, False, '_') real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False) quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '>>>', '>>>=', '^', '.', '::', '++', '--', '&&', '||', '?', '->', 'new' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '!', '~', '++', '--', 'new'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'abstract', 'assert', 'break', 'case', 'catch', 'class', 'const', 'continue', 'default', 'do', 'else', 'enum', 'extends', 'final', 'finally', 'for', 'goto', 'if', 'implements', 'import', 'instanceof', 'interface', 'native', 'package', 'private', 'protected', 'public', 'return', 'static', 'strictfp', 'super', 'switch', 'synchronized', 'throw', 'throws', 'transient', 'try', 'volatile', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'boolean', 'byte', 'char', 'double', 'float', 'int', 'long', 'short', 'string', 'void', 'Integer', 'String', 'StringBuilder', 'File', 'Exception', 'IOException' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'null', 'this', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, class_type_tb, decorator_tb, string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.convert_keywords_to_identifiers(['::', '.']) self.convert_operators_to_identifiers(['::', '.']) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() ctrlz_char = '' code = self.TrimCtrlZText(code, ctrlz_char) operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(True, True, None) real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None) hex_constant_tb = SuffixedIntegerTokenBuilder( 'H', True, '0123456789ABCDEFabcdef') octal_constant_tb = SuffixedIntegerTokenBuilder('C', True, '01234567') binary_constant_tb = SuffixedIntegerTokenBuilder('B', True, '01') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ["'", '"'] string_tb = StringTokenBuilder(quotes, 0) operand_types.append('string') paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment') known_operators = [ ':=', '=', '>', '>=', '<', '<=', '#', '<>', '+', '-', '*', '/', 'DIV', 'MOD', 'AND', 'OR', 'NOT', '^', '.', '..', 'IN', '&' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', 'NOT', '@', '^', '.'] self.postfix_operators = ['^'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '|'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':', '|'] group_ends = [')', ']', '}'] groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'BEGIN', 'BY', 'CASE', 'CONST', 'DEFINITION', 'DO', 'ELSE', 'ELSIF', 'END', 'EXCEPT', 'EXIT', 'EXPORT', 'FINALLY', 'FOR', 'FROM', 'IF', 'IMPLEMENTATION', 'IMPORT', 'LOOP', 'MODULE', 'OF', 'PROCEDURE', 'QUALIFIED', 'REPEAT', 'THEN', 'TO', 'TYPE', 'VAR', 'WITH', 'WHILE' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'ARRAY', 'BOOLEAN', 'CARDINAL', 'CHAR', 'INTEGER', 'POINTER', 'REAL', 'RECORD', 'SET' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['FALSE', 'NIL', 'TRUE'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, string_tb, paren_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = tokens self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'identifier', 'variable'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence( ['BEGIN', 'RECORD', 'CASE', 'DO', 'IF', 'WHILE'], ['END']) self.calc_paired_blockers_confidence(['REPEAT'], ['UNTIL']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, block_comment_limit): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(False, True, '_') real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', '_') octal_integer_tb = PrefixedIntegerTokenBuilder('0o', True, '01234567_') hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', True, '0123456789ABCDEFabcdef_') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', True, '01_') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') lifetime_tb = IdentifierTokenBuilder("'", extras) attribute_tb = RustAttributeTokenBuilder() quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 10) bstring_tb = PrefixedStringTokenBuilder('b', True, quotes) rstring_tb = RustRawStringTokenBuilder() operand_types.append('string') char_tb = SingleCharStringTokenBuilder() class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = NestedCommentTokenBuilder( '/*', '*/', block_comment_limit) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '^', '!', '&', '|', '&&', '||', '<<', '>>', '+=', '-=', '*=', '/=', '%=', '^=', '&=', '|-', '<<=', '>>=', '=', '==', '!=', '>', '<', '>=', '<=', '@', '.', '..', '...', '->', '#', '$', '?', 'in', '&mut' ] self.unary_operators = ['+', '-', '*', '!', '&', '&mut'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::', '=>'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':', '::', '=>'] group_ends = [')', ']', '}', ')|'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'as', 'break', 'const', 'continue', 'crate' 'else', 'enum', 'extern', 'fn', 'for', 'if', 'impl', 'let', 'loop', 'match', 'mod', 'move', 'mut', 'pub', 'ref', 'return', 'static', 'struct', 'trait', 'type', 'unsafe', 'use', 'where', 'while' ] keywords_2018 = ['dyn', 'union', 'static'] keywords_future = [ 'abstract', 'become', 'box', 'do', 'final', 'macro', 'override', 'priv', 'typeof', 'unsized', 'virtual', 'yield', 'async', 'await', 'try' ] keywords += keywords_2018 keywords += keywords_future keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'Self', 'u8', 'i8', 'u16', 'i16', 'u32', 'i32', 'u64', 'i64', 'u128', 'i128', 'usize', 'isize', 'f32', 'f64' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['self', 'true', 'false', 'super', '_'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, octal_integer_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, char_tb, lifetime_tb, class_type_tb, attribute_tb, string_tb, bstring_tb, rstring_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = self.combine_numbers_and_adjacent_types(tokens) self.convert_operators_to_identifiers() self.convert_bars_to_groups() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_format_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types =[] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder(['_'], 'line continuation', False) integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) operand_types.append('number') variable_tb = VisualBasicVariableTokenBuilder('$%#!') operand_types.append('variable') leads = '_' extras = '_' suffixes = '$%#!' identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes) operand_types.append('identifier') quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') remark_tb = RemarkTokenBuilder() comment_tb = LeadToEndOfLineTokenBuilder("'", True, 'comment') comment2_tb = LeadToEndOfLineTokenBuilder("’", True, 'comment') known_operators = [ '+', '-', '*', '/', '\\', 'Mod', '^', '&', '>', '>=', '<', '<=', '<>', '=', 'And', 'Or', 'Eqv', 'Is', 'Imp', 'Like', 'Not', 'Xor', '.' ] self.unary_operators = [ '+', '-', 'Not' ] groupers = ['(', ')', ',', '[', ']'] group_starts = ['(', '[', ','] group_mids = [','] group_ends = [')', ']'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) keywords = [ 'Access', 'Alias', 'Any', 'AppActivate', 'Append', 'AppendChunk', 'Arrange', 'As', 'Beep', 'BeginTrans', 'ByVal', 'Call', 'Case', 'Circle', 'Clear', 'Close', 'Cls', 'CommitTrans', 'Compare', 'Const', 'Controls', 'CreateDynaset', 'Data', 'DateSerial', 'DateValue', 'Declare', 'DefCur', 'DefDbl', 'DefInt', 'DefLng', 'DefSng', 'DefStr', 'DefVar', 'Delete', 'Dim', 'Do', 'DoEvents', 'Drag', 'Edit', 'Else', 'ElseIf', 'End', 'EndDoc', 'EndIf', 'Erase', 'ExecuteSQL', 'Exit', 'Explicit', 'FieldSize', 'FileAttr', 'FileCopy', 'FileDateTime', 'Fix', 'For', 'Form', 'Format', 'Format$', 'Forms', 'Function', 'Get', 'GetAttr', 'GetChunk', 'GetData', 'GetFormat', 'GetText', 'Global', 'GoSub', 'GoTo', 'Hide', 'If', 'Input', 'Input$', 'InputBox', 'InputBox$', 'Kill', 'Let', 'Lib', 'Line', 'LinkExecute', 'LinkPoke', 'LinkRequest', 'LinkSend', 'Load', 'LoadPicture', 'Loc', 'Local', 'Lock', 'LOF', 'Loop', 'LSet', 'MkDir', 'Move', 'MoveFirst', 'MoveLast', 'MoveNext', 'MovePrevious', 'MoveRelative', 'MsgBox', 'Name', 'New', 'NewPage', 'Next', 'NextBlock', 'On', 'Open', 'OpenDataBase', 'Option', 'Output', 'Point', 'Preserve', 'Print', 'PrintForm', 'Private', 'PSet', 'Put', 'QBColor', 'Random', 'Randomize', 'Read', 'ReDim', 'Refresh', 'RegisterDataBase', 'Rem', 'RemoveItem', 'Reset', 'Restore', 'Resume', 'Return', 'RmDir', 'Rollback', 'RSet', 'SavePicture', 'Scale', 'Seek', 'Select', 'SendKeys', 'Set', 'SetAttr', 'SetData', 'SetFocus', 'SetText', 'Shared', 'Shell', 'Show', 'Static', 'Step', 'Stop', 'Sub', 'System', 'Text', 'TextHeight', 'TextWidth', 'Then', 'Timer', 'TimeSerial', 'TimeValue', 'To', 'Type', 'TypeOf', 'Unload', 'Unlock', 'Until', 'Update', 'Using', 'VarType', 'Weekday', 'Wend', 'While', 'Width', 'Write', 'ZOrder' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) functions = [ 'Abs', 'AddItem', 'AddNew', 'Asc', 'Atn', 'CCur', 'CDbl', 'ChDir', 'ChDrive', 'Chr', 'Chr$', 'CInt', 'CLng', 'Command', 'Command$', 'Cos', 'CSng', 'CStr', 'CurDir$', 'CVar', 'CVDate', 'Date', 'Date$', 'Day', 'Dir', 'Dir$', 'Environ$', 'EOF', 'Error', 'Error$', 'Exp', 'FileLen', 'FreeFile', 'Hex', 'Hex$', 'Hour', 'InStr', 'Int', 'InStrRev', 'IsDate', 'IsEmpty', 'IsNull', 'IsNumeric', 'Join', 'LBound', 'LCase', 'LCase$', 'Left', 'Left$', 'Len', 'Log', 'LTrim', 'LTrim$', 'Mid', 'Mid$', 'Minute', 'Mod', 'Month', 'Now', 'Oct', 'Oct$', 'RGB', 'Right', 'Right$', 'Rnd', 'RTrim', 'RTrim$', 'Second', 'Sgn', 'Sin', 'Space', 'Space$', 'Spc', 'Split', 'Sqr', 'Str', 'Str$', 'StrComp', 'String$', 'Tab', 'Tan', 'Time', 'Time$', 'Trim', 'Trim$', 'UBound', 'UCase', 'UCase$', 'Val', 'Year' ] function_tb = CaseSensitiveListTokenBuilder(functions, 'function', True) operand_types.append('function') types = [ 'Binary', 'Control', 'Currency', 'Double', 'Dynaset', 'Integer', 'Long', 'Single', 'String', 'Variant' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = [ 'False', 'True', 'App', 'Base', 'Clipboard', 'Debug', 'Erl', 'Err', 'Printer', 'Me', 'Nothing', 'Null' ] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, groupers_tb, known_operator_tb, types_tb, values_tb, function_tb, variable_tb, identifier_tb, string_tb, remark_tb, comment_tb, comment2_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon(tokens, ['newline'], [], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.convert_keywords_to_identifiers(['.']) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_format_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, block_comment_limit): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF_') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_') suffixed_integer_tb = SuffixedIntegerTokenBuilder(['U', 'L', 'LU', 'UL'], False, None) real_tb = RealTokenBuilder(False, False, "'") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l', 'i'], False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") hex_real_tb = HexRealExponentTokenBuilder() operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) operand_types.append('attribute') # string suffix: c,w,d quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) r_string_tb = PrefixedStringTokenBuilder('r', True, quotes) backtick_string_tb = EscapedStringTokenBuilder(['`'], 0) x_string_tb = PrefixedStringTokenBuilder('x', True, quotes) q_string_tb = PrefixedStringTokenBuilder('q', True, quotes) # q{} string cwd_string_tb = SuffixedStringTokenBuilder(quotes, 'cwd', False) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() slash_plus_comment_tb = NestedCommentTokenBuilder('/+', '+/', block_comment_limit) line_continuation_tb = SingleCharacterTokenBuilder('\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '/', '/=', '.', '..', '...', '&', '&=', '&&', '|', '|=', '||', '-', '-=', '--', '+', '+=', '++', '<', '<=', '<<', '<<=', '>', '>=', '>>=', '>>>=', '>>', '>>>', '!', '!=', '?', ',', ':', '$', '=', '==', '*', '*=', '%', '%=', '^', '^=', '^^', '^^=', '~', '~=', '@', '=>', '#', 'new', 'delete', 'typeof', 'is' ] self.unary_operators = [ '+', '-', '*', '!', '&', '~', '++', '--', ':', 'new', 'delete', 'typeof', 'is' ] self.postfix_operators = [ '++', '--', '&', ':' ] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) keywords = [ 'abstract', 'alias', 'align', 'asm', 'assert', 'auto', 'body', 'break', 'case', 'cast', 'catch', 'class', 'const', 'continue', 'debug', 'default', 'delegate', 'deprecated', 'do', 'else', 'enum', 'export', 'extern', 'final', 'finally', 'for', 'foreach', 'foreach_reverse', 'function', 'goto', 'if', 'immutable', 'import', 'in', 'inout', 'interface', 'invariant', 'lazy', 'macro', 'mixin', 'module', 'nothrow', 'out', 'override', 'package', 'pragma', 'private', 'protected', 'public', 'pure', 'ref', 'return', 'scope', 'shared', 'static', 'struct', 'switch', 'synchronized', 'template', 'throw', 'try', 'typeid', 'union', 'unittest', 'version', 'while', 'with', '__gshared', '__traits', '__vector', '__parameters' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'bool', 'byte', 'cdouble', 'cent', 'cfloat', 'char', 'creal', 'dchar', 'double', 'float', 'idouble', 'ifloat', 'int', 'ireal', 'long', 'real', 'short', 'ubyte', 'ucent', 'uint', 'ulong', 'ushort', 'void', 'wchar' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = [ 'false', 'null', 'super', 'this', 'true', '__FILE__', '__FILE_FULL_PATH__', '__MODULE__', '__LINE__', '__FUNCTION__', '__PRETTY_FUNCTION__', '__DATE__', '__EOF__', '__TIME__','__TIMESTAMP__', '__VENDOR__', '__VERSION__' ] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, hex_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, attribute_tb, class_type_tb, string_tb, r_string_tb, x_string_tb, backtick_string_tb, q_string_tb, cwd_string_tb, slash_slash_comment_tb, slash_star_comment_tb, slash_plus_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() number_suffixes = ['f', 'F', 'i', 'I', 'u', 'U', 'l', 'L', 'ul', 'uL', 'Ul', 'UL', 'lu', 'lU', 'Lu', 'LU'] tokens = self.combine_tokens_and_adjacent_types(tokens, 'number', 'identifier', number_suffixes) string_suffixes = ['c', 'w', 'd'] self.tokens = self.combine_tokens_and_adjacent_types(tokens, 'string', 'identifier', string_suffixes) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder(['_'], 'line continuation', False) integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) operand_types.append('number') variable_tb = VisualBasicVariableTokenBuilder('$%#!') operand_types.append('variable') leads = '_' extras = '_' suffixes = '$%#!' identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes) operand_types.append('identifier') quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') remark_tb = RemarkTokenBuilder() comment_tb = LeadToEndOfLineTokenBuilder("'", True, 'comment') comment2_tb = LeadToEndOfLineTokenBuilder("’", True, 'comment') directives = [ '#If', '#Else', '#ElseIf', '#End If', '#ExternalSource', '#Line', '#Region', '#End Region', '#Const' ] preprocessor_tb = CaseSensitiveListTokenBuilder( directives, 'preprocessor', False) known_operators = [ '&', '&=', '*', '*=', '/', '/=', '\\', '\\=', '^', '^=', '+', '+=', '-', '-=', '>>', '>>=', '<<', '<<=', '.', '=', '<', '<=', '>', '>=', '<>', 'AddressOf', 'And', 'AndAlso', 'In', 'Is', 'IsNot', 'Like', 'Or', 'OrElse', 'Xor' ] self.unary_operators = ['+', '-', 'Not', 'IsNot'] groupers = ['(', ')', ',', '[', ']'] group_starts = ['(', '[', ','] group_mids = [','] group_ends = [')', ']'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'AddHandler', 'Alias', 'As', 'ByRef', 'ByVal', 'Call', 'Case', 'Catch', 'Class', 'Const', 'Continue', 'Declare', 'Default', 'Delegate', 'Dim', 'DirectCast', 'Do' 'Each', 'Else', 'ElseIf', 'End', 'Enum', 'Erase', 'Error', 'Event', 'Finally', 'For', 'For Each', 'Friend', 'Function', 'Get', 'GetType', 'GetXMLNamespace', 'Global', 'GoSub', 'GoTo', 'Handles', 'If', 'Implements', 'Imports', 'Inherits', 'Interface', 'Let', 'Lib', 'Loop', 'Module', 'MustInherit', 'MustOverride', 'Namespace', 'Narrowing', 'New Constraint', 'New Operator', 'Next', 'NotInheritable', 'NotOverridable', 'Of', 'On', 'Operator', 'Option', 'Optional', 'Out', 'Overloads', 'Overridable', 'Overrides', 'ParamArray', 'Partial', 'Private', 'Property', 'Protected', 'Public', 'RaiseEvent', 'ReadOnly', 'ReDim', 'REM', 'RemoveHandler', 'Resume', 'Return', 'Select', 'Set', 'Shadows', 'Shared', 'Static', 'Step', 'Stop', 'Structure', 'Sub', 'SyncLock', 'Then', 'Throw', 'To', 'Try', 'TryCast', 'TypeOf', 'Using', 'Wend', 'When', 'While', 'Widening', 'With', 'WithEvents', 'WriteOnly' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) functions = [ 'Asc', 'AscW', 'Chr', 'ChrW', 'Filter', 'Format', 'GetChar', 'InStr', 'InStrRev', 'Join', 'LCase', 'Left', 'Len', 'LSet', 'LTrim', 'Mid', 'Replace', 'Right', 'RSet', 'RTrim', 'Space', 'Split', 'StrComp', 'StrConv', 'StrDup', 'StrReverse', 'Trim', 'UCase' ] function_tb = CaseSensitiveListTokenBuilder(functions, 'function', True) types = [ 'Boolean', 'Byte', 'CBool', 'CByte', 'CChar', 'CDate', 'CDbl', 'CDec', 'Char', 'CInt', 'CLng', 'CObj', 'CSByte', 'CShort', 'CSng', 'CStr', 'CType', 'CUInt', 'CULng', 'CUShort', 'Date', 'Decimal', 'Double', 'Integer', 'Long', 'Object', 'SByte', 'Short', 'Single', 'String', 'UInteger', 'ULong', 'UShort', ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['False', 'True', 'Nothing', 'MyBase', 'MyClass'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, groupers_tb, known_operator_tb, types_tb, values_tb, function_tb, variable_tb, identifier_tb, string_tb, remark_tb, comment_tb, comment2_tb, preprocessor_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon(tokens, ['newline'], [], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.convert_keywords_to_identifiers(['.']) self.convert_functions_to_identifiers() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(True, True, None) real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None) hex_constant_tb = PrefixedIntegerTokenBuilder( '$', True, '0123456789ABCDEFabcdef') octal_constant_tb = PrefixedIntegerTokenBuilder('&', True, '01234567') binary_constant_tb = PrefixedIntegerTokenBuilder('%', True, '01') char_constant_tb = PrefixedIntegerTokenBuilder('#', True, '0123456789') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ["'"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') brace_comment_tb = BraceCommentTokenBuilder() paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment') known_operators = [ '+', '-', '*', '/', '=', '<>', '>', '>=', '<', '<=', 'and', 'or', 'not', '&', '|', '~', '<<', '>>', ':=', '^', '~', '@', '.', ':', '..', 'div', 'mod', 'shl', 'shr', 'in' ] known_operator_tb = CaseInsensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', 'not', '@', '^', '.'] self.postfix_operators = ['^'] groupers = ['(', ')', ',', '[', ']'] group_starts = ['(', '[', ','] group_mids = [','] group_ends = [')', ']'] groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'begin', 'break', 'case', 'const', 'do', 'downto', 'else', 'end', 'for', 'forward', 'function', 'goto', 'if', 'label', 'of', 'otherwise', 'packed', 'procedure', 'program', 'repeat', 'reset', 'then', 'to', 'type', 'until', 'uses', 'value', 'var', 'while', 'with' ] keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'array', 'boolean', 'char', 'file', 'integer', 'real', 'record', 'set', 'string' ] types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'nil', 'true'] values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, char_constant_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, string_tb, brace_comment_tb, paren_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = self.combine_identifier_colon( tokens, ['statement separator'], ['begin'], ['whitespace', 'comment', 'newline', 'line description']) self.tokens = tokens self.convert_identifiers_to_labels() self.convert_identifiers_to_labels_2() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'identifier', 'variable'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['begin', 'record', 'case'], ['end']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() line_continuation_tb = CBasicLineContinuationTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, False, False) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) hex_constant_tb = CBasicSuffixedIntegerTokenBuilder( '0123456789ABCDEF', 'H') binary_constant_tb = CBasicSuffixedIntegerTokenBuilder('01', 'B') operand_types.append('number') variable_tb = CBasicVariableTokenBuilder('%$') operand_types.append('variable') quotes = ['"'] string_tb = StuffedQuoteStringTokenBuilder(quotes, False) operand_types.append('string') remark_tb = RemarkTokenBuilder() comment_tb = LeadToEndOfLineTokenBuilder("'", False, 'comment') comment2_tb = LeadToEndOfLineTokenBuilder("’", False, 'comment') stmt_separator_tb = SingleCharacterTokenBuilder( ':', 'statement separator', False) known_operators = [ '+', '-', '*', '/', '^', '=', '>', '>=', '<', '<=', '<>', '#', 'NOT', 'AND', 'EQ', 'GE', 'GT', 'LE', 'LT', 'NE', 'OR', 'XOR' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '#', 'NOT'] groupers = ['(', ')', ',', ';'] group_starts = ['(', ','] group_mids = [','] group_ends = [')'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'AS', 'BUFF', 'CALL', 'CHAIN', 'CLOSE', 'COMMON', 'CONSOLE', 'CREATE', 'DATA', 'DEF', 'DELETE', 'DIM', 'ELSE', 'END', 'FEND', 'FILE', 'FOR', 'GOSUB', 'GO', 'GOTO', 'IF', 'INITIALIZE', 'INPUT', 'INTEGER', 'LET', 'LINE', 'LPRINTER', 'NEXT', 'ON', 'OPEN', 'OUT', 'POKE', 'PRINT', 'RANDOMIZE', 'READ', 'REM', 'REMARK', 'RENAME', 'RESTORE', 'RETURN', 'SAVEMEM', 'STEP', 'STOP', 'SUB', 'THEN', 'TO', 'USING', 'WEND', 'WHILE', 'WIDTH', 'GRAPHIC', 'MAT', 'FILL', 'MAT', 'MARKER', 'PLOT', 'CHARACTER', 'HEIGHT', 'SET', 'ASK', 'COLOR', 'COUNT', 'JUSTIFY', 'LINE', 'STYLE', 'TYPE', 'TEXT', 'ANGLE', 'BOUNDS', 'DEVICE', 'VIEWPORT', 'WINDOW', 'BEAM', 'CLEAR', 'CLIP', 'POSITION' ] keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) label_tb = CBasicLabelTokenBuilder(keywords) functions = [ 'ASC', 'CHR$', 'STR$', 'TAB', 'COMMAND$', 'CONCHAR%', 'CONSTAT%', 'ATN', 'COS', 'SIN', 'TAN', 'ABS', 'EXP', 'INT', 'FLOAT', 'LOG', 'RND', 'SGN', 'SQR', 'LEFT$', 'LEN', 'MID$', 'RIGHT$', 'MATCH', 'VAL', 'FRE', 'INP', 'INT%', 'PEEK', 'POS', 'TAB', 'RECL', 'RECS', 'SADD', 'SIZE', 'UCASE$', 'VARPTR' ] function_tb = CaseInsensitiveListTokenBuilder(functions, 'function', True) operand_types.append('function') directives = [ '%LIST', '%NOLIST', '%PAGE', '%EJECT', '%INCLUDE', '%CHAIN' ] directive_tb = CaseInsensitiveListTokenBuilder(directives, 'directive', False) invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb, binary_constant_tb, keyword_tb, known_operator_tb, function_tb, variable_tb, label_tb, groupers_tb, string_tb, remark_tb, comment_tb, comment2_tb, directive_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.convert_numbers_to_line_numbers() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_format_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, year, extension): super().__init__() if year is not None and year not in ['2002', '2014']: raise CodeStatException('Unknown year for language') operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, True, None) real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', None) identifier_tb = CobolIdentifierTokenBuilder() quotes = ['"', "'", "’"] string_tb = StuffedQuoteStringTokenBuilder(quotes, False) n_string_tb = PrefixedStringTokenBuilder('N', False, quotes) nx_string_tb = PrefixedStringTokenBuilder('NX', False, quotes) picture_tb = PictureTokenBuilder() cr_picture_tb = CRPictureTokenBuilder() inline_comment_tb = LeadToEndOfLineTokenBuilder('*>', True, 'comment') star_comment_tb = AsteriskCommentTokenBuilder() terminators_tb = SingleCharacterTokenBuilder('.', 'statement terminator', False) known_operators = [ 'ADD', 'SUBTRACT', 'MULTIPLY', 'DIVIDE', '+', '-', '*', '/', '**', '=', '<>', '>', '>=', '<', '<=', 'AND', 'OR', 'NOT', 'B-AND', 'B-NOT', 'B-OR', 'B-XOR', ':' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', 'NOT'] groupers = ['(', ')', ','] group_starts = ['('] group_mids = [','] # group_ends = [')'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'ACCEPT', 'ACCESS', 'ADD', 'ADDRESS', 'ADVANCING', 'AFTER', 'ALL', 'ALPHABET', 'ALPHABETIC', 'ALPHABETIC-LOWER', 'ALPHABETIC-UPPER', 'ALPHANUMERIC', 'ALPHANUMERIC-EDITED', 'ALSO', 'ALTER', 'ALTERNATE', 'AND', 'ANY', 'APPLY', 'ARE', 'AREA', 'AREAS', 'ASCENDING', 'ASSIGN', 'AT', 'AUTHOR', 'BEFORE', 'BEGINNING', 'BELL', 'BINARY', 'BLOCK', 'BOTTOM', 'BY', 'BYTE-LENGTH', 'CALL', 'CANCEL', 'CBL', 'CD', 'CF', 'CH', 'CHARACTER', 'CHARACTERS', 'CLOCK-UNITS', 'CLOSE', 'COBOL', 'CODE', 'CODE-SET', 'COL', 'COLLATING', 'COLS', 'COLUMN', 'COMMA', 'COMMON', 'COMMUNICATION', 'COMP', 'COMPUTATIONAL', 'COMPUTE', 'CONFIGURATION', 'CONTAINS', 'CONTENT', 'CONTINUE', 'CONTROL', 'CONTROLS', 'CONVERTING', 'COPY', 'CORR', 'CORRESPONDING', 'COUNT', 'CURRENCY', 'DATA', 'DATE', 'DATE-COMPILED', 'DATE-WRITTEN', 'DAY', 'DAY-OF-WEEK', 'DE', 'DEBUG-CONTENTS', 'DEBUG-ITEM', 'DEBUG-LINE', 'DEBUG-NAME', 'DEBUG-SUB-1', 'DEBUG-SUB-2', 'DEBUG-SUB-3', 'DECIMAL-POINT', 'DECLARATIVES', 'DELETE', 'DELIMITED', 'DELIMITER', 'DEPENDING', 'DESCENDING', 'DESTINATION', 'DISABLE', 'DIVIDE', 'DIVISION', 'DOWN', 'DUPLICATES', 'DYNAMIC', 'EGI', 'ELSE', 'EMI', 'ENABLE', 'END', 'END-ACCEPT', 'END-ADD', 'END-CALL', 'END-COMPUTE', 'END-DELETE', 'END-DISPLAY', 'END-DIVIDE', 'END-EVALUATE', 'END-EXEC', 'END-IF', 'END-MULTIPLY', 'END-OF-PAGE', 'END-PERFORM', 'END-READ', 'END-RECEIVE', 'END-RETURN', 'END-REWRITE', 'END-SEARCH', 'END-START', 'END-STRING', 'END-SUBTRACT', 'END-UNSTRING', 'END-WRITE', 'ENTER', 'ENVIRONMENT', 'EOL', 'EOP', 'EQUAL', 'ERROR', 'ESI', 'EVALUATE', 'EVERY', 'EXCEPTION', 'EXEC', 'EXIT', 'EXTEND', 'EXTERNAL', 'FD', 'FILE', 'FILE-CONTROL', 'FILLER', 'FINAL', 'FIRST', 'FOOTING', 'FOR', 'FROM', 'FULL', 'GENERATE', 'GIVING', 'GLOBAL', 'GO', 'GOBACK', 'GREATER', 'GROUP', 'HEADING', 'HIGH-VALUE', 'HIGH-VALUES', 'I-O', 'I-O-CONTROL', 'IDENTIFICATION', 'IF', 'IN', 'INDEX', 'INDEXED', 'INDICATE', 'INITIAL', 'INITIALIZE', 'INITIATE', 'INPUT', 'INPUT-OUTPUT', 'INSPECT', 'INSTALLATION', 'INTO', 'INVALID', 'IS', 'JUST', 'JUSTIFIED', 'KEY', 'LABEL', 'LAST', 'LEADING', 'LEFT', 'LENGTH', 'LESS', 'LIMIT', 'LIMITS', 'LINAGE', 'LINAGE-COUNTER', 'LINE', 'LINE-COUNTER', 'LINES', 'LINKAGE', 'LOCK', 'LOW-VALUE', 'LOW-VALUES', 'MEMORY', 'MERGE', 'MESSAGE', 'MODE', 'MODULES', 'MOVE', 'MULTIPLE', 'MULTIPLY', 'NATIVE', 'NEGATIVE', 'NEXT', 'NOT', 'NUMBER', 'NUMBERS', 'NUMERIC', 'NUMERIC-EDITED', 'OBJECT-COMPUTER', 'OCCURS', 'OF', 'OMITTED', 'OPEN', 'OPTIONAL', 'OR', 'ORDER', 'ORGANIZATION', 'OTHER', 'OUTPUT', 'OVERFLOW', 'PACKED-DECIMAL', 'PADDING', 'PAGE', 'PAGE-COUNTER', 'PARAGRAPH', 'PERFORM', 'PF', 'PH', 'PIC', 'PICTURE', 'PLUS', 'POINTER', 'POSITION', 'POSITIVE', 'PRINTING', 'PROCEDURE', 'PROCEDURES', 'PROCEED', 'PROGRAM', 'PROGRAM-ID', 'PURGE', 'QUEUE', 'QUOTE', 'QUOTES', 'RANDOM', 'RD', 'READ', 'RECEIVE', 'RECORD', 'RECORDS', 'REDEFINES', 'REEL', 'REFERENCE', 'RELATIVE', 'RELEASE', 'REMAINDER', 'REMOVAL', 'RENAMES', 'REPLACE', 'REPLACING', 'REPORT', 'REPORTING', 'REPORTS', 'RERUN', 'RESERVE', 'RESET', 'RESUME', 'RETRY', 'RETURN', 'REVERSED', 'REWIND', 'REWRITE', 'RF', 'RH', 'RIGHT', 'ROUNDED', 'RUN', 'SAME', 'SD', 'SEARCH', 'SECTION', 'SECURE', 'SECURITY', 'SEGMENT', 'SEGMENT-LIMIT', 'SELECT', 'SEND', 'SENTENCE', 'SEPARATE', 'SEQUENCE', 'SEQUENTIAL', 'SET', 'SIGN', 'SIZE', 'SORT', 'SORT-MERGE', 'SOURCE', 'SOURCE-COMPUTER', 'SPECIAL-NAMES', 'STANDARD', 'STANDARD-1', 'STANDARD-2', 'START', 'STATUS', 'STOP', 'STRING', 'SUB-QUEUE-1', 'SUB-QUEUE-2', 'SUB-QUEUE-3', 'SUBTRACT', 'SUM', 'SUPPRESS', 'SYMBOLIC', 'SYNC', 'SYNCHRONIZED', 'TABLE', 'TALLY', 'TALLYING', 'TAPE', 'TERMINAL', 'TERMINATE', 'TEST', 'TEXT', 'THAN', 'THEN', 'THROUGH', 'THRU', 'TIME', 'TIMES', 'TITLE', 'TO', 'TOP', 'TRAILING', 'TYPE', 'UNIT', 'UNSTRING', 'UNTIL', 'UP', 'UPON', 'USAGE', 'USE', 'USING', 'VALUE', 'VALUES', 'VARYING', 'WHEN', 'WITH', 'WORDS', 'WORKING-STORAGE', 'WRITE' ] keywords_2002 = [ 'ACTIVE-CLASS', 'ALIGNED', 'ALLOCATE', 'ANYCASE', 'ARITHMETIC', 'AUTO', 'AUTOMATIC', 'BACKGROUND-COLOR', 'BASED', 'BASIS', 'BINARY-CHAR', 'BINARY-DOUBLE', 'BINARY-LONG', 'BINARY-SHORT', 'BIT', 'BLINK', 'BOOLEAN', 'CENTER', 'CLASS', 'CLASS-ID', 'CLASSIFICATION', 'COLUMNS', 'COM-REG', 'CONDITION', 'CONSTANT', 'CRT', 'CURSOR', 'CYCLE', 'DATA-POINTER', 'DBCS', 'DEBUGGING', 'DETAIL', 'DISPLAY', 'DISPLAY-1', 'DISPLAY-OF', 'EC', 'EGCS', 'EJECT', 'END-INVOKE', 'ENDING', 'ENTRY-CONVENTION', 'ENTRY-FIELD', 'EO', 'EOS', 'ERASE', 'EXCEPTION-OBJECT', 'EXCLUSIVE', 'EXPANDS', 'EXTERN', 'FACTORY', 'FLOAT-EXTENDED', 'FLOAT-LONG', 'FLOAT-SHORT', 'FOREGROUND-COLOR', 'FOREVER', 'FORMAT', 'FREE', 'FUNCTION', 'FUNCTION-ID', 'GET', 'GROUP-USAGE', 'HIGHLIGHT', 'IGNORING', 'IMPLEMENTS', 'INHERITS', 'INITIALIZED', 'INSERT', 'INTERFACE', 'INTERFACE-ID', 'INTRINSIC', 'INVOKE', 'KANJI', 'LC_ALL', 'LC_COLLATE', 'LC_CTYPE', 'LC_MESSAGES', 'LC_MONEY', 'LC_NUMERIC', 'LC_TIME', 'LOCAL-STORAGE', 'LOCALE', 'LOWLIGHT', 'MANUAL', 'METACLASS', 'METHOD', 'METHOD-ID', 'MINUS', 'MORE-LABELS', 'NATIONAL', 'NATIONAL-EDITED', 'NATIONAL-OF', 'NATIVE_BINARY', 'NESTED', 'NEW', 'NONE', 'NORMAL', 'OBJECT', 'OBJECT-REFERENCE', 'ONLY', 'OPTIONS', 'OVERRIDE', 'PHYSICAL', 'PRESENT', 'PREVIOUS', 'PROCEDURE-POINTER', 'PROCESSING', 'PROGRAM-POINTER', 'PROPERTY', 'PROTOTYPE', 'RAISE', 'RAISING', 'READY', 'RECURSIVE', 'REFERENCES', 'RELATION', 'RELOAD', 'REPOSITORY', 'REQUIRED', 'RETURN-CODE', 'RETURNING', 'ROUNDING', 'SCREEN', 'SECONDS', 'SERVICE', 'SHARING', 'SHIFT-IN', 'SHIFT-OUT', 'SIGNED', 'SKIP1', 'SKIP2', 'SKIP3', 'SORT-CONTROL', 'SORT-CORE-SIZE', 'SORT-FILE-SIZE', 'SORT-MESSAGE', 'SORT-MODE-SIZE', 'SORT-RETURN', 'SOURCES', 'STATEMENT', 'STEP', 'STRONG', 'SYMBOL', 'SYSTEM-DEFAULT', 'TRACE', 'TYPEDEF', 'UCS-4', 'UNDERLINE', 'UNIVERSAL', 'UNLOCK', 'UNSIGNED', 'USER-DEFAULT', 'UTF-16', 'UTF-8', 'VAL-STATUS', 'VALID', 'VALIDATE', 'VALIDATE-STATUS', 'WHEN-COMPILED', 'WRITE-ONLY', 'YYYYDDD', 'YYYYMMDD', ] keywords_2014 = [ 'AWAY-FROM-ZERO', 'NEAREST-AWAY-FROM-ZERO', 'NEAREST-EVEN', 'NEAREST-TOWARD-ZERO', 'TOWARD-GREATER', 'TOWARD-LESSER', 'CAPACITY', 'FLOAT-BINARY-128', 'FLOAT-BINARY-32', 'FLOAT-BINARY-64', 'FLOAT-DECIMAL-16', 'FLOAT-DECIMAL-34', 'FLOAT-INFINITY', 'FLOAT-NOT-A-NUMBER', 'FUNCTION-POINTER', 'INTERMEDIATE', 'PHYSICAL', 'PREFIXED', 'PROHIBITED', 'SHORT', 'STANDARD-BINARY', 'STANDARD-DECIMAL', 'TRUNCATION' ] keywords_ibm = ['ABSENT', 'ID', 'PASSWORD', 'UNBOUNDED'] keywords_gnu = [ 'ARGUMENT-NUMBER', 'ARGUMENT-VALUE', 'ASCII', 'BINARY-C-LONG', 'BINARY-SEQUENTIAL', 'CARD-PUNCH', 'CARD-READER', 'CASSETTE', 'CHAIN', 'CHAINING', 'COLOR', 'COMMAND-LINE', 'COMMIT', 'COMP-1', 'COMP-2', 'COMP-3', 'COMP-4', 'COMP-5', 'COMP-6', 'COMP-X', 'COMPUTATIONAL-1', 'COMPUTATIONAL-2', 'COMPUTATIONAL-3', 'COMPUTATIONAL-4', 'COMPUTATIONAL-5', 'COMPUTATIONAL-6', 'COMPUTATIONAL-X', 'CONVERSION', 'CRT-UNDER', 'DISC', 'DISK', 'EBCDIC', 'ECHO', 'END-CHAIN', 'ENTRY', 'ENVIRONMENT-NAME', 'ENVIRONMENT-VALUE', 'ESCAPE', 'F', 'FILE-ID', 'FIXED', 'FLOAT-DECIMAL-7', 'ID', 'IGNORE', 'KEPT', 'KEYBOARD', 'LEFT-JUSTIFY', 'LEFTLINE', 'LINE-SEQUENTIAL', 'LOWER', 'MAGNETIC-TAPE', 'NAME', 'NO-ECHO', 'NOTHING', 'OVERLINE', 'PRINT', 'PRINTER', 'PRINTER-1', 'PROCEDURE-POINTER', 'PROCEDURES', 'PROMPT', 'PROTECTED', 'RECORDING', 'REVERSE', 'RIGHT-JUSTIFY', 'ROLLBACK', 'S', 'SCROLL', 'SIGNED-INT', 'SIGNED-LONG', 'SIGNED-SHORT', 'SPACE-FILL', 'STATIC', 'STDCALL', 'SYSTEM-OFFSET', 'TAB', 'TIME-OUT', 'TRAILING-SIGN', 'U', 'UNSIGNED-INT', 'UNSIGNED-LONG', 'UNSIGNED-SHORT', 'UPDATE', 'UPPER', 'USER', 'V', 'VARIABLE', 'WAIT', 'WRAP', 'ZERO-FILL' ] keywords_acu = [ '3-D', 'ACTION', 'ACTIVE-X', 'ADJUSTABLE-COLUMNS', 'ALIGNMENT', 'AUTO-DECIMAL', 'AUTO-SPIN', 'BACKGROUND-HIGH', 'BACKGROUND-LOW', 'BACKGROUND-STANDARD', 'BAR', 'BITMAP', 'BITMAP-END', 'BITMAP-HANDLE', 'BITMAP-NUMBER', 'BITMAP-START', 'BITMAP-TRAILING', 'BITMAP-TRANSPARENT-COLOR', 'BITMAP-WIDTH', 'BOX', 'BOXED', 'BUSY', 'BUTTONS', 'CALENDAR-FONT', 'CANCEL-BUTTON', 'CELL', 'CELL-COLOR', 'CELL-DATA', 'CELL-FONT', 'CELL-PROTECTION', 'CENTERED-HEADING', 'CENTURY-DATE', 'CHECK-BOX', 'CLEAR-SELECTION', 'CLINE', 'CLINES', 'COLORS', 'COLUMN-COLOR', 'COLUMN-DIVIDERS', 'COLUMN-FONT', 'COLUMN-HEADINGS', 'COLUMN-PROTECTION', 'COMBO-BOX', 'COPY-SELECTION', 'CSIZE', 'CURSOR-COL', 'CURSOR-COLOR', 'CURSOR-FRAME-WIDTH', 'CURSOR-ROW', 'CURSOR-X', 'CURSOR-Y', 'CUSTOM-PRINT-TEMPLATE', 'DASHED', 'DATA-COLUMNS', 'DATA-TYPES', 'DATE-ENTRY', 'DEFAULT-BUTTON', 'DEFAULT-FONT', 'DESTROY', 'DISPLAY-COLUMNS', 'DISPLAY-FORMAT', 'DOTDASH', 'DOTTED', 'DOUBLE', 'DRAG-COLOR', 'DROP-DOWN', 'DROP-LIST', 'END-COLOR', 'END-MODIFY', 'ENGRAVED', 'ENSURE-VISIBLE', 'ENTRY-FIELD', 'ENTRY-REASON', 'ESCAPE-BUTTON', 'EVENT', 'EVENT-LIST', 'EXCEPTION-VALUE', 'EXPAND', 'EXTERNAL-FORM', 'FILE-NAME', 'FILE-POS', 'FILL-COLOR', 'FILL-COLOR-2', 'FILL-PERCENT', 'FINISH-REASON', 'FIXED-FONT', 'FIXED-WIDTH', 'FLAT', 'FLAT-BUTTONS', 'FLOAT', 'FLOATING', 'FONT', 'FRAME', 'FRAMED', 'FULL-HEIGHT', 'GRID', 'GO-BACK', 'GO-FORWARD', 'GO-HOME', 'GO-SEARCH', 'GRAPHICAL', 'GRID', 'GROUP-VALUE', 'HANDLE', 'HAS-CHILDREN', 'HEADING-COLOR', 'HEADING-DIVIDER-COLOR', 'HEADING-FONT', 'HEAVY', 'HEIGHT-IN-CELLS', 'HIDDEN-DATA', 'HIGH-COLOR', 'HOT-TRACK', 'HSCROLL', 'HSCROLL-POS', 'ICON', 'IDENTIFIED', 'INDEPENDENT', 'INQUIRE', 'INSERTION-INDEX', 'INSERTION-ROWS', 'ITEM', 'ITEM-TEXT', 'ITEM-TO-ADD', 'ITEM-TO-DELETE', 'ITEM-TO-EMPTY', 'ITEM-VALUE', 'LABEL', 'LABEL-OFFSET', 'LARGE-FONT', 'LARGE-OFFSET', 'LAST-ROW', 'LAYOUT-DATA', 'LAYOUT-MANAGER', 'LEADING-SHIFT', 'LEFT-TEXT', 'LINES-AT-ROOT', 'LIST-BOX', 'LM-RESIZE', 'LONG-DATE', 'LOW-COLOR', 'LOWERED', 'MASS-UPDATE', 'MAX-LINES', 'MAX-PROGRESS', 'MAX-TEXT', 'MAX-VAL', 'MEDIUM-FONT', 'MENU', 'MIN-VAL', 'MODIFY', 'MULTILINE', 'NAVIGATE-URL', 'NEXT-ITEM', 'NO-AUTOSEL', 'NO-AUTO-DEFAULT', 'NO-BOX', 'NO-DIVIDERS', 'NO-F4', 'NO-FOCUS', 'NO-GROUP-TAB', 'NO-KEY-LETTER', 'NO-SEARCH', 'NO-UPDOWN', 'NOTAB', 'NOTIFY', 'NOTIFY-CHANGE', 'NOTIFY-DBLCLICK', 'NOTIFY-SELCHANGE', 'NUM-COL-HEADINGS', 'NUM-ROWS', 'OK-BUTTON', 'OVERLAP-LEFT', 'OVERLAP-TOP', 'PAGE-SETUP', 'PAGED', 'PARENT', 'PERMANENT', 'PIXEL', 'PLACEMENT', 'POP-UP', 'POSITION-SHIFT', 'PRINT-NO-PROMPT', 'PRINT-PREVIEW', 'PRIORITY', 'PROGRESS', 'PROPERTIES', 'PROPERTY', 'PUSH-BUTTON', 'QUERY-INDEX', 'RADIO-BUTTON', 'RAISED', 'READ-ONLY', 'RECORD-DATA', 'RECORD-TO-ADD', 'RECORD-TO-DELETE', 'REFRESH', 'REGION-COLOR', 'RESET-GRID', 'RESET-LIST', 'RESET-TABS', 'RIGHT-ALIGN', 'RIMMED', 'ROW-COLOR', 'ROW-COLOR-PATTERN', 'ROW-DIVIDERS', 'ROW-FONT', 'ROW-HEADINGS', 'ROW-PROTECTION', 'SAVE-AS', 'SAVE-AS-NO-PROMPT', 'SCROLL-BAR', 'SEARCH-OPTIONS', 'SEARCH-TEXT', 'SELECT-ALL', 'SELECTION-INDEX', 'SELECTION-TEXT', 'SELF-ACT', 'SEPARATION', 'SHADING', 'SHADOW', 'SHORT-DATE', 'SHOW-LINES', 'SHOW-NONE', 'SHOW-SEL-ALWAYS', 'SMALL-FONT', 'SORT-ORDER', 'SPINNER', 'SQUARE', 'START-X', 'START-Y', 'STATIC-LIST', 'STATUS-BAR', 'STATUS-TEXT', 'STYLE', 'SUBWINDOW', 'TAB-TO-ADD', 'TAB-TO-DELETE', 'TEMPORARY', 'TERMINATION-VALUE', 'THREAD', 'THREADS', 'THUMB-POSITION', 'TILED-HEADINGS', 'TITLE', 'TITLE-POSITION', 'TRADITIONAL-FONT', 'TRAILING-SHIFT', 'TRANSPARENT', 'TREE-VIEW', 'UNFRAMED', 'UNSORTED', 'USE-ALT', 'USE-RETURN', 'USE TAB', 'VALUE-FORMAT', 'VARIANT', 'VERTICAL', 'VERY-HEAVY', 'VIRTUAL-WIDTH', 'VPADDING', 'VSCROLL', 'VSCROLL-BAR', 'VSCROLL-POS', 'VTOP', 'WEB-BROWSER', 'WIDTH', 'WIDTH-IN-CELLS', 'WINDOW', 'X', 'Y' ] if year in ['2002', '2014']: keywords += keywords_2002 if year == '2014': keywords += keywords_2014 if extension.lower() == 'acu': keywords += keywords_acu if extension.lower() == 'ibm': keywords += keywords_ibm if extension.lower() == 'gnu': keywords += keywords_gnu keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) values = [ 'BLANK', 'SPACE', 'SPACES', 'ZERO', 'ZEROES', 'ZEROS', 'FALSE', 'NO', 'OFF', 'ON', 'TRUE' ] values_2002 = ['NULL', 'NULLS', 'SELF', 'SUPER'] if year in ['2002', '2014']: values += values_2002 value_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') cobol_preprocessor_tb = CobolPreprocessorTokenBuilder() exec_tb = BlockTokenBuilder('EXEC', 'END-EXEC', 'exec block') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, picture_tb, cr_picture_tb, keyword_tb, star_comment_tb, # before operator, to catch single star as comment known_operator_tb, groupers_tb, value_tb, identifier_tb, string_tb, n_string_tb, nx_string_tb, inline_comment_tb, cobol_preprocessor_tb, exec_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.convert_numbers_to_pictures() self.convert_numbers_to_levels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) # self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) # self.calc_operand_n_confidence(tokens, operand_types, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_length_confidence(code, self.max_expected_line) self.calc_picture_confidence() expected_keyword_confidence = self.check_expected_keywords() self.confidences['expected_keywords'] = expected_keyword_confidence
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '0123456789abcdefABCDEF_') long_integer_tb = SuffixedIntegerTokenBuilder('L', False, None) real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") float_real_tb = SuffixedRealTokenBuilder(False, False, ['f'], False, None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder("'", 'symbol', True) operand_types.append('symbol') quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '&', '|', '^', '<<', '>>', '&&', '||', '=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '>:', '⇒', '=>', '=', '<%', '<:', '←', '<-', '#', '@', '==', '!=', '>', '<', '>=', '<=', '!', '~', '<<<', '>>>', '.', '++', '--', 'new' ] self.unary_operators = ['+', '-', '*', '!', '~', '++', '--', 'new'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'abstract', 'case', 'catch', 'class', 'def', 'do', 'else', 'extends', 'final', 'finally', 'for', 'forSome', 'if', 'implicit', 'import', 'lazy', 'match', 'object', 'override', 'package', 'private', 'protected', 'return', 'sealed', 'then', 'throw', 'trait', 'try', 'type', 'val', 'var', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['false', 'true', 'null', 'this', 'super'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, long_integer_tb, real_tb, real_exponent_tb, float_real_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, symbol_tb, string_tb, triple_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, version): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') command_tb = PrefixedIdentifierTokenBuilder('!', 'command', False) metaclass_tb = PrefixedIdentifierTokenBuilder('?', 'metaclass', False) quotes = ['"', "'", "’"] string_tb = MatlabStringTokenBuilder(quotes, False) operand_types.append('string') line_comment_m_tb = LeadToEndOfLineTokenBuilder('%', False, 'comment') line_comment_o_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment') block_comment_m_tb = BlockTokenBuilder('%{', '%}', 'comment') block_comment_o_tb = BlockTokenBuilder('#{', '#}', 'comment') line_continuation_tb = KeywordTokenBuilder('...', 'line continuation') known_operators = [ '+', '-', '.*', '*', './', '/', '\\', '.^', '^', ".'", "'", '=', '==', '~=', '>', '>=', '<', '<=', '&', '|', '&&', '||', '~', '@', '.', '.?' ] operators_octave = [ '++', '--', '+=', '-=', '*=', '/=', '^=', '!', '!=', '**' ] if version == 'octave': known_operators += operators_octave self.unary_operators = ['+', '-', '~', '@'] self.postfix_operators = ["'"] groupers = ['(', ')', ',', '[', ']', '{', '}', ';', ':'] group_starts = ['(', '[', ',', '{'] # group_mids = [',', ';', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'break', 'case', 'catch', 'classdef', 'continue', 'else', 'elseif', 'end', 'for', 'function', 'global', 'if', 'otherwise', 'parfor', 'persistent', 'return', 'spmd', 'switch', 'try', 'while' ] keywords_octave = ['endfor', 'endif', 'endwhile'] if version == 'octave': keywords += keywords_octave keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['inf', 'Nan'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, command_tb, metaclass_tb, string_tb, line_comment_m_tb, block_comment_m_tb ] tokenbuilders_2 = [line_comment_o_tb, block_comment_o_tb] if version == 'octave': tokenbuilders += tokenbuilders_2 tokenbuilders_9 = [self.unknown_operator_tb, invalid_token_builder] tokenbuilders += tokenbuilders_9 tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) # self.calc_group_confidence(tokens, group_mids) # operand_types_2 = ['number'] # self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, tab_size, wide): super().__init__() self.operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) binary_integer_tb = SuffixedIntegerTokenBuilder(['B'], False, None) hex_integer_tb = SuffixedIntegerTokenBuilder(['H'], False, 'ABCDEF') octal_integer_tb = SuffixedIntegerTokenBuilder(['O'], False, None) decimal_integer_tb = SuffixedIntegerTokenBuilder(['D'], False, None) real_tb = RealTokenBuilder(True, False, None) real_exponent_tb = RealExponentTokenBuilder(True, False, 'E', None) binary_real_tb = SuffixedRealTokenBuilder(True, True, ['B'], False, None) self.operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) self.operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) self.operand_types.append('string') label_tb = PL1LabelTokenBuilder() self.operand_types.append('label') slash_star_comment_tb = SlashStarCommentTokenBuilder() directives = [ '%ACTIVATE', '%DEACTIVATE', '%DECLARE', '%DCL', '%DICTIONARY', '%DO', '%ELSE', '%END', '%FATAL', '%GOTO', '%IF', '%INCLUDE', '%LIST', '%NOLIST', '%PAGE', '%PROCEDURE', '%PROC', '%REPLACE', '%RETURN', '%THEN' ] line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) preprocessor_tb = CaseInsensitiveListTokenBuilder( directives, 'preprocessor', False) title_tb = LeadToEndOfLineTokenBuilder('%TITLE', True, 'preprocessor') subtitle_tb = LeadToEndOfLineTokenBuilder('%SBTTL', True, 'preprocessor') error_tb = LeadToEndOfLineTokenBuilder('%ERROR', True, 'preprocessor') warn_tb = LeadToEndOfLineTokenBuilder('%WARN', True, 'preprocessor') inform_tb = LeadToEndOfLineTokenBuilder('%INFORM', True, 'preprocessor') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '**', '>', '<', '=', '>=', '<=', '<>', '^>', '^<', '^=', '^', '~>', '~<', '~=', '~', '&', '&:', ':=', '|', '|:', '||', '!', '!:', '!!', ':', '@', 'NOT', 'AND', 'OR', 'XOR', 'MINUS', 'PLUS', 'MOD' ] self.unary_operators = ['+', '-', '^', '~', '@', 'NOT'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}'] self.group_starts = ['(', '[', ',', '{'] self.group_mids = [','] self.group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'ADDRESS', 'AT', 'BASED', 'BY', 'CALL', 'CASE', 'CLOSE', 'DATA', 'DECLARE', 'DISABLE', 'DO', 'ELSE', 'ENABLE', 'END', 'EOF', 'EXTERNAL', 'GO', 'GOTO', 'HALT', 'IF', 'INITIAL', 'INTERRUPT', 'LABEL', 'LITERALLY', 'OFFSET', 'ON', 'OPEN', 'OTHERWISE', 'OTHER', 'PROCEDURE', 'PROC', 'PUBLIC', 'READ', 'REENTRANT', 'RETURN', 'SELECTOR', 'STRUCTURE', 'THEN', 'TO', 'WHILE', 'WRITE' ] keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) attributes = [ 'ALIGNED', 'ANY', 'AREA', 'BASED', 'BUILTIN', 'CONDITION', 'COND', 'CONTROLLED', 'CTL', 'DEFINED', 'DEF', 'DIRECT', 'ENTRY', 'ENVIRONMENT', 'ENV', 'EXTERNAL', 'EXT', 'FILE', 'GLOBALDEF', 'GLOBALREF', 'INITIAL', 'INIT', 'INPUT', 'INTERNAL', 'INT' 'KEYED', 'LABEL', 'LIKE', 'LIST', 'MEMBER', 'NONVARYING', 'NONVAR', 'OPTIONAL', 'OPTIONS', 'OUTPUT', 'PARAMETER', 'PARM', 'PICTURE', 'PIC', 'POSITION', 'POS', 'PRECISION', 'PREC', 'PRINT', 'READONLY', 'RECORD', 'REFER', 'RETURNS', 'SEQUENTIAL', 'SEQL', 'STATIC', 'STREAM', 'STRUCTURE', 'TRUNCATE', 'UNALIGNED', 'UNAL', 'UNION', 'UPDATE', 'VARIABLE', 'VARYING', 'VAR' ] attributes_tb = CaseInsensitiveListTokenBuilder( attributes, 'attribute', False) functions = [ 'ABS', 'ADJUSTRPL', 'BLOCKINPUT', 'BLOCKINWORD', 'BLOCKINDWORD', 'BLOCKOUTPUT', 'BLOCKOUTWORD', 'BLOCKOUTDWORD', 'BUILDPTR', 'BYTESWAP', 'CMPD', 'CARRY', 'CAUSEINTERRUPT', 'CLEARTASKSWITCHEDFLAG', 'CMPB', 'CMPW', 'CONTROLREGISTER', 'DEC', 'DOUBLE', 'DEBUGREGISTER', 'FINDB', 'FINDD', 'FINDRD', 'FINDHW', 'FINDRB', 'FINDRHW', 'FINDRW', 'FINDW', 'FIX', 'FLAGS', 'FLOAT', 'GETACCESSRIGHTS', 'GETREALERROR', 'GETSEGMENTLIMIT', 'HIGH', 'IABS', 'INHWORD', 'INITREALMATHUNITSKIPRB', 'INPUT', 'INT SIZE', 'INWORD SIZE', 'INVALIDATEDATACACHE', 'INVALIDATETLBENTRY' 'INDWORD', 'LAST', 'LENGTH', 'LOCALTABLE', 'LOCKSET', 'LOW', 'MACHINESTATUS', 'MOVB', 'MOVBIT', 'MOVD', 'MOVE', 'MOVHW', 'MOVRB', 'MOVRBIT' 'MOVRD', 'MOVRHW', 'MOVRW', 'MOVW', 'NIL', 'OFFSETOF', 'OUTDWORD', 'OUTHWORD', 'OUTPUT', 'OUTWORD', 'PARITY', 'RESTOREGLOBALTABLE', 'RESTOREINTERRUPTABLE', 'RESTOREREALSTATUS', 'ROL', 'ROR', 'SAL', 'SAR', 'SAVEGLOBALTABLE', 'SAVEINTERRUPTTABLE', 'SAVEREALSTATUS', 'SCANBIT', 'SCANRBIT', 'SCL', 'SCR', 'SEGMENTREADABLE', 'SEGMENTWRITABLE', 'SELECTOROF', 'SETB', 'SETHW', 'SETREALMODE', 'SETW', 'SHL', 'SHLD', 'SHR', 'SHRD', 'SETD', 'SIGN', 'SIGNED', 'SKIPB', 'SKIPD', 'SKIPRD' 'SKIPHW', 'SKIPRHW', 'SKIPRW', 'SKIPW', 'STACKBASE', 'STACKPTR', 'TASKREGISTER', 'TESTREGISTER', 'TIME', 'UNSIGN', 'WAITFORINTERRUPT', 'WBINVALIDATEDATACACHE', 'XLAT', 'ZERO' ] function_tb = CaseInsensitiveListTokenBuilder(functions, 'function', True) format_items = [ 'A', 'B', 'B1', 'B2', 'B3', 'B4', 'COLUMN', 'COL', 'E', 'F', 'P', 'R', 'TAB', 'X' ] format_item_tb = CaseSensitiveListTokenBuilder(format_items, 'format', True) self.operand_types.append('format') options = [ 'APPEND', 'BACKUP_DATE', 'BATCH', 'BLOCK_BOUNDARY_FORMAT', 'BLOCK_IO', 'BLOCK_SIZE', 'BUCKET_SIZE', 'BY', 'CANCEL_CONTROL_O', 'CARRIAGE_RETURN_FORMAT', 'CONTIGUOUS', 'CONTIGUOUS_BEST_TRY', 'CREATION_DATE', 'CURRENT_POSITION', 'DEFAULT_FILE_NAME', 'DEFERRED_WRITE', 'DELETE', 'EDIT', 'EXPIRATION_DATE', 'EXTENSION_SIZE', 'FAST_DELETE', 'FILE_ID', 'FILE_ID_TO', 'FILE_SIZE', 'FIXED_CONTROL_FROM', 'FIXED_CONTROL_SIZE', 'FIXED_CONTROL_SIZE_TO', 'FIXED_CONTROL_TO', 'FIXED_LENGTH_RECORDS', 'FROM', 'GROUP_PROTECTION', 'IDENT', 'IGNORE_LINE_MARKS', 'IN', 'INDEXED', 'INDEX_NUMBER', 'INITIAL_FILL', 'INTO', 'KEY', 'KEYFROM', 'KEYTO', 'LINESIZE', 'LOCK_ON_READ', 'LOCK_ON_WRITE', 'MAIN PROCEDURE', 'MANUAL_UNLOCKING', 'MATCH_GREATER', 'MATCH_GREATER_EQUAL', 'MATCH_NEXT', 'MATCH_NEXT_EQUAL', 'MAXIMUM_RECORD_NUMBER', 'MAXIMUM_RECORD_SIZE', 'MULTIBLOCK_COUNT', 'MULTIBUFFER_COUNT', 'NOLOCK', 'NONEXISTENT_RECORD', 'NONRECURSIVE', 'NORESCAN', 'NO_ECHO', 'NO_FILTER', 'NO_SHARE', 'OWNER_GROUP', 'OWNER_ID', 'OWNER_MEMBER', 'OWNER_PROTECTION', 'PAGE', 'PAGESIZE', 'PRINTER_FORMAT', 'PROMPT', 'PURGE_TYPE_AHEAD', 'READ_AHEAD', 'READ_CHECK', 'READ_REGARDLESS', 'RECORD_ID', 'RECORD_ID_ACCESS', 'RECORD_ID_TO', 'RECURSIVE', 'REPEAT', 'RESCAN', 'RETRIEVAL_POINTERS', 'REVISION_DATE', 'REWIND_ON_CLOSE', 'REWIND_ON_OPEN', 'SCALARVARYING', 'SET READ', 'SHARED_READ', 'SHARED_WRITE', 'SKIP', 'SNAP', 'SPOOL', 'STATEMENT', 'SUPERSEDE', 'SYSTEM', 'SYSTEM_PROTECTION', 'TEMPORARY', 'TIMEOUT_PERIOD', 'TITLE', 'TO', 'UNDERFLOW', 'UFL', 'UNTIL', 'USER_OPEN', 'WAIT_FOR_RECORD', 'WHILE', 'WORLD_PROTECTION', 'WRITE_BEHIND', 'WRITE_CHECK' ] options_tb = CaseInsensitiveListTokenBuilder(options, 'option', False) conditions = [ 'ANYCONDITION', 'CONVERSION', 'CONV', 'ENDFILE', 'ENDPAGE', 'FINISH', 'FIXEDOVERFLOW', 'FOFL', 'OVERFLOW', 'OFL', 'STORAGE', 'STRINGRANGE', 'STRG', 'SUBSCRIPTRANGE', 'SUBRG', 'UNDEFINEDFILE', 'UNDF', 'VAXCONDITION', 'ZERODIVIDE', 'ZDIV' ] conditions_tb = CaseInsensitiveListTokenBuilder( conditions, 'condition', False) subroutines = [ 'DISPLAY', 'EXTEND', 'FLUSH', 'NEXT_VOLUME', 'RELEASE', 'RESIGNAL', 'REWIND', 'SPACEBLOCK' ] subroutines_tb = CaseInsensitiveListTokenBuilder( subroutines, 'subroutine', False) types = [ 'ADDRESS', 'BYTE', 'CHARINT', 'DWORD', 'HWORD', 'INTEGER', 'LONGINT', 'OFFSET', 'POINTER', 'REAL', 'SHORTINT', 'STRUCTURE', 'QWORD', 'WORD' ] types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True) self.operand_types.append('type') values = ['SYSIN', 'SYSPRINT', 'TRUE', 'FALSE'] values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) self.operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() # tokenize as free-format tokenbuilders_free = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, binary_integer_tb, hex_integer_tb, octal_integer_tb, decimal_integer_tb, real_tb, real_exponent_tb, binary_real_tb, keyword_tb, format_item_tb, function_tb, attributes_tb, options_tb, conditions_tb, subroutines_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, label_tb, slash_star_comment_tb, preprocessor_tb, title_tb, subtitle_tb, error_tb, warn_tb, inform_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer_free = Tokenizer(tokenbuilders_free) tokens_free = tokenizer_free.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid') self.tokens = tokens_free self.calc_statistics() statistics_free = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] # tokenize as fixed-format tokenbuilders_fixed = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, binary_integer_tb, hex_integer_tb, octal_integer_tb, decimal_integer_tb, real_tb, real_exponent_tb, binary_real_tb, keyword_tb, function_tb, attributes_tb, options_tb, conditions_tb, subroutines_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, label_tb, slash_star_comment_tb, preprocessor_tb, title_tb, subtitle_tb, error_tb, warn_tb, inform_tb, self.unknown_operator_tb, invalid_token_builder ] comment_start_tb = PL1CommentStartTokenBuilder() comment_middle_tb = PL1CommentMiddleTokenBuilder() comment_end_tb = PL1CommentEndTokenBuilder() type1_tokenbuilders = [comment_start_tb] tokenbuilders_fixed_1 = tokenbuilders_fixed + type1_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_1 = Tokenizer(tokenbuilders_fixed_1) type2_tokenbuilders = [ comment_start_tb, comment_middle_tb, comment_end_tb ] tokenbuilders_fixed_2 = tokenbuilders_fixed + type2_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_2 = Tokenizer(tokenbuilders_fixed_2) tokens_fixed = self.tokenize_code(code, tab_size, tokenizer_fixed_1, tokenizer_fixed_2, wide) tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid operator') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'whitespace') tokens_fixed = self.convert_broken_comments_to_comments(tokens_fixed) self.tokens = tokens_fixed self.calc_statistics() statistics_fixed = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_fixed = self.confidences self.confidences = {} errors_fixed = self.errors self.errors = [] # compute confidence for free-format and fixed-format confidence_free = 1.0 if len(confidences_free) == 0: confidence_free = 0.0 else: for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_fixed = 1.0 if len(confidences_fixed) == 0: confidence_fixed = 0.0 else: for key in confidences_fixed: factor = confidences_fixed[key] confidence_fixed *= factor # select the better of free-format and spaced-format if confidence_fixed > confidence_free: self.tokens = tokens_fixed self.statistics = statistics_fixed self.confidences = confidences_fixed self.errors = errors_fixed else: self.tokens = tokens_free self.statistics = statistics_free self.confidences = confidences_free self.errors = errors_free
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '0123456789ABCDEFabcdef_') real_tb = RealTokenBuilder(True, True, '_') real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False) operand_types.append('decorator') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() class_tb = ClassTypeTokenBuilder() operand_types.append('class') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '+=', '-=', '*=', '/=', '%=', '++', '--', '&&', '||', '!', '==', '!=', '===', '!==', '<', '>', '<=', '>=', '!!', '?.', '?:', '::', '..', ':', '?', '.' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '!', '*', '++', '--'] self.postfix_operators = ['++', '--', ':'] groupers = ['->', '(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = ['->', ','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'as', 'as?', 'break', 'class', 'continue', 'do', 'else', 'for', 'fun', 'if', 'in', '!in', 'is', '!is', 'object', 'package', 'return', 'super', 'throw', 'try', 'typealias', 'typeof', 'val', 'var', 'when', 'while', 'by', 'catch', 'constructor', 'delegate', 'dynamic', 'field', 'file', 'finally', 'get', 'import', 'init', 'param', 'property', 'receiver', 'set', 'setparam', 'where', 'actual', 'abstract', 'annotation', 'companion', 'const', 'crossinline', 'data', 'enum', 'expect', 'external', 'final', 'infix', 'inline', 'inner', 'internal', 'lateinit', 'noinline', 'open', 'operator', 'out', 'override', 'private', 'protected', 'public', 'reified', 'sealed', 'suspend', 'tailrec', 'vararg' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'Byte', 'Short', 'Int', 'Long', 'Float', 'Double', 'Char', 'u', 'f', 'ul', 'UInt', 'ULong', 'UByte', 'UShort' ] type_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'null', 'this', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, keyword_tb, type_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, class_tb, decorator_tb, string_tb, triple_quote_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = self.combine_numbers_and_adjacent_types(tokens) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') annotation_tb = PrefixedIdentifierTokenBuilder('@', 'annotation', False) operand_types.append('annotation') symbol_tb = PrefixedIdentifierTokenBuilder('#', 'symbol', True) operand_types.append('symbol') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '~/', '%', '^', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '~/=', '||', '&&', '.', '..', ':', '?', '??', '??=', 'as', 'is', 'is!', '++', '--', 'new' ] self.unary_operators = [ '+', '-', '*', '!', '~', '.', '..' '?.', '++', '--', 'new' ] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'abstract', 'assert', 'async', 'await', 'break', 'case', 'catch', 'class', 'const', 'continue', 'covariant', 'default', 'deferred', 'do', 'dynamic', 'else', 'enum', 'export', 'extends', 'external', 'factory', 'final', 'finally', 'for', 'Function', 'get', 'hide', 'if', 'implements', 'import', 'in', 'interface', 'library', 'mixin', 'on', 'operator', 'part', 'rethrow', 'return', 'set', 'show', 'static', 'switch', 'sync', 'throw', 'try', 'typedef', 'var', 'void', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = ['int', 'double', 'String', 'List', 'bool', 'void'] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'true', 'null', 'this', 'super'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, annotation_tb, symbol_tb, class_type_tb, string_tb, raw_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_@' extras = '_@' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') comment_tb = LeadToEndOfLineTokenBuilder('%', False, 'comment') directives = [ '-include', '-define', '-error', '-warning', '-module', '-compile' ] c_preprocessor_tb = CaseSensitiveListTokenBuilder(directives, 'preprocessor', False) terminators_tb = SingleCharacterTokenBuilder([';', '.'], 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '!', 'and', 'andalso', 'band', 'bnot', 'bor', 'bsl', 'bsr', 'bxor', 'div', 'not', 'of', 'or', 'orelse', 'xor', '++', '--', '->', '=>', '#', ':=', '=', '==', '/=', '=<', '<', '>=', '>', '=:=', '=/=' ] self.unary_operators = [ '+', '-', 'not', '#', '!' ] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '<', '>', '<<', '>>', '|', '||'] # group_starts = ['(', '[', ',', '{', '<', '<<'] group_ends = [')', ']', '}', '>', '>>'] group_mids = [',', ':', '|', '||'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) keywords = [ 'after', 'begin', 'case', 'catch', 'cond', 'end', 'fun', 'if', 'let', 'receive', 'rem', 'try', 'when', 'ignore' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'integer', 'float', 'binary', 'bytes', 'bitstring', 'bits', 'utf8', 'utf16', 'utf32', 'signed', 'unsigned', 'big', 'little', 'native' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', False) operand_types.append('type') values = [ 'true', 'false', '?MODULE', '?MODULE_STRING', '?FILE', '?LINE', '?MACHINE', '?FUNCTION_NAME', '?FUNCTION_ARITY', '?OTP_RELEASE' ] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, comment_tb, c_preprocessor_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = tokens self.convert_keywords_to_identifiers() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = PrefixedIntegerTokenBuilder('#', False, '0123456789') variable16_tb = PrefixedIntegerTokenBuilder('.', False, '0123456789') variable32_tb = PrefixedIntegerTokenBuilder(':', False, '0123456789') array16_tb = PrefixedIntegerTokenBuilder(',', False, '0123456789') array32_tb = PrefixedIntegerTokenBuilder(';', False, '0123456789') operand_types.append('number') comment_tb = LeadToEndOfLineTokenBuilder('NOTE', True, 'comment') label_tb = ParensLabelTokenBuilder() known_operators = ['~', '$', 'V', '?', '&', 'SUB', '<-'] self.unary_operators = ['V', '?', '&'] self.postfix_operators = [] groupers = ['"', "'"] group_starts = ['"', "'"] group_ends = ['"', "'"] group_mids = [] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'DO', 'STASH', 'RETRIEVE', 'RESUME', 'FORGET', 'NEXT', 'ABSTAIN', 'FROM', 'REINSTATE', 'IGNORE', 'REMEMBER', 'WRITE', 'IN', 'READ', 'OUT', 'PLEASE', 'COME', 'FROM' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, variable16_tb, variable32_tb, array16_tb, array32_tb, keyword_tb, groupers_tb, label_tb, known_operator_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() # self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, extension): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(True, True, None) real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None) operand_types.append('number') quotes = ["'", '"'] string_tb = StuffedQuoteStringTokenBuilder(quotes, False) operand_types.append('string') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) bracketed_identifier_tb = NullTokenBuilder() if extension in ['microsoft', 't-sql']: bracketed_identifier_tb = SqlBracketedIdentifierTokenBuilder() operand_types.append('identifier') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) comment_tb = LeadToEndOfLineTokenBuilder('--', True, 'comment') known_operators = [ '=', '>', '>=', '<', '<=', '<>', '!=', 'AND', 'OR', 'NOT', 'IN', 'EXISTS', 'LIKE', 'BETWEEN', 'ANY', 'ALL', '.' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['NOT', 'EXISTS', 'ANY', 'ALL'] groupers = ['(', ')', ','] group_starts = ['(', ','] group_mids = [','] group_ends = [')'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'ACOS', 'ASIN', 'ATAN', 'ABSOLUTE', 'ACTION', 'ADD', 'ALL', 'ALLOCATE', 'ALTER', 'ARE', 'ABS', 'ARRAY_AGG', 'AVG', 'AS', 'ASC', 'ASSERTION', 'AT', 'AUTHORIZATION', 'AFTER', 'ARRAY', 'ASENSITIVE', 'ASYMMETRIC', 'ATOMIC', 'ARRAY_MAX_CARDINALITY', 'BEFORE', 'BEGIN', 'BETWEEN', 'BIT_LENGTH', 'BOTH', 'BY', 'BEGIN_FRAME', 'BEGIN_PARTITION', 'BINARY', 'BOOLEAN', 'BREADTH', 'CALL', 'CASCADE', 'CASCADED', 'CASE', 'CAST', 'CATALOG', 'CALLED', 'CHAR_LENGTH', 'CHARACTER_LENGTH', 'CHECK', 'COALESCE', 'COLLATE', 'COLLATION', 'COLUMN', 'COMMIT', 'CONDITION', 'CONNECT', 'CONNECTION', 'CONSTRAINT', 'CONSTRAINTS', 'CONTAINS', 'CONTINUE', 'CONVERT', 'CORRESPONDING', 'COUNT', 'CREATE', 'CROSS', 'CURRENT', 'CURRENT_DATE', 'CURRENT_PATH', 'CURRENT_TIME', 'CURRENT_TIMESTAMP', 'CURRENT_USER', 'CURSOR', 'CLOSE', 'CONSTRUCTOR', 'CUBE', 'CURRENT_DEFAULT_TRANSFORM_GROUP', 'CURRENT_ROLE', 'CURRENT_TRANSFORM_GROUP_FOR_TYPE', 'CYCLE', 'CARDINALITY', 'CEIL', 'CEILING', 'CONVERT', 'CORR', 'COVAR_POP', 'COVAR_SAMPLE', 'CUME_DIST', 'CURRENT_CATALOG', 'CURRENT_SCHEMA', 'CLASSIFIER', 'COS', 'COSH', 'DAY', 'DEALLOCATE', 'DEC', 'DECLARE', 'DEFAULT', 'DECFLOAT', 'DEFINE', 'DEFERRABLE', 'DEFERRED', 'DELETE', 'DEPTH', 'DESC', 'DESCRIBE', 'DENSE_RANK', 'DESCRIPTOR', 'DETERMINISTIC', 'DIAGNOSTICS', 'DISCONNECT', 'DISTINCT', 'DO', 'DOMAIN', 'DROP', 'DYNAMIC', 'ELSE', 'END', 'ESCAPE', 'EXCEPT', 'EXCEPTION', 'ELEMENT', 'EXEC', 'EXECUTE', 'EXISTS', 'EXIT', 'EXTERNAL', 'EXTRACT', 'EACH', 'ELSEIF', 'EQUALS', 'END_EXEC', 'EVERY', 'EXP', 'EMPTY', 'EQUALS', 'FETCH', 'FIRST', 'FOR', 'FOREIGN', 'FOUND', 'FROM', 'FULL', 'FUNCTION', 'FUSION', 'FILTER', 'FREE', 'FIRST_VALUE', 'FRAME_ROW', 'GENERAL', 'GET', 'GLOBAL', 'GO', 'GOTO', 'GRANT', 'GROUP', 'GROUPING', 'GROUPS', 'HANDLER', 'HAVING', 'HOUR', 'HOLD', 'IDENTITY', 'IF', 'IMMEDIATE', 'IN', 'INDICATOR', 'INITIALLY', 'INNER', 'INOUT', 'INPUT', 'INSENSITIVE', 'INSERT', 'INT', 'INTERSECT', 'INITIAL', 'INTERVAL', 'INTO', 'IS', 'ISOLATION', 'INTERSECTION', 'ITERATE', 'JOIN', 'JSON_ARRY', 'JSON_ARRAYAGG', 'JSON_EXISTS', 'JSON_OBJECT', 'JSON_OBJECTAGG', 'JSON_QUERY', 'JSON_TABLE', 'JSON_TABLE_PRIMITIVE', 'JSON_VALUE', 'KEY', 'LANGUAGE', 'LAST', 'LEADING', 'LEFT', 'LEVEL', 'LIKE', 'LOCAL', 'LARGE', 'LATERAL', 'LEAVE', 'LOCALTIME', 'LOCALTIMESTAMP', 'LOCATOR', 'LOOP', 'LAG', 'LISTAGG', 'LOG', 'LOG10', 'LIKE_REGEX', 'LN', 'LOWER', 'LAST_VALUE', 'LEAD', 'MATCH', 'MAX', 'MIN', 'MINUTE', 'MODULE', 'MONTH', 'MAP', 'METHOD', 'MODIFIES', 'MATCH_NUMBER', 'MATCH_RECOGNIZE', 'MATCHES', 'MEMBER', 'MERGE', 'MULTISET', 'MOD', 'NAMES', 'NATIONAL', 'NATURAL', 'NEXT', 'NO', 'NOT', 'NULLIF', 'NUMERIC', 'NTH_VALUE', 'NTILE', 'NEW', 'NORMALIZE', 'OCTET_LENGTH', 'OF', 'ONLY', 'OPEN', 'OPTION', 'ORDER', 'OUTPUT', 'OVERLAPS', 'OBJECT', 'OLD', 'ORDINALITY', 'OUT', 'OUTER', 'OCTET_LENGTH', 'OFFSET', 'OMIT', 'OCCURRENCES_REGEX', 'ONE', 'OVER', 'OVERLAY', 'PAD', 'PARAMETER', 'PARTIAL', 'PRECISION', 'PREPARE', 'PRESERVE', 'PRIMARY', 'PRIOR', 'PRIVILEGES', 'PROCEDURE', 'PUBLIC', 'PATTERN', 'PER', 'PTF', 'PARTITION', 'PERCENT_RANK', 'PERCENTILE_CONT', 'PERCENTILE_DISC', 'POSITION', 'PERCENT', 'PERIOD', 'PORTION', 'PRECEDES', 'POSITION_REGEX', 'POWER', 'RANGE', 'READ', 'REFERENCES', 'RELATIVE', 'RESTRICT', 'RETURN', 'RETURNS', 'REVOKE', 'RIGHT', 'ROLLBACK', 'ROLLUP', 'READS', 'ROWS', 'RECURSIVE', 'REF', 'REFERENCING', 'RELEASE', 'REPEAT', 'REGIONAL', 'RESULT', 'ROW', 'RANK', 'REGR_AVGX', 'REGR_AVGY', 'REGR_COUNT', 'REGR_INTERCEPT', 'REGR_R2', 'REGR_SLOPE', 'REGR_SXX', 'REGR_SXY', 'REGR_SYY', 'ROW_NUMBER', 'RUNNING', 'SCHEMA', 'SCROLL', 'SECOND', 'SECTION', 'SELECT', 'SESSION', 'SESSION_USER', 'SET', 'SIZE', 'SOME', 'SPACE', 'SPECIFIC', 'SQL', 'SQLCODE', 'SQLERROR', 'SQLEXCEPTION', 'SQLSTATE', 'SQLWARNING', 'SUBSTRING', 'SUM', 'SQRT', 'STDDEV_POP', 'STDDEV_SAMP', 'SUBSTRING_REGEX', 'SUM', 'SEEK', 'SHOW', 'SIN', 'SINH', 'SUBSET', 'SUBMULTISET', 'SYSTEM_USER', 'SAVEPOINT', 'SCOPE', 'SEARCH', 'SENSITIVE', 'SETS', 'SIGNAL', 'SIMILAR', 'SPECIFICTYPE', 'START', 'STATE', 'STATIC', 'SYMMETRIC', 'SYSTEM', 'TABLE', 'TEMPORARY', 'THEN', 'TIME', 'TIMESTAMP', 'TIMEZONE_HOUR', 'TABLESAMPLE' 'TAN', 'TANH' 'TIMEZONE_MINUTE', 'TO', 'TRAILING', 'TRANSACTION', 'TRANSLATE', 'TRANSLATION', 'TRIM', 'TRANSLATE', 'TRANSLATE_REGEX', 'TRUNCATE', 'TREAT', 'TRIGGER', 'TRIM_ARRAY', 'UNDO', 'UNION', 'UNIQUE', 'UNKNOWN', 'UPDATE', 'UPPER', 'USAGE', 'USER', 'USING', 'UNDER', 'UNNEST', 'UNTIL', 'UESCAPE', 'UPPER', 'VALUE', 'VALUES', 'VARYING', 'VIEW', 'VAR_POP', 'VAR_SAMP', 'VALUE_OF', 'VERSIONING' 'WHEN', 'WHENEVER', 'WHERE', 'WITH', 'WORK', 'WRITE', 'WHILE', 'WINDOW', 'WITHIN', 'WITHOUT' 'WIDTH_BUCKET' 'YEAR', 'ZONE' ] keywords_tsql = [ 'INSTEAD', 'CASE', 'UPDLOCK', 'DATEADD', 'GETDATE', 'TEXTIMAGE_ON', 'CLUSTERED', 'GENERATED', 'DECLARE', 'SET', 'BEGIN', 'END', 'BREAK', 'CONTINUE', 'GOTO', 'ELSE', 'RETURN', 'WAITFOR', 'BULK', 'TRY', 'CATCH' ] keywords_plsql = [ '%TYPE', 'BEFORE', 'DECODE', 'DESCRIBE', 'DUAL', 'INTERSECT', 'MINUS', 'SYSDATE', 'USER' ] if extension in ['microsoft', 't-sql']: keywords += keywords_tsql if extension in ['oracle', 'pl-sql']: keywords += keywords_plsql keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) values = ['TRUE', 'FALSE', 'NULL', 'OFF', 'ON', 'NONE'] values_tsql = [ 'ALLOW_ROW_LOCKS', 'ALLOW_PAGE_LOCKS', 'ALWAYS', 'IGNORE_DUP_KEY', 'FILLFACTOR', 'HISTORY_TABLE', 'PAD_INDEX', 'STATISTICS_NORECOMPUTE', 'SUSER_SNAME', 'SYSTEM_VERSIONING', 'SYSTEM_TIME' ] if extension in ['microsoft', 't-sql']: values += values_tsql values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') types = [ 'BIGINT', 'BIT', 'BLOB', 'CHAR', 'CHARACTER', 'CLOB', 'DATE', 'DECIMAL', 'DOUBLE', 'FLOAT', 'INTEGER', 'NCHAR', 'NCLOB', 'REAL', 'SMALLINT', 'VARCHAR' ] types_tsql = [ 'nvarchar', 'bigint', 'datetime', 'datetime2', 'geography' ] if extension in ['microsoft', 't-sql']: types += types_tsql type_tb = CaseInsensitiveListTokenBuilder(types, 'type', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, string_tb, known_operator_tb, terminators_tb, groupers_tb, keyword_tb, values_tb, identifier_tb, type_tb, bracketed_identifier_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], [], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) # operand_types_2 = ['number', 'string', 'symbol'] # self.calc_operand_n_confidence(tokens, operand_types_2, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, variant): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(True, True, None) real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) prefixed_string_tb = PrefixedStringTokenBuilder('@', False, quotes) char_tb = FsharpCharTokenBuilder(["'", "’"]) operand_types.append('string') slash_slash_comment_tb = NullTokenBuilder() parens_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment') triple_slash_comment_tb = NullTokenBuilder() if variant in ['fsharp']: slash_slash_comment_tb = SlashSlashCommentTokenBuilder() triple_slash_comment_tb = TripleSlashCommentTokenBuilder() directives = [ '#if', '#else', '#elif', '#endif', '#define', '#undef', '#line', '#region', '#endregion', '#pragma' ] preprocessor_tb = CaseSensitiveListTokenBuilder( directives, 'preprocessor', False) c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True, 'preprocessor') c_error_tb = LeadToEndOfLineTokenBuilder('#error', True, 'preprocessor') known_operators = [ 'and', 'as', 'in', 'mod', 'not', 'of', 'or', 'when', '::', '+', '-', '*', '/', '+.', '-.', '*.', '/.', '=', "'", '->', '>', '<', '>=', '<=', '==', '^', '||', '.', '#' ] known_operators_fsharp = [ 'new', '!', '!=', '%', '%%', '%?', '&', '&&', '&&&', '(|', '|)', '*?', '**', '+?', '-?', '->', '..', '.. ..', '/?', ':', ':=', ':/', '<<', '<<<', '<-', '<>', '<>?', '<=?', '<|', '<||', '<|||', '<@', '@>', '<@@', '@@>', '=?', '==', '>?', '>>', '>>>', '>=?', '?', '|||', '^^^', '?>=', '?>', '?<=', '?<', '?=', '?<>', '?+', '?-', '?*', '?/', '>=?', '>?', '<=?', '<?', '=?', '<>?', '+?', '-?', '*?', '/?', '?>=?', '?>?', '?<=?', '?<?', '?=?', '?<>?', '?+?', '?-?', '?*?', '?/?', '@', '|>', '||>', '|||>', '~~', '~~~', '~-', '~+', ':>', ':?>', "'" ] if variant in ['fsharp']: known_operators += known_operators_fsharp self.unary_operators = ['new', 'not', "'", '-'] self.postfix_operators = ["'"] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) groupers = [ '(', ')', ',', '[', ']', '{', '}', 'begin', 'end', ';', '|' ] groupers_fsharp = ['[|', '|]', '[<', '>]', '^'] if variant in ['fsharp']: groupers += groupers_fsharp # group_starts = ['(', '[', ',', '{', '[|', '[<'] group_mids = [',', ';', '^', '|'] group_ends = [')', ']', '}', '|]', '>]'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'assert', 'class', 'def', 'do', 'done', 'downto', 'else', 'exception', 'failwith', 'for', 'fun', 'function', 'if', 'inherit', 'lazy', 'let', 'match', 'method', 'module', 'object', 'open', 'raise', 'rec', 'sig', 'then', 'to', 'try', 'type', 'val', 'virtual', 'while', 'with' ] keywords_fsharp = [ 'abstract', 'break', 'default', 'delegate', 'downcast', 'elif', 'extern', 'finally', 'fixed', 'global', 'inline', 'interface', 'internal', 'let!', 'match!', 'member', 'mutable', 'namespace', 'override', 'private', 'public', 'return', 'return!', 'upcast', 'use', 'use!', 'yield', 'yield!' ] if variant in ['fsharp']: keywords += keywords_fsharp keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'bool', 'byte', 'char', 'double', 'float', 'int', 'list', 'long', 'number', 'object', 'range', 'string', 'struct', 'union', 'unit', 'void' ] types_fsharp = [ 'decimal', 'sbyte', 'short', 'uint', 'ulong', 'ushort', 'void' ] if variant in ['fsharp']: types += types_fsharp types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['base', 'false', 'null', 'true', '_'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, class_type_tb, string_tb, triple_quote_string_tb, prefixed_string_tb, char_tb, triple_slash_comment_tb, slash_slash_comment_tb, parens_star_comment_tb, preprocessor_tb, c_error_tb, c_warning_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') directive_tb = DirectiveTokenBuilder() quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 10) prefixed_string_tb = PrefixedStringTokenBuilder('@', False, quotes) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() directives = [ '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else', '#elif', '#import', '#line', '#include' ] line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) c_preprocessor_tb = CaseSensitiveListTokenBuilder( directives, 'preprocessor', False) c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True, 'preprocessor') c_error_tb = LeadToEndOfLineTokenBuilder('#error', True, 'preprocessor') c_pragma_tb = LeadToEndOfLineTokenBuilder('#pragma', True, 'preprocessor') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '<<', '>>', '~', '.', '->', '++', '--', '&&', '||', '^', '?', '##' ] self.unary_operators = [ '+', '-', '*', '!', '&', '^', '~', '++', '--', '##' ] self.postfix_operators = ['++', '--', '&', '->', '*', '^'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'atomic', 'break', 'bycopy', 'byref', 'case', 'continue', 'default', 'do', 'else', 'for', 'goto', 'if', 'IMP', 'in', 'inline', 'inout', 'nonatomic', 'oneway', 'out', 'Protocol', 'restrict', 'retain', 'return', 'SEL', 'sizeof', 'switch', 'typedef', 'while', '@interface', '@end', '@implementation', '@protocol', '@class', '@public', '@protected', '@private', '@property', '@try', '@throw', '@catch()', '@finally', '@synthesize', '@dynamic', '@selector' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'auto', 'char', 'const', 'double', 'enum', 'extern', 'float', 'id', 'int', 'long', 'register', 'short', 'signed', 'static', 'struct', 'union', 'unsigned', 'void', 'volatile', '_Bool', '_Complex', '_Imaginary', 'BOOL', 'Class' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['self', 'super', 'nil', 'YES', 'NO', 'NULL', '...'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, directive_tb, identifier_tb, class_type_tb, string_tb, prefixed_string_tb, slash_slash_comment_tb, slash_star_comment_tb, c_preprocessor_tb, c_warning_tb, c_error_tb, c_pragma_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment', 'line description']) self.tokens = tokens self.convert_identifiers_to_labels() self.convert_values_to_operators() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, year, extension, tab_size, wide): super().__init__() self.max_expected_line = 80 if year is not None and year not in ['68', '1968', '74', '1974', '85', '1985']: raise CodeStatException('Unknown year for language') operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, True, None) real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', None) operand_types.append('number') identifier_tb = CobolIdentifierTokenBuilder() operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = StuffedQuoteStringTokenBuilder(quotes, True) operand_types.append('string') picture_tb = PictureTokenBuilder() cr_picture_tb = CRPictureTokenBuilder() operand_types.append('picture') terminators_tb = SingleCharacterTokenBuilder('.', 'statement terminator', False) known_operators = [ 'ADD', 'SUBTRACT', 'MULTIPLY', 'DIVIDE', '+', '-', '*', '/', '**', '=', '<>', '>', '>=', '<', '<=', 'AND', 'OR', 'NOT', ':' ] known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) self.unary_operators = [ '+', '-' ] groupers = ['(', ')', ','] group_starts = ['('] group_mids = [','] # group_ends = [')'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'ACCEPT', 'ACCESS', 'ADD', 'ADDRESS', 'ADVANCING', 'AFTER', 'ALL', 'ALPHABETIC', 'ALPHABETIC-LOWER', 'ALPHABETIC-UPPER', 'ALPHANUMERIC', 'ALPHANUMERIC-EDITED', 'ALTER', 'ALTERNATE', 'AND', 'APPLY', 'ARE', 'AREA', 'AREAS', 'ASCENDING', 'ASSIGN', 'AT', 'AUTHOR', 'BEFORE', 'BLOCK', 'BY', 'CALL', 'CANCEL', 'CD', 'CF', 'CH', 'CHARACTER', 'CHARACTERS', 'CLOCK-UNITS', 'CLOSE', 'COBOL', 'CODE', 'COLUMN', 'COMMA', 'COMMUNICATION', 'COMP', 'COMPUTATIONAL', 'COMPUTE', 'CONFIGURATION', 'CONTAINS', 'CONTROL', 'CONTROLS', 'COPY', 'CORR', 'CORRESPONDING', 'COUNT', 'CURRENCY', 'DATA', 'DATE', 'DATE-COMPILED', 'DATE-WRITTEN', 'DE', 'DEBUG-CONTENTS', 'DEBUG-ITEM', 'DEBUG-LINE', 'DEBUG-NAME', 'DEBUG-SUB-1', 'DEBUG-SUB-2', 'DEBUG-SUB-3', 'DECIMAL-POINT', 'DECLARATIVES', 'DELIMITED', 'DELIMITER', 'DEPENDING', 'DESCENDING', 'DESTINATION', 'DETAIL', 'DISABLE', 'DISPLAY', 'DIVIDE', 'DIVISION', 'DOWN', 'EGI', 'ELSE', 'EMI', 'ENABLE', 'END', 'ENTER', 'ENVIRONMENT', 'EQUAL', 'ERROR', 'ESI', 'EVERY', 'EXIT', 'EXTEND', 'FD', 'FILE', 'FILE-CONTROL', 'FILLER', 'FINAL', 'FIRST', 'FOOTING', 'FOR', 'FROM', 'GENERATE', 'GIVING', 'GLOBAL', 'GO', 'GOBACK', 'GREATER', 'GROUP', 'HEADING', 'HIGH-VALUE', 'HIGH-VALUES', 'I-O', 'I-O-CONTROL', 'IDENTIFICATION', 'IF', 'IN', 'INDEX', 'INDEXED', 'INDICATE', 'INITIAL', 'INITIATE', 'INPUT', 'INPUT-OUTPUT', 'INSTALLATION', 'INTO', 'INVALID', 'IS', 'JUST', 'JUSTIFIED', 'KEY', 'LABEL', 'LAST', 'LEADING', 'LEFT', 'LENGTH', 'LESS', 'LIMIT', 'LIMITS', 'LINE', 'LINE-COUNTER', 'LINES', 'LINKAGE', 'LOCK', 'LOW-VALUE', 'LOW-VALUES', 'MEMORY', 'MERGE', 'MESSAGE', 'MODE', 'MODULES', 'MOVE', 'MULTIPLE', 'MULTIPLY', 'NEGATIVE', 'NEXT', 'NO', 'NOT', 'NUMBER', 'NUMERIC', 'NUMERIC-EDITED', 'OBJECT-COMPUTER', 'OCCURS', 'OF', 'OMITTED', 'OPEN', 'OPTIONAL', 'OR', 'OUTPUT', 'OVERFLOW', 'PAGE', 'PAGE-COUNTER', 'PERFORM', 'PF', 'PH', 'PIC', 'PICTURE', 'PLUS', 'POINTER', 'POSITION', 'POSITIVE', 'PROCEDURE', 'PROCEED', 'PROGRAM', 'PROGRAM-ID', 'QUEUE', 'QUOTE', 'QUOTES', 'RANDOM', 'RD', 'READ', 'RECEIVE', 'RECORD', 'RECORDS', 'REDEFINES', 'REEL', 'REFERENCE', 'RELATIVE', 'RELEASE', 'REMAINDER', 'RENAMES', 'REPLACE', 'REPLACING', 'REPORT', 'REPORTING', 'REPORTS', 'RERUN', 'RESERVE', 'RESET', 'RETURN', 'REVERSED', 'REWIND', 'REWRITE', 'RF', 'RH', 'RIGHT', 'ROUNDED', 'RUN', 'SAME', 'SD', 'SEARCH', 'SECTION', 'SECURITY', 'SEGMENT', 'SEGMENT-LIMIT', 'SELECT', 'SEND', 'SENTENCE', 'SEQUENCE', 'SEQUENTIAL', 'SET', 'SIGN', 'SIZE', 'SORT', 'SOURCE', 'SOURCE-COMPUTER', 'SPECIAL-NAMES', 'STANDARD', 'STATUS', 'STOP', 'STRING','SUB-QUEUE-1', 'SUB-QUEUE-2', 'SUB-QUEUE-3', 'SUBTRACT', 'SUM', 'SUPPRESS', 'SYMBOLIC', 'SYNC', 'SYNCHRONIZED', 'TABLE', 'TALLY', 'TAPE', 'TERMINAL', 'TERMINATE', 'TEST', 'TEXT', 'THAN', 'THEN', 'THROUGH', 'THRU', 'TIME', 'TIMES', 'TITLE', 'TO', 'TYPE', 'UNIT', 'UNSTRING', 'UNTIL', 'UP', 'UPON', 'USAGE', 'USE', 'USING', 'VALUE', 'VALUES', 'VARYING', 'WHEN', 'WITH', 'WORDS', 'WORKING-STORAGE', 'WRITE' ] keywords_68_only = [ 'ACTUAL', 'FILE-LIMITS', 'NOMINAL', 'PROCESSING', 'NOTE', 'REMARKS', 'SEEK', 'TODAY' ] keywords_74 = [ 'ALSO', 'BOTTOM', 'CODE-SET', 'COLLATING', 'COMMON', 'DAY', 'DELETE', 'DEBUGGING', 'DUPLICATES', 'DYNAMIC', 'END-OF-PAGE', 'EOP', 'EXCEPTION', 'INSPECT', 'LINAGE', 'LINAGE-COUNTER', 'NATIVE', 'ORGANIZATION', 'PACKED-DECIMAL', 'PADDING', 'PRINTING', 'PROCEDURES', 'REFERENCES', 'REMOVAL', 'SEPARATE', 'SORT-MERGE', 'STANDARD-1', 'STANDARD-2', 'START', 'TALLYING', 'TOP', 'TRAILING' ] keywords_85 = [ 'ALPHABET', 'ANY', 'BINARY', 'CONTENT', 'CONTINUE', 'CONVERTING', 'DAY-OF-WEEK', 'END-ADD', 'END-CALL', 'END-COMPUTE', 'END-DELETE', 'END-DIVIDE', 'END-EVALUATE', 'END-IF', 'END-MULTIPLY', 'END-PERFORM', 'END-READ', 'END-RECEIVE', 'END-RETURN', 'END-REWRITE', 'END-SEARCH', 'END-START', 'END-STRING', 'END-SUBTRACT', 'END-UNSTRING', 'END-WRITE', 'EVALUATE', 'EXTERNAL', 'INITIALIZE', 'ORDER', 'OTHER', 'PURGE' ] if year in ['68', '1968']: keywords += keywords_68_only if year in ['74', '1974', '85', '1985']: keywords += keywords_74 if year in ['85', '1985']: keywords += keywords_85 keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) values = [ 'BLANK', 'SPACE', 'SPACES', 'ZERO', 'ZEROES', 'ZEROS', 'NO', 'OFF', 'ON' ] values_85 = ['FALSE', 'TRUE'] if year in ['85', '1985']: values += values_85 value_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') exec_tb = BlockTokenBuilder('EXEC', 'END-EXEC', 'exec block') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, picture_tb, cr_picture_tb, keyword_tb, known_operator_tb, groupers_tb, value_tb, identifier_tb, string_tb, exec_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = self.tokenize_code(code, tab_size, tokenizer, wide) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'whitespace') self.convert_numbers_to_pictures() self.convert_numbers_to_levels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) # self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) # self.calc_operand_n_confidence(tokens, operand_types, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_picture_confidence() if not wide: self.calc_line_length_confidence(code, self.max_expected_line) expected_keyword_confidence = self.check_expected_keywords() self.confidences['expected_keywords'] = expected_keyword_confidence
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(True, True, '_') real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_') operand_types.append('number') argument_tb = SwiftArgumentTokenBuilder() leads = '_' extras = '_' suffixes = '?' identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes) operand_types.append('identifier') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) symbol_tb = SwiftSymbolTokenBuilder('.', 'symbol', True) operand_types.append('symbol') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 10) triple_quote_comment_tb = TripleQuoteStringTokenBuilder(quotes) slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() operand_types.append('string') known_operators = [ '+', '-', '*', '/', '%', '==', '!=', '>', '<', '>=', '<=', '&&', '||', '!', '&', '|', '^', '~', '<<', '>>', '===', '=', '+=', '-=', '*=', '/=', '%=', '<<=', '>>=', '&=', '^=', '|=', '...', '..<', '?', ':', '.', '++', '--', '->', '??', '\\.', '&+', '&-', '&*' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '!', '~', '&', '++', '--', ':', '?'] self.postfix_operators = ['++', '--', ':', '!', '?'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'associatedtype', 'class', 'deinit', 'enum', 'extension', 'fileprivate', 'func', 'import', 'init', 'inout', 'internal', 'let', 'open', 'operator', 'private', 'protocol', 'public', 'static', 'struct', 'subscript', 'typealias', 'var', 'break', 'case', 'continue', 'default', 'defer', 'do', 'else', 'fallthrough', 'for', 'guard', 'if', 'in', 'repeat', 'return', 'switch', 'where', 'while', 'as', 'Any', 'catch', 'is', 'rethrows', 'super', 'throw', 'throws', 'try', 'try?', 'try!', '#available', '#colorLiteral', '#column', '#else', '#elseif', '#endif', '#file', '#fileLiteral', '#function', '#if', '#imageLiteral', '#line', '#selector', '#sourceLocation', 'associativity', 'convenience', 'dynamic', 'didSet', 'final', 'get', 'infix', 'indirect', 'lazy', 'left', 'mutating', 'none', 'nonmutating', 'optional', 'override', 'postfix', 'precedence', 'prefix', 'Protocol', 'required', 'right', 'set', 'Type', 'unowned', 'weak', 'willSet' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'char', 'double', 'float', 'int', 'long', 'short', ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['nil', 'Self', 'false', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, argument_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, attribute_tb, symbol_tb, string_tb, slash_slash_comment_tb, slash_star_comment_tb, triple_quote_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.convert_keywords_to_identifiers(['.']) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') identifier_tb = HaskellIdentifierTokenBuilder() operand_types.append('identifier') class_tb = HaskellClassTokenBuilder() operand_types.append('class') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') line_comment_tb = LeadToEndOfLineTokenBuilder('--', False, 'comment') block_comment_tb = BlockTokenBuilder('{-', '-}', 'comment') line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::'] group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) operators_tb = HaskellOperatorTokenBuilder('#$%&*+./<=>?@\\^|-~') known_operators = ["'", '..'] known_operators_tb = CaseInsensitiveListTokenBuilder( known_operators, 'operator', False) self.postfix_operators = ['..', "'"] keywords = [ 'case', 'class', 'data', 'deriving', 'do', 'else', 'if', 'import', 'in', 'infix', 'infix1', 'infixr', 'instance', 'let', 'module', 'newtype', 'of', 'then', 'type', 'where' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True) values = ['True', 'False', 'Nothing', '_'] value_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, keyword_tb, groupers_tb, operators_tb, known_operators_tb, identifier_tb, value_tb, class_tb, string_tb, line_comment_tb, block_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) HaskellExaminer.convert_keywords_to_identifiers(tokens) self.tokens = tokens # self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() # self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] # self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) # operand_types_2 = ['number'] # self.calc_operand_n_confidence(tokens, operand_types_2, 2) # operand_types = ['number', 'string', 'symbol', 'identifier', 'variable'] # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, tab_size, wide): super().__init__() self.operand_types = [] self.whitespace_tb = WhitespaceTokenBuilder() self.newline_tb = NewlineTokenBuilder() self.integer_tb = IntegerTokenBuilder(None) self.integer_exponent_tb = IntegerExponentTokenBuilder(None) self.binary_integer_tb = SuffixedIntegerTokenBuilder(['B'], False, None) self.real_tb = RealTokenBuilder(False, False, None) self.real_exponent_tb = RealExponentTokenBuilder( False, False, 'E', None) self.binary_real_tb = SuffixedRealTokenBuilder(True, True, ['B'], False, None) self.operand_types.append('number') leads = '_' extras = '_' self.identifier_tb = IdentifierTokenBuilder(leads, extras) self.operand_types.append('identifier') quotes = ['"', "'", "’"] self.string_tb = EscapedStringTokenBuilder(quotes, 0) self.operand_types.append('string') self.label_tb = PL1LabelTokenBuilder() self.operand_types.append('label') self.slash_star_comment_tb = SlashStarCommentTokenBuilder() self.jcl_tb = JCLTokenBuilder() directives = [ '%ACTIVATE', '%DEACTIVATE', '%DECLARE', '%DCL', '%DICTIONARY', '%DO', '%ELSE', '%END', '%FATAL', '%GOTO', '%IF', '%INCLUDE', '%LIST', '%NOLIST', '%PAGE', '%PROCEDURE', '%PROC', '%REPLACE', '%RETURN', '%THEN' ] self.line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) self.preprocessor_tb = CaseInsensitiveListTokenBuilder( directives, 'preprocessor', False) self.title_tb = LeadToEndOfLineTokenBuilder('%TITLE', True, 'preprocessor') self.subtitle_tb = LeadToEndOfLineTokenBuilder('%SBTTL', True, 'preprocessor') self.error_tb = LeadToEndOfLineTokenBuilder('%ERROR', True, 'preprocessor') self.warn_tb = LeadToEndOfLineTokenBuilder('%WARN', True, 'preprocessor') self.inform_tb = LeadToEndOfLineTokenBuilder('%INFORM', True, 'preprocessor') self.terminators_tb = SingleCharacterTokenBuilder( ';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '**', '>', '<', '=', '>=', '<=', '¬>', '¬<', '¬=', '^>', '^<', '^=', '^', '~>', '~<', '~=', '~', '¬', '&', '&:', '|', '|:', '||', '!', '!:', '!!', ':' ] self.unary_operators = ['+', '-', '^', '~', '¬'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}'] self.group_starts = ['(', '[', ',', '{'] self.group_mids = [','] self.group_ends = [')', ']', '}'] self.groupers_tb = CaseInsensitiveListTokenBuilder( groupers, 'group', False) self.known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'ALLOCATE', 'ALLOC', 'BEGIN', 'CALL', 'CLOSE', 'DECLARE', 'DCL', 'DO', 'ELSE', 'END', 'FORMAT', 'FREE', 'GET', 'GOTO', 'GO TO', 'IF', 'LEAVE', 'ON', 'OPEN', 'OTHERWISE', 'OTHER', 'PROCEDURE', 'PROC', 'PUT', 'READ', 'RETURN', 'REVERT', 'REWRITE', 'SELECT', 'SIGNAL', 'STOP', 'THEN', 'WHEN', 'WRITE' ] self.keyword_tb = CaseInsensitiveListTokenBuilder( keywords, 'keyword', False) attributes = [ 'ALIGNED', 'ANY', 'AREA', 'BASED', 'BUILTIN', 'CONDITION', 'COND', 'CONTROLLED', 'CTL', 'DEFINED', 'DEF', 'DIRECT', 'ENTRY', 'ENVIRONMENT', 'ENV', 'EXTERNAL', 'EXT', 'FILE', 'GLOBALDEF', 'GLOBALREF', 'INITIAL', 'INIT', 'INPUT', 'INTERNAL', 'INT' 'KEYED', 'LABEL', 'LIKE', 'LIST', 'MEMBER', 'NONVARYING', 'NONVAR', 'OPTIONAL', 'OPTIONS', 'OUTPUT', 'PARAMETER', 'PARM', 'PICTURE', 'PIC', 'POSITION', 'POS', 'PRECISION', 'PREC', 'PRINT', 'READONLY', 'RECORD', 'REFER', 'RETURNS', 'SEQUENTIAL', 'SEQL', 'STATIC', 'STREAM', 'STRUCTURE', 'TRUNCATE', 'UNALIGNED', 'UNAL', 'UNION', 'UPDATE', 'VARIABLE', 'VARYING', 'VAR' ] self.attributes_tb = CaseInsensitiveListTokenBuilder( attributes, 'attribute', False) functions = [ 'ABS', 'ACOS', 'ACTUALCOUNT', 'ADD', 'ADDR', 'ADDREL', 'ALLOCATION', 'ALLOCN', 'ASIN', 'ATAN', 'ATAND', 'ATANH', 'AUTOMATIC', 'AUTO', 'BINARY', 'BIN', 'BIT', 'BOOL', 'BYTE', 'BYTESIZE', 'CEIL', 'CHARACTER', 'CHAR', 'COLLATE', 'COPY', 'COS', 'COSD', 'COSH', 'DATE', 'DATETIME', 'DECIMAL', 'DEC', 'DECODE', 'DESCRIPTOR', 'DESC', 'DIMENSION', 'DIM', 'DIVIDE', 'EMPTY', 'ENCODE', 'ERROR', 'EVERY', 'EXP', 'FIXED', 'FLOAT', 'FLOOR', 'HBOUND', 'HIGH', 'INDEX', 'INFORM', 'INT', 'LBOUND', 'LENGTH', 'LINE', 'LINENO', 'LOG', 'LOG10', 'LOG2', 'LOW', 'LTRIM', 'MAX', 'MAXLENGTH', 'MIN', 'MOD', 'MULTIPLY', 'NULL', 'OFFSET', 'ONARGSLIST', 'ONCHAR', 'ONCODE', 'ONFILE', 'ONKEY', 'ONSOURCE', 'PAGENO', 'POINTER', 'PTR', 'POSINT', 'PRESENT', 'PROD', 'RANK', 'REFERENCE', 'REVERSE', 'ROUND', 'RTRIM', 'SEARCH', 'SIGN', 'SIN', 'SIND', 'SINH', 'SIZE', 'SOME', 'SQRT', 'STRING', 'SUBSTR', 'SUBTRACT', 'SUM', 'TAN', 'TAND', 'TANH', 'TIME', 'TRANSLATE', 'TRIM', 'TRUNC', 'UNSPEC', 'VALID', 'VALUE', 'VAL', 'VARIANT', 'VERIFY', 'WARN' ] self.function_tb = CaseInsensitiveListTokenBuilder( functions, 'function', True) format_items = [ 'A', 'B', 'B1', 'B2', 'B3', 'B4', 'COLUMN', 'COL', 'E', 'F', 'P', 'R', 'TAB', 'X' ] self.format_item_tb = CaseSensitiveListTokenBuilder( format_items, 'format', True) self.operand_types.append('format') options = [ 'APPEND', 'BACKUP_DATE', 'BATCH', 'BLOCK_BOUNDARY_FORMAT', 'BLOCK_IO', 'BLOCK_SIZE', 'BUCKET_SIZE', 'BY', 'CANCEL_CONTROL_O', 'CARRIAGE_RETURN_FORMAT', 'CONTIGUOUS', 'CONTIGUOUS_BEST_TRY', 'CREATION_DATE', 'CURRENT_POSITION', 'DEFAULT_FILE_NAME', 'DEFERRED_WRITE', 'DELETE', 'EDIT', 'EXPIRATION_DATE', 'EXTENSION_SIZE', 'FAST_DELETE', 'FILE_ID', 'FILE_ID_TO', 'FILE_SIZE', 'FIXED_CONTROL_FROM', 'FIXED_CONTROL_SIZE', 'FIXED_CONTROL_SIZE_TO', 'FIXED_CONTROL_TO', 'FIXED_LENGTH_RECORDS', 'FROM', 'GROUP_PROTECTION', 'IDENT', 'IGNORE_LINE_MARKS', 'IN', 'INDEXED', 'INDEX_NUMBER', 'INITIAL_FILL', 'INTO', 'KEY', 'KEYFROM', 'KEYTO', 'LINESIZE', 'LOCK_ON_READ', 'LOCK_ON_WRITE', 'MAIN PROCEDURE', 'MANUAL_UNLOCKING', 'MATCH_GREATER', 'MATCH_GREATER_EQUAL', 'MATCH_NEXT', 'MATCH_NEXT_EQUAL', 'MAXIMUM_RECORD_NUMBER', 'MAXIMUM_RECORD_SIZE', 'MULTIBLOCK_COUNT', 'MULTIBUFFER_COUNT', 'NOLOCK', 'NONEXISTENT_RECORD', 'NONRECURSIVE', 'NORESCAN', 'NO_ECHO', 'NO_FILTER', 'NO_SHARE', 'OWNER_GROUP', 'OWNER_ID', 'OWNER_MEMBER', 'OWNER_PROTECTION', 'PAGE', 'PAGESIZE', 'PRINTER_FORMAT', 'PROMPT', 'PURGE_TYPE_AHEAD', 'READ_AHEAD', 'READ_CHECK', 'READ_REGARDLESS', 'RECORD_ID', 'RECORD_ID_ACCESS', 'RECORD_ID_TO', 'RECURSIVE', 'REPEAT', 'RESCAN', 'RETRIEVAL_POINTERS', 'REVISION_DATE', 'REWIND_ON_CLOSE', 'REWIND_ON_OPEN', 'SCALARVARYING', 'SET READ', 'SHARED_READ', 'SHARED_WRITE', 'SKIP', 'SNAP', 'SPOOL', 'STATEMENT', 'SUPERSEDE', 'SYSTEM', 'SYSTEM_PROTECTION', 'TEMPORARY', 'TIMEOUT_PERIOD', 'TITLE', 'TO', 'UNDERFLOW', 'UFL', 'UNTIL', 'USER_OPEN', 'WAIT_FOR_RECORD', 'WHILE', 'WORLD_PROTECTION', 'WRITE_BEHIND', 'WRITE_CHECK' ] self.options_tb = CaseInsensitiveListTokenBuilder( options, 'option', False) conditions = [ 'ANYCONDITION', 'CONVERSION', 'CONV', 'ENDFILE', 'ENDPAGE', 'FINISH', 'FIXEDOVERFLOW', 'FOFL', 'OVERFLOW', 'OFL', 'STORAGE', 'STRINGRANGE', 'STRG', 'SUBSCRIPTRANGE', 'SUBRG', 'UNDEFINEDFILE', 'UNDF', 'VAXCONDITION', 'ZERODIVIDE', 'ZDIV' ] self.conditions_tb = CaseInsensitiveListTokenBuilder( conditions, 'condition', False) subroutines = [ 'DISPLAY', 'EXTEND', 'FLUSH', 'NEXT_VOLUME', 'RELEASE', 'RESIGNAL', 'REWIND', 'SPACEBLOCK' ] self.subroutines_tb = CaseInsensitiveListTokenBuilder( subroutines, 'subroutine', False) types = [ 'FIXED', 'BINARY', 'FLOAT', 'DECIMAL', 'BIT', 'CHARACTER', 'PICTURE' ] self.types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True) self.operand_types.append('type') values = ['SYSIN', 'SYSPRINT'] self.values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) self.operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() # tokenize as free-format tokenbuilders_free = [ self.newline_tb, self.whitespace_tb, self.line_continuation_tb, self.terminators_tb, self.integer_tb, self.integer_exponent_tb, self.binary_integer_tb, self.real_tb, self.real_exponent_tb, self.binary_real_tb, self.keyword_tb, self.function_tb, self.attributes_tb, self.options_tb, self.conditions_tb, self.subroutines_tb, self.types_tb, self.values_tb, self.groupers_tb, self.known_operator_tb, self.identifier_tb, self.string_tb, self.label_tb, self.slash_star_comment_tb, self.preprocessor_tb, self.title_tb, self.subtitle_tb, self.error_tb, self.warn_tb, self.inform_tb, self.jcl_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer_free = Tokenizer(tokenbuilders_free) tokens_free = tokenizer_free.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid') self.tokens = tokens_free self.calc_statistics() statistics_free = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] # tokenize as fixed-format tokenbuilders_fixed = [ self.newline_tb, self.whitespace_tb, self.line_continuation_tb, self.terminators_tb, self.integer_tb, self.integer_exponent_tb, self.binary_integer_tb, self.real_tb, self.real_exponent_tb, self.binary_real_tb, self.keyword_tb, self.function_tb, self.attributes_tb, self.options_tb, self.conditions_tb, self.subroutines_tb, self.types_tb, self.values_tb, self.groupers_tb, self.known_operator_tb, self.identifier_tb, self.string_tb, self.label_tb, self.slash_star_comment_tb, self.preprocessor_tb, self.title_tb, self.subtitle_tb, self.error_tb, self.warn_tb, self.inform_tb, self.jcl_tb, self.unknown_operator_tb, invalid_token_builder ] comment_start_tb = PL1CommentStartTokenBuilder() comment_middle_tb = PL1CommentMiddleTokenBuilder() comment_end_tb = PL1CommentEndTokenBuilder() type1_tokenbuilders = [comment_start_tb] tokenbuilders_fixed_1 = tokenbuilders_fixed + type1_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_1 = Tokenizer(tokenbuilders_fixed_1) type2_tokenbuilders = [ comment_start_tb, comment_middle_tb, comment_end_tb ] tokenbuilders_fixed_2 = tokenbuilders_fixed + type2_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_2 = Tokenizer(tokenbuilders_fixed_2) tokens_fixed = self.tokenize_code(code, tab_size, tokenizer_fixed_1, tokenizer_fixed_2, wide) tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid operator') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'whitespace') tokens_fixed = self.convert_broken_comments_to_comments(tokens_fixed) self.tokens = tokens_fixed self.calc_statistics() statistics_fixed = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_fixed = self.confidences self.confidences = {} errors_fixed = self.errors self.errors = [] # compute confidence for free-format and fixed-format confidence_free = 1.0 if len(confidences_free) == 0: confidence_free = 0.0 else: for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_fixed = 1.0 if len(confidences_fixed) == 0: confidence_fixed = 0.0 else: for key in confidences_fixed: factor = confidences_fixed[key] confidence_fixed *= factor # select the better of free-format and spaced-format if confidence_fixed > confidence_free: self.tokens = tokens_fixed self.statistics = statistics_fixed self.confidences = confidences_fixed self.errors = errors_fixed else: self.tokens = tokens_free self.statistics = statistics_free self.confidences = confidences_free self.errors = errors_free
def __init__(self, code): super().__init__() self.newlines_important = 'always' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("") operand_types.append('number') leads = '_$' extras = '_$' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') comment_tb = LeadToEndOfLineTokenBuilder(';', False, 'comment') directives = [ '.PROC', '.LIST', '.NOLIST', '.PAGE', '.INCLUDE', '.IFDEF', '.ENDC', '.IFNDEF', '.END' ] directive_tb = CaseInsensitiveListTokenBuilder(directives, 'directive', False) line_continuation_tb = SingleCharacterTokenBuilder( '&', 'line continuation', False) title_tb = LeadToEndOfLineTokenBuilder('.TITLE', True, 'directive') known_operators = [ '=', '+', '-', '*', '/', '.EQ.', '.NE.', '.LT.', '.LE.', '.GE.', '.GT.', '.NOT.', '.AND.', '.OR.', '.XOR.' ] self.unary_operators = ['.NOT.', '+', '-'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', ':'] group_starts = ['(', '[', ','] group_ends = [')', ']'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'ACCEPT', 'BEGIN', 'BY', 'CALL', 'CLEAR', 'CLOSE', 'COMMON', 'DELETE', 'DISPLAY', 'DO', 'ELSE', 'END', 'ENDUSING', 'FOR', 'GOTO', 'IF', 'INCR', 'OPEN', 'PROC', 'READ', 'READS', 'RECORD', 'RETURN', 'SELECT', 'STORE', 'SUBROUTINE', 'THEN', 'THRU', 'UNTIL', 'USING', 'WHILE', 'WRITE', 'WRITES', 'XCALL' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, integer_tb, keyword_tb, groupers_tb, known_operator_tb, identifier_tb, comment_tb, string_tb, directive_tb, title_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence([]) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '0123456789abcdefABCDEF_') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_') octal_integer_tb = PrefixedIntegerTokenBuilder('0c', False, '01234567_') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') comment_tb = LeadToEndOfLineTokenBuilder('--', True, 'comment') known_operators = [ ':=', '=', '/=', '<', '>', '<=', '>=', '+', '-', '*', '/', '//', '\\\\', '^', '|..|', '..', 'and', 'or', 'xor', 'not', 'and then', 'or else', 'implies', '.', '@', '#', '|', '&' ] self.unary_operators = ['+', '-', 'not', '@', '#', '|', '&'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', ';'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ';', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'across', 'agent', 'alias', 'all', 'as', 'assign', 'attribute', 'check', 'class', 'convert', 'create', 'debug', 'deferred', 'do', 'else', 'elseif', 'end', 'ensure', 'expanded', 'export', 'external', 'feature', 'from', 'frozen', 'if', 'implies', 'inherit', 'inspect', 'invariant', 'like', 'local', 'loop', 'note', 'obsolete', 'old', 'once', 'only', 'redefine', 'rename', 'require', 'rescue', 'retry', 'select', 'separate', 'then', 'undefine', 'until', 'variant', 'when' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = ['Current', 'Precursor', 'Result', 'Void', 'TUPLE'] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['False', 'True', '?'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, octal_integer_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("_") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '_0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '_01') suffixed_integer_tb = SuffixedIntegerTokenBuilder([ 'G', 'L', 'I', ], False, '_') real_tb = RealTokenBuilder(False, False, "_") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "_") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['G', 'D', 'F'], False, '_') operand_types.append('number') leads = '@_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) regex_tb = RegexTokenBuilder() # dollar-slash slash-dollar strings (allow newline) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() shebang_tb = SheBangTokenBuilder() terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '**', '=', '==', '!=', '===', '!==', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '**=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '>>>', '^', '?.', '?:', '<>', '>>>=', '.', '.&', '.@', '::', '=~', '==~', '*.', '*:', '..', '..<', '<=>', '++', '--', '->', '&&', '||', '?', '##', 'as', 'in', '!in', 'instanceof', '!instanceof', 'new', ] self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--', '?'] self.postfix_operators = ['++', '--', '&', '*'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] # group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'assert', 'break', 'case', 'catch', 'class', 'const', 'continue', 'def', 'default', 'do', 'else', 'enum', 'extends', 'finally', 'for', 'goto', 'if', 'implements', 'import', 'interface', 'new', 'package', 'return', 'super', 'switch', 'throw', 'throws', 'trait', 'try', 'var', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'byte', 'char', 'double', 'float', 'int', 'long', 'short', 'Java.lang.BigInteger' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['null', 'true', 'false', 'this'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, class_type_tb, string_tb, triple_quote_string_tb, regex_tb, slash_slash_comment_tb, slash_star_comment_tb, shebang_tb, self.unknown_operator_tb, invalid_token_builder ] # shebang line at start tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)