def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() RegexTokenBuilder.__escape_z__() PerlIdentifierTokenBuilder.__escape_z__() PerlDollarCaretIdentifierTokenBuilder.__escape_z__() PerlQStringTokenBuilder.__escape_z__() MRegexTokenBuilder.__escape_z__() SRegexTokenBuilder.__escape_z__() YRegexTokenBuilder.__escape_z__() TrRegexTokenBuilder.__escape_z__() PerlPrototypeTokenBuilder.__escape_z__() PerlSigilBraceTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() SuffixedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() SuffixedRealTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() SlashSlashCommentTokenBuilder.__escape_z__() SlashStarCommentTokenBuilder.__escape_z__() ClassTypeTokenBuilder.__escape_z__() HexRealExponentTokenBuilder.__escape_z__() NestedCommentTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() ParensLabelTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() AssemblyCommentTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() BlockTokenBuilder.__escape_z__() BraceCommentTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() StuffedQuoteStringTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() NullTokenBuilder.__escape_z__() SqlBracketedIdentifierTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() StuffedQuoteStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() RemarkTokenBuilder.__escape_z__() CBasicVariableTokenBuilder.__escape_z__() CBasicLabelTokenBuilder.__escape_z__() CBasicSuffixedIntegerTokenBuilder.__escape_z__() CBasicLineContinuationTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() LabelTokenBuilder.__escape_z__() AssemblyCommentTokenBuilder.__escape_z__() MultilineCommentTokenBuilder.__escape_z__() HashQuoteCharTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() TripleQuoteStringTokenBuilder.__escape_z__() SlashSlashCommentTokenBuilder.__escape_z__() SlashStarCommentTokenBuilder.__escape_z__() SwiftArgumentTokenBuilder.__escape_z__() SwiftSymbolTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() PrefixedRawStringTokenBuilder.__escape_z__() TripleQuoteStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() SuffixedRealTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() SuffixedIdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__(), NestedCommentTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() StuffedQuoteStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() SuffixedRealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() NullTokenBuilder.__escape_z__() BasicVariableTokenBuilder.__escape_z__() BasicLongVariableTokenBuilder.__escape_z__() RemarkTokenBuilder.__escape_z__() UserFunctionTokenBuilder.__escape_z__() LongUserFunctionTokenBuilder.__escape_z__() HardwareFunctionTokenBuilder.__escape_z__() return 'Escape ?Z'
def __init__(self, code, version): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') command_tb = PrefixedIdentifierTokenBuilder('!', 'command', False) metaclass_tb = PrefixedIdentifierTokenBuilder('?', 'metaclass', False) quotes = ['"', "'", "’"] string_tb = MatlabStringTokenBuilder(quotes, False) operand_types.append('string') line_comment_m_tb = LeadToEndOfLineTokenBuilder('%', False, 'comment') line_comment_o_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment') block_comment_m_tb = BlockTokenBuilder('%{', '%}', 'comment') block_comment_o_tb = BlockTokenBuilder('#{', '#}', 'comment') line_continuation_tb = KeywordTokenBuilder('...', 'line continuation') known_operators = [ '+', '-', '.*', '*', './', '/', '\\', '.^', '^', ".'", "'", '=', '==', '~=', '>', '>=', '<', '<=', '&', '|', '&&', '||', '~', '@', '.', '.?' ] operators_octave = [ '++', '--', '+=', '-=', '*=', '/=', '^=', '!', '!=', '**' ] if version == 'octave': known_operators += operators_octave self.unary_operators = ['+', '-', '~', '@'] self.postfix_operators = ["'"] groupers = ['(', ')', ',', '[', ']', '{', '}', ';', ':'] group_starts = ['(', '[', ',', '{'] # group_mids = [',', ';', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'break', 'case', 'catch', 'classdef', 'continue', 'else', 'elseif', 'end', 'for', 'function', 'global', 'if', 'otherwise', 'parfor', 'persistent', 'return', 'spmd', 'switch', 'try', 'while' ] keywords_octave = ['endfor', 'endif', 'endwhile'] if version == 'octave': keywords += keywords_octave keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['inf', 'Nan'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, command_tb, metaclass_tb, string_tb, line_comment_m_tb, block_comment_m_tb ] tokenbuilders_2 = [line_comment_o_tb, block_comment_o_tb] if version == 'octave': tokenbuilders += tokenbuilders_2 tokenbuilders_9 = [self.unknown_operator_tb, invalid_token_builder] tokenbuilders += tokenbuilders_9 tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) # self.calc_group_confidence(tokens, group_mids) # operand_types_2 = ['number'] # self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, block_comment_limit): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(False, True, '_') real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', '_') octal_integer_tb = PrefixedIntegerTokenBuilder('0o', True, '01234567_') hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', True, '0123456789ABCDEFabcdef_') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', True, '01_') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') lifetime_tb = IdentifierTokenBuilder("'", extras) attribute_tb = RustAttributeTokenBuilder() quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 10) bstring_tb = PrefixedStringTokenBuilder('b', True, quotes) rstring_tb = RustRawStringTokenBuilder() operand_types.append('string') char_tb = SingleCharStringTokenBuilder() class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = NestedCommentTokenBuilder( '/*', '*/', block_comment_limit) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '^', '!', '&', '|', '&&', '||', '<<', '>>', '+=', '-=', '*=', '/=', '%=', '^=', '&=', '|-', '<<=', '>>=', '=', '==', '!=', '>', '<', '>=', '<=', '@', '.', '..', '...', '->', '#', '$', '?', 'in', '&mut' ] self.unary_operators = ['+', '-', '*', '!', '&', '&mut'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::', '=>'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':', '::', '=>'] group_ends = [')', ']', '}', ')|'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'as', 'break', 'const', 'continue', 'crate' 'else', 'enum', 'extern', 'fn', 'for', 'if', 'impl', 'let', 'loop', 'match', 'mod', 'move', 'mut', 'pub', 'ref', 'return', 'static', 'struct', 'trait', 'type', 'unsafe', 'use', 'where', 'while' ] keywords_2018 = ['dyn', 'union', 'static'] keywords_future = [ 'abstract', 'become', 'box', 'do', 'final', 'macro', 'override', 'priv', 'typeof', 'unsized', 'virtual', 'yield', 'async', 'await', 'try' ] keywords += keywords_2018 keywords += keywords_future keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'Self', 'u8', 'i8', 'u16', 'i16', 'u32', 'i32', 'u64', 'i64', 'u128', 'i128', 'usize', 'isize', 'f32', 'f64' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['self', 'true', 'false', 'super', '_'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, octal_integer_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, char_tb, lifetime_tb, class_type_tb, attribute_tb, string_tb, bstring_tb, rstring_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = self.combine_numbers_and_adjacent_types(tokens) self.convert_operators_to_identifiers() self.convert_bars_to_groups() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_format_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, extension): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) operand_types.append('number') num_variable_tb = PrefixedIntegerTokenBuilder('$', False, '0123456789') operand_types.append('variable') known_variables = [ 'ARGC', 'ARGV', 'ENVIRON', 'FILENAME', 'FS', 'NF', 'NR', 'FNR', 'OFMT', 'OFS', 'ORS', 'RLENGTH', 'RS', 'RSTART', 'SUBSEP', ] known_variables_gnu = [ 'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'IGNORECASE', 'LINT', 'PROCINFO', 'TEXTDOMAIN' ] if extension == 'gnu': known_variables += known_variables_gnu variable_tb = CaseSensitiveListTokenBuilder(known_variables, 'variable', True) regex_tb = RegexTokenBuilder() operand_types.append('regex') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') hash_comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment') line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '=', '+', '-', '*', '/', '%', '^', '++', '--', '==', '+=', '-=', '*=', '/=', '%=', '^=', '!=', '>', '>=', '<', '<=', '&&', '||', '|', '!', '?', ':', '~', '!~' ] self.unary_operators = ['+', '-', '!', '~', '++', '--'] self.postfix_operators = [ '++', '--', ] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'BEGIN', 'END', 'if', 'else', 'while', 'do', 'for', 'break', 'continue', 'delete', 'next', 'nextfile', 'function', 'func', 'exit' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, variable_tb, num_variable_tb, real_tb, real_exponent_tb, keyword_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb, string_tb, hash_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'variable', 'regex'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '0123456789ABCDEFabcdef_') real_tb = RealTokenBuilder(True, True, '_') real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False) operand_types.append('decorator') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() class_tb = ClassTypeTokenBuilder() operand_types.append('class') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '+=', '-=', '*=', '/=', '%=', '++', '--', '&&', '||', '!', '==', '!=', '===', '!==', '<', '>', '<=', '>=', '!!', '?.', '?:', '::', '..', ':', '?', '.' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '!', '*', '++', '--'] self.postfix_operators = ['++', '--', ':'] groupers = ['->', '(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = ['->', ','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'as', 'as?', 'break', 'class', 'continue', 'do', 'else', 'for', 'fun', 'if', 'in', '!in', 'is', '!is', 'object', 'package', 'return', 'super', 'throw', 'try', 'typealias', 'typeof', 'val', 'var', 'when', 'while', 'by', 'catch', 'constructor', 'delegate', 'dynamic', 'field', 'file', 'finally', 'get', 'import', 'init', 'param', 'property', 'receiver', 'set', 'setparam', 'where', 'actual', 'abstract', 'annotation', 'companion', 'const', 'crossinline', 'data', 'enum', 'expect', 'external', 'final', 'infix', 'inline', 'inner', 'internal', 'lateinit', 'noinline', 'open', 'operator', 'out', 'override', 'private', 'protected', 'public', 'reified', 'sealed', 'suspend', 'tailrec', 'vararg' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'Byte', 'Short', 'Int', 'Long', 'Float', 'Double', 'Char', 'u', 'f', 'ul', 'UInt', 'ULong', 'UByte', 'UShort' ] type_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'null', 'this', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, keyword_tb, type_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, class_tb, decorator_tb, string_tb, triple_quote_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = self.combine_numbers_and_adjacent_types(tokens) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, tab_size): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") real_tb = RealTokenBuilder(True, True, None) hex_integer_1_tb = PrefixedIntegerTokenBuilder( '$', False, '0123456789abcdefABCDEF') hex_integer_2_tb = PrefixedIntegerTokenBuilder( '#$', False, '0123456789abcdefABCDEF') hex_integer_3_tb = PrefixedIntegerTokenBuilder( '&', False, '0123456789abcdefABCDEF') hex_integer_h_tb = SuffixedIntegerTokenBuilder(['h'], False, 'abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') suffixed_integer_tb = SuffixedIntegerTokenBuilder( ['Q', 'A', 'O', 'D', 'B'], False, None) operand_types.append('number') leads = '_$#.' extras = '_$#.' identifier_tb = IbmAsmIdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) hex_string_tb = PrefixedStringTokenBuilder('X', False, quotes) char_string_tb = PrefixedStringTokenBuilder('C', False, quotes) operand_types.append('string') known_operators = ['+', '-', '*', '/', '=', '&', '#', '?'] self.unary_operators = ['+', '-', '=', '&', '#', '?'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '<', '>'] group_starts = ['(', '[', ',', '{', '<'] group_ends = [')', ']', '}', '>'] group_mids = [','] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) # keywords = [] # keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) # types = [] # types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) values = ['*'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') comment_tb = AssemblyCommentTokenBuilder(';*') title_directive_tb = LeadToEndOfLineTokenBuilder( 'TITLE', False, 'directive') subtitle_directive_tb = LeadToEndOfLineTokenBuilder( 'SUBTTL', False, 'directive') include_directive_tb = LeadToEndOfLineTokenBuilder( 'INCLUDE', False, 'directive') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, integer_exponent_tb, hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb, hex_integer_h_tb, binary_integer_tb, suffixed_integer_tb, real_tb, values_tb, groupers_tb, known_operator_tb, title_directive_tb, subtitle_directive_tb, include_directive_tb, identifier_tb, string_tb, hex_string_tb, char_string_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] opcode_tokenbuilders = [identifier_tb, invalid_token_builder] args_tokenbuilders = [ integer_tb, integer_exponent_tb, hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb, hex_integer_h_tb, binary_integer_tb, suffixed_integer_tb, real_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, hex_string_tb, char_string_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) opcode_tokenizer = Tokenizer(opcode_tokenbuilders) args_tokenizer = Tokenizer(args_tokenbuilders) # tokenize as free-format tokens_free = tokenizer.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid') tokens_free = Examiner.convert_values_to_operators( tokens_free, known_operators) self.tokens = tokens_free self.convert_asm_identifiers_to_labels() self.calc_statistics() statistics_free = self.statistics self.statistics = {} self.calc_confidences(operand_types, group_starts, group_mids, group_ends, None) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] # tokenize as space-format opcode_extras = '.&=,()+-*/' label_leads = '.&$@' label_mids = '.&$#@' label_ends = ':,' comment_leads = '*;!' line_comment_leads = '' use_line_id = False tokens_space, indents = Tokenizer.tokenize_asm_code( code, tab_size, opcode_tokenizer, opcode_extras, args_tokenizer, label_leads, label_mids, label_ends, comment_leads, line_comment_leads, use_line_id) tokens_space = Examiner.combine_adjacent_identical_tokens( tokens_space, 'invalid operator') tokens_space = Examiner.combine_adjacent_identical_tokens( tokens_space, 'invalid') tokens_space = Examiner.combine_identifier_colon( tokens_space, ['newline'], [], []) tokens_space = Tokenizer.combine_number_and_adjacent_identifier( tokens_space) tokens_space = Examiner.convert_values_to_operators( tokens_space, known_operators) self.tokens = tokens_space self.convert_asm_identifiers_to_labels() self.calc_statistics() statistics_space = self.statistics self.statistics = {} self.calc_confidences(operand_types, group_starts, group_mids, group_ends, indents) self.calc_line_length_confidence(code, self.max_expected_line) confidences_space = self.confidences self.confidences = {} errors_space = self.errors self.errors = [] # select the better of free-format and spaced-format confidence_free = 1.0 for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_space = 1.0 for key in confidences_space: factor = confidences_space[key] confidence_space *= factor if confidence_space > confidence_free: self.tokens = tokens_space self.statistics = statistics_space self.confidences = confidences_space self.errors = errors_space else: self.tokens = tokens_free self.statistics = statistics_free self.confidences = confidences_free self.errors = errors_free
def __init__(self, code, block_comment_limit): super().__init__() self.newlines_important = 'parens' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) imaginary_tb = SuffixedRealTokenBuilder(False, False, ['im', 'cx'], True, None) operand_types.append('number') leads = '_' extras = '_' suffixes = '!' identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes) operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder(':', 'symbol', True) operand_types.append('symbol') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) operand_types.append('attribute') dollar_sign_tb = SingleCharacterTokenBuilder('$', 'identifier', True) quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) raw_string_tb = PrefixedRawStringTokenBuilder('raw', True, quotes) b_string_tb = PrefixedStringTokenBuilder('b', True, quotes) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment') nested_comment_tb = NestedCommentTokenBuilder('#=', '=#', block_comment_limit) line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ 'where', 'in', 'isa', '′', "'", '+', '-', '*', '/', '\\', '^', '%', '//', '<<', '>>', '<<<', '>>>', ':', '=', '==', '!=', '===', '!==', '+=', '-=', '*=', '/=', '^=', '%=', '<', '>', '<=', '>=', '~', '&', '|', '!', '&&', '||', '?', '.', '<:', '>:', '::', '->', '...', '..', '∀', '≤', '≥', '⊻', '⊽', '⊼' ] # 0x391 through 0x3a9 (capital) # 0x3b1 through 0x3c9 (small) greek_letters = [ 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω' ] greek_letter_tb = CaseSensitiveListTokenBuilder( greek_letters, 'identifier', True) self.unary_operators = [ 'isa', '+', '-', '~', '!', '.', ':', '::', "'", '<:', '>:', 'in', '..' ] self.postfix_operators = ['...', '′'] groupers = ['(', ')', ',', '[', ']', '{', '}'] # group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'baremodule', 'begin', 'break', 'catch', 'const', 'continue', 'do', 'else', 'elseif', 'end', 'export', 'finally', 'for', 'function', 'global', 'if', 'import', 'let', 'local', 'macro', 'module', 'quote', 'return', 'struct', 'try', 'using', 'while', 'abstract', 'mutable', 'primitive', 'type' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'Int8', 'UInt8', 'Int16', 'UInt16', 'Int32', 'UInt32', 'Int64', 'UInt64', 'Int128', 'UInt128', 'Float16', 'Float32', 'Float64', 'Bool', 'Char' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, imaginary_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, symbol_tb, attribute_tb, dollar_sign_tb, greek_letter_tb, string_tb, raw_string_tb, b_string_tb, triple_quote_string_tb, comment_tb, nested_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = JuliaExaminer.split_symbols_to_operators_identifiers( tokens, group_ends) self.tokens = tokens self.convert_keywords_to_identifiers() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'identifier', 'symbol'] self.calc_operand_confidence(tokens, operand_types_2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("_") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '_0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '_01') suffixed_integer_tb = SuffixedIntegerTokenBuilder([ 'G', 'L', 'I', ], False, '_') real_tb = RealTokenBuilder(False, False, "_") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "_") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['G', 'D', 'F'], False, '_') operand_types.append('number') leads = '@_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) regex_tb = RegexTokenBuilder() # dollar-slash slash-dollar strings (allow newline) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() shebang_tb = SheBangTokenBuilder() terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '**', '=', '==', '!=', '===', '!==', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '**=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '>>>', '^', '?.', '?:', '<>', '>>>=', '.', '.&', '.@', '::', '=~', '==~', '*.', '*:', '..', '..<', '<=>', '++', '--', '->', '&&', '||', '?', '##', 'as', 'in', '!in', 'instanceof', '!instanceof', 'new', ] self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--', '?'] self.postfix_operators = ['++', '--', '&', '*'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] # group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'assert', 'break', 'case', 'catch', 'class', 'const', 'continue', 'def', 'default', 'do', 'else', 'enum', 'extends', 'finally', 'for', 'goto', 'if', 'implements', 'import', 'interface', 'new', 'package', 'return', 'super', 'switch', 'throw', 'throws', 'trait', 'try', 'var', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'byte', 'char', 'double', 'float', 'int', 'long', 'short', 'Java.lang.BigInteger' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['null', 'true', 'false', 'this'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, class_type_tb, string_tb, triple_quote_string_tb, regex_tb, slash_slash_comment_tb, slash_star_comment_tb, shebang_tb, self.unknown_operator_tb, invalid_token_builder ] # shebang line at start tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '0123456789abcdefABCDEF_') long_integer_tb = SuffixedIntegerTokenBuilder('L', False, None) real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") float_real_tb = SuffixedRealTokenBuilder(False, False, ['f'], False, None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder("'", 'symbol', True) operand_types.append('symbol') quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '&', '|', '^', '<<', '>>', '&&', '||', '=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '>:', '⇒', '=>', '=', '<%', '<:', '←', '<-', '#', '@', '==', '!=', '>', '<', '>=', '<=', '!', '~', '<<<', '>>>', '.', '++', '--', 'new' ] self.unary_operators = ['+', '-', '*', '!', '~', '++', '--', 'new'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'abstract', 'case', 'catch', 'class', 'def', 'do', 'else', 'extends', 'final', 'finally', 'for', 'forSome', 'if', 'implicit', 'import', 'lazy', 'match', 'object', 'override', 'package', 'private', 'protected', 'return', 'sealed', 'then', 'throw', 'trait', 'try', 'type', 'val', 'var', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['false', 'true', 'null', 'this', 'super'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, long_integer_tb, real_tb, real_exponent_tb, float_real_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, symbol_tb, string_tb, triple_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() self.newlines_important = 'parens' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) bracket_string_tb = DoubleBracketStringTokenBuilder() operand_types.append('string') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '^', '<', '>', '<=', '>=', '==', '~=', '=', '..', '.', '#', ':', 'and', 'not', 'or' ] self.unary_operators = ['+', '-', '#', 'not'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}'] # group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'break', 'do', 'else', 'elseif', 'end', 'for', 'function', 'if', 'in', 'local', 'repeat', 'return', 'then', 'until', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['false', 'true', 'nil', '...'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') line_comment_tb = LeadToEndOfLineTokenBuilder('--', True, 'comment') block_comment_tb = LuaBlockCommentTokenBuilder() invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, bracket_string_tb, line_comment_tb, block_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'identifier'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') annotation_tb = PrefixedIdentifierTokenBuilder('@', 'annotation', False) operand_types.append('annotation') symbol_tb = PrefixedIdentifierTokenBuilder('#', 'symbol', True) operand_types.append('symbol') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '~/', '%', '^', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '~/=', '||', '&&', '.', '..', ':', '?', '??', '??=', 'as', 'is', 'is!', '++', '--', 'new' ] self.unary_operators = [ '+', '-', '*', '!', '~', '.', '..' '?.', '++', '--', 'new' ] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'abstract', 'assert', 'async', 'await', 'break', 'case', 'catch', 'class', 'const', 'continue', 'covariant', 'default', 'deferred', 'do', 'dynamic', 'else', 'enum', 'export', 'extends', 'external', 'factory', 'final', 'finally', 'for', 'Function', 'get', 'hide', 'if', 'implements', 'import', 'in', 'interface', 'library', 'mixin', 'on', 'operator', 'part', 'rethrow', 'return', 'set', 'show', 'static', 'switch', 'sync', 'throw', 'try', 'typedef', 'var', 'void', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = ['int', 'double', 'String', 'List', 'bool', 'void'] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'true', 'null', 'this', 'super'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, annotation_tb, symbol_tb, class_type_tb, string_tb, raw_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, block_comment_limit): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF_') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_') suffixed_integer_tb = SuffixedIntegerTokenBuilder(['U', 'L', 'LU', 'UL'], False, None) real_tb = RealTokenBuilder(False, False, "'") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l', 'i'], False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") hex_real_tb = HexRealExponentTokenBuilder() operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) operand_types.append('attribute') # string suffix: c,w,d quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) r_string_tb = PrefixedStringTokenBuilder('r', True, quotes) backtick_string_tb = EscapedStringTokenBuilder(['`'], 0) x_string_tb = PrefixedStringTokenBuilder('x', True, quotes) q_string_tb = PrefixedStringTokenBuilder('q', True, quotes) # q{} string cwd_string_tb = SuffixedStringTokenBuilder(quotes, 'cwd', False) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() slash_plus_comment_tb = NestedCommentTokenBuilder('/+', '+/', block_comment_limit) line_continuation_tb = SingleCharacterTokenBuilder('\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '/', '/=', '.', '..', '...', '&', '&=', '&&', '|', '|=', '||', '-', '-=', '--', '+', '+=', '++', '<', '<=', '<<', '<<=', '>', '>=', '>>=', '>>>=', '>>', '>>>', '!', '!=', '?', ',', ':', '$', '=', '==', '*', '*=', '%', '%=', '^', '^=', '^^', '^^=', '~', '~=', '@', '=>', '#', 'new', 'delete', 'typeof', 'is' ] self.unary_operators = [ '+', '-', '*', '!', '&', '~', '++', '--', ':', 'new', 'delete', 'typeof', 'is' ] self.postfix_operators = [ '++', '--', '&', ':' ] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) keywords = [ 'abstract', 'alias', 'align', 'asm', 'assert', 'auto', 'body', 'break', 'case', 'cast', 'catch', 'class', 'const', 'continue', 'debug', 'default', 'delegate', 'deprecated', 'do', 'else', 'enum', 'export', 'extern', 'final', 'finally', 'for', 'foreach', 'foreach_reverse', 'function', 'goto', 'if', 'immutable', 'import', 'in', 'inout', 'interface', 'invariant', 'lazy', 'macro', 'mixin', 'module', 'nothrow', 'out', 'override', 'package', 'pragma', 'private', 'protected', 'public', 'pure', 'ref', 'return', 'scope', 'shared', 'static', 'struct', 'switch', 'synchronized', 'template', 'throw', 'try', 'typeid', 'union', 'unittest', 'version', 'while', 'with', '__gshared', '__traits', '__vector', '__parameters' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'bool', 'byte', 'cdouble', 'cent', 'cfloat', 'char', 'creal', 'dchar', 'double', 'float', 'idouble', 'ifloat', 'int', 'ireal', 'long', 'real', 'short', 'ubyte', 'ucent', 'uint', 'ulong', 'ushort', 'void', 'wchar' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = [ 'false', 'null', 'super', 'this', 'true', '__FILE__', '__FILE_FULL_PATH__', '__MODULE__', '__LINE__', '__FUNCTION__', '__PRETTY_FUNCTION__', '__DATE__', '__EOF__', '__TIME__','__TIMESTAMP__', '__VENDOR__', '__VERSION__' ] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, hex_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, attribute_tb, class_type_tb, string_tb, r_string_tb, x_string_tb, backtick_string_tb, q_string_tb, cwd_string_tb, slash_slash_comment_tb, slash_star_comment_tb, slash_plus_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() number_suffixes = ['f', 'F', 'i', 'I', 'u', 'U', 'l', 'L', 'ul', 'uL', 'Ul', 'UL', 'lu', 'lU', 'Lu', 'LU'] tokens = self.combine_tokens_and_adjacent_types(tokens, 'number', 'identifier', number_suffixes) string_suffixes = ['c', 'w', 'd'] self.tokens = self.combine_tokens_and_adjacent_types(tokens, 'string', 'identifier', string_suffixes) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(True, True, None) real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None) hex_constant_tb = PrefixedIntegerTokenBuilder( '$', True, '0123456789ABCDEFabcdef') octal_constant_tb = PrefixedIntegerTokenBuilder('&', True, '01234567') binary_constant_tb = PrefixedIntegerTokenBuilder('%', True, '01') char_constant_tb = PrefixedIntegerTokenBuilder('#', True, '0123456789') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ["'"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') brace_comment_tb = BraceCommentTokenBuilder() paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment') known_operators = [ '+', '-', '*', '/', '=', '<>', '>', '>=', '<', '<=', 'and', 'or', 'not', '&', '|', '~', '<<', '>>', ':=', '^', '~', '@', '.', ':', '..', 'div', 'mod', 'shl', 'shr', 'in' ] known_operator_tb = CaseInsensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', 'not', '@', '^', '.'] self.postfix_operators = ['^'] groupers = ['(', ')', ',', '[', ']'] group_starts = ['(', '[', ','] group_mids = [','] group_ends = [')', ']'] groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'begin', 'break', 'case', 'const', 'do', 'downto', 'else', 'end', 'for', 'forward', 'function', 'goto', 'if', 'label', 'of', 'otherwise', 'packed', 'procedure', 'program', 'repeat', 'reset', 'then', 'to', 'type', 'until', 'uses', 'value', 'var', 'while', 'with' ] keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'array', 'boolean', 'char', 'file', 'integer', 'real', 'record', 'set', 'string' ] types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'nil', 'true'] values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, char_constant_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, string_tb, brace_comment_tb, paren_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = self.combine_identifier_colon( tokens, ['statement separator'], ['begin'], ['whitespace', 'comment', 'newline', 'line description']) self.tokens = tokens self.convert_identifiers_to_labels() self.convert_identifiers_to_labels_2() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'identifier', 'variable'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['begin', 'record', 'case'], ['end']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) hex_constant_tb = PrefixedIntegerTokenBuilder( '0H', False, '0123456789ABCDEFabcdef') octal_constant_tb = PrefixedIntegerTokenBuilder( '0O', False, '01234567') binary_constant_tb = PrefixedIntegerTokenBuilder('0B', False, '01') operand_types.append('number') leads = '_$' extras = '_$' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() terminators_tb = CaseInsensitiveListTokenBuilder( [';'], 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '===', '!==', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '=>', '^', '.', ':', '++', '--', '&&', '||', '?', '$', '?.', 'new', 'delete' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = [ '+', '-', '!', '~', '++', '--', ':', '$', 'new', 'delete' ] self.postfix_operators = ['++', '--', ':'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False) regex_tb = RegexTokenBuilder() keywords = [ 'break', 'case', 'catch', 'class', 'const', 'continue', 'debugger', 'default', 'do', 'else', 'enum', 'export', 'extends', 'finally', 'for', 'function', 'if', 'import', 'in', 'instanceof', 'return', 'switch', 'throw', 'try', 'typeof', 'while', 'with', 'as', 'implements', 'interface', 'let', 'package', 'private', 'protected', 'public', 'static', 'yield', 'constructor', 'declare', 'get', 'module', 'require', 'set', 'type', 'from', 'of' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'any', 'boolean', 'byte', 'char', 'number', 'string', 'symbol', 'void', 'never', 'object' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['this', 'super', 'null', 'true', 'false', 'undefined'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb, string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.convert_keywords_to_identifiers(['.']) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, tab_size, processor): super().__init__() self.newlines_important = 'always' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") real_tb = RealTokenBuilder(True, True, None) hex_integer_1_tb = PrefixedIntegerTokenBuilder('$', False, '0123456789abcdefABCDEF') hex_integer_2_tb = PrefixedIntegerTokenBuilder('#$', False, '0123456789abcdefABCDEF') hex_integer_3_tb = PrefixedIntegerTokenBuilder('&', False, '0123456789abcdefABCDEF') hex_integer_h_tb = SuffixedIntegerTokenBuilder(['h'], False, 'abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') suffixed_integer_tb = SuffixedIntegerTokenBuilder(['Q', 'A', 'O', 'D', 'B'], False, None) operand_types.append('number') leads = '$#.@&' extras = '$#.@&' identifier_tb = IbmAsmIdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) hex_string_tb = PrefixedStringTokenBuilder('X', False, quotes) char_string_tb = PrefixedStringTokenBuilder('C', False, quotes) operand_types.append('string') known_operators = [ '+', '-', '*', '/', '=', '&', '#', '?', "'" ] self.unary_operators = [ '+', '-', '=', '&', '#', '?', "'" ] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '<', '>'] group_starts = ['(', '[', ',', '{', '<'] group_ends = [')', ']', '}', '>'] group_mids = [','] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) preprocessors = [ 'MACRO', 'MEND' ] preprocessor_tb = CaseInsensitiveListTokenBuilder(preprocessors, 'preprocesssor', False) directives = [ 'CSECT', 'DC', 'DROP', 'DS', 'EJECT', 'END', 'ENTRY', 'EQU', 'EXTRN', 'FREEMAIN', 'GETMAIN', 'GLOBAL', 'NAM', 'NAME', 'ORG', 'PAGE', 'PARAM', 'PROC', 'PUBLIC', 'RETURN', 'STIMER', 'TITLE', 'SUBTTL', 'USING' ] directive_tb = CaseInsensitiveListTokenBuilder(directives, 'directive', False) keywords = [] keywords_360 = [ 'A', 'ABEND', 'AD', 'ADR', 'AE', 'AER', 'AH', 'AL', 'ALR', 'AP', 'AR', 'AU', 'AUR', 'AW', 'AWR', 'AXR', 'B', 'BAL', 'BALR', 'BAS', 'BASR', 'BC', 'BCR', 'BCT', 'BCTR', 'BE', 'BH', 'BL', 'BM', 'BNE', 'BNH', 'BNL', 'BNM', 'BNP', 'BNO', 'BNZ', 'BO', 'BP', 'BR', 'BXH', 'BXLE', 'BZ', 'C', 'CD', 'CDR', 'CE', 'CER', 'CH', 'CL', 'CLC', 'CLI', 'CLR', 'CP', 'CR', 'CVB', 'CVD', 'D', 'DD', 'DDR', 'DE', 'DER', 'DIAGNOSE', 'DP', 'DR', 'ED', 'EDMK', 'EX', 'HDR', 'HER', 'HIO', 'IC', 'ISK', 'L', 'LA', 'LCR', 'LCDR', 'LCER', 'LD', 'LDR', 'LE', 'LER', 'LH', 'LM', 'LNDR', 'LNER', 'LNR', 'LPDR', 'LPER', 'LPR', 'LPSW', 'LR', 'LRDR', 'LRER', 'LTDR', 'LTER', 'LTR', 'M', 'MD', 'MDR', 'ME', 'MER', 'MH', 'MP', 'MR', 'MVC', 'MVI', 'MVN', 'MVO', 'MVZ', 'MXD', 'MXDR', 'MXR', 'N', 'NC', 'NI', 'NOP', 'NOPR', 'NR', 'O', 'OC', 'OI', 'OR', 'PACK', 'RDD', 'S', 'SD', 'SDR', 'SE', 'SER', 'SH', 'SIO', 'SL', 'SLA', 'SLDA', 'SLDL', 'SLL', 'SLR', 'SP', 'SPM', 'SR', 'SRA', 'SRDL', 'SRP', 'SSK', 'SSM', 'SRDA', 'SRL', 'ST', 'STC', 'STD', 'STE', 'STH', 'STM', 'SU', 'SUR', 'SVC', 'SW', 'SWR', 'SXR', 'TCH', 'TIO', 'TM', 'TR', 'TRT', 'TS', 'UNPK', 'UNPKU', 'WRD', 'X', 'XC', 'XI', 'XR', 'ZAP' ] keywords_370 = [ 'BRXH', 'BRXLE', 'CLCL', 'HDV', 'LAM', 'LEDR', 'MS', 'MVCL', 'RIO', 'SIOF', 'STAM', 'VA', 'VACD', 'VACDR', 'VACE', 'VACER', 'VAD', 'VADQ', 'VADR', 'VADS', 'VAE', 'VAEQ', 'VAER', 'VAES', 'VAQ', 'VAR', 'VAS', 'VC', 'VCD', 'VCDQ', 'VCDR', 'VCDS', 'VCE', 'VCEQ', 'VCER', 'VCES', 'VCQ', 'VCR', 'VCS', 'VDD', 'VDDQ', 'VDDR', 'VDDS', 'VDE', 'VDEQ', 'VDER', 'VDES', 'VL', 'VLCDR', 'VLCER', 'VLCR', 'VLD', 'VLDQ', 'VLDR', 'VLEQ', 'VLH', 'VLINT', 'VLM', 'VLMD', 'VLMDQ', 'VLMDR', 'VLMEQ', 'VLMQ', 'VLMR', 'VLNDR', 'VLNER', 'VLNR', 'VLPDR', 'VLPER', 'VLPR', 'VLQ', 'VLR', 'VLY', 'VLYD', 'VLZDR', 'VLZR', 'VM', 'VMAD', 'VMADQ', 'VMADS', 'VMAE', 'VMAEQ', 'VMAES', 'VMCD', 'VMCE', 'VMCER', 'VMD', 'VMDQ', 'VMDR', 'VMDS', 'VME', 'VMEQ', 'VMER', 'VMES', 'VMQ', 'VMR', 'VMS', 'VMSD', 'VMSDQ', 'VMSDS', 'VMSE', 'VMSEQ', 'VMSES', 'VN', 'VNQ', 'VNR', 'VNS', 'VO', 'VOQ', 'VOR', 'VOS', 'VS', 'VSD', 'VSDQ', 'VSDR', 'VSDS', 'VSE', 'VSEQ', 'VSER', 'VSES', 'VSQD', 'VSQDR', 'VSQE', 'VSQER', 'VSQ', 'VSR', 'VSS', 'VST', 'VSTD', 'VSTE', 'VSTH', 'VSTKD', 'VSTMD', 'VTAD', 'VTAE', 'VTSD', 'VTSE', 'VX', 'VXQ', 'VXR', 'VXS', 'VMXSE', 'VMNSE', 'VMXAE', 'VLELE', 'VSELE', 'VMXDS', 'VMNSD', 'VMXAD', 'VLELD', 'VXELD', 'VSPSD', 'VAPSD', 'VTVM', 'VCVM', 'VCZVM', 'VCOVM', 'VXVC', 'VXVMM', 'VRRS', 'VRSVC', 'VRSV', 'VLVM', 'VLCVM', 'VSTVM', 'VNVM', 'VOVM', 'VXVM', ' VSRSV', 'VMRSV', 'VSRRS', 'VLVCA', 'VRCL', 'VSVMM', 'VLVXA', 'VSVTP', 'VACSV', 'VACRS', 'STNSM', 'SOTSM', 'SIOP', 'MC', 'LRA', 'CONCS', 'DISCS', 'STIDP', 'SCK', 'SPT', 'STPT', 'SPKA', 'IPK', 'PTLB', 'SPX', 'STPX', 'STAP', 'RRB', 'PC', 'SAC', 'IPTE', 'IVSK', 'IAC', 'SSAR', 'EPAR', 'ESAR', 'PT', 'ISKE', 'RRBE', 'SSKE', 'TB', 'STCTL', 'LCTL', 'CS', 'CDS', 'CLM', 'STCM', 'ICM', 'MVCK', 'MVCP', 'MVCS', 'VLI', 'VSTI', 'VLID', 'VSTID', 'VSRL', 'VSLL', 'VLBIX', 'LASP', 'TPROT', 'STRAG', 'MVCSK', 'MVCDK', 'DPFET', 'MVHHI', 'MVGHI', 'MVHI', 'CHHSI', 'CLHHSI', 'CGHSI', 'CLGHSI', 'CHSI', 'CLFHSI', 'TBEGIN', 'TBEGINC', 'MVCIN', 'UNPKA' ] keywords_390 = [ 'BASSM', 'BSG', 'BSM', 'CLRCH', 'CMPS', 'CLRIO', 'CMSG', 'LAE', 'LXDR', 'MDE', 'PFPO', 'PR', 'PTFF', 'SAM24', 'SAM31', 'SCKPF', 'TAM', 'TMPS', 'TMSG', 'TRACE', 'TRAP2', 'TMH',' TMLH', 'TML', 'TMLL', 'TMHH', 'TMHL', 'BRC', 'BRAS', 'BRCT', 'BRCTG', 'LHI', 'LGHI', 'AHI', 'AGHI', 'MHI', 'MGHI', 'CHI', 'CGHI', 'MVCLE', 'CLCLE', 'UPT', 'SIE', 'PCF', 'CFC', 'DEP', 'DCTP', 'MAD', 'MUN', 'STCAP', 'SERVC', 'IPM', 'DXR', 'PGIN', 'PGOUT', 'CSCH', 'HSCH', 'MSCH', 'SSCH', 'STSCH', 'TSCH', 'TPI', 'SAL', 'RSCH', 'STCRW', 'STCPS', 'RCHP', 'SCHM', 'STZP', 'SZP', 'TPZI', 'BAKR', 'CKSM', 'MADS', 'SQDR', 'STURA', 'MSTA', 'PALB', 'EREG', 'ESTA', 'LURA', 'TAR', 'SQDR', 'SAR', 'EAR', 'CSP', 'MSR', 'MVPG', 'MVST', 'CUSE', 'BSG', 'CLST', 'SRST', 'XSCH', 'RP', 'STCKE', 'SACF', 'STSI', 'SRNM', 'STFPC', 'LFPC', 'TRE', 'CUUTF', 'CUTFU', 'STFL', 'LPSWE', 'TRAP4', 'LPEBR', 'LNEBR', 'LTEBR', 'LCEBR', 'LDEBR', 'LXDBR', 'LDEBR', 'MXDBR', 'KEBR', 'CEBR', 'AEBR', 'SEBR', 'MDEBR', 'DEBR', 'MAEBR', 'MSEBR', 'LPDBR', 'LCDBR', 'SQEBR', 'MEEBR', 'KDBR', 'CDBR', 'ADBR', 'MDBR', 'DDBR', 'SDBR', 'LDER', 'LXDR', 'MAER', 'MSER', 'SQXR', 'MEER', 'MADR', 'MSDR', 'LPXBR', 'LNXBR', 'LTXBR', 'LCXBR', 'LCXBR', 'LEDBR', 'LDXBR', 'LEXBR', 'FIXBR', 'KXBR', 'CXBR', 'AXBR', 'SXBR', 'MXBR', 'DXBR', 'TBEDR', 'TBDR', 'DIEBR', 'FIEBR', 'THDER', 'DIDBR', 'FIDBR', 'LPXR', 'LNXR', 'LTXR', 'LCXR', 'LXR', 'LEXR', 'FIXR', 'CXR', 'LZER', 'LZDR', 'LZXR', 'FIER', 'FIDR', 'SFPC', 'EFPC', 'CEFBR', 'CDFBR', 'CXFBR', 'CEGBR', 'CEFR', 'CDFR', 'CXFR', 'CFDR', 'CFXR', 'CEGR', 'CDGR', 'CXGR', 'CGER', 'CGDR', 'CGXR', 'CDGBR', 'CXGBR', 'CGDBR', 'CGEBR', 'CGXBR', 'LMC', 'LPGR', 'LNGR', 'LTGR', 'LCGR', 'LGC', 'LURAG', 'AGR', 'SGR', 'ALGR', 'SLGR', 'MSGR', 'DSGR', 'EREGG', 'LRVGR', 'LPGFR', 'LNGFR', 'LTGFR', 'LCGFR', 'LGFR', 'LLGFR', 'LLGTR', 'AGFR', 'SGFR', 'ALGFR', 'SLGFR', 'MSGFR', 'DSGFR', 'LRVR', 'CGR', 'CLGR', 'STURG', 'CGFR', 'CLGFR', 'BCTGR', 'NGR', 'OGR', 'XGR', 'MLGR', 'DLGR', 'ALCGR', 'SLBGR', 'EPSW', 'TRTT', 'TRTO', 'TROT', 'TROO', 'MLR', 'DLR', 'ALCR', 'SLBR', 'ESEA', 'LARL', 'LGFI', 'BRCL', 'BRASL', 'XIHF', 'XILF', 'IIHF', 'IILF', 'NIHF', 'NILF', 'OIHF', 'OILF', 'LLIHF', 'LLILF', 'LLHRL', 'LGHRL', 'LHRL', 'AGFI', 'AFI', 'ALGFI', 'ALFI', 'CGFI', 'CFI', 'LLGFRL', 'STRL', 'EXRL', 'PFDRL', 'CGHRL','CHRL', 'CLGHRL', 'CLHRL', 'CGRL', 'CLGRL', 'CRL', 'CLGFRL', 'CLRL', 'MVCOS', 'ECTG', 'CSST', 'PKU', 'LRAG', 'LG', 'AG', 'SG', 'ALG', 'SLG', 'MSG', 'DSG', 'CVBG', 'LRVG', 'LGF', 'LGH', 'LLGF', 'LLGT', 'AGF', 'SGF', 'ALGF', 'SLGF', 'MSGF', 'DSGF', 'LRV', 'LRVH', 'CG', 'CLG', 'STG', 'CVDG', 'STRVG', 'CGF', 'CLGF', 'STRV', 'STRVH', 'BCTG', 'NG', 'OG', 'XG', 'MLG', 'DLG', 'ALCG', 'SLBG', 'STPQ', 'LPQ', 'LLGC', 'LLGH', 'ML', 'DL', 'ALC', 'SLB', 'PKA', 'DIL', 'BDIL', 'ANUM', 'COMP', 'MCPU', 'MIO', 'BIFLAG', 'MULDIV', 'LMG', 'SRAG', 'SLAG', 'SRLG', 'SLLG', 'TRACG', 'RLLG', 'RLL', 'CLMH', 'CLMY', 'CLT', 'CLTH', 'CLTL', 'CLTNE', 'CLTE', 'CLTNL', 'CLTNH', 'STMG', 'STCTG', 'STMH', 'STCMH', 'LCTLG', 'CSG', 'CDSG', 'BXHG', 'BXLEG', 'ICMH', 'MVCLU', 'CLCLU', 'LMH', 'LMY', 'TP', 'SRAK', 'SLAK', 'SRLK', 'SRLK', 'LOCG', 'BRXHG', 'BRXLG', 'LDEB', 'LXDB', 'LXEB', 'MXDB', 'KEB', 'CEB', 'AEB', 'SEB', 'MDEB', 'DEB', 'MAEB', 'MSEB', 'TCEB', 'TCDB', 'TCXB', 'SQEB', 'SQDB', 'MEEB', 'KDB', 'CDB', 'ADB', 'SDB', 'MDB', 'DDB', 'MADB', 'MSDB', 'LDE', 'LXD', 'LXE', 'SQE', 'SQD', 'MEE', 'PLO', 'LMD' ] keywords_z = [ 'IIHH', 'IIHL', 'IILH', 'IILL', 'LLIHH', 'LLIHL', 'LLILH', 'LLILL', 'NIHH', 'NIHL', 'NILH', 'NILL', 'OIHH', 'OIHL', 'OILH', 'OILL', 'SAM64' ] if processor in ['360', '370', '390', 'system-z']: keywords += keywords_360 if processor in ['370', '390', 'system-z']: keywords += keywords_370 if processor in ['390', 'system-z']: keywords += keywords_390 if processor in ['system-z']: keywords += keywords_z opcode_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) registers = [ 'R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11', 'R12', 'R13', 'R14', 'R15', 'FP0', 'FP2', 'FP4', 'FP6' ] register_tb = CaseInsensitiveListTokenBuilder(registers, 'register', True) values = ['*'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') comment_tb = LeadToEndOfLineTokenBuilder('!', False, 'comment') line_comment_tb = AssemblyCommentTokenBuilder('*') include_directive_tb = LeadToEndOfLineTokenBuilder('INCLUDE', False, 'directive') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, integer_exponent_tb, hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb, hex_integer_h_tb, binary_integer_tb, suffixed_integer_tb, real_tb, values_tb, groupers_tb, known_operator_tb, register_tb, opcode_tb, directive_tb, include_directive_tb, preprocessor_tb, identifier_tb, string_tb, hex_string_tb, char_string_tb, comment_tb, line_comment_tb, self.unknown_operator_tb, invalid_token_builder ] opcode_tokenbuilders = [ whitespace_tb, opcode_tb, directive_tb, include_directive_tb, preprocessor_tb, identifier_tb, self.unknown_operator_tb, invalid_token_builder ] args_tokenbuilders = [ whitespace_tb, integer_tb, integer_exponent_tb, hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb, hex_integer_h_tb, binary_integer_tb, suffixed_integer_tb, real_tb, values_tb, groupers_tb, known_operator_tb, register_tb, identifier_tb, string_tb, hex_string_tb, char_string_tb, comment_tb, line_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) opcode_tokenizer = Tokenizer(opcode_tokenbuilders) args_tokenizer = Tokenizer(args_tokenbuilders) # tokenize as free-format tokens_free = tokenizer.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens(tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens(tokens_free, 'invalid') tokens_free = AssemblyIBMExaminer.convert_keywords_to_identifiers(tokens_free) tokens_free = Examiner.convert_values_to_operators(tokens_free, known_operators) self.tokens = tokens_free self.convert_asm_identifiers_to_labels() self.convert_asm_keywords_to_identifiers() self.calc_statistics() statistics1 = self.statistics self.statistics = {} self.calc_confidences(operand_types, group_starts, group_mids, group_ends, None) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] # tokenize as space-format opcode_extras = '.&=,()+-*/' label_leads = '.&$@' label_mids = '.&$#@' label_ends = ':,' comment_leads = '!' line_comment_leads = '*' use_line_id = True tokens_space, indents = Tokenizer.tokenize_asm_code(code, tab_size, opcode_tokenizer, opcode_extras, args_tokenizer, label_leads, label_mids, label_ends, comment_leads, line_comment_leads, use_line_id) tokens_space = Examiner.combine_adjacent_identical_tokens(tokens_space, 'invalid operator') tokens_space = Examiner.combine_adjacent_identical_tokens(tokens_space, 'invalid') tokens_space = Examiner.combine_identifier_colon(tokens_space, ['newline'], [], []) tokens_space = Tokenizer.combine_number_and_adjacent_identifier(tokens_space) tokens_space = AssemblyIBMExaminer.convert_opcodes_to_keywords(tokens_space, keywords) tokens_space = AssemblyIBMExaminer.convert_keywords_to_identifiers(tokens_space) tokens_space = Examiner.convert_values_to_operators(tokens_space, known_operators) self.tokens = tokens_space self.convert_asm_identifiers_to_labels() self.convert_asm_keywords_to_identifiers() self.calc_statistics() statistics2 = self.statistics self.statistics = {} self.calc_confidences(operand_types, group_starts, group_mids, group_ends, indents) self.calc_line_length_confidence(code, self.max_expected_line) confidences_space = self.confidences self.confidences = {} errors_space = self.errors self.errors = [] # select the better of free-format and spaced-format confidence_free = 1.0 for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_space = 1.0 for key in confidences_space: factor = confidences_space[key] confidence_space *= factor if confidence_space > confidence_free: self.tokens = tokens_space self.statistics = statistics2 self.confidences = confidences_space self.errors = errors_space else: self.tokens = tokens_free self.statistics = statistics1 self.confidences = confidences_free self.errors = errors_free
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) perl_identfier_tb = PerlIdentifierTokenBuilder() operand_types.append('identifier') specials = [ '$_', '@_', '$$', '$"', '$(', '$)', '$>', '$<', '$;', '$]', '$[', '$&', '$`', "$'", '$+', '@+', '%+', '@-', '%-', '$,', '$.', '$/', '$\\', '$|', '$%', '$-', '$:', '$=', '$^', '$~', '$!', '$?', '$@', '$#', '$*' ] specials_tb = CaseInsensitiveListTokenBuilder(specials, 'identifier', True) dollar_carat_tb = PerlDollarCaretIdentifierTokenBuilder() sigilbrace_tb = PerlSigilBraceTokenBuilder() quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') q_string_tb = PerlQStringTokenBuilder() regex_tb = RegexTokenBuilder() m_regex_tb = MRegexTokenBuilder() s_regex_tb = SRegexTokenBuilder() y_regex_tb = YRegexTokenBuilder() tr_regex_tb = TrRegexTokenBuilder() operand_types.append('regex') prototype_tb = PerlPrototypeTokenBuilder() comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment') line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) directives = ['#line'] preprocessor_tb = CaseSensitiveListTokenBuilder( directives, 'preprocessor', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '**', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=', '**=', '+=', '*=', '&=', '&.=', '<<=', '&&=', '-=', '/=', '|=', '|.=', '>>=', '||=', '.=', '%=', '^=', '^.=', '//=', 'x=', 'ne', 'gt', 'ge', 'le', 'lt', 'eq', '!', '&', '|', '~', '<<', '>>', '^', '.', '..', '...', '++', '--', '->', '=>', '&&', '||', '?', '<->', '<=>', 'and', 'cmp', 'or', 'xor' ] self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':', '::'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'bless', 'break', 'continue', 'die', 'do', 'else', 'elsif', 'eval', 'exit', 'exp', 'for', 'foreach', 'if', 'last', 'lock', 'my', 'next', 'no', 'our', 'package', 'redo', 'return', 'say', 'sub', 'taint', 'undef', 'unless', 'until', 'use', 'wantarray', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True) values = ['NULL'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, prototype_tb, identifier_tb, perl_identfier_tb, specials_tb, dollar_carat_tb, sigilbrace_tb, string_tb, q_string_tb, regex_tb, m_regex_tb, s_regex_tb, y_regex_tb, tr_regex_tb, preprocessor_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code, ['__END__']) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment', 'line description']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') identifier_tb = HaskellIdentifierTokenBuilder() operand_types.append('identifier') class_tb = HaskellClassTokenBuilder() operand_types.append('class') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') line_comment_tb = LeadToEndOfLineTokenBuilder('--', False, 'comment') block_comment_tb = BlockTokenBuilder('{-', '-}', 'comment') line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::'] group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) operators_tb = HaskellOperatorTokenBuilder('#$%&*+./<=>?@\\^|-~') known_operators = ["'", '..'] known_operators_tb = CaseInsensitiveListTokenBuilder( known_operators, 'operator', False) self.postfix_operators = ['..', "'"] keywords = [ 'case', 'class', 'data', 'deriving', 'do', 'else', 'if', 'import', 'in', 'infix', 'infix1', 'infixr', 'instance', 'let', 'module', 'newtype', 'of', 'then', 'type', 'where' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True) values = ['True', 'False', 'Nothing', '_'] value_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, keyword_tb, groupers_tb, operators_tb, known_operators_tb, identifier_tb, value_tb, class_tb, string_tb, line_comment_tb, block_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) HaskellExaminer.convert_keywords_to_identifiers(tokens) self.tokens = tokens # self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() # self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] # self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) # operand_types_2 = ['number'] # self.calc_operand_n_confidence(tokens, operand_types_2, 2) # operand_types = ['number', 'string', 'symbol', 'identifier', 'variable'] # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '0123456789abcdefABCDEF_') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_') octal_integer_tb = PrefixedIntegerTokenBuilder('0c', False, '01234567_') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') comment_tb = LeadToEndOfLineTokenBuilder('--', True, 'comment') known_operators = [ ':=', '=', '/=', '<', '>', '<=', '>=', '+', '-', '*', '/', '//', '\\\\', '^', '|..|', '..', 'and', 'or', 'xor', 'not', 'and then', 'or else', 'implies', '.', '@', '#', '|', '&' ] self.unary_operators = ['+', '-', 'not', '@', '#', '|', '&'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', ';'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ';', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'across', 'agent', 'alias', 'all', 'as', 'assign', 'attribute', 'check', 'class', 'convert', 'create', 'debug', 'deferred', 'do', 'else', 'elseif', 'end', 'ensure', 'expanded', 'export', 'external', 'feature', 'from', 'frozen', 'if', 'implies', 'inherit', 'inspect', 'invariant', 'like', 'local', 'loop', 'note', 'obsolete', 'old', 'once', 'only', 'redefine', 'rename', 'require', 'rescue', 'retry', 'select', 'separate', 'then', 'undefine', 'until', 'variant', 'when' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = ['Current', 'Precursor', 'Result', 'Void', 'TUPLE'] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['False', 'True', '?'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, octal_integer_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = PrefixedIntegerTokenBuilder('#', False, '0123456789') variable16_tb = PrefixedIntegerTokenBuilder('.', False, '0123456789') variable32_tb = PrefixedIntegerTokenBuilder(':', False, '0123456789') array16_tb = PrefixedIntegerTokenBuilder(',', False, '0123456789') array32_tb = PrefixedIntegerTokenBuilder(';', False, '0123456789') operand_types.append('number') comment_tb = LeadToEndOfLineTokenBuilder('NOTE', True, 'comment') label_tb = ParensLabelTokenBuilder() known_operators = ['~', '$', 'V', '?', '&', 'SUB', '<-'] self.unary_operators = ['V', '?', '&'] self.postfix_operators = [] groupers = ['"', "'"] group_starts = ['"', "'"] group_ends = ['"', "'"] group_mids = [] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'DO', 'STASH', 'RETRIEVE', 'RESUME', 'FORGET', 'NEXT', 'ABSTAIN', 'FROM', 'REINSTATE', 'IGNORE', 'REMEMBER', 'WRITE', 'IN', 'READ', 'OUT', 'PLEASE', 'COME', 'FROM' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, variable16_tb, variable32_tb, array16_tb, array32_tb, keyword_tb, groupers_tb, label_tb, known_operator_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() # self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)