def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() SuffixedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() SuffixedRealTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() SlashSlashCommentTokenBuilder.__escape_z__() SlashStarCommentTokenBuilder.__escape_z__() ClassTypeTokenBuilder.__escape_z__() HexRealExponentTokenBuilder.__escape_z__() NestedCommentTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() RegexTokenBuilder.__escape_z__() RubyIdentifierTokenBuilder.__escape_z__() HereDocTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() BlockTokenBuilder.__escape_z__() KeywordTokenBuilder.__escape_z__() MatlabStringTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() PrefixedRawStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() TripleQuoteStringTokenBuilder.__escape_z__() RawTripleQuoteCommentTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() TripleQuoteStringTokenBuilder.__escape_z__() SlashSlashCommentTokenBuilder.__escape_z__() SlashStarCommentTokenBuilder.__escape_z__() SwiftArgumentTokenBuilder.__escape_z__() SwiftSymbolTokenBuilder.__escape_z__() return 'Escape ?Z'
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(True, True, '_') real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_') operand_types.append('number') argument_tb = SwiftArgumentTokenBuilder() leads = '_' extras = '_' suffixes = '?' identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes) operand_types.append('identifier') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) symbol_tb = SwiftSymbolTokenBuilder('.', 'symbol', True) operand_types.append('symbol') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 10) triple_quote_comment_tb = TripleQuoteStringTokenBuilder(quotes) slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() operand_types.append('string') known_operators = [ '+', '-', '*', '/', '%', '==', '!=', '>', '<', '>=', '<=', '&&', '||', '!', '&', '|', '^', '~', '<<', '>>', '===', '=', '+=', '-=', '*=', '/=', '%=', '<<=', '>>=', '&=', '^=', '|=', '...', '..<', '?', ':', '.', '++', '--', '->', '??', '\\.', '&+', '&-', '&*' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '!', '~', '&', '++', '--', ':', '?'] self.postfix_operators = ['++', '--', ':', '!', '?'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'associatedtype', 'class', 'deinit', 'enum', 'extension', 'fileprivate', 'func', 'import', 'init', 'inout', 'internal', 'let', 'open', 'operator', 'private', 'protocol', 'public', 'static', 'struct', 'subscript', 'typealias', 'var', 'break', 'case', 'continue', 'default', 'defer', 'do', 'else', 'fallthrough', 'for', 'guard', 'if', 'in', 'repeat', 'return', 'switch', 'where', 'while', 'as', 'Any', 'catch', 'is', 'rethrows', 'super', 'throw', 'throws', 'try', 'try?', 'try!', '#available', '#colorLiteral', '#column', '#else', '#elseif', '#endif', '#file', '#fileLiteral', '#function', '#if', '#imageLiteral', '#line', '#selector', '#sourceLocation', 'associativity', 'convenience', 'dynamic', 'didSet', 'final', 'get', 'infix', 'indirect', 'lazy', 'left', 'mutating', 'none', 'nonmutating', 'optional', 'override', 'postfix', 'precedence', 'prefix', 'Protocol', 'required', 'right', 'set', 'Type', 'unowned', 'weak', 'willSet' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'char', 'double', 'float', 'int', 'long', 'short', ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['nil', 'Self', 'false', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, argument_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, attribute_tb, symbol_tb, string_tb, slash_slash_comment_tb, slash_star_comment_tb, triple_quote_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.convert_keywords_to_identifiers(['.']) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() self.newlines_important = 'parens' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(True, True, '_') real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_') operand_types.append('number') identifier_tb = RubyIdentifierTokenBuilder() operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder(':', 'symbol', True) operand_types.append('symbol') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 10) operand_types.append('string') regex_tb = RegexTokenBuilder() operand_types.append('regex') heredoc_tb = HereDocTokenBuilder('<<-') hash_comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment') known_operators = [ '!', '~', '**', '*', '/', '%', '+', '-', '<<', '>>', '&', '|', '^', '<', '<=', '>', '>=', '==', '===', '!=', '=~', '!~', '<=>', '&&', '||', '..', '...', '?', ':', '=', '**=', '*=', '/=', '%=', '+=', '-=', '<<=', '>>=', '&&=', '&=', '||=', '|=', '^=', 'not', 'and', 'or', 'in', '.', '.:', '=>', '::', '<<-' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '!', '~', '&', '*', '**', '<<-'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'BEGIN', 'END', 'alias', 'begin', 'break', 'case', 'class', 'def', 'defined?', 'do', 'else', 'elsif', 'end', 'ensure', 'for', 'if', 'module', 'next', 'redo', 'rescue', 'retry', 'return', 'then', 'undef', 'unless', 'until', 'when', 'while', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['nil', 'self', 'true', 'false', 'super'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') array_markers = ['%w', '%q', '%Q', '%i', '%s', '%x'] array_marker_tb = CaseSensitiveListTokenBuilder( array_markers, 'identifier', True) invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, symbol_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb, array_marker_tb, string_tb, heredoc_tb, hash_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.convert_bars_to_groups() self.convert_keywords_to_identifiers(['.']) self.convert_operators_to_identifiers() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() openers = ['begin', 'def', 'do', 'class', 'module'] closers = ['end'] self.calc_paired_blockers_confidence(openers, closers) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(False, False, '_') real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False) quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '>>>', '>>>=', '^', '.', '::', '++', '--', '&&', '||', '?', '->', 'new' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '!', '~', '++', '--', 'new'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'abstract', 'assert', 'break', 'case', 'catch', 'class', 'const', 'continue', 'default', 'do', 'else', 'enum', 'extends', 'final', 'finally', 'for', 'goto', 'if', 'implements', 'import', 'instanceof', 'interface', 'native', 'package', 'private', 'protected', 'public', 'return', 'static', 'strictfp', 'super', 'switch', 'synchronized', 'throw', 'throws', 'transient', 'try', 'volatile', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'boolean', 'byte', 'char', 'double', 'float', 'int', 'long', 'short', 'string', 'void', 'Integer', 'String', 'StringBuilder', 'File', 'Exception', 'IOException' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'null', 'this', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, class_type_tb, decorator_tb, string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.convert_keywords_to_identifiers(['::', '.']) self.convert_operators_to_identifiers(['::', '.']) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '0123456789ABCDEFabcdef_') real_tb = RealTokenBuilder(True, True, '_') real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False) operand_types.append('decorator') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() class_tb = ClassTypeTokenBuilder() operand_types.append('class') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '+=', '-=', '*=', '/=', '%=', '++', '--', '&&', '||', '!', '==', '!=', '===', '!==', '<', '>', '<=', '>=', '!!', '?.', '?:', '::', '..', ':', '?', '.' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '!', '*', '++', '--'] self.postfix_operators = ['++', '--', ':'] groupers = ['->', '(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = ['->', ','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'as', 'as?', 'break', 'class', 'continue', 'do', 'else', 'for', 'fun', 'if', 'in', '!in', 'is', '!is', 'object', 'package', 'return', 'super', 'throw', 'try', 'typealias', 'typeof', 'val', 'var', 'when', 'while', 'by', 'catch', 'constructor', 'delegate', 'dynamic', 'field', 'file', 'finally', 'get', 'import', 'init', 'param', 'property', 'receiver', 'set', 'setparam', 'where', 'actual', 'abstract', 'annotation', 'companion', 'const', 'crossinline', 'data', 'enum', 'expect', 'external', 'final', 'infix', 'inline', 'inner', 'internal', 'lateinit', 'noinline', 'open', 'operator', 'out', 'override', 'private', 'protected', 'public', 'reified', 'sealed', 'suspend', 'tailrec', 'vararg' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'Byte', 'Short', 'Int', 'Long', 'Float', 'Double', 'Char', 'u', 'f', 'ul', 'UInt', 'ULong', 'UByte', 'UShort' ] type_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'null', 'this', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, keyword_tb, type_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, class_tb, decorator_tb, string_tb, triple_quote_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = self.combine_numbers_and_adjacent_types(tokens) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() self.newlines_important = 'always' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(False, False, '_') real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False) quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes) byte_string_tb = PrefixedStringTokenBuilder('b', True, quotes) unicode_string_tb = PrefixedStringTokenBuilder('u', True, quotes) fast_string_tb = PrefixedStringTokenBuilder('f', True, quotes) operand_types.append('string') triple_quote_comment_tb = TripleQuoteStringTokenBuilder(quotes) raw_triple_quote_comment_tb = RawTripleQuoteCommentTokenBuilder() hash_comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment') known_operators = [ '+', '-', '*', '/', '%', '@', '=', ':=', '==', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '&', '|', '~', '<<', '>>', '**', '.', ':', '++', '--', 'and', 'or', 'in', 'is', 'not' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', 'not', '~', '++', '--', '.'] self.postfix_operators = ['++', '--', ':'] self.adjective_operators = ['not'] self.keyword_postfix = [':'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) continuation_chars = ['\\'] line_continuation_tb = CaseInsensitiveListTokenBuilder( continuation_chars, 'line continuation', False) keywords = [ 'as', 'assert', 'break', 'case', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'finally', 'for', 'from', 'global', 'if', 'import', 'lambda', 'match', 'nonlocal', 'pass', 'print', 'raise', 'return', 'try', 'while', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['False', 'None', 'True'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, decorator_tb, string_tb, raw_string_tb, byte_string_tb, unicode_string_tb, fast_string_tb, hash_comment_tb, triple_quote_comment_tb, raw_triple_quote_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [['not', 'in']] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = [ 'number', 'string', 'identifier', 'variable', 'symbol' ] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_format_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') annotation_tb = PrefixedIdentifierTokenBuilder('@', 'annotation', False) operand_types.append('annotation') symbol_tb = PrefixedIdentifierTokenBuilder('#', 'symbol', True) operand_types.append('symbol') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '~/', '%', '^', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '~/=', '||', '&&', '.', '..', ':', '?', '??', '??=', 'as', 'is', 'is!', '++', '--', 'new' ] self.unary_operators = [ '+', '-', '*', '!', '~', '.', '..' '?.', '++', '--', 'new' ] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'abstract', 'assert', 'async', 'await', 'break', 'case', 'catch', 'class', 'const', 'continue', 'covariant', 'default', 'deferred', 'do', 'dynamic', 'else', 'enum', 'export', 'extends', 'external', 'factory', 'final', 'finally', 'for', 'Function', 'get', 'hide', 'if', 'implements', 'import', 'in', 'interface', 'library', 'mixin', 'on', 'operator', 'part', 'rethrow', 'return', 'set', 'show', 'static', 'switch', 'sync', 'throw', 'try', 'typedef', 'var', 'void', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = ['int', 'double', 'String', 'List', 'bool', 'void'] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'true', 'null', 'this', 'super'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, annotation_tb, symbol_tb, class_type_tb, string_tb, raw_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, block_comment_limit): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF_') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_') suffixed_integer_tb = SuffixedIntegerTokenBuilder(['U', 'L', 'LU', 'UL'], False, None) real_tb = RealTokenBuilder(False, False, "'") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l', 'i'], False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") hex_real_tb = HexRealExponentTokenBuilder() operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) operand_types.append('attribute') # string suffix: c,w,d quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) r_string_tb = PrefixedStringTokenBuilder('r', True, quotes) backtick_string_tb = EscapedStringTokenBuilder(['`'], 0) x_string_tb = PrefixedStringTokenBuilder('x', True, quotes) q_string_tb = PrefixedStringTokenBuilder('q', True, quotes) # q{} string cwd_string_tb = SuffixedStringTokenBuilder(quotes, 'cwd', False) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() slash_plus_comment_tb = NestedCommentTokenBuilder('/+', '+/', block_comment_limit) line_continuation_tb = SingleCharacterTokenBuilder('\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '/', '/=', '.', '..', '...', '&', '&=', '&&', '|', '|=', '||', '-', '-=', '--', '+', '+=', '++', '<', '<=', '<<', '<<=', '>', '>=', '>>=', '>>>=', '>>', '>>>', '!', '!=', '?', ',', ':', '$', '=', '==', '*', '*=', '%', '%=', '^', '^=', '^^', '^^=', '~', '~=', '@', '=>', '#', 'new', 'delete', 'typeof', 'is' ] self.unary_operators = [ '+', '-', '*', '!', '&', '~', '++', '--', ':', 'new', 'delete', 'typeof', 'is' ] self.postfix_operators = [ '++', '--', '&', ':' ] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) keywords = [ 'abstract', 'alias', 'align', 'asm', 'assert', 'auto', 'body', 'break', 'case', 'cast', 'catch', 'class', 'const', 'continue', 'debug', 'default', 'delegate', 'deprecated', 'do', 'else', 'enum', 'export', 'extern', 'final', 'finally', 'for', 'foreach', 'foreach_reverse', 'function', 'goto', 'if', 'immutable', 'import', 'in', 'inout', 'interface', 'invariant', 'lazy', 'macro', 'mixin', 'module', 'nothrow', 'out', 'override', 'package', 'pragma', 'private', 'protected', 'public', 'pure', 'ref', 'return', 'scope', 'shared', 'static', 'struct', 'switch', 'synchronized', 'template', 'throw', 'try', 'typeid', 'union', 'unittest', 'version', 'while', 'with', '__gshared', '__traits', '__vector', '__parameters' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'bool', 'byte', 'cdouble', 'cent', 'cfloat', 'char', 'creal', 'dchar', 'double', 'float', 'idouble', 'ifloat', 'int', 'ireal', 'long', 'real', 'short', 'ubyte', 'ucent', 'uint', 'ulong', 'ushort', 'void', 'wchar' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = [ 'false', 'null', 'super', 'this', 'true', '__FILE__', '__FILE_FULL_PATH__', '__MODULE__', '__LINE__', '__FUNCTION__', '__PRETTY_FUNCTION__', '__DATE__', '__EOF__', '__TIME__','__TIMESTAMP__', '__VENDOR__', '__VERSION__' ] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, hex_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, attribute_tb, class_type_tb, string_tb, r_string_tb, x_string_tb, backtick_string_tb, q_string_tb, cwd_string_tb, slash_slash_comment_tb, slash_star_comment_tb, slash_plus_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() number_suffixes = ['f', 'F', 'i', 'I', 'u', 'U', 'l', 'L', 'ul', 'uL', 'Ul', 'UL', 'lu', 'lU', 'Lu', 'LU'] tokens = self.combine_tokens_and_adjacent_types(tokens, 'number', 'identifier', number_suffixes) string_suffixes = ['c', 'w', 'd'] self.tokens = self.combine_tokens_and_adjacent_types(tokens, 'string', 'identifier', string_suffixes) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, block_comment_limit): super().__init__() self.newlines_important = 'parens' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) imaginary_tb = SuffixedRealTokenBuilder(False, False, ['im', 'cx'], True, None) operand_types.append('number') leads = '_' extras = '_' suffixes = '!' identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes) operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder(':', 'symbol', True) operand_types.append('symbol') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) operand_types.append('attribute') dollar_sign_tb = SingleCharacterTokenBuilder('$', 'identifier', True) quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) raw_string_tb = PrefixedRawStringTokenBuilder('raw', True, quotes) b_string_tb = PrefixedStringTokenBuilder('b', True, quotes) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment') nested_comment_tb = NestedCommentTokenBuilder('#=', '=#', block_comment_limit) line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ 'where', 'in', 'isa', '′', "'", '+', '-', '*', '/', '\\', '^', '%', '//', '<<', '>>', '<<<', '>>>', ':', '=', '==', '!=', '===', '!==', '+=', '-=', '*=', '/=', '^=', '%=', '<', '>', '<=', '>=', '~', '&', '|', '!', '&&', '||', '?', '.', '<:', '>:', '::', '->', '...', '..', '∀', '≤', '≥', '⊻', '⊽', '⊼' ] # 0x391 through 0x3a9 (capital) # 0x3b1 through 0x3c9 (small) greek_letters = [ 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω' ] greek_letter_tb = CaseSensitiveListTokenBuilder( greek_letters, 'identifier', True) self.unary_operators = [ 'isa', '+', '-', '~', '!', '.', ':', '::', "'", '<:', '>:', 'in', '..' ] self.postfix_operators = ['...', '′'] groupers = ['(', ')', ',', '[', ']', '{', '}'] # group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'baremodule', 'begin', 'break', 'catch', 'const', 'continue', 'do', 'else', 'elseif', 'end', 'export', 'finally', 'for', 'function', 'global', 'if', 'import', 'let', 'local', 'macro', 'module', 'quote', 'return', 'struct', 'try', 'using', 'while', 'abstract', 'mutable', 'primitive', 'type' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'Int8', 'UInt8', 'Int16', 'UInt16', 'Int32', 'UInt32', 'Int64', 'UInt64', 'Int128', 'UInt128', 'Float16', 'Float32', 'Float64', 'Bool', 'Char' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, imaginary_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, symbol_tb, attribute_tb, dollar_sign_tb, greek_letter_tb, string_tb, raw_string_tb, b_string_tb, triple_quote_string_tb, comment_tb, nested_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = JuliaExaminer.split_symbols_to_operators_identifiers( tokens, group_ends) self.tokens = tokens self.convert_keywords_to_identifiers() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'identifier', 'symbol'] self.calc_operand_confidence(tokens, operand_types_2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '0123456789abcdefABCDEF_') long_integer_tb = SuffixedIntegerTokenBuilder('L', False, None) real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") float_real_tb = SuffixedRealTokenBuilder(False, False, ['f'], False, None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder("'", 'symbol', True) operand_types.append('symbol') quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '&', '|', '^', '<<', '>>', '&&', '||', '=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '>:', '⇒', '=>', '=', '<%', '<:', '←', '<-', '#', '@', '==', '!=', '>', '<', '>=', '<=', '!', '~', '<<<', '>>>', '.', '++', '--', 'new' ] self.unary_operators = ['+', '-', '*', '!', '~', '++', '--', 'new'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'abstract', 'case', 'catch', 'class', 'def', 'do', 'else', 'extends', 'final', 'finally', 'for', 'forSome', 'if', 'implicit', 'import', 'lazy', 'match', 'object', 'override', 'package', 'private', 'protected', 'return', 'sealed', 'then', 'throw', 'trait', 'try', 'type', 'val', 'var', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['false', 'true', 'null', 'this', 'super'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, long_integer_tb, real_tb, real_exponent_tb, float_real_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, symbol_tb, string_tb, triple_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, version): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') command_tb = PrefixedIdentifierTokenBuilder('!', 'command', False) metaclass_tb = PrefixedIdentifierTokenBuilder('?', 'metaclass', False) quotes = ['"', "'", "’"] string_tb = MatlabStringTokenBuilder(quotes, False) operand_types.append('string') line_comment_m_tb = LeadToEndOfLineTokenBuilder('%', False, 'comment') line_comment_o_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment') block_comment_m_tb = BlockTokenBuilder('%{', '%}', 'comment') block_comment_o_tb = BlockTokenBuilder('#{', '#}', 'comment') line_continuation_tb = KeywordTokenBuilder('...', 'line continuation') known_operators = [ '+', '-', '.*', '*', './', '/', '\\', '.^', '^', ".'", "'", '=', '==', '~=', '>', '>=', '<', '<=', '&', '|', '&&', '||', '~', '@', '.', '.?' ] operators_octave = [ '++', '--', '+=', '-=', '*=', '/=', '^=', '!', '!=', '**' ] if version == 'octave': known_operators += operators_octave self.unary_operators = ['+', '-', '~', '@'] self.postfix_operators = ["'"] groupers = ['(', ')', ',', '[', ']', '{', '}', ';', ':'] group_starts = ['(', '[', ',', '{'] # group_mids = [',', ';', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'break', 'case', 'catch', 'classdef', 'continue', 'else', 'elseif', 'end', 'for', 'function', 'global', 'if', 'otherwise', 'parfor', 'persistent', 'return', 'spmd', 'switch', 'try', 'while' ] keywords_octave = ['endfor', 'endif', 'endwhile'] if version == 'octave': keywords += keywords_octave keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['inf', 'Nan'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, command_tb, metaclass_tb, string_tb, line_comment_m_tb, block_comment_m_tb ] tokenbuilders_2 = [line_comment_o_tb, block_comment_o_tb] if version == 'octave': tokenbuilders += tokenbuilders_2 tokenbuilders_9 = [self.unknown_operator_tb, invalid_token_builder] tokenbuilders += tokenbuilders_9 tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) # self.calc_group_confidence(tokens, group_mids) # operand_types_2 = ['number'] # self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)