Example #1
0
 def __escape_z__():
   InvalidTokenBuilder.__escape_z__()
   WhitespaceTokenBuilder.__escape_z__()
   NewlineTokenBuilder.__escape_z__()
   EscapedStringTokenBuilder.__escape_z__()
   PrefixedStringTokenBuilder.__escape_z__()
   SuffixedStringTokenBuilder.__escape_z__()
   IntegerTokenBuilder.__escape_z__()
   IntegerExponentTokenBuilder.__escape_z__()
   PrefixedIntegerTokenBuilder.__escape_z__()
   SuffixedIntegerTokenBuilder.__escape_z__()
   RealTokenBuilder.__escape_z__()
   RealExponentTokenBuilder.__escape_z__()
   SuffixedRealTokenBuilder.__escape_z__()
   IdentifierTokenBuilder.__escape_z__()
   PrefixedIdentifierTokenBuilder.__escape_z__()
   CaseInsensitiveListTokenBuilder.__escape_z__()
   CaseSensitiveListTokenBuilder.__escape_z__()
   SingleCharacterTokenBuilder.__escape_z__()
   SlashSlashCommentTokenBuilder.__escape_z__()
   SlashStarCommentTokenBuilder.__escape_z__()
   ClassTypeTokenBuilder.__escape_z__()
   HexRealExponentTokenBuilder.__escape_z__()
   NestedCommentTokenBuilder.__escape_z__()
   return 'Escape ?Z'
Example #2
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     PrefixedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     BlockTokenBuilder.__escape_z__()
     TripleQuoteStringTokenBuilder.__escape_z__()
     SlashSlashCommentTokenBuilder.__escape_z__()
     TripleSlashCommentTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     NullTokenBuilder.__escape_z__()
     ClassTypeTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #3
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(False, False, '_')
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False)
        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=',
            '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!',
            '&', '|', '~', '<<', '>>', '>>>', '>>>=', '^', '.', '::', '++',
            '--', '&&', '||', '?', '->', 'new'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', '!', '~', '++', '--', 'new']

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'abstract', 'assert', 'break', 'case', 'catch', 'class', 'const',
            'continue', 'default', 'do', 'else', 'enum', 'extends', 'final',
            'finally', 'for', 'goto', 'if', 'implements', 'import',
            'instanceof', 'interface', 'native', 'package', 'private',
            'protected', 'public', 'return', 'static', 'strictfp', 'super',
            'switch', 'synchronized', 'throw', 'throws', 'transient', 'try',
            'volatile', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'boolean', 'byte', 'char', 'double', 'float', 'int', 'long',
            'short', 'string', 'void', 'Integer', 'String', 'StringBuilder',
            'File', 'Exception', 'IOException'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'null', 'this', 'true']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb,
            types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb,
            class_type_tb, decorator_tb, string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.convert_keywords_to_identifiers(['::', '.'])
        self.convert_operators_to_identifiers(['::', '.'])

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #4
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        directive_tb = DirectiveTokenBuilder()

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 10)
        prefixed_string_tb = PrefixedStringTokenBuilder('@', False, quotes)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        directives = [
            '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else',
            '#elif', '#import', '#line', '#include'
        ]

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        c_preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True,
                                                   'preprocessor')
        c_error_tb = LeadToEndOfLineTokenBuilder('#error', True,
                                                 'preprocessor')
        c_pragma_tb = LeadToEndOfLineTokenBuilder('#pragma', True,
                                                  'preprocessor')
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=',
            '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!',
            '&', '|', '<<', '>>', '~', '.', '->', '++', '--', '&&', '||', '^',
            '?', '##'
        ]

        self.unary_operators = [
            '+', '-', '*', '!', '&', '^', '~', '++', '--', '##'
        ]

        self.postfix_operators = ['++', '--', '&', '->', '*', '^']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'atomic', 'break', 'bycopy', 'byref', 'case', 'continue',
            'default', 'do', 'else', 'for', 'goto', 'if', 'IMP', 'in',
            'inline', 'inout', 'nonatomic', 'oneway', 'out', 'Protocol',
            'restrict', 'retain', 'return', 'SEL', 'sizeof', 'switch',
            'typedef', 'while', '@interface', '@end', '@implementation',
            '@protocol', '@class', '@public', '@protected', '@private',
            '@property', '@try', '@throw', '@catch()', '@finally',
            '@synthesize', '@dynamic', '@selector'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'auto', 'char', 'const', 'double', 'enum', 'extern', 'float', 'id',
            'int', 'long', 'register', 'short', 'signed', 'static', 'struct',
            'union', 'unsigned', 'void', 'volatile', '_Bool', '_Complex',
            '_Imaginary', 'BOOL', 'Class'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['self', 'super', 'nil', 'YES', 'NO', 'NULL', '...']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, real_tb, real_exponent_tb,
            keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb,
            directive_tb, identifier_tb, class_type_tb, string_tb,
            prefixed_string_tb, slash_slash_comment_tb, slash_star_comment_tb,
            c_preprocessor_tb, c_warning_tb, c_error_tb, c_pragma_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment', 'line description'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()
        self.convert_values_to_operators()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #5
0
    def __init__(self, code, block_comment_limit):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(False, True, '_')
        real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', '_')
        octal_integer_tb = PrefixedIntegerTokenBuilder('0o', True, '01234567_')
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', True, '0123456789ABCDEFabcdef_')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', True, '01_')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        lifetime_tb = IdentifierTokenBuilder("'", extras)

        attribute_tb = RustAttributeTokenBuilder()

        quotes = ['"']
        string_tb = EscapedStringTokenBuilder(quotes, 10)
        bstring_tb = PrefixedStringTokenBuilder('b', True, quotes)
        rstring_tb = RustRawStringTokenBuilder()
        operand_types.append('string')

        char_tb = SingleCharStringTokenBuilder()

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = NestedCommentTokenBuilder(
            '/*', '*/', block_comment_limit)

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '^', '!', '&', '|', '&&', '||', '<<',
            '>>', '+=', '-=', '*=', '/=', '%=', '^=', '&=', '|-', '<<=', '>>=',
            '=', '==', '!=', '>', '<', '>=', '<=', '@', '.', '..', '...', '->',
            '#', '$', '?', 'in', '&mut'
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '&mut']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::', '=>']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':', '::', '=>']
        group_ends = [')', ']', '}', ')|']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'as', 'break', 'const', 'continue', 'crate'
            'else', 'enum', 'extern', 'fn', 'for', 'if', 'impl', 'let', 'loop',
            'match', 'mod', 'move', 'mut', 'pub', 'ref', 'return', 'static',
            'struct', 'trait', 'type', 'unsafe', 'use', 'where', 'while'
        ]

        keywords_2018 = ['dyn', 'union', 'static']

        keywords_future = [
            'abstract', 'become', 'box', 'do', 'final', 'macro', 'override',
            'priv', 'typeof', 'unsized', 'virtual', 'yield', 'async', 'await',
            'try'
        ]

        keywords += keywords_2018
        keywords += keywords_future

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'Self', 'u8', 'i8', 'u16', 'i16', 'u32', 'i32', 'u64', 'i64',
            'u128', 'i128', 'usize', 'isize', 'f32', 'f64'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['self', 'true', 'false', 'super', '_']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, octal_integer_tb, hex_integer_tb,
            binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, types_tb,
            values_tb, groupers_tb, known_operator_tb, identifier_tb, char_tb,
            lifetime_tb, class_type_tb, attribute_tb, string_tb, bstring_tb,
            rstring_tb, slash_slash_comment_tb, slash_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        self.tokens = self.combine_numbers_and_adjacent_types(tokens)
        self.convert_operators_to_identifiers()
        self.convert_bars_to_groups()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_format_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #6
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '0123456789ABCDEFabcdef_')
        real_tb = RealTokenBuilder(True, True, '_')
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False)
        operand_types.append('decorator')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        operand_types.append('string')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        class_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '+=', '-=', '*=', '/=', '%=', '++',
            '--', '&&', '||', '!', '==', '!=', '===', '!==', '<', '>', '<=',
            '>=', '!!', '?.', '?:', '::', '..', ':', '?', '.'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', '!', '*', '++', '--']

        self.postfix_operators = ['++', '--', ':']

        groupers = ['->', '(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = ['->', ',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'as', 'as?', 'break', 'class', 'continue', 'do', 'else', 'for',
            'fun', 'if', 'in', '!in', 'is', '!is', 'object', 'package',
            'return', 'super', 'throw', 'try', 'typealias', 'typeof', 'val',
            'var', 'when', 'while', 'by', 'catch', 'constructor', 'delegate',
            'dynamic', 'field', 'file', 'finally', 'get', 'import', 'init',
            'param', 'property', 'receiver', 'set', 'setparam', 'where',
            'actual', 'abstract', 'annotation', 'companion', 'const',
            'crossinline', 'data', 'enum', 'expect', 'external', 'final',
            'infix', 'inline', 'inner', 'internal', 'lateinit', 'noinline',
            'open', 'operator', 'out', 'override', 'private', 'protected',
            'public', 'reified', 'sealed', 'suspend', 'tailrec', 'vararg'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'Byte', 'Short', 'Int', 'Long', 'Float', 'Double', 'Char', 'u',
            'f', 'ul', 'UInt', 'ULong', 'UByte', 'UShort'
        ]

        type_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'null', 'this', 'true']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb,
            keyword_tb, type_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, class_tb, decorator_tb, string_tb,
            triple_quote_string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        self.tokens = self.combine_numbers_and_adjacent_types(tokens)

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #7
0
    def __init__(self, code, variant):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        quotes = ['"']
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        prefixed_string_tb = PrefixedStringTokenBuilder('@', False, quotes)
        char_tb = FsharpCharTokenBuilder(["'", "’"])
        operand_types.append('string')

        slash_slash_comment_tb = NullTokenBuilder()
        parens_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment')
        triple_slash_comment_tb = NullTokenBuilder()
        if variant in ['fsharp']:
            slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
            triple_slash_comment_tb = TripleSlashCommentTokenBuilder()

        directives = [
            '#if', '#else', '#elif', '#endif', '#define', '#undef', '#line',
            '#region', '#endregion', '#pragma'
        ]

        preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True,
                                                   'preprocessor')
        c_error_tb = LeadToEndOfLineTokenBuilder('#error', True,
                                                 'preprocessor')

        known_operators = [
            'and', 'as', 'in', 'mod', 'not', 'of', 'or', 'when', '::', '+',
            '-', '*', '/', '+.', '-.', '*.', '/.', '=', "'", '->', '>', '<',
            '>=', '<=', '==', '^', '||', '.', '#'
        ]

        known_operators_fsharp = [
            'new', '!', '!=', '%', '%%', '%?', '&', '&&', '&&&', '(|', '|)',
            '*?', '**', '+?', '-?', '->', '..', '.. ..', '/?', ':', ':=', ':/',
            '<<', '<<<', '<-', '<>', '<>?', '<=?', '<|', '<||', '<|||', '<@',
            '@>', '<@@', '@@>', '=?', '==', '>?', '>>', '>>>', '>=?', '?',
            '|||', '^^^', '?>=', '?>', '?<=', '?<', '?=', '?<>', '?+', '?-',
            '?*', '?/', '>=?', '>?', '<=?', '<?', '=?', '<>?', '+?', '-?',
            '*?', '/?', '?>=?', '?>?', '?<=?', '?<?', '?=?', '?<>?', '?+?',
            '?-?', '?*?', '?/?', '@', '|>', '||>', '|||>', '~~', '~~~', '~-',
            '~+', ':>', ':?>', "'"
        ]

        if variant in ['fsharp']:
            known_operators += known_operators_fsharp

        self.unary_operators = ['new', 'not', "'", '-']

        self.postfix_operators = ["'"]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        groupers = [
            '(', ')', ',', '[', ']', '{', '}', 'begin', 'end', ';', '|'
        ]

        groupers_fsharp = ['[|', '|]', '[<', '>]', '^']

        if variant in ['fsharp']:
            groupers += groupers_fsharp

        # group_starts = ['(', '[', ',', '{', '[|', '[<']
        group_mids = [',', ';', '^', '|']
        group_ends = [')', ']', '}', '|]', '>]']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'assert', 'class', 'def', 'do', 'done', 'downto', 'else',
            'exception', 'failwith', 'for', 'fun', 'function', 'if', 'inherit',
            'lazy', 'let', 'match', 'method', 'module', 'object', 'open',
            'raise', 'rec', 'sig', 'then', 'to', 'try', 'type', 'val',
            'virtual', 'while', 'with'
        ]

        keywords_fsharp = [
            'abstract', 'break', 'default', 'delegate', 'downcast', 'elif',
            'extern', 'finally', 'fixed', 'global', 'inline', 'interface',
            'internal', 'let!', 'match!', 'member', 'mutable', 'namespace',
            'override', 'private', 'public', 'return', 'return!', 'upcast',
            'use', 'use!', 'yield', 'yield!'
        ]

        if variant in ['fsharp']:
            keywords += keywords_fsharp

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'bool', 'byte', 'char', 'double', 'float', 'int', 'list', 'long',
            'number', 'object', 'range', 'string', 'struct', 'union', 'unit',
            'void'
        ]

        types_fsharp = [
            'decimal', 'sbyte', 'short', 'uint', 'ulong', 'ushort', 'void'
        ]

        if variant in ['fsharp']:
            types += types_fsharp

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['base', 'false', 'null', 'true', '_']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, integer_tb, integer_exponent_tb,
            real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb,
            known_operator_tb, groupers_tb, identifier_tb, class_type_tb,
            string_tb, triple_quote_string_tb, prefixed_string_tb, char_tb,
            triple_slash_comment_tb, slash_slash_comment_tb,
            parens_star_comment_tb, preprocessor_tb, c_error_tb, c_warning_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #8
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        annotation_tb = PrefixedIdentifierTokenBuilder('@', 'annotation',
                                                       False)
        operand_types.append('annotation')

        symbol_tb = PrefixedIdentifierTokenBuilder('#', 'symbol', True)
        operand_types.append('symbol')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '~/', '%', '^', '=', '==', '!=', '>', '>=',
            '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=',
            '>>=', '!', '&', '|', '~', '<<', '>>', '~/=', '||', '&&', '.',
            '..', ':', '?', '??', '??=', 'as', 'is', 'is!', '++', '--', 'new'
        ]

        self.unary_operators = [
            '+', '-', '*', '!', '~', '.', '..'
            '?.', '++', '--', 'new'
        ]

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'abstract', 'assert', 'async', 'await', 'break', 'case', 'catch',
            'class', 'const', 'continue', 'covariant', 'default', 'deferred',
            'do', 'dynamic', 'else', 'enum', 'export', 'extends', 'external',
            'factory', 'final', 'finally', 'for', 'Function', 'get', 'hide',
            'if', 'implements', 'import', 'in', 'interface', 'library',
            'mixin', 'on', 'operator', 'part', 'rethrow', 'return', 'set',
            'show', 'static', 'switch', 'sync', 'throw', 'try', 'typedef',
            'var', 'void', 'while', 'with', 'yield'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = ['int', 'double', 'String', 'List', 'bool', 'void']

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'true', 'null', 'this', 'super']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb,
            real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb,
            groupers_tb, known_operator_tb, identifier_tb, annotation_tb,
            symbol_tb, class_type_tb, string_tb, raw_string_tb,
            slash_slash_comment_tb, slash_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #9
0
  def __init__(self, code, block_comment_limit):
    super().__init__()

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder("'")
    integer_exponent_tb = IntegerExponentTokenBuilder("'")
    hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF_')
    binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_')
    suffixed_integer_tb = SuffixedIntegerTokenBuilder(['U', 'L', 'LU', 'UL'], False, None)
    real_tb = RealTokenBuilder(False, False, "'")
    suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l', 'i'], False, None)
    real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
    hex_real_tb = HexRealExponentTokenBuilder()
    operand_types.append('number')

    leads = '_'
    extras = '_'
    identifier_tb = IdentifierTokenBuilder(leads, extras)
    operand_types.append('identifier')

    attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False)
    operand_types.append('attribute')

    # string suffix: c,w,d
    quotes = ['"', "'", "’"]
    string_tb = EscapedStringTokenBuilder(quotes, 0)
    r_string_tb = PrefixedStringTokenBuilder('r', True, quotes)
    backtick_string_tb = EscapedStringTokenBuilder(['`'], 0)
    x_string_tb = PrefixedStringTokenBuilder('x', True, quotes)
    q_string_tb = PrefixedStringTokenBuilder('q', True, quotes)
    # q{} string
    cwd_string_tb = SuffixedStringTokenBuilder(quotes, 'cwd', False)
    operand_types.append('string')

    class_type_tb = ClassTypeTokenBuilder()
    operand_types.append('class')

    slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
    slash_star_comment_tb = SlashStarCommentTokenBuilder()
    slash_plus_comment_tb = NestedCommentTokenBuilder('/+', '+/', block_comment_limit)

    line_continuation_tb = SingleCharacterTokenBuilder('\\', 'line continuation', False)
    terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False)

    known_operators = [
      '/', '/=', '.', '..', '...', '&', '&=', '&&', '|', '|=', '||',
      '-', '-=', '--', '+', '+=', '++', '<', '<=', '<<', '<<=', '>', '>=',
      '>>=', '>>>=', '>>', '>>>', '!', '!=', '?', ',', ':', '$',
      '=', '==', '*', '*=', '%', '%=', '^', '^=', '^^', '^^=', '~', '~=',
      '@', '=>', '#',
      'new', 'delete',
      'typeof', 'is'
    ]

    self.unary_operators = [
      '+', '-', '*',
      '!', '&', '~',
      '++', '--', ':',
      'new', 'delete',
      'typeof', 'is'
    ]

    self.postfix_operators = [
      '++', '--', '&', ':'
    ]

    groupers = ['(', ')', ',', '[', ']', '{', '}']
    group_starts = ['(', '[', ',', '{']
    group_mids = [',']
    group_ends = [')', ']', '}']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    keywords = [
      'abstract', 'alias', 'align', 'asm', 'assert', 'auto',
      'body', 'break', 'case', 'cast', 'catch', 'class', 'const', 'continue',
      'debug', 'default', 'delegate', 'deprecated', 'do',
      'else', 'enum', 'export', 'extern',
      'final', 'finally', 'for', 'foreach', 'foreach_reverse', 'function',
      'goto',
      'if', 'immutable', 'import', 'in', 'inout', 'interface', 'invariant',
      'lazy',
      'macro', 'mixin', 'module',
      'nothrow',
      'out', 'override',
      'package', 'pragma', 'private', 'protected', 'public', 'pure',
      'ref', 'return',
      'scope', 'shared', 'static', 'struct', 'switch', 'synchronized',
      'template', 'throw', 'try', 'typeid',
      'union', 'unittest', 'version', 'while', 'with',
      '__gshared', '__traits', '__vector', '__parameters'
]

    keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

    types = [
      'bool', 'byte', 'cdouble', 'cent', 'cfloat', 'char', 'creal',
      'dchar', 'double', 'float', 'idouble', 'ifloat', 'int', 'ireal',
      'long', 'real', 'short', 'ubyte', 'ucent', 'uint', 'ulong', 'ushort',
      'void', 'wchar'
    ]

    types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
    operand_types.append('type')

    values = [
      'false', 'null', 'super', 'this', 'true',
      '__FILE__', '__FILE_FULL_PATH__', '__MODULE__', '__LINE__',
      '__FUNCTION__', '__PRETTY_FUNCTION__',
      '__DATE__', '__EOF__', '__TIME__','__TIMESTAMP__',
      '__VENDOR__', '__VERSION__'
    ]

    values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      line_continuation_tb,
      terminators_tb,
      integer_tb,
      integer_exponent_tb,
      hex_integer_tb,
      binary_integer_tb,
      suffixed_integer_tb,
      real_tb,
      real_exponent_tb,
      suffixed_real_tb,
      hex_real_tb,
      keyword_tb,
      types_tb,
      values_tb,
      groupers_tb,
      known_operator_tb,
      identifier_tb,
      attribute_tb,
      class_type_tb,
      string_tb,
      r_string_tb,
      x_string_tb,
      backtick_string_tb,
      q_string_tb,
      cwd_string_tb,
      slash_slash_comment_tb,
      slash_star_comment_tb,
      slash_plus_comment_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)
    tokens = tokenizer.tokenize(code)
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator')
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
    tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment'])
    self.tokens = tokens
    self.convert_identifiers_to_labels()

    number_suffixes = ['f', 'F', 'i', 'I', 'u', 'U', 'l', 'L', 'ul', 'uL', 'Ul', 'UL', 'lu', 'lU', 'Lu', 'LU']
    tokens = self.combine_tokens_and_adjacent_types(tokens, 'number', 'identifier', number_suffixes)

    string_suffixes = ['c', 'w', 'd']
    self.tokens = self.combine_tokens_and_adjacent_types(tokens, 'string', 'identifier', string_suffixes)

    self.calc_statistics()

    tokens = self.source_tokens()
    tokens = Examiner.join_all_lines(tokens)

    self.calc_token_confidence()
    self.calc_token_2_confidence()

    num_operators = self.count_my_tokens(['operator', 'invalid operator'])
    if num_operators > 0:
      self.calc_operator_confidence(num_operators)
      allow_pairs = []
      self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
      self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
      self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

    self.calc_group_confidence(tokens, group_mids)

    operand_types_2 = ['number', 'symbol']
    self.calc_operand_n_confidence(tokens, operand_types_2, 2)
    self.calc_operand_n_confidence(tokens, operand_types, 4)

    self.calc_keyword_confidence()

    self.calc_paired_blockers_confidence(['{'], ['}'])
    self.calc_line_length_confidence(code, self.max_expected_line)
Example #10
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        hex_constant_tb = PrefixedIntegerTokenBuilder(
            '$', True, '0123456789ABCDEFabcdef')
        octal_constant_tb = PrefixedIntegerTokenBuilder('&', True, '01234567')
        binary_constant_tb = PrefixedIntegerTokenBuilder('%', True, '01')
        char_constant_tb = PrefixedIntegerTokenBuilder('#', True, '0123456789')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        class_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        string_tb = EscapedStringTokenBuilder(["'"], 0)
        operand_types.append('string')

        brace_comment_tb = BraceCommentTokenBuilder()
        paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment')
        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()

        known_operators = [
            '+', '-', '*', '/', '=', '<>', '>', '>=', '<', '<=', 'and', 'or',
            'not', 'xor', '&', '|', '~', '<<', '>>', ':=', '^', '~', '@', '.',
            '..', 'div', 'mod', 'shl', 'shr', 'is', 'in'
        ]

        known_operator_tb = CaseInsensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', 'not', '@', '^', '.']

        self.postfix_operators = ['^']

        groupers = ['(', ')', ',', '[', ']', ':']
        group_starts = ['(', '[', ',']
        group_mids = [',', ':']
        group_ends = [')', ']']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'as', 'begin', 'break', 'case', 'class', 'const', 'constructor',
            'default', 'destructor', 'do', 'downto', 'else', 'end', 'except',
            'finally', 'for', 'forward', 'function', 'goto', 'if',
            'implementation', 'inherited', 'interface', 'label', 'object',
            'of', 'on', 'otherwise', 'out', 'packed', 'private', 'procedure',
            'program', 'property', 'protected', 'public', 'raise', 'read',
            'repeat', 'resourcestring', 'strict', 'then', 'threadvar', 'to',
            'try', 'type', 'unit', 'until', 'uses', 'var', 'while', 'with',
            'write'
        ]

        keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword',
                                                     False)

        types = [
            'array', 'boolean', 'char', 'file', 'integer', 'real', 'record',
            'set', 'string'
        ]

        types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'nil', 'true']

        values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, char_constant_tb,
            keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, class_tb, string_tb, brace_comment_tb,
            paren_star_comment_tb, slash_slash_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = self.combine_identifier_colon(
            tokens, ['statement separator'], ['begin'],
            ['whitespace', 'comment', 'newline'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()
        self.convert_identifiers_to_labels_2()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = [
            'number', 'string', 'identifier', 'variable', 'symbol'
        ]
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        pair_starters = ['unit', 'class', 'begin', 'record', 'case', 'try']
        pair_enders = ['end']
        self.calc_paired_blockers_confidence(pair_starters, pair_enders)
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #11
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", '`', "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '&', '!', '^', '<<', '>>', '&^', '=',
            '+=', '-=', '*=', '<<=', '>>=', '&^=', '&&', '||', '<-', '++',
            '--', '==', '!=', '<=', '>=', ':=', '...', '.', ':', '<', '>'
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '<-', ':']

        self.postfix_operators = ['++', '--', ':']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        # group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'break', 'case', 'chan', 'const', 'continue', 'default', 'defer',
            'else', 'fallthrough', 'for', 'func', 'go', 'goto', 'if', 'import',
            'interface', 'map', 'package', 'range', 'return'
            'select', 'struct', 'switch', 'type', 'var'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32',
            'int64', 'float32', 'float64', 'complex64', 'complex128', 'byte',
            'rune', 'string', 'uint', 'int', 'uintptr'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['nil', 'true', 'false']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, real_tb, real_exponent_tb,
            keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb,
            identifier_tb, class_type_tb, string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        # tokens = Examiner.combine_identifier_colon(tokens, ['newline'], ['{'], ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_format_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #12
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("_")
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '_0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '_01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder([
            'G',
            'L',
            'I',
        ], False, '_')
        real_tb = RealTokenBuilder(False, False, "_")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "_")
        suffixed_real_tb = SuffixedRealTokenBuilder(False, False,
                                                    ['G', 'D', 'F'], False,
                                                    '_')
        operand_types.append('number')

        leads = '@_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        regex_tb = RegexTokenBuilder()
        # dollar-slash slash-dollar strings (allow newline)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        shebang_tb = SheBangTokenBuilder()
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+',
            '-',
            '*',
            '/',
            '%',
            '**',
            '=',
            '==',
            '!=',
            '===',
            '!==',
            '>',
            '>=',
            '<',
            '<=',
            '+=',
            '-=',
            '*=',
            '/=',
            '%=',
            '**=',
            '&=',
            '|=',
            '^=',
            '<<=',
            '>>=',
            '!',
            '&',
            '|',
            '~',
            '<<',
            '>>',
            '>>>',
            '^',
            '?.',
            '?:',
            '<>',
            '>>>=',
            '.',
            '.&',
            '.@',
            '::',
            '=~',
            '==~',
            '*.',
            '*:',
            '..',
            '..<',
            '<=>',
            '++',
            '--',
            '->',
            '&&',
            '||',
            '?',
            '##',
            'as',
            'in',
            '!in',
            'instanceof',
            '!instanceof',
            'new',
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--', '?']

        self.postfix_operators = ['++', '--', '&', '*']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        # group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'assert', 'break', 'case', 'catch', 'class', 'const', 'continue',
            'def', 'default', 'do', 'else', 'enum', 'extends', 'finally',
            'for', 'goto', 'if', 'implements', 'import', 'interface', 'new',
            'package', 'return', 'super', 'switch', 'throw', 'throws', 'trait',
            'try', 'var', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'byte', 'char', 'double', 'float', 'int', 'long', 'short',
            'Java.lang.BigInteger'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['null', 'true', 'false', 'this']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, binary_integer_tb,
            suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb,
            keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb,
            identifier_tb, class_type_tb, string_tb, triple_quote_string_tb,
            regex_tb, slash_slash_comment_tb, slash_star_comment_tb,
            shebang_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        # shebang line at start

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #13
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder(
            ['U', 'L', 'LL', 'ULL', 'LLU'], False, None)
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l'],
                                                    False, None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        directives = [
            '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else',
            '#elif', '#line', '#include', '#pragma'
        ]

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        c_preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True,
                                                   'preprocessor')
        c_error_tb = LeadToEndOfLineTokenBuilder('#error', True,
                                                 'preprocessor')
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=',
            '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!',
            '&', '|', '~', '<<', '>>', '^', '.', '++', '--', '->', '&&', '||',
            '?', '##', '::', '<=>', '.*', '->*', 'new', 'delete', 'and',
            'and_eq', 'bitand', 'bitor', 'compl', 'not', 'not_eq', 'or',
            'or_eq', 'xor', 'xor_eq'
        ]

        self.unary_operators = [
            '+', '-', '*', '!', '&', '~', '++', '--', 'new', 'delete', 'compl',
            'not'
        ]

        self.postfix_operators = ['++', '--', '&', '*']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'alignas',
            'alignof',
            'asm',
            'atomic_cancel',
            'atomic_commit',
            'atomic_noexcept',
            'audit',
            'auto',
            'axiom',
            'break',
            'case',
            'catch',
            'class',
            'concept',
            'const',
            'consteval',
            'constexpr',
            'const_cast',
            'continue',
            'co_await',
            'co_return',
            'co_yield',
            'decltype',
            'default',
            'do',
            'dynamic_cast',
            'else',
            'enum',
            'explicit',
            'export',
            'extern',
            'final',
            'for',
            'friend',
            'goto',
            'if',
            'import',
            'inline',
            'module',
            'mutable',
            'namespace',
            'noexcept',
            'nullptr',
            'operator',
            'override',
            'private',
            'protected',
            'public',
            'private:',
            'protected:',
            'public:',
            'reflexpr',
            'register',
            'reinterpret_cast',
            'requires',
            'return',
            'signed',
            'sizeof',
            'static',
            'static_assert',
            'static_cast',
            'struct',
            'switch',
            'synchronized',
            'template',
            'thread_local',
            'throw',
            'transaction_safe',
            'transaction_safe_dynamic'
            'try',
            'typedef',
            'typeid',
            'typename',
            'union',
            'unsigned',
            'using',
            'virtual',
            'volatile',
            'while',
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'bool', 'char', 'char8_t', 'char16_t', 'char32_t', 'double',
            'float', 'int', 'long', 'short', 'void', 'wchar_t'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'this', 'true', 'cout', 'cin']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb,
            suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb,
            keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, class_type_tb, string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, c_preprocessor_tb, c_error_tb, c_warning_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #14
0
    def __init__(self, code, year):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder(
            ['U', 'L', 'LL', 'ULL', 'LLU'], False, None)
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l'],
                                                    False, None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        directives = [
            '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else',
            '#elif', '#line', '#include', '#pragma'
        ]

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        c_preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True,
                                                   'preprocessor')
        c_error_tb = LeadToEndOfLineTokenBuilder('#error', True,
                                                 'preprocessor')
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=',
            '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!',
            '&', '|', '~', '<<', '>>', '^', '.', '++', '--', '->', '&&', '||',
            '?', '##'
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--']

        self.postfix_operators = ['++', '--', '&', '*']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'auto', 'break', 'case', 'const', 'continue', 'default', 'do',
            'else', 'enum', 'extern', 'for', 'goto', 'if', 'inline',
            'register', 'return', 'signed', 'sizeof', 'static', 'struct',
            'switch', 'typedef', 'union', 'unsigned', 'volatile', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = ['char', 'double', 'float', 'int', 'long', 'short']

        types_89 = ['void']

        types_99 = ['bool', 'complex']

        if year in ['89', '99']:
            types += types_89

        if year in ['99']:
            types += types_99

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['NULL']

        values_89 = []

        values_99 = ['...', 'true', 'false']

        if year in ['89', '99']:
            values += values_89

        if year in ['99']:
            values += values_99

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb,
            whitespace_tb,
            line_continuation_tb,
            terminators_tb,
            integer_tb,
            integer_exponent_tb,
            hex_integer_tb,
            binary_integer_tb,
            suffixed_integer_tb,
            real_tb,
            real_exponent_tb,
            suffixed_real_tb,
            keyword_tb,
            types_tb,
            values_tb,
            groupers_tb,
            known_operator_tb,
            identifier_tb,
            class_type_tb,
            string_tb,
        ]

        if year in ['99']:
            tokenbuilders += [
                slash_slash_comment_tb,
            ]

        tokenbuilders += [
            slash_star_comment_tb, c_preprocessor_tb, c_error_tb, c_warning_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)