def calc_confidences(self, operand_types, group_starts, group_mids,
                         group_ends, indents):
        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        # self.calc_keyword_confidence()

        if indents is not None:
            self.calc_indent_confidence(indents)
Esempio n. 2
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", '`', "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '&', '!', '^', '<<', '>>', '&^', '=',
            '+=', '-=', '*=', '<<=', '>>=', '&^=', '&&', '||', '<-', '++',
            '--', '==', '!=', '<=', '>=', ':=', '...', '.', ':', '<', '>'
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '<-', ':']

        self.postfix_operators = ['++', '--', ':']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        # group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'break', 'case', 'chan', 'const', 'continue', 'default', 'defer',
            'else', 'fallthrough', 'for', 'func', 'go', 'goto', 'if', 'import',
            'interface', 'map', 'package', 'range', 'return'
            'select', 'struct', 'switch', 'type', 'var'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32',
            'int64', 'float32', 'float64', 'complex64', 'complex128', 'byte',
            'rune', 'string', 'uint', 'int', 'uintptr'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['nil', 'true', 'false']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, real_tb, real_exponent_tb,
            keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb,
            identifier_tb, class_type_tb, string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        # tokens = Examiner.combine_identifier_colon(tokens, ['newline'], ['{'], ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_format_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 3
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        perl_identfier_tb = PerlIdentifierTokenBuilder()
        operand_types.append('identifier')

        specials = [
            '$_', '@_', '$$', '$"', '$(', '$)', '$>', '$<', '$;', '$]', '$[',
            '$&', '$`', "$'", '$+', '@+', '%+', '@-', '%-', '$,', '$.', '$/',
            '$\\', '$|', '$%', '$-', '$:', '$=', '$^', '$~', '$!', '$?', '$@',
            '$#', '$*'
        ]

        specials_tb = CaseInsensitiveListTokenBuilder(specials, 'identifier',
                                                      True)

        dollar_carat_tb = PerlDollarCaretIdentifierTokenBuilder()

        sigilbrace_tb = PerlSigilBraceTokenBuilder()

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        q_string_tb = PerlQStringTokenBuilder()

        regex_tb = RegexTokenBuilder()
        m_regex_tb = MRegexTokenBuilder()
        s_regex_tb = SRegexTokenBuilder()
        y_regex_tb = YRegexTokenBuilder()
        tr_regex_tb = TrRegexTokenBuilder()
        operand_types.append('regex')

        prototype_tb = PerlPrototypeTokenBuilder()

        comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        directives = ['#line']

        preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '**', '/', '%', '=', '==', '!=', '>', '>=', '<',
            '<=', '**=', '+=', '*=', '&=', '&.=', '<<=', '&&=', '-=', '/=',
            '|=', '|.=', '>>=', '||=', '.=', '%=', '^=', '^.=', '//=', 'x=',
            'ne', 'gt', 'ge', 'le', 'lt', 'eq', '!', '&', '|', '~', '<<', '>>',
            '^', '.', '..', '...', '++', '--', '->', '=>', '&&', '||', '?',
            '<->', '<=>', 'and', 'cmp', 'or', 'xor'
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--']

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':', '::']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'bless', 'break', 'continue', 'die', 'do', 'else', 'elsif', 'eval',
            'exit', 'exp', 'for', 'foreach', 'if', 'last', 'lock', 'my',
            'next', 'no', 'our', 'package', 'redo', 'return', 'say', 'sub',
            'taint', 'undef', 'unless', 'until', 'use', 'wantarray', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True)

        values = ['NULL']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, real_tb,
            real_exponent_tb, keyword_tb, values_tb, groupers_tb,
            known_operator_tb, prototype_tb, identifier_tb, perl_identfier_tb,
            specials_tb, dollar_carat_tb, sigilbrace_tb, string_tb,
            q_string_tb, regex_tb, m_regex_tb, s_regex_tb, y_regex_tb,
            tr_regex_tb, preprocessor_tb, comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code, ['__END__'])
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment', 'line description'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 4
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        hex_constant_tb = PrefixedIntegerTokenBuilder(
            '0H', False, '0123456789ABCDEFabcdef')
        octal_constant_tb = PrefixedIntegerTokenBuilder(
            '0O', False, '01234567')
        binary_constant_tb = PrefixedIntegerTokenBuilder('0B', False, '01')
        operand_types.append('number')

        leads = '_$'
        extras = '_$'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        terminators_tb = CaseInsensitiveListTokenBuilder(
            [';'], 'statement terminator', False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '===', '!==', '>', '>=',
            '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=',
            '>>=', '!', '&', '|', '~', '<<', '>>', '=>', '^', '.', ':', '++',
            '--', '&&', '||', '?', '$', '?.', 'new', 'delete'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = [
            '+', '-', '!', '~', '++', '--', ':', '$', 'new', 'delete'
        ]

        self.postfix_operators = ['++', '--', ':']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False)

        regex_tb = RegexTokenBuilder()

        keywords = [
            'break', 'case', 'catch', 'class', 'const', 'continue', 'debugger',
            'default', 'do', 'else', 'enum', 'export', 'extends', 'finally',
            'for', 'function', 'if', 'import', 'in', 'instanceof', 'return',
            'switch', 'throw', 'try', 'typeof', 'while', 'with', 'as',
            'implements', 'interface', 'let', 'package', 'private',
            'protected', 'public', 'static', 'yield', 'constructor', 'declare',
            'get', 'module', 'require', 'set', 'type', 'from', 'of'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'any', 'boolean', 'byte', 'char', 'number', 'string', 'symbol',
            'void', 'never', 'object'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['this', 'super', 'null', 'true', 'false', 'undefined']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, keyword_tb, types_tb,
            values_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb,
            string_tb, slash_slash_comment_tb, slash_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')
        self.convert_keywords_to_identifiers(['.'])

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 5
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(False, False, '_')
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False)
        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=',
            '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!',
            '&', '|', '~', '<<', '>>', '>>>', '>>>=', '^', '.', '::', '++',
            '--', '&&', '||', '?', '->', 'new'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', '!', '~', '++', '--', 'new']

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'abstract', 'assert', 'break', 'case', 'catch', 'class', 'const',
            'continue', 'default', 'do', 'else', 'enum', 'extends', 'final',
            'finally', 'for', 'goto', 'if', 'implements', 'import',
            'instanceof', 'interface', 'native', 'package', 'private',
            'protected', 'public', 'return', 'static', 'strictfp', 'super',
            'switch', 'synchronized', 'throw', 'throws', 'transient', 'try',
            'volatile', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'boolean', 'byte', 'char', 'double', 'float', 'int', 'long',
            'short', 'string', 'void', 'Integer', 'String', 'StringBuilder',
            'File', 'Exception', 'IOException'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'null', 'this', 'true']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb,
            types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb,
            class_type_tb, decorator_tb, string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.convert_keywords_to_identifiers(['::', '.'])
        self.convert_operators_to_identifiers(['::', '.'])

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 6
0
    def __init__(self, code):
        super().__init__()
        ctrlz_char = ''
        code = self.TrimCtrlZText(code, ctrlz_char)

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        hex_constant_tb = SuffixedIntegerTokenBuilder(
            'H', True, '0123456789ABCDEFabcdef')
        octal_constant_tb = SuffixedIntegerTokenBuilder('C', True, '01234567')
        binary_constant_tb = SuffixedIntegerTokenBuilder('B', True, '01')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ["'", '"']
        string_tb = StringTokenBuilder(quotes, 0)
        operand_types.append('string')

        paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment')

        known_operators = [
            ':=', '=', '>', '>=', '<', '<=', '#', '<>', '+', '-', '*', '/',
            'DIV', 'MOD', 'AND', 'OR', 'NOT', '^', '.', '..', 'IN', '&'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', 'NOT', '@', '^', '.']

        self.postfix_operators = ['^']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '|']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':', '|']
        group_ends = [')', ']', '}']

        groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'BEGIN', 'BY', 'CASE', 'CONST', 'DEFINITION', 'DO', 'ELSE',
            'ELSIF', 'END', 'EXCEPT', 'EXIT', 'EXPORT', 'FINALLY', 'FOR',
            'FROM', 'IF', 'IMPLEMENTATION', 'IMPORT', 'LOOP', 'MODULE', 'OF',
            'PROCEDURE', 'QUALIFIED', 'REPEAT', 'THEN', 'TO', 'TYPE', 'VAR',
            'WITH', 'WHILE'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'ARRAY', 'BOOLEAN', 'CARDINAL', 'CHAR', 'INTEGER', 'POINTER',
            'REAL', 'RECORD', 'SET'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['FALSE', 'NIL', 'TRUE']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, keyword_tb, types_tb,
            values_tb, known_operator_tb, groupers_tb, identifier_tb,
            string_tb, paren_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        self.tokens = tokens

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'identifier', 'variable']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(
            ['BEGIN', 'RECORD', 'CASE', 'DO', 'IF', 'WHILE'], ['END'])
        self.calc_paired_blockers_confidence(['REPEAT'], ['UNTIL'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 7
0
    def __init__(self, code, block_comment_limit):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(False, True, '_')
        real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', '_')
        octal_integer_tb = PrefixedIntegerTokenBuilder('0o', True, '01234567_')
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', True, '0123456789ABCDEFabcdef_')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', True, '01_')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        lifetime_tb = IdentifierTokenBuilder("'", extras)

        attribute_tb = RustAttributeTokenBuilder()

        quotes = ['"']
        string_tb = EscapedStringTokenBuilder(quotes, 10)
        bstring_tb = PrefixedStringTokenBuilder('b', True, quotes)
        rstring_tb = RustRawStringTokenBuilder()
        operand_types.append('string')

        char_tb = SingleCharStringTokenBuilder()

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = NestedCommentTokenBuilder(
            '/*', '*/', block_comment_limit)

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '^', '!', '&', '|', '&&', '||', '<<',
            '>>', '+=', '-=', '*=', '/=', '%=', '^=', '&=', '|-', '<<=', '>>=',
            '=', '==', '!=', '>', '<', '>=', '<=', '@', '.', '..', '...', '->',
            '#', '$', '?', 'in', '&mut'
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '&mut']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::', '=>']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':', '::', '=>']
        group_ends = [')', ']', '}', ')|']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'as', 'break', 'const', 'continue', 'crate'
            'else', 'enum', 'extern', 'fn', 'for', 'if', 'impl', 'let', 'loop',
            'match', 'mod', 'move', 'mut', 'pub', 'ref', 'return', 'static',
            'struct', 'trait', 'type', 'unsafe', 'use', 'where', 'while'
        ]

        keywords_2018 = ['dyn', 'union', 'static']

        keywords_future = [
            'abstract', 'become', 'box', 'do', 'final', 'macro', 'override',
            'priv', 'typeof', 'unsized', 'virtual', 'yield', 'async', 'await',
            'try'
        ]

        keywords += keywords_2018
        keywords += keywords_future

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'Self', 'u8', 'i8', 'u16', 'i16', 'u32', 'i32', 'u64', 'i64',
            'u128', 'i128', 'usize', 'isize', 'f32', 'f64'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['self', 'true', 'false', 'super', '_']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, octal_integer_tb, hex_integer_tb,
            binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, types_tb,
            values_tb, groupers_tb, known_operator_tb, identifier_tb, char_tb,
            lifetime_tb, class_type_tb, attribute_tb, string_tb, bstring_tb,
            rstring_tb, slash_slash_comment_tb, slash_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        self.tokens = self.combine_numbers_and_adjacent_types(tokens)
        self.convert_operators_to_identifiers()
        self.convert_bars_to_groups()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_format_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 8
0
  def __init__(self, code):
    super().__init__()

    operand_types =[]

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()
    line_continuation_tb = SingleCharacterTokenBuilder(['_'], 'line continuation', False)

    integer_tb = IntegerTokenBuilder(None)
    integer_exponent_tb = IntegerExponentTokenBuilder(None)
    real_tb = RealTokenBuilder(False, False, None)
    real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
    operand_types.append('number')

    variable_tb = VisualBasicVariableTokenBuilder('$%#!')
    operand_types.append('variable')

    leads = '_'
    extras = '_'
    suffixes = '$%#!'
    identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes)
    operand_types.append('identifier')

    quotes = ['"']
    string_tb = EscapedStringTokenBuilder(quotes, 0)
    operand_types.append('string')

    remark_tb = RemarkTokenBuilder()
    comment_tb = LeadToEndOfLineTokenBuilder("'", True, 'comment')
    comment2_tb = LeadToEndOfLineTokenBuilder("’", True, 'comment')

    known_operators = [
      '+', '-', '*', '/', '\\', 'Mod', '^', '&',
      '>', '>=', '<', '<=', '<>', '=',
      'And', 'Or', 'Eqv', 'Is', 'Imp', 'Like', 'Not', 'Xor',
      '.'
    ]

    self.unary_operators = [
      '+', '-', 'Not'
    ]

    groupers = ['(', ')', ',', '[', ']']
    group_starts = ['(', '[', ',']
    group_mids = [',']
    group_ends = [')', ']']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    keywords = [
      'Access', 'Alias', 'Any',
      'AppActivate', 'Append', 'AppendChunk', 'Arrange', 'As',
      'Beep', 'BeginTrans', 'ByVal', 'Call', 'Case', 'Circle', 'Clear',
      'Close', 'Cls', 'CommitTrans',
      'Compare', 'Const', 'Controls', 'CreateDynaset',
      'Data',
      'DateSerial', 'DateValue', 'Declare', 'DefCur', 'DefDbl',
      'DefInt', 'DefLng', 'DefSng', 'DefStr', 'DefVar', 'Delete', 'Dim',
      'Do', 'DoEvents', 'Drag', 'Edit',
      'Else', 'ElseIf', 'End', 'EndDoc', 'EndIf',
      'Erase', 'ExecuteSQL', 'Exit',
      'Explicit', 'FieldSize', 'FileAttr', 'FileCopy', 'FileDateTime',
      'Fix', 'For', 'Form', 'Format', 'Format$', 'Forms',
      'Function', 'Get', 'GetAttr', 'GetChunk', 'GetData',
      'GetFormat', 'GetText', 'Global', 'GoSub', 'GoTo',
      'Hide', 'If', 'Input', 'Input$', 'InputBox', 'InputBox$',
      'Kill', 
      'Let', 'Lib', 'Line', 'LinkExecute', 'LinkPoke',
      'LinkRequest', 'LinkSend', 'Load', 'LoadPicture', 'Loc', 'Local',
      'Lock', 'LOF', 'Loop', 'LSet',
      'MkDir', 'Move',
      'MoveFirst', 'MoveLast', 'MoveNext', 'MovePrevious', 'MoveRelative',
      'MsgBox', 'Name', 'New', 'NewPage', 'Next', 'NextBlock',
      'On', 'Open', 'OpenDataBase', 'Option',
      'Output', 'Point', 'Preserve', 'Print', 'PrintForm',
      'Private', 'PSet', 'Put', 'QBColor', 'Random', 'Randomize', 'Read',
      'ReDim', 'Refresh', 'RegisterDataBase', 'Rem', 'RemoveItem', 'Reset',
      'Restore', 'Resume', 'Return', 'RmDir',
      'Rollback', 'RSet', 'SavePicture', 'Scale',
      'Seek', 'Select', 'SendKeys', 'Set', 'SetAttr', 'SetData', 'SetFocus',
      'SetText', 'Shared', 'Shell', 'Show', 'Static', 'Step', 'Stop',
      'Sub', 'System', 'Text',
      'TextHeight', 'TextWidth', 'Then', 'Timer',
      'TimeSerial', 'TimeValue', 'To', 'Type',
      'TypeOf', 'Unload', 'Unlock', 'Until',
      'Update', 'Using', 'VarType', 'Weekday', 'Wend',
      'While', 'Width', 'Write', 'ZOrder'
    ]

    keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

    functions = [
      'Abs', 'AddItem', 'AddNew', 'Asc', 'Atn',
      'CCur',
      'CDbl', 'ChDir', 'ChDrive', 'Chr', 'Chr$', 'CInt',
      'CLng', 'Command', 'Command$', 'Cos', 'CSng',
      'CStr', 'CurDir$', 'CVar', 'CVDate', 'Date', 'Date$', 'Day',
      'Dir', 'Dir$', 'Environ$', 'EOF', 'Error', 'Error$', 'Exp',
      'FileLen', 'FreeFile',
      'Hex', 'Hex$', 'Hour',
      'InStr', 'Int', 'InStrRev', 'IsDate', 'IsEmpty', 'IsNull',
      'IsNumeric',
      'Join',
      'LBound', 'LCase', 'LCase$', 'Left', 'Left$',
      'Len', 'Log', 'LTrim', 'LTrim$',
      'Mid', 'Mid$', 'Minute', 'Mod', 'Month',
      'Now', 'Oct', 'Oct$', 'RGB', 'Right', 'Right$', 'Rnd',
      'RTrim', 'RTrim$', 'Second', 'Sgn', 'Sin', 'Space',
      'Space$', 'Spc', 'Split', 'Sqr', 'Str', 'Str$', 'StrComp',
      'String$',
      'Tab', 'Tan', 'Time', 'Time$', 'Trim', 'Trim$',
      'UBound', 'UCase', 'UCase$',
      'Val', 'Year'
    ]

    function_tb = CaseSensitiveListTokenBuilder(functions, 'function', True)
    operand_types.append('function')

    types = [
      'Binary', 'Control', 'Currency', 'Double', 'Dynaset', 'Integer',
      'Long', 'Single', 'String', 'Variant'
    ]

    types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
    operand_types.append('type')

    values = [
      'False', 'True', 'App', 'Base', 'Clipboard', 'Debug', 'Erl', 'Err',
      'Printer', 'Me', 'Nothing', 'Null'
    ]

    values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      line_continuation_tb,
      integer_tb,
      integer_exponent_tb,
      real_tb,
      real_exponent_tb,
      keyword_tb,
      groupers_tb,
      known_operator_tb,
      types_tb,
      values_tb,
      function_tb,
      variable_tb,
      identifier_tb,
      string_tb,
      remark_tb,
      comment_tb,
      comment2_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)
    tokens = tokenizer.tokenize(code)
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator')
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
    tokens = Examiner.combine_identifier_colon(tokens, ['newline'], [], ['whitespace', 'comment'])
    self.tokens = tokens
    self.convert_identifiers_to_labels()
    self.convert_keywords_to_identifiers(['.'])

    self.calc_statistics()

    tokens = self.source_tokens()
    tokens = Examiner.join_all_lines(tokens)

    self.calc_token_confidence()
    self.calc_token_2_confidence()

    num_operators = self.count_my_tokens(['operator', 'invalid operator'])
    if num_operators > 0:
      self.calc_operator_confidence(num_operators)
      allow_pairs = []
      self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
      self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
      self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

    self.calc_group_confidence(tokens, group_mids)

    operand_types_2 = ['number', 'string', 'symbol']
    self.calc_operand_n_confidence(tokens, operand_types_2, 2)
    self.calc_operand_n_confidence(tokens, operand_types, 4)

    self.calc_keyword_confidence()

    self.calc_line_format_confidence()
    self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 9
0
  def __init__(self, code, block_comment_limit):
    super().__init__()

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder("'")
    integer_exponent_tb = IntegerExponentTokenBuilder("'")
    hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF_')
    binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_')
    suffixed_integer_tb = SuffixedIntegerTokenBuilder(['U', 'L', 'LU', 'UL'], False, None)
    real_tb = RealTokenBuilder(False, False, "'")
    suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l', 'i'], False, None)
    real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
    hex_real_tb = HexRealExponentTokenBuilder()
    operand_types.append('number')

    leads = '_'
    extras = '_'
    identifier_tb = IdentifierTokenBuilder(leads, extras)
    operand_types.append('identifier')

    attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False)
    operand_types.append('attribute')

    # string suffix: c,w,d
    quotes = ['"', "'", "’"]
    string_tb = EscapedStringTokenBuilder(quotes, 0)
    r_string_tb = PrefixedStringTokenBuilder('r', True, quotes)
    backtick_string_tb = EscapedStringTokenBuilder(['`'], 0)
    x_string_tb = PrefixedStringTokenBuilder('x', True, quotes)
    q_string_tb = PrefixedStringTokenBuilder('q', True, quotes)
    # q{} string
    cwd_string_tb = SuffixedStringTokenBuilder(quotes, 'cwd', False)
    operand_types.append('string')

    class_type_tb = ClassTypeTokenBuilder()
    operand_types.append('class')

    slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
    slash_star_comment_tb = SlashStarCommentTokenBuilder()
    slash_plus_comment_tb = NestedCommentTokenBuilder('/+', '+/', block_comment_limit)

    line_continuation_tb = SingleCharacterTokenBuilder('\\', 'line continuation', False)
    terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False)

    known_operators = [
      '/', '/=', '.', '..', '...', '&', '&=', '&&', '|', '|=', '||',
      '-', '-=', '--', '+', '+=', '++', '<', '<=', '<<', '<<=', '>', '>=',
      '>>=', '>>>=', '>>', '>>>', '!', '!=', '?', ',', ':', '$',
      '=', '==', '*', '*=', '%', '%=', '^', '^=', '^^', '^^=', '~', '~=',
      '@', '=>', '#',
      'new', 'delete',
      'typeof', 'is'
    ]

    self.unary_operators = [
      '+', '-', '*',
      '!', '&', '~',
      '++', '--', ':',
      'new', 'delete',
      'typeof', 'is'
    ]

    self.postfix_operators = [
      '++', '--', '&', ':'
    ]

    groupers = ['(', ')', ',', '[', ']', '{', '}']
    group_starts = ['(', '[', ',', '{']
    group_mids = [',']
    group_ends = [')', ']', '}']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    keywords = [
      'abstract', 'alias', 'align', 'asm', 'assert', 'auto',
      'body', 'break', 'case', 'cast', 'catch', 'class', 'const', 'continue',
      'debug', 'default', 'delegate', 'deprecated', 'do',
      'else', 'enum', 'export', 'extern',
      'final', 'finally', 'for', 'foreach', 'foreach_reverse', 'function',
      'goto',
      'if', 'immutable', 'import', 'in', 'inout', 'interface', 'invariant',
      'lazy',
      'macro', 'mixin', 'module',
      'nothrow',
      'out', 'override',
      'package', 'pragma', 'private', 'protected', 'public', 'pure',
      'ref', 'return',
      'scope', 'shared', 'static', 'struct', 'switch', 'synchronized',
      'template', 'throw', 'try', 'typeid',
      'union', 'unittest', 'version', 'while', 'with',
      '__gshared', '__traits', '__vector', '__parameters'
]

    keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

    types = [
      'bool', 'byte', 'cdouble', 'cent', 'cfloat', 'char', 'creal',
      'dchar', 'double', 'float', 'idouble', 'ifloat', 'int', 'ireal',
      'long', 'real', 'short', 'ubyte', 'ucent', 'uint', 'ulong', 'ushort',
      'void', 'wchar'
    ]

    types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
    operand_types.append('type')

    values = [
      'false', 'null', 'super', 'this', 'true',
      '__FILE__', '__FILE_FULL_PATH__', '__MODULE__', '__LINE__',
      '__FUNCTION__', '__PRETTY_FUNCTION__',
      '__DATE__', '__EOF__', '__TIME__','__TIMESTAMP__',
      '__VENDOR__', '__VERSION__'
    ]

    values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      line_continuation_tb,
      terminators_tb,
      integer_tb,
      integer_exponent_tb,
      hex_integer_tb,
      binary_integer_tb,
      suffixed_integer_tb,
      real_tb,
      real_exponent_tb,
      suffixed_real_tb,
      hex_real_tb,
      keyword_tb,
      types_tb,
      values_tb,
      groupers_tb,
      known_operator_tb,
      identifier_tb,
      attribute_tb,
      class_type_tb,
      string_tb,
      r_string_tb,
      x_string_tb,
      backtick_string_tb,
      q_string_tb,
      cwd_string_tb,
      slash_slash_comment_tb,
      slash_star_comment_tb,
      slash_plus_comment_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)
    tokens = tokenizer.tokenize(code)
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator')
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
    tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment'])
    self.tokens = tokens
    self.convert_identifiers_to_labels()

    number_suffixes = ['f', 'F', 'i', 'I', 'u', 'U', 'l', 'L', 'ul', 'uL', 'Ul', 'UL', 'lu', 'lU', 'Lu', 'LU']
    tokens = self.combine_tokens_and_adjacent_types(tokens, 'number', 'identifier', number_suffixes)

    string_suffixes = ['c', 'w', 'd']
    self.tokens = self.combine_tokens_and_adjacent_types(tokens, 'string', 'identifier', string_suffixes)

    self.calc_statistics()

    tokens = self.source_tokens()
    tokens = Examiner.join_all_lines(tokens)

    self.calc_token_confidence()
    self.calc_token_2_confidence()

    num_operators = self.count_my_tokens(['operator', 'invalid operator'])
    if num_operators > 0:
      self.calc_operator_confidence(num_operators)
      allow_pairs = []
      self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
      self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
      self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

    self.calc_group_confidence(tokens, group_mids)

    operand_types_2 = ['number', 'symbol']
    self.calc_operand_n_confidence(tokens, operand_types_2, 2)
    self.calc_operand_n_confidence(tokens, operand_types, 4)

    self.calc_keyword_confidence()

    self.calc_paired_blockers_confidence(['{'], ['}'])
    self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 10
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        line_continuation_tb = SingleCharacterTokenBuilder(['_'],
                                                           'line continuation',
                                                           False)

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        operand_types.append('number')

        variable_tb = VisualBasicVariableTokenBuilder('$%#!')
        operand_types.append('variable')

        leads = '_'
        extras = '_'
        suffixes = '$%#!'
        identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes)
        operand_types.append('identifier')

        quotes = ['"']
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        remark_tb = RemarkTokenBuilder()
        comment_tb = LeadToEndOfLineTokenBuilder("'", True, 'comment')
        comment2_tb = LeadToEndOfLineTokenBuilder("’", True, 'comment')

        directives = [
            '#If', '#Else', '#ElseIf', '#End If', '#ExternalSource', '#Line',
            '#Region', '#End Region', '#Const'
        ]

        preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)

        known_operators = [
            '&', '&=', '*', '*=', '/', '/=', '\\', '\\=', '^', '^=', '+', '+=',
            '-', '-=', '>>', '>>=', '<<', '<<=', '.', '=', '<', '<=', '>',
            '>=', '<>', 'AddressOf', 'And', 'AndAlso', 'In', 'Is', 'IsNot',
            'Like', 'Or', 'OrElse', 'Xor'
        ]

        self.unary_operators = ['+', '-', 'Not', 'IsNot']

        groupers = ['(', ')', ',', '[', ']']
        group_starts = ['(', '[', ',']
        group_mids = [',']
        group_ends = [')', ']']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'AddHandler', 'Alias', 'As', 'ByRef', 'ByVal', 'Call', 'Case',
            'Catch', 'Class', 'Const', 'Continue', 'Declare', 'Default',
            'Delegate', 'Dim', 'DirectCast', 'Do'
            'Each', 'Else', 'ElseIf', 'End', 'Enum', 'Erase', 'Error', 'Event',
            'Finally', 'For', 'For Each', 'Friend', 'Function', 'Get',
            'GetType', 'GetXMLNamespace', 'Global', 'GoSub', 'GoTo', 'Handles',
            'If', 'Implements', 'Imports', 'Inherits', 'Interface', 'Let',
            'Lib', 'Loop', 'Module', 'MustInherit', 'MustOverride',
            'Namespace', 'Narrowing', 'New Constraint', 'New Operator', 'Next',
            'NotInheritable', 'NotOverridable', 'Of', 'On', 'Operator',
            'Option', 'Optional', 'Out', 'Overloads', 'Overridable',
            'Overrides', 'ParamArray', 'Partial', 'Private', 'Property',
            'Protected', 'Public', 'RaiseEvent', 'ReadOnly', 'ReDim', 'REM',
            'RemoveHandler', 'Resume', 'Return', 'Select', 'Set', 'Shadows',
            'Shared', 'Static', 'Step', 'Stop', 'Structure', 'Sub', 'SyncLock',
            'Then', 'Throw', 'To', 'Try', 'TryCast', 'TypeOf', 'Using', 'Wend',
            'When', 'While', 'Widening', 'With', 'WithEvents', 'WriteOnly'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        functions = [
            'Asc', 'AscW', 'Chr', 'ChrW', 'Filter', 'Format', 'GetChar',
            'InStr', 'InStrRev', 'Join', 'LCase', 'Left', 'Len', 'LSet',
            'LTrim', 'Mid', 'Replace', 'Right', 'RSet', 'RTrim', 'Space',
            'Split', 'StrComp', 'StrConv', 'StrDup', 'StrReverse', 'Trim',
            'UCase'
        ]

        function_tb = CaseSensitiveListTokenBuilder(functions, 'function',
                                                    True)

        types = [
            'Boolean',
            'Byte',
            'CBool',
            'CByte',
            'CChar',
            'CDate',
            'CDbl',
            'CDec',
            'Char',
            'CInt',
            'CLng',
            'CObj',
            'CSByte',
            'CShort',
            'CSng',
            'CStr',
            'CType',
            'CUInt',
            'CULng',
            'CUShort',
            'Date',
            'Decimal',
            'Double',
            'Integer',
            'Long',
            'Object',
            'SByte',
            'Short',
            'Single',
            'String',
            'UInteger',
            'ULong',
            'UShort',
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['False', 'True', 'Nothing', 'MyBase', 'MyClass']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb,
            groupers_tb, known_operator_tb, types_tb, values_tb, function_tb,
            variable_tb, identifier_tb, string_tb, remark_tb, comment_tb,
            comment2_tb, preprocessor_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(tokens, ['newline'], [],
                                                   ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()
        self.convert_keywords_to_identifiers(['.'])
        self.convert_functions_to_identifiers()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 11
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        hex_constant_tb = PrefixedIntegerTokenBuilder(
            '$', True, '0123456789ABCDEFabcdef')
        octal_constant_tb = PrefixedIntegerTokenBuilder('&', True, '01234567')
        binary_constant_tb = PrefixedIntegerTokenBuilder('%', True, '01')
        char_constant_tb = PrefixedIntegerTokenBuilder('#', True, '0123456789')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ["'"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        brace_comment_tb = BraceCommentTokenBuilder()
        paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment')

        known_operators = [
            '+', '-', '*', '/', '=', '<>', '>', '>=', '<', '<=', 'and', 'or',
            'not', '&', '|', '~', '<<', '>>', ':=', '^', '~', '@', '.', ':',
            '..', 'div', 'mod', 'shl', 'shr', 'in'
        ]

        known_operator_tb = CaseInsensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', 'not', '@', '^', '.']

        self.postfix_operators = ['^']

        groupers = ['(', ')', ',', '[', ']']
        group_starts = ['(', '[', ',']
        group_mids = [',']
        group_ends = [')', ']']

        groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'begin', 'break', 'case', 'const', 'do', 'downto', 'else', 'end',
            'for', 'forward', 'function', 'goto', 'if', 'label', 'of',
            'otherwise', 'packed', 'procedure', 'program', 'repeat', 'reset',
            'then', 'to', 'type', 'until', 'uses', 'value', 'var', 'while',
            'with'
        ]

        keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword',
                                                     False)

        types = [
            'array', 'boolean', 'char', 'file', 'integer', 'real', 'record',
            'set', 'string'
        ]

        types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'nil', 'true']

        values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, char_constant_tb,
            keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, string_tb, brace_comment_tb, paren_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = self.combine_identifier_colon(
            tokens, ['statement separator'], ['begin'],
            ['whitespace', 'comment', 'newline', 'line description'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()
        self.convert_identifiers_to_labels_2()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'identifier', 'variable']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['begin', 'record', 'case'],
                                             ['end'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 12
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        line_continuation_tb = CBasicLineContinuationTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, False)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        hex_constant_tb = CBasicSuffixedIntegerTokenBuilder(
            '0123456789ABCDEF', 'H')
        binary_constant_tb = CBasicSuffixedIntegerTokenBuilder('01', 'B')
        operand_types.append('number')

        variable_tb = CBasicVariableTokenBuilder('%$')
        operand_types.append('variable')

        quotes = ['"']
        string_tb = StuffedQuoteStringTokenBuilder(quotes, False)
        operand_types.append('string')

        remark_tb = RemarkTokenBuilder()
        comment_tb = LeadToEndOfLineTokenBuilder("'", False, 'comment')
        comment2_tb = LeadToEndOfLineTokenBuilder("’", False, 'comment')

        stmt_separator_tb = SingleCharacterTokenBuilder(
            ':', 'statement separator', False)

        known_operators = [
            '+', '-', '*', '/', '^', '=', '>', '>=', '<', '<=', '<>', '#',
            'NOT', 'AND', 'EQ', 'GE', 'GT', 'LE', 'LT', 'NE', 'OR', 'XOR'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', '#', 'NOT']

        groupers = ['(', ')', ',', ';']
        group_starts = ['(', ',']
        group_mids = [',']
        group_ends = [')']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'AS', 'BUFF', 'CALL', 'CHAIN', 'CLOSE', 'COMMON', 'CONSOLE',
            'CREATE', 'DATA', 'DEF', 'DELETE', 'DIM', 'ELSE', 'END', 'FEND',
            'FILE', 'FOR', 'GOSUB', 'GO', 'GOTO', 'IF', 'INITIALIZE', 'INPUT',
            'INTEGER', 'LET', 'LINE', 'LPRINTER', 'NEXT', 'ON', 'OPEN', 'OUT',
            'POKE', 'PRINT', 'RANDOMIZE', 'READ', 'REM', 'REMARK', 'RENAME',
            'RESTORE', 'RETURN', 'SAVEMEM', 'STEP', 'STOP', 'SUB', 'THEN',
            'TO', 'USING', 'WEND', 'WHILE', 'WIDTH', 'GRAPHIC', 'MAT', 'FILL',
            'MAT', 'MARKER', 'PLOT', 'CHARACTER', 'HEIGHT', 'SET', 'ASK',
            'COLOR', 'COUNT', 'JUSTIFY', 'LINE', 'STYLE', 'TYPE', 'TEXT',
            'ANGLE', 'BOUNDS', 'DEVICE', 'VIEWPORT', 'WINDOW', 'BEAM', 'CLEAR',
            'CLIP', 'POSITION'
        ]

        keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword',
                                                     False)

        label_tb = CBasicLabelTokenBuilder(keywords)

        functions = [
            'ASC', 'CHR$', 'STR$', 'TAB', 'COMMAND$', 'CONCHAR%', 'CONSTAT%',
            'ATN', 'COS', 'SIN', 'TAN', 'ABS', 'EXP', 'INT', 'FLOAT', 'LOG',
            'RND', 'SGN', 'SQR', 'LEFT$', 'LEN', 'MID$', 'RIGHT$', 'MATCH',
            'VAL', 'FRE', 'INP', 'INT%', 'PEEK', 'POS', 'TAB', 'RECL', 'RECS',
            'SADD', 'SIZE', 'UCASE$', 'VARPTR'
        ]

        function_tb = CaseInsensitiveListTokenBuilder(functions, 'function',
                                                      True)
        operand_types.append('function')

        directives = [
            '%LIST', '%NOLIST', '%PAGE', '%EJECT', '%INCLUDE', '%CHAIN'
        ]

        directive_tb = CaseInsensitiveListTokenBuilder(directives, 'directive',
                                                       False)

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, stmt_separator_tb,
            integer_tb, integer_exponent_tb, real_tb, real_exponent_tb,
            hex_constant_tb, binary_constant_tb, keyword_tb, known_operator_tb,
            function_tb, variable_tb, label_tb, groupers_tb, string_tb,
            remark_tb, comment_tb, comment2_tb, directive_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.convert_numbers_to_line_numbers()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_line_format_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 13
0
    def __init__(self, code, year, extension):
        super().__init__()

        if year is not None and year not in ['2002', '2014']:
            raise CodeStatException('Unknown year for language')

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, True, None)
        real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', None)

        identifier_tb = CobolIdentifierTokenBuilder()

        quotes = ['"', "'", "’"]
        string_tb = StuffedQuoteStringTokenBuilder(quotes, False)
        n_string_tb = PrefixedStringTokenBuilder('N', False, quotes)
        nx_string_tb = PrefixedStringTokenBuilder('NX', False, quotes)

        picture_tb = PictureTokenBuilder()
        cr_picture_tb = CRPictureTokenBuilder()

        inline_comment_tb = LeadToEndOfLineTokenBuilder('*>', True, 'comment')
        star_comment_tb = AsteriskCommentTokenBuilder()

        terminators_tb = SingleCharacterTokenBuilder('.',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            'ADD', 'SUBTRACT', 'MULTIPLY', 'DIVIDE', '+', '-', '*', '/', '**',
            '=', '<>', '>', '>=', '<', '<=', 'AND', 'OR', 'NOT', 'B-AND',
            'B-NOT', 'B-OR', 'B-XOR', ':'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', 'NOT']

        groupers = ['(', ')', ',']
        group_starts = ['(']
        group_mids = [',']
        # group_ends = [')']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'ACCEPT', 'ACCESS', 'ADD', 'ADDRESS', 'ADVANCING', 'AFTER', 'ALL',
            'ALPHABET', 'ALPHABETIC', 'ALPHABETIC-LOWER', 'ALPHABETIC-UPPER',
            'ALPHANUMERIC', 'ALPHANUMERIC-EDITED', 'ALSO', 'ALTER',
            'ALTERNATE', 'AND', 'ANY', 'APPLY', 'ARE', 'AREA', 'AREAS',
            'ASCENDING', 'ASSIGN', 'AT', 'AUTHOR', 'BEFORE', 'BEGINNING',
            'BELL', 'BINARY', 'BLOCK', 'BOTTOM', 'BY', 'BYTE-LENGTH', 'CALL',
            'CANCEL', 'CBL', 'CD', 'CF', 'CH', 'CHARACTER', 'CHARACTERS',
            'CLOCK-UNITS', 'CLOSE', 'COBOL', 'CODE', 'CODE-SET', 'COL',
            'COLLATING', 'COLS', 'COLUMN', 'COMMA', 'COMMON', 'COMMUNICATION',
            'COMP', 'COMPUTATIONAL', 'COMPUTE', 'CONFIGURATION', 'CONTAINS',
            'CONTENT', 'CONTINUE', 'CONTROL', 'CONTROLS', 'CONVERTING', 'COPY',
            'CORR', 'CORRESPONDING', 'COUNT', 'CURRENCY', 'DATA', 'DATE',
            'DATE-COMPILED', 'DATE-WRITTEN', 'DAY', 'DAY-OF-WEEK', 'DE',
            'DEBUG-CONTENTS', 'DEBUG-ITEM', 'DEBUG-LINE', 'DEBUG-NAME',
            'DEBUG-SUB-1', 'DEBUG-SUB-2', 'DEBUG-SUB-3', 'DECIMAL-POINT',
            'DECLARATIVES', 'DELETE', 'DELIMITED', 'DELIMITER', 'DEPENDING',
            'DESCENDING', 'DESTINATION', 'DISABLE', 'DIVIDE', 'DIVISION',
            'DOWN', 'DUPLICATES', 'DYNAMIC', 'EGI', 'ELSE', 'EMI', 'ENABLE',
            'END', 'END-ACCEPT', 'END-ADD', 'END-CALL', 'END-COMPUTE',
            'END-DELETE', 'END-DISPLAY', 'END-DIVIDE', 'END-EVALUATE',
            'END-EXEC', 'END-IF', 'END-MULTIPLY', 'END-OF-PAGE', 'END-PERFORM',
            'END-READ', 'END-RECEIVE', 'END-RETURN', 'END-REWRITE',
            'END-SEARCH', 'END-START', 'END-STRING', 'END-SUBTRACT',
            'END-UNSTRING', 'END-WRITE', 'ENTER', 'ENVIRONMENT', 'EOL', 'EOP',
            'EQUAL', 'ERROR', 'ESI', 'EVALUATE', 'EVERY', 'EXCEPTION', 'EXEC',
            'EXIT', 'EXTEND', 'EXTERNAL', 'FD', 'FILE', 'FILE-CONTROL',
            'FILLER', 'FINAL', 'FIRST', 'FOOTING', 'FOR', 'FROM', 'FULL',
            'GENERATE', 'GIVING', 'GLOBAL', 'GO', 'GOBACK', 'GREATER', 'GROUP',
            'HEADING', 'HIGH-VALUE', 'HIGH-VALUES', 'I-O', 'I-O-CONTROL',
            'IDENTIFICATION', 'IF', 'IN', 'INDEX', 'INDEXED', 'INDICATE',
            'INITIAL', 'INITIALIZE', 'INITIATE', 'INPUT', 'INPUT-OUTPUT',
            'INSPECT', 'INSTALLATION', 'INTO', 'INVALID', 'IS', 'JUST',
            'JUSTIFIED', 'KEY', 'LABEL', 'LAST', 'LEADING', 'LEFT', 'LENGTH',
            'LESS', 'LIMIT', 'LIMITS', 'LINAGE', 'LINAGE-COUNTER', 'LINE',
            'LINE-COUNTER', 'LINES', 'LINKAGE', 'LOCK', 'LOW-VALUE',
            'LOW-VALUES', 'MEMORY', 'MERGE', 'MESSAGE', 'MODE', 'MODULES',
            'MOVE', 'MULTIPLE', 'MULTIPLY', 'NATIVE', 'NEGATIVE', 'NEXT',
            'NOT', 'NUMBER', 'NUMBERS', 'NUMERIC', 'NUMERIC-EDITED',
            'OBJECT-COMPUTER', 'OCCURS', 'OF', 'OMITTED', 'OPEN', 'OPTIONAL',
            'OR', 'ORDER', 'ORGANIZATION', 'OTHER', 'OUTPUT', 'OVERFLOW',
            'PACKED-DECIMAL', 'PADDING', 'PAGE', 'PAGE-COUNTER', 'PARAGRAPH',
            'PERFORM', 'PF', 'PH', 'PIC', 'PICTURE', 'PLUS', 'POINTER',
            'POSITION', 'POSITIVE', 'PRINTING', 'PROCEDURE', 'PROCEDURES',
            'PROCEED', 'PROGRAM', 'PROGRAM-ID', 'PURGE', 'QUEUE', 'QUOTE',
            'QUOTES', 'RANDOM', 'RD', 'READ', 'RECEIVE', 'RECORD', 'RECORDS',
            'REDEFINES', 'REEL', 'REFERENCE', 'RELATIVE', 'RELEASE',
            'REMAINDER', 'REMOVAL', 'RENAMES', 'REPLACE', 'REPLACING',
            'REPORT', 'REPORTING', 'REPORTS', 'RERUN', 'RESERVE', 'RESET',
            'RESUME', 'RETRY', 'RETURN', 'REVERSED', 'REWIND', 'REWRITE', 'RF',
            'RH', 'RIGHT', 'ROUNDED', 'RUN', 'SAME', 'SD', 'SEARCH', 'SECTION',
            'SECURE', 'SECURITY', 'SEGMENT', 'SEGMENT-LIMIT', 'SELECT', 'SEND',
            'SENTENCE', 'SEPARATE', 'SEQUENCE', 'SEQUENTIAL', 'SET', 'SIGN',
            'SIZE', 'SORT', 'SORT-MERGE', 'SOURCE', 'SOURCE-COMPUTER',
            'SPECIAL-NAMES', 'STANDARD', 'STANDARD-1', 'STANDARD-2', 'START',
            'STATUS', 'STOP', 'STRING', 'SUB-QUEUE-1', 'SUB-QUEUE-2',
            'SUB-QUEUE-3', 'SUBTRACT', 'SUM', 'SUPPRESS', 'SYMBOLIC', 'SYNC',
            'SYNCHRONIZED', 'TABLE', 'TALLY', 'TALLYING', 'TAPE', 'TERMINAL',
            'TERMINATE', 'TEST', 'TEXT', 'THAN', 'THEN', 'THROUGH', 'THRU',
            'TIME', 'TIMES', 'TITLE', 'TO', 'TOP', 'TRAILING', 'TYPE', 'UNIT',
            'UNSTRING', 'UNTIL', 'UP', 'UPON', 'USAGE', 'USE', 'USING',
            'VALUE', 'VALUES', 'VARYING', 'WHEN', 'WITH', 'WORDS',
            'WORKING-STORAGE', 'WRITE'
        ]

        keywords_2002 = [
            'ACTIVE-CLASS',
            'ALIGNED',
            'ALLOCATE',
            'ANYCASE',
            'ARITHMETIC',
            'AUTO',
            'AUTOMATIC',
            'BACKGROUND-COLOR',
            'BASED',
            'BASIS',
            'BINARY-CHAR',
            'BINARY-DOUBLE',
            'BINARY-LONG',
            'BINARY-SHORT',
            'BIT',
            'BLINK',
            'BOOLEAN',
            'CENTER',
            'CLASS',
            'CLASS-ID',
            'CLASSIFICATION',
            'COLUMNS',
            'COM-REG',
            'CONDITION',
            'CONSTANT',
            'CRT',
            'CURSOR',
            'CYCLE',
            'DATA-POINTER',
            'DBCS',
            'DEBUGGING',
            'DETAIL',
            'DISPLAY',
            'DISPLAY-1',
            'DISPLAY-OF',
            'EC',
            'EGCS',
            'EJECT',
            'END-INVOKE',
            'ENDING',
            'ENTRY-CONVENTION',
            'ENTRY-FIELD',
            'EO',
            'EOS',
            'ERASE',
            'EXCEPTION-OBJECT',
            'EXCLUSIVE',
            'EXPANDS',
            'EXTERN',
            'FACTORY',
            'FLOAT-EXTENDED',
            'FLOAT-LONG',
            'FLOAT-SHORT',
            'FOREGROUND-COLOR',
            'FOREVER',
            'FORMAT',
            'FREE',
            'FUNCTION',
            'FUNCTION-ID',
            'GET',
            'GROUP-USAGE',
            'HIGHLIGHT',
            'IGNORING',
            'IMPLEMENTS',
            'INHERITS',
            'INITIALIZED',
            'INSERT',
            'INTERFACE',
            'INTERFACE-ID',
            'INTRINSIC',
            'INVOKE',
            'KANJI',
            'LC_ALL',
            'LC_COLLATE',
            'LC_CTYPE',
            'LC_MESSAGES',
            'LC_MONEY',
            'LC_NUMERIC',
            'LC_TIME',
            'LOCAL-STORAGE',
            'LOCALE',
            'LOWLIGHT',
            'MANUAL',
            'METACLASS',
            'METHOD',
            'METHOD-ID',
            'MINUS',
            'MORE-LABELS',
            'NATIONAL',
            'NATIONAL-EDITED',
            'NATIONAL-OF',
            'NATIVE_BINARY',
            'NESTED',
            'NEW',
            'NONE',
            'NORMAL',
            'OBJECT',
            'OBJECT-REFERENCE',
            'ONLY',
            'OPTIONS',
            'OVERRIDE',
            'PHYSICAL',
            'PRESENT',
            'PREVIOUS',
            'PROCEDURE-POINTER',
            'PROCESSING',
            'PROGRAM-POINTER',
            'PROPERTY',
            'PROTOTYPE',
            'RAISE',
            'RAISING',
            'READY',
            'RECURSIVE',
            'REFERENCES',
            'RELATION',
            'RELOAD',
            'REPOSITORY',
            'REQUIRED',
            'RETURN-CODE',
            'RETURNING',
            'ROUNDING',
            'SCREEN',
            'SECONDS',
            'SERVICE',
            'SHARING',
            'SHIFT-IN',
            'SHIFT-OUT',
            'SIGNED',
            'SKIP1',
            'SKIP2',
            'SKIP3',
            'SORT-CONTROL',
            'SORT-CORE-SIZE',
            'SORT-FILE-SIZE',
            'SORT-MESSAGE',
            'SORT-MODE-SIZE',
            'SORT-RETURN',
            'SOURCES',
            'STATEMENT',
            'STEP',
            'STRONG',
            'SYMBOL',
            'SYSTEM-DEFAULT',
            'TRACE',
            'TYPEDEF',
            'UCS-4',
            'UNDERLINE',
            'UNIVERSAL',
            'UNLOCK',
            'UNSIGNED',
            'USER-DEFAULT',
            'UTF-16',
            'UTF-8',
            'VAL-STATUS',
            'VALID',
            'VALIDATE',
            'VALIDATE-STATUS',
            'WHEN-COMPILED',
            'WRITE-ONLY',
            'YYYYDDD',
            'YYYYMMDD',
        ]

        keywords_2014 = [
            'AWAY-FROM-ZERO', 'NEAREST-AWAY-FROM-ZERO', 'NEAREST-EVEN',
            'NEAREST-TOWARD-ZERO', 'TOWARD-GREATER', 'TOWARD-LESSER',
            'CAPACITY', 'FLOAT-BINARY-128', 'FLOAT-BINARY-32',
            'FLOAT-BINARY-64', 'FLOAT-DECIMAL-16', 'FLOAT-DECIMAL-34',
            'FLOAT-INFINITY', 'FLOAT-NOT-A-NUMBER', 'FUNCTION-POINTER',
            'INTERMEDIATE', 'PHYSICAL', 'PREFIXED', 'PROHIBITED', 'SHORT',
            'STANDARD-BINARY', 'STANDARD-DECIMAL', 'TRUNCATION'
        ]

        keywords_ibm = ['ABSENT', 'ID', 'PASSWORD', 'UNBOUNDED']

        keywords_gnu = [
            'ARGUMENT-NUMBER', 'ARGUMENT-VALUE', 'ASCII', 'BINARY-C-LONG',
            'BINARY-SEQUENTIAL', 'CARD-PUNCH', 'CARD-READER', 'CASSETTE',
            'CHAIN', 'CHAINING', 'COLOR', 'COMMAND-LINE', 'COMMIT', 'COMP-1',
            'COMP-2', 'COMP-3', 'COMP-4', 'COMP-5', 'COMP-6', 'COMP-X',
            'COMPUTATIONAL-1', 'COMPUTATIONAL-2', 'COMPUTATIONAL-3',
            'COMPUTATIONAL-4', 'COMPUTATIONAL-5', 'COMPUTATIONAL-6',
            'COMPUTATIONAL-X', 'CONVERSION', 'CRT-UNDER', 'DISC', 'DISK',
            'EBCDIC', 'ECHO', 'END-CHAIN', 'ENTRY', 'ENVIRONMENT-NAME',
            'ENVIRONMENT-VALUE', 'ESCAPE', 'F', 'FILE-ID', 'FIXED',
            'FLOAT-DECIMAL-7', 'ID', 'IGNORE', 'KEPT', 'KEYBOARD',
            'LEFT-JUSTIFY', 'LEFTLINE', 'LINE-SEQUENTIAL', 'LOWER',
            'MAGNETIC-TAPE', 'NAME', 'NO-ECHO', 'NOTHING', 'OVERLINE', 'PRINT',
            'PRINTER', 'PRINTER-1', 'PROCEDURE-POINTER', 'PROCEDURES',
            'PROMPT', 'PROTECTED', 'RECORDING', 'REVERSE', 'RIGHT-JUSTIFY',
            'ROLLBACK', 'S', 'SCROLL', 'SIGNED-INT', 'SIGNED-LONG',
            'SIGNED-SHORT', 'SPACE-FILL', 'STATIC', 'STDCALL', 'SYSTEM-OFFSET',
            'TAB', 'TIME-OUT', 'TRAILING-SIGN', 'U', 'UNSIGNED-INT',
            'UNSIGNED-LONG', 'UNSIGNED-SHORT', 'UPDATE', 'UPPER', 'USER', 'V',
            'VARIABLE', 'WAIT', 'WRAP', 'ZERO-FILL'
        ]

        keywords_acu = [
            '3-D', 'ACTION', 'ACTIVE-X', 'ADJUSTABLE-COLUMNS', 'ALIGNMENT',
            'AUTO-DECIMAL', 'AUTO-SPIN', 'BACKGROUND-HIGH', 'BACKGROUND-LOW',
            'BACKGROUND-STANDARD', 'BAR', 'BITMAP', 'BITMAP-END',
            'BITMAP-HANDLE', 'BITMAP-NUMBER', 'BITMAP-START',
            'BITMAP-TRAILING', 'BITMAP-TRANSPARENT-COLOR', 'BITMAP-WIDTH',
            'BOX', 'BOXED', 'BUSY', 'BUTTONS', 'CALENDAR-FONT',
            'CANCEL-BUTTON', 'CELL', 'CELL-COLOR', 'CELL-DATA', 'CELL-FONT',
            'CELL-PROTECTION', 'CENTERED-HEADING', 'CENTURY-DATE', 'CHECK-BOX',
            'CLEAR-SELECTION', 'CLINE', 'CLINES', 'COLORS', 'COLUMN-COLOR',
            'COLUMN-DIVIDERS', 'COLUMN-FONT', 'COLUMN-HEADINGS',
            'COLUMN-PROTECTION', 'COMBO-BOX', 'COPY-SELECTION', 'CSIZE',
            'CURSOR-COL', 'CURSOR-COLOR', 'CURSOR-FRAME-WIDTH', 'CURSOR-ROW',
            'CURSOR-X', 'CURSOR-Y', 'CUSTOM-PRINT-TEMPLATE', 'DASHED',
            'DATA-COLUMNS', 'DATA-TYPES', 'DATE-ENTRY', 'DEFAULT-BUTTON',
            'DEFAULT-FONT', 'DESTROY', 'DISPLAY-COLUMNS', 'DISPLAY-FORMAT',
            'DOTDASH', 'DOTTED', 'DOUBLE', 'DRAG-COLOR', 'DROP-DOWN',
            'DROP-LIST', 'END-COLOR', 'END-MODIFY', 'ENGRAVED',
            'ENSURE-VISIBLE', 'ENTRY-FIELD', 'ENTRY-REASON', 'ESCAPE-BUTTON',
            'EVENT', 'EVENT-LIST', 'EXCEPTION-VALUE', 'EXPAND',
            'EXTERNAL-FORM', 'FILE-NAME', 'FILE-POS', 'FILL-COLOR',
            'FILL-COLOR-2', 'FILL-PERCENT', 'FINISH-REASON', 'FIXED-FONT',
            'FIXED-WIDTH', 'FLAT', 'FLAT-BUTTONS', 'FLOAT', 'FLOATING', 'FONT',
            'FRAME', 'FRAMED', 'FULL-HEIGHT', 'GRID', 'GO-BACK', 'GO-FORWARD',
            'GO-HOME', 'GO-SEARCH', 'GRAPHICAL', 'GRID', 'GROUP-VALUE',
            'HANDLE', 'HAS-CHILDREN', 'HEADING-COLOR', 'HEADING-DIVIDER-COLOR',
            'HEADING-FONT', 'HEAVY', 'HEIGHT-IN-CELLS', 'HIDDEN-DATA',
            'HIGH-COLOR', 'HOT-TRACK', 'HSCROLL', 'HSCROLL-POS', 'ICON',
            'IDENTIFIED', 'INDEPENDENT', 'INQUIRE', 'INSERTION-INDEX',
            'INSERTION-ROWS', 'ITEM', 'ITEM-TEXT', 'ITEM-TO-ADD',
            'ITEM-TO-DELETE', 'ITEM-TO-EMPTY', 'ITEM-VALUE', 'LABEL',
            'LABEL-OFFSET', 'LARGE-FONT', 'LARGE-OFFSET', 'LAST-ROW',
            'LAYOUT-DATA', 'LAYOUT-MANAGER', 'LEADING-SHIFT', 'LEFT-TEXT',
            'LINES-AT-ROOT', 'LIST-BOX', 'LM-RESIZE', 'LONG-DATE', 'LOW-COLOR',
            'LOWERED', 'MASS-UPDATE', 'MAX-LINES', 'MAX-PROGRESS', 'MAX-TEXT',
            'MAX-VAL', 'MEDIUM-FONT', 'MENU', 'MIN-VAL', 'MODIFY', 'MULTILINE',
            'NAVIGATE-URL', 'NEXT-ITEM', 'NO-AUTOSEL', 'NO-AUTO-DEFAULT',
            'NO-BOX', 'NO-DIVIDERS', 'NO-F4', 'NO-FOCUS', 'NO-GROUP-TAB',
            'NO-KEY-LETTER', 'NO-SEARCH', 'NO-UPDOWN', 'NOTAB', 'NOTIFY',
            'NOTIFY-CHANGE', 'NOTIFY-DBLCLICK', 'NOTIFY-SELCHANGE',
            'NUM-COL-HEADINGS', 'NUM-ROWS', 'OK-BUTTON', 'OVERLAP-LEFT',
            'OVERLAP-TOP', 'PAGE-SETUP', 'PAGED', 'PARENT', 'PERMANENT',
            'PIXEL', 'PLACEMENT', 'POP-UP', 'POSITION-SHIFT',
            'PRINT-NO-PROMPT', 'PRINT-PREVIEW', 'PRIORITY', 'PROGRESS',
            'PROPERTIES', 'PROPERTY', 'PUSH-BUTTON', 'QUERY-INDEX',
            'RADIO-BUTTON', 'RAISED', 'READ-ONLY', 'RECORD-DATA',
            'RECORD-TO-ADD', 'RECORD-TO-DELETE', 'REFRESH', 'REGION-COLOR',
            'RESET-GRID', 'RESET-LIST', 'RESET-TABS', 'RIGHT-ALIGN', 'RIMMED',
            'ROW-COLOR', 'ROW-COLOR-PATTERN', 'ROW-DIVIDERS', 'ROW-FONT',
            'ROW-HEADINGS', 'ROW-PROTECTION', 'SAVE-AS', 'SAVE-AS-NO-PROMPT',
            'SCROLL-BAR', 'SEARCH-OPTIONS', 'SEARCH-TEXT', 'SELECT-ALL',
            'SELECTION-INDEX', 'SELECTION-TEXT', 'SELF-ACT', 'SEPARATION',
            'SHADING', 'SHADOW', 'SHORT-DATE', 'SHOW-LINES', 'SHOW-NONE',
            'SHOW-SEL-ALWAYS', 'SMALL-FONT', 'SORT-ORDER', 'SPINNER', 'SQUARE',
            'START-X', 'START-Y', 'STATIC-LIST', 'STATUS-BAR', 'STATUS-TEXT',
            'STYLE', 'SUBWINDOW', 'TAB-TO-ADD', 'TAB-TO-DELETE', 'TEMPORARY',
            'TERMINATION-VALUE', 'THREAD', 'THREADS', 'THUMB-POSITION',
            'TILED-HEADINGS', 'TITLE', 'TITLE-POSITION', 'TRADITIONAL-FONT',
            'TRAILING-SHIFT', 'TRANSPARENT', 'TREE-VIEW', 'UNFRAMED',
            'UNSORTED', 'USE-ALT', 'USE-RETURN', 'USE TAB', 'VALUE-FORMAT',
            'VARIANT', 'VERTICAL', 'VERY-HEAVY', 'VIRTUAL-WIDTH', 'VPADDING',
            'VSCROLL', 'VSCROLL-BAR', 'VSCROLL-POS', 'VTOP', 'WEB-BROWSER',
            'WIDTH', 'WIDTH-IN-CELLS', 'WINDOW', 'X', 'Y'
        ]

        if year in ['2002', '2014']:
            keywords += keywords_2002

        if year == '2014':
            keywords += keywords_2014

        if extension.lower() == 'acu':
            keywords += keywords_acu

        if extension.lower() == 'ibm':
            keywords += keywords_ibm

        if extension.lower() == 'gnu':
            keywords += keywords_gnu

        keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword',
                                                     False)

        values = [
            'BLANK', 'SPACE', 'SPACES', 'ZERO', 'ZEROES', 'ZEROS', 'FALSE',
            'NO', 'OFF', 'ON', 'TRUE'
        ]

        values_2002 = ['NULL', 'NULLS', 'SELF', 'SUPER']

        if year in ['2002', '2014']:
            values += values_2002

        value_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        cobol_preprocessor_tb = CobolPreprocessorTokenBuilder()

        exec_tb = BlockTokenBuilder('EXEC', 'END-EXEC', 'exec block')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb,
            whitespace_tb,
            terminators_tb,
            integer_tb,
            integer_exponent_tb,
            real_tb,
            real_exponent_tb,
            picture_tb,
            cr_picture_tb,
            keyword_tb,
            star_comment_tb,  # before operator, to catch single star as comment
            known_operator_tb,
            groupers_tb,
            value_tb,
            identifier_tb,
            string_tb,
            n_string_tb,
            nx_string_tb,
            inline_comment_tb,
            cobol_preprocessor_tb,
            exec_tb,
            self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.convert_numbers_to_pictures()
        self.convert_numbers_to_levels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            # self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        # self.calc_operand_n_confidence(tokens, operand_types, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)

        self.calc_picture_confidence()
        expected_keyword_confidence = self.check_expected_keywords()
        self.confidences['expected_keywords'] = expected_keyword_confidence
Esempio n. 14
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '0123456789abcdefABCDEF_')
        long_integer_tb = SuffixedIntegerTokenBuilder('L', False, None)
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        float_real_tb = SuffixedRealTokenBuilder(False, False, ['f'], False,
                                                 None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        symbol_tb = PrefixedIdentifierTokenBuilder("'", 'symbol', True)
        operand_types.append('symbol')

        quotes = ['"']
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_string_tb = TripleQuoteStringTokenBuilder(quotes)
        operand_types.append('string')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '&', '|', '^', '<<', '>>', '&&', '||',
            '=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=',
            '>:', '⇒', '=>', '=', '<%', '<:', '←', '<-', '#', '@', '==', '!=',
            '>', '<', '>=', '<=', '!', '~', '<<<', '>>>', '.', '++', '--',
            'new'
        ]

        self.unary_operators = ['+', '-', '*', '!', '~', '++', '--', 'new']

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'abstract', 'case', 'catch', 'class', 'def', 'do', 'else',
            'extends', 'final', 'finally', 'for', 'forSome', 'if', 'implicit',
            'import', 'lazy', 'match', 'object', 'override', 'package',
            'private', 'protected', 'return', 'sealed', 'then', 'throw',
            'trait', 'try', 'type', 'val', 'var', 'while', 'with', 'yield'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['false', 'true', 'null', 'this', 'super']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, long_integer_tb,
            real_tb, real_exponent_tb, float_real_tb, keyword_tb, values_tb,
            groupers_tb, known_operator_tb, identifier_tb, symbol_tb,
            string_tb, triple_string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 15
0
    def __init__(self, code, version):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        command_tb = PrefixedIdentifierTokenBuilder('!', 'command', False)

        metaclass_tb = PrefixedIdentifierTokenBuilder('?', 'metaclass', False)

        quotes = ['"', "'", "’"]
        string_tb = MatlabStringTokenBuilder(quotes, False)
        operand_types.append('string')

        line_comment_m_tb = LeadToEndOfLineTokenBuilder('%', False, 'comment')
        line_comment_o_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')
        block_comment_m_tb = BlockTokenBuilder('%{', '%}', 'comment')
        block_comment_o_tb = BlockTokenBuilder('#{', '#}', 'comment')

        line_continuation_tb = KeywordTokenBuilder('...', 'line continuation')

        known_operators = [
            '+', '-', '.*', '*', './', '/', '\\', '.^', '^', ".'", "'", '=',
            '==', '~=', '>', '>=', '<', '<=', '&', '|', '&&', '||', '~', '@',
            '.', '.?'
        ]

        operators_octave = [
            '++', '--', '+=', '-=', '*=', '/=', '^=', '!', '!=', '**'
        ]

        if version == 'octave':
            known_operators += operators_octave

        self.unary_operators = ['+', '-', '~', '@']

        self.postfix_operators = ["'"]

        groupers = ['(', ')', ',', '[', ']', '{', '}', ';', ':']
        group_starts = ['(', '[', ',', '{']
        # group_mids = [',', ';', ':']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'break', 'case', 'catch', 'classdef', 'continue', 'else', 'elseif',
            'end', 'for', 'function', 'global', 'if', 'otherwise', 'parfor',
            'persistent', 'return', 'spmd', 'switch', 'try', 'while'
        ]

        keywords_octave = ['endfor', 'endif', 'endwhile']

        if version == 'octave':
            keywords += keywords_octave

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['inf', 'Nan']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb,
            real_exponent_tb, keyword_tb, values_tb, groupers_tb,
            known_operator_tb, identifier_tb, command_tb, metaclass_tb,
            string_tb, line_comment_m_tb, block_comment_m_tb
        ]

        tokenbuilders_2 = [line_comment_o_tb, block_comment_o_tb]

        if version == 'octave':
            tokenbuilders += tokenbuilders_2

        tokenbuilders_9 = [self.unknown_operator_tb, invalid_token_builder]

        tokenbuilders += tokenbuilders_9

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        # self.calc_group_confidence(tokens, group_mids)

        # operand_types_2 = ['number']
        # self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 16
0
    def __init__(self, code, tab_size, wide):
        super().__init__()

        self.operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        binary_integer_tb = SuffixedIntegerTokenBuilder(['B'], False, None)
        hex_integer_tb = SuffixedIntegerTokenBuilder(['H'], False, 'ABCDEF')
        octal_integer_tb = SuffixedIntegerTokenBuilder(['O'], False, None)
        decimal_integer_tb = SuffixedIntegerTokenBuilder(['D'], False, None)
        real_tb = RealTokenBuilder(True, False, None)
        real_exponent_tb = RealExponentTokenBuilder(True, False, 'E', None)
        binary_real_tb = SuffixedRealTokenBuilder(True, True, ['B'], False,
                                                  None)
        self.operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        self.operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        self.operand_types.append('string')

        label_tb = PL1LabelTokenBuilder()
        self.operand_types.append('label')

        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        directives = [
            '%ACTIVATE', '%DEACTIVATE', '%DECLARE', '%DCL', '%DICTIONARY',
            '%DO', '%ELSE', '%END', '%FATAL', '%GOTO', '%IF', '%INCLUDE',
            '%LIST', '%NOLIST', '%PAGE', '%PROCEDURE', '%PROC', '%REPLACE',
            '%RETURN', '%THEN'
        ]

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        preprocessor_tb = CaseInsensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        title_tb = LeadToEndOfLineTokenBuilder('%TITLE', True, 'preprocessor')
        subtitle_tb = LeadToEndOfLineTokenBuilder('%SBTTL', True,
                                                  'preprocessor')
        error_tb = LeadToEndOfLineTokenBuilder('%ERROR', True, 'preprocessor')
        warn_tb = LeadToEndOfLineTokenBuilder('%WARN', True, 'preprocessor')
        inform_tb = LeadToEndOfLineTokenBuilder('%INFORM', True,
                                                'preprocessor')
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '**', '>', '<', '=', '>=', '<=', '<>', '^>',
            '^<', '^=', '^', '~>', '~<', '~=', '~', '&', '&:', ':=', '|', '|:',
            '||', '!', '!:', '!!', ':', '@', 'NOT', 'AND', 'OR', 'XOR',
            'MINUS', 'PLUS', 'MOD'
        ]

        self.unary_operators = ['+', '-', '^', '~', '@', 'NOT']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        self.group_starts = ['(', '[', ',', '{']
        self.group_mids = [',']
        self.group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'ADDRESS', 'AT', 'BASED', 'BY', 'CALL', 'CASE', 'CLOSE', 'DATA',
            'DECLARE', 'DISABLE', 'DO', 'ELSE', 'ENABLE', 'END', 'EOF',
            'EXTERNAL', 'GO', 'GOTO', 'HALT', 'IF', 'INITIAL', 'INTERRUPT',
            'LABEL', 'LITERALLY', 'OFFSET', 'ON', 'OPEN', 'OTHERWISE', 'OTHER',
            'PROCEDURE', 'PROC', 'PUBLIC', 'READ', 'REENTRANT', 'RETURN',
            'SELECTOR', 'STRUCTURE', 'THEN', 'TO', 'WHILE', 'WRITE'
        ]

        keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword',
                                                     False)

        attributes = [
            'ALIGNED', 'ANY', 'AREA', 'BASED', 'BUILTIN', 'CONDITION', 'COND',
            'CONTROLLED', 'CTL', 'DEFINED', 'DEF', 'DIRECT', 'ENTRY',
            'ENVIRONMENT', 'ENV', 'EXTERNAL', 'EXT', 'FILE', 'GLOBALDEF',
            'GLOBALREF', 'INITIAL', 'INIT', 'INPUT', 'INTERNAL', 'INT'
            'KEYED', 'LABEL', 'LIKE', 'LIST', 'MEMBER', 'NONVARYING', 'NONVAR',
            'OPTIONAL', 'OPTIONS', 'OUTPUT', 'PARAMETER', 'PARM', 'PICTURE',
            'PIC', 'POSITION', 'POS', 'PRECISION', 'PREC', 'PRINT', 'READONLY',
            'RECORD', 'REFER', 'RETURNS', 'SEQUENTIAL', 'SEQL', 'STATIC',
            'STREAM', 'STRUCTURE', 'TRUNCATE', 'UNALIGNED', 'UNAL', 'UNION',
            'UPDATE', 'VARIABLE', 'VARYING', 'VAR'
        ]

        attributes_tb = CaseInsensitiveListTokenBuilder(
            attributes, 'attribute', False)

        functions = [
            'ABS', 'ADJUSTRPL', 'BLOCKINPUT', 'BLOCKINWORD', 'BLOCKINDWORD',
            'BLOCKOUTPUT', 'BLOCKOUTWORD', 'BLOCKOUTDWORD', 'BUILDPTR',
            'BYTESWAP', 'CMPD', 'CARRY', 'CAUSEINTERRUPT',
            'CLEARTASKSWITCHEDFLAG', 'CMPB', 'CMPW', 'CONTROLREGISTER', 'DEC',
            'DOUBLE', 'DEBUGREGISTER', 'FINDB', 'FINDD', 'FINDRD', 'FINDHW',
            'FINDRB', 'FINDRHW', 'FINDRW', 'FINDW', 'FIX', 'FLAGS', 'FLOAT',
            'GETACCESSRIGHTS', 'GETREALERROR', 'GETSEGMENTLIMIT', 'HIGH',
            'IABS', 'INHWORD', 'INITREALMATHUNITSKIPRB', 'INPUT', 'INT SIZE',
            'INWORD SIZE', 'INVALIDATEDATACACHE', 'INVALIDATETLBENTRY'
            'INDWORD', 'LAST', 'LENGTH', 'LOCALTABLE', 'LOCKSET', 'LOW',
            'MACHINESTATUS', 'MOVB', 'MOVBIT', 'MOVD', 'MOVE', 'MOVHW',
            'MOVRB', 'MOVRBIT'
            'MOVRD', 'MOVRHW', 'MOVRW', 'MOVW', 'NIL', 'OFFSETOF', 'OUTDWORD',
            'OUTHWORD', 'OUTPUT', 'OUTWORD', 'PARITY', 'RESTOREGLOBALTABLE',
            'RESTOREINTERRUPTABLE', 'RESTOREREALSTATUS', 'ROL', 'ROR', 'SAL',
            'SAR', 'SAVEGLOBALTABLE', 'SAVEINTERRUPTTABLE', 'SAVEREALSTATUS',
            'SCANBIT', 'SCANRBIT', 'SCL', 'SCR', 'SEGMENTREADABLE',
            'SEGMENTWRITABLE', 'SELECTOROF', 'SETB', 'SETHW', 'SETREALMODE',
            'SETW', 'SHL', 'SHLD', 'SHR', 'SHRD', 'SETD', 'SIGN', 'SIGNED',
            'SKIPB', 'SKIPD', 'SKIPRD'
            'SKIPHW', 'SKIPRHW', 'SKIPRW', 'SKIPW', 'STACKBASE', 'STACKPTR',
            'TASKREGISTER', 'TESTREGISTER', 'TIME', 'UNSIGN',
            'WAITFORINTERRUPT', 'WBINVALIDATEDATACACHE', 'XLAT', 'ZERO'
        ]

        function_tb = CaseInsensitiveListTokenBuilder(functions, 'function',
                                                      True)

        format_items = [
            'A', 'B', 'B1', 'B2', 'B3', 'B4', 'COLUMN', 'COL', 'E', 'F', 'P',
            'R', 'TAB', 'X'
        ]

        format_item_tb = CaseSensitiveListTokenBuilder(format_items, 'format',
                                                       True)
        self.operand_types.append('format')

        options = [
            'APPEND', 'BACKUP_DATE', 'BATCH', 'BLOCK_BOUNDARY_FORMAT',
            'BLOCK_IO', 'BLOCK_SIZE', 'BUCKET_SIZE', 'BY', 'CANCEL_CONTROL_O',
            'CARRIAGE_RETURN_FORMAT', 'CONTIGUOUS', 'CONTIGUOUS_BEST_TRY',
            'CREATION_DATE', 'CURRENT_POSITION', 'DEFAULT_FILE_NAME',
            'DEFERRED_WRITE', 'DELETE', 'EDIT', 'EXPIRATION_DATE',
            'EXTENSION_SIZE', 'FAST_DELETE', 'FILE_ID', 'FILE_ID_TO',
            'FILE_SIZE', 'FIXED_CONTROL_FROM', 'FIXED_CONTROL_SIZE',
            'FIXED_CONTROL_SIZE_TO', 'FIXED_CONTROL_TO',
            'FIXED_LENGTH_RECORDS', 'FROM', 'GROUP_PROTECTION', 'IDENT',
            'IGNORE_LINE_MARKS', 'IN', 'INDEXED', 'INDEX_NUMBER',
            'INITIAL_FILL', 'INTO', 'KEY', 'KEYFROM', 'KEYTO', 'LINESIZE',
            'LOCK_ON_READ', 'LOCK_ON_WRITE', 'MAIN PROCEDURE',
            'MANUAL_UNLOCKING', 'MATCH_GREATER', 'MATCH_GREATER_EQUAL',
            'MATCH_NEXT', 'MATCH_NEXT_EQUAL', 'MAXIMUM_RECORD_NUMBER',
            'MAXIMUM_RECORD_SIZE', 'MULTIBLOCK_COUNT', 'MULTIBUFFER_COUNT',
            'NOLOCK', 'NONEXISTENT_RECORD', 'NONRECURSIVE', 'NORESCAN',
            'NO_ECHO', 'NO_FILTER', 'NO_SHARE', 'OWNER_GROUP', 'OWNER_ID',
            'OWNER_MEMBER', 'OWNER_PROTECTION', 'PAGE', 'PAGESIZE',
            'PRINTER_FORMAT', 'PROMPT', 'PURGE_TYPE_AHEAD', 'READ_AHEAD',
            'READ_CHECK', 'READ_REGARDLESS', 'RECORD_ID', 'RECORD_ID_ACCESS',
            'RECORD_ID_TO', 'RECURSIVE', 'REPEAT', 'RESCAN',
            'RETRIEVAL_POINTERS', 'REVISION_DATE', 'REWIND_ON_CLOSE',
            'REWIND_ON_OPEN', 'SCALARVARYING', 'SET READ', 'SHARED_READ',
            'SHARED_WRITE', 'SKIP', 'SNAP', 'SPOOL', 'STATEMENT', 'SUPERSEDE',
            'SYSTEM', 'SYSTEM_PROTECTION', 'TEMPORARY', 'TIMEOUT_PERIOD',
            'TITLE', 'TO', 'UNDERFLOW', 'UFL', 'UNTIL', 'USER_OPEN',
            'WAIT_FOR_RECORD', 'WHILE', 'WORLD_PROTECTION', 'WRITE_BEHIND',
            'WRITE_CHECK'
        ]

        options_tb = CaseInsensitiveListTokenBuilder(options, 'option', False)

        conditions = [
            'ANYCONDITION', 'CONVERSION', 'CONV', 'ENDFILE', 'ENDPAGE',
            'FINISH', 'FIXEDOVERFLOW', 'FOFL', 'OVERFLOW', 'OFL', 'STORAGE',
            'STRINGRANGE', 'STRG', 'SUBSCRIPTRANGE', 'SUBRG', 'UNDEFINEDFILE',
            'UNDF', 'VAXCONDITION', 'ZERODIVIDE', 'ZDIV'
        ]

        conditions_tb = CaseInsensitiveListTokenBuilder(
            conditions, 'condition', False)

        subroutines = [
            'DISPLAY', 'EXTEND', 'FLUSH', 'NEXT_VOLUME', 'RELEASE', 'RESIGNAL',
            'REWIND', 'SPACEBLOCK'
        ]

        subroutines_tb = CaseInsensitiveListTokenBuilder(
            subroutines, 'subroutine', False)

        types = [
            'ADDRESS', 'BYTE', 'CHARINT', 'DWORD', 'HWORD', 'INTEGER',
            'LONGINT', 'OFFSET', 'POINTER', 'REAL', 'SHORTINT', 'STRUCTURE',
            'QWORD', 'WORD'
        ]

        types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True)
        self.operand_types.append('type')

        values = ['SYSIN', 'SYSPRINT', 'TRUE', 'FALSE']

        values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        self.operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        # tokenize as free-format
        tokenbuilders_free = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, binary_integer_tb, hex_integer_tb,
            octal_integer_tb, decimal_integer_tb, real_tb, real_exponent_tb,
            binary_real_tb, keyword_tb, format_item_tb, function_tb,
            attributes_tb, options_tb, conditions_tb, subroutines_tb, types_tb,
            values_tb, groupers_tb, known_operator_tb, identifier_tb,
            string_tb, label_tb, slash_star_comment_tb, preprocessor_tb,
            title_tb, subtitle_tb, error_tb, warn_tb, inform_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer_free = Tokenizer(tokenbuilders_free)
        tokens_free = tokenizer_free.tokenize(code)
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid operator')
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid')
        self.tokens = tokens_free

        self.calc_statistics()
        statistics_free = self.statistics
        self.statistics = {}

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators,
                                            self.group_ends, allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            self.group_starts, allow_pairs)

        self.calc_group_confidence(tokens, self.group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, self.operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
        confidences_free = self.confidences
        self.confidences = {}
        errors_free = self.errors
        self.errors = []

        # tokenize as fixed-format
        tokenbuilders_fixed = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, binary_integer_tb, hex_integer_tb,
            octal_integer_tb, decimal_integer_tb, real_tb, real_exponent_tb,
            binary_real_tb, keyword_tb, function_tb, attributes_tb, options_tb,
            conditions_tb, subroutines_tb, types_tb, values_tb, groupers_tb,
            known_operator_tb, identifier_tb, string_tb, label_tb,
            slash_star_comment_tb, preprocessor_tb, title_tb, subtitle_tb,
            error_tb, warn_tb, inform_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        comment_start_tb = PL1CommentStartTokenBuilder()
        comment_middle_tb = PL1CommentMiddleTokenBuilder()
        comment_end_tb = PL1CommentEndTokenBuilder()

        type1_tokenbuilders = [comment_start_tb]
        tokenbuilders_fixed_1 = tokenbuilders_fixed + type1_tokenbuilders + [
            invalid_token_builder
        ]
        tokenizer_fixed_1 = Tokenizer(tokenbuilders_fixed_1)

        type2_tokenbuilders = [
            comment_start_tb, comment_middle_tb, comment_end_tb
        ]
        tokenbuilders_fixed_2 = tokenbuilders_fixed + type2_tokenbuilders + [
            invalid_token_builder
        ]
        tokenizer_fixed_2 = Tokenizer(tokenbuilders_fixed_2)

        tokens_fixed = self.tokenize_code(code, tab_size, tokenizer_fixed_1,
                                          tokenizer_fixed_2, wide)
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'invalid operator')
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'invalid')
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'whitespace')
        tokens_fixed = self.convert_broken_comments_to_comments(tokens_fixed)
        self.tokens = tokens_fixed

        self.calc_statistics()
        statistics_fixed = self.statistics
        self.statistics = {}

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators,
                                            self.group_ends, allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            self.group_starts, allow_pairs)

        self.calc_group_confidence(tokens, self.group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, self.operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
        confidences_fixed = self.confidences
        self.confidences = {}
        errors_fixed = self.errors
        self.errors = []

        # compute confidence for free-format and fixed-format
        confidence_free = 1.0
        if len(confidences_free) == 0:
            confidence_free = 0.0
        else:
            for key in confidences_free:
                factor = confidences_free[key]
                confidence_free *= factor

        confidence_fixed = 1.0
        if len(confidences_fixed) == 0:
            confidence_fixed = 0.0
        else:
            for key in confidences_fixed:
                factor = confidences_fixed[key]
                confidence_fixed *= factor

        # select the better of free-format and spaced-format
        if confidence_fixed > confidence_free:
            self.tokens = tokens_fixed
            self.statistics = statistics_fixed
            self.confidences = confidences_fixed
            self.errors = errors_fixed
        else:
            self.tokens = tokens_free
            self.statistics = statistics_free
            self.confidences = confidences_free
            self.errors = errors_free
Esempio n. 17
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '0123456789ABCDEFabcdef_')
        real_tb = RealTokenBuilder(True, True, '_')
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False)
        operand_types.append('decorator')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        operand_types.append('string')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        class_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '+=', '-=', '*=', '/=', '%=', '++',
            '--', '&&', '||', '!', '==', '!=', '===', '!==', '<', '>', '<=',
            '>=', '!!', '?.', '?:', '::', '..', ':', '?', '.'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', '!', '*', '++', '--']

        self.postfix_operators = ['++', '--', ':']

        groupers = ['->', '(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = ['->', ',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'as', 'as?', 'break', 'class', 'continue', 'do', 'else', 'for',
            'fun', 'if', 'in', '!in', 'is', '!is', 'object', 'package',
            'return', 'super', 'throw', 'try', 'typealias', 'typeof', 'val',
            'var', 'when', 'while', 'by', 'catch', 'constructor', 'delegate',
            'dynamic', 'field', 'file', 'finally', 'get', 'import', 'init',
            'param', 'property', 'receiver', 'set', 'setparam', 'where',
            'actual', 'abstract', 'annotation', 'companion', 'const',
            'crossinline', 'data', 'enum', 'expect', 'external', 'final',
            'infix', 'inline', 'inner', 'internal', 'lateinit', 'noinline',
            'open', 'operator', 'out', 'override', 'private', 'protected',
            'public', 'reified', 'sealed', 'suspend', 'tailrec', 'vararg'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'Byte', 'Short', 'Int', 'Long', 'Float', 'Double', 'Char', 'u',
            'f', 'ul', 'UInt', 'ULong', 'UByte', 'UShort'
        ]

        type_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'null', 'this', 'true']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb,
            keyword_tb, type_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, class_tb, decorator_tb, string_tb,
            triple_quote_string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        self.tokens = self.combine_numbers_and_adjacent_types(tokens)

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 18
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        annotation_tb = PrefixedIdentifierTokenBuilder('@', 'annotation',
                                                       False)
        operand_types.append('annotation')

        symbol_tb = PrefixedIdentifierTokenBuilder('#', 'symbol', True)
        operand_types.append('symbol')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '~/', '%', '^', '=', '==', '!=', '>', '>=',
            '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=',
            '>>=', '!', '&', '|', '~', '<<', '>>', '~/=', '||', '&&', '.',
            '..', ':', '?', '??', '??=', 'as', 'is', 'is!', '++', '--', 'new'
        ]

        self.unary_operators = [
            '+', '-', '*', '!', '~', '.', '..'
            '?.', '++', '--', 'new'
        ]

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'abstract', 'assert', 'async', 'await', 'break', 'case', 'catch',
            'class', 'const', 'continue', 'covariant', 'default', 'deferred',
            'do', 'dynamic', 'else', 'enum', 'export', 'extends', 'external',
            'factory', 'final', 'finally', 'for', 'Function', 'get', 'hide',
            'if', 'implements', 'import', 'in', 'interface', 'library',
            'mixin', 'on', 'operator', 'part', 'rethrow', 'return', 'set',
            'show', 'static', 'switch', 'sync', 'throw', 'try', 'typedef',
            'var', 'void', 'while', 'with', 'yield'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = ['int', 'double', 'String', 'List', 'bool', 'void']

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'true', 'null', 'this', 'super']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb,
            real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb,
            groupers_tb, known_operator_tb, identifier_tb, annotation_tb,
            symbol_tb, class_type_tb, string_tb, raw_string_tb,
            slash_slash_comment_tb, slash_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 19
0
  def __init__(self, code):
    super().__init__()

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder("'")
    integer_exponent_tb = IntegerExponentTokenBuilder("'")
    real_tb = RealTokenBuilder(False, False, "'")
    real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
    operand_types.append('number')

    leads = '_@'
    extras = '_@'
    identifier_tb = IdentifierTokenBuilder(leads, extras)
    operand_types.append('identifier')

    quotes = ['"', "'", "’"]
    string_tb = EscapedStringTokenBuilder(quotes, 0)
    operand_types.append('string')

    comment_tb = LeadToEndOfLineTokenBuilder('%', False, 'comment')

    directives = [
      '-include', '-define', '-error', '-warning',
      '-module', '-compile'
    ]

    c_preprocessor_tb = CaseSensitiveListTokenBuilder(directives, 'preprocessor', False)
    terminators_tb = SingleCharacterTokenBuilder([';', '.'], 'statement terminator', False)

    known_operators = [
      '+', '-', '*', '/', '!',
      'and', 'andalso',
      'band', 'bnot', 'bor', 'bsl', 'bsr', 'bxor',
      'div',
      'not',
      'of', 'or', 'orelse',
      'xor',
      '++', '--', '->', '=>', '#', ':=',
      '=', '==', '/=', '=<', '<', '>=', '>', '=:=', '=/='
    ]

    self.unary_operators = [
      '+', '-', 'not', '#', '!'
    ]

    self.postfix_operators = []

    groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '<', '>', '<<', '>>', '|', '||']
    # group_starts = ['(', '[', ',', '{', '<', '<<']
    group_ends = [')', ']', '}', '>', '>>']
    group_mids = [',', ':', '|', '||']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    keywords = [
      'after', 'begin',
      'case', 'catch', 'cond', 'end', 'fun', 'if', 'let',
      'receive', 'rem', 'try', 'when',
      'ignore'
    ]

    keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

    types = [
      'integer', 'float', 'binary', 'bytes', 'bitstring', 'bits',
      'utf8', 'utf16', 'utf32',
      'signed', 'unsigned', 'big', 'little', 'native'
    ]

    types_tb = CaseSensitiveListTokenBuilder(types, 'type', False)
    operand_types.append('type')

    values = [
      'true', 'false',
      '?MODULE', '?MODULE_STRING', '?FILE', '?LINE', '?MACHINE',
      '?FUNCTION_NAME', '?FUNCTION_ARITY', '?OTP_RELEASE'
    ]

    values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      terminators_tb,
      integer_tb,
      integer_exponent_tb,
      real_tb,
      real_exponent_tb,
      keyword_tb,
      types_tb,
      values_tb,
      groupers_tb,
      known_operator_tb,
      identifier_tb,
      string_tb,
      comment_tb,
      c_preprocessor_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)
    tokens = tokenizer.tokenize(code)
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator')
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
    self.tokens = tokens
    self.convert_keywords_to_identifiers()

    self.calc_statistics()

    tokens = self.source_tokens()
    tokens = Examiner.join_all_lines(tokens)

    self.calc_token_confidence()
    self.calc_token_2_confidence(['*', ';'])

    num_operators = self.count_my_tokens(['operator', 'invalid operator'])
    if num_operators > 0:
      self.calc_operator_confidence(num_operators)
      allow_pairs = []
      self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
      self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
      # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

    self.calc_group_confidence(tokens, group_mids)

    operand_types_2 = ['number']
    self.calc_operand_n_confidence(tokens, operand_types_2, 2)
    self.calc_operand_n_confidence(tokens, operand_types, 4)

    self.calc_keyword_confidence()

    self.calc_paired_blockers_confidence(['{'], ['}'])
    self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 20
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = PrefixedIntegerTokenBuilder('#', False, '0123456789')
        variable16_tb = PrefixedIntegerTokenBuilder('.', False, '0123456789')
        variable32_tb = PrefixedIntegerTokenBuilder(':', False, '0123456789')
        array16_tb = PrefixedIntegerTokenBuilder(',', False, '0123456789')
        array32_tb = PrefixedIntegerTokenBuilder(';', False, '0123456789')
        operand_types.append('number')

        comment_tb = LeadToEndOfLineTokenBuilder('NOTE', True, 'comment')

        label_tb = ParensLabelTokenBuilder()

        known_operators = ['~', '$', 'V', '?', '&', 'SUB', '<-']

        self.unary_operators = ['V', '?', '&']

        self.postfix_operators = []

        groupers = ['"', "'"]
        group_starts = ['"', "'"]
        group_ends = ['"', "'"]
        group_mids = []

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'DO', 'STASH', 'RETRIEVE', 'RESUME', 'FORGET', 'NEXT', 'ABSTAIN',
            'FROM', 'REINSTATE', 'IGNORE', 'REMEMBER', 'WRITE', 'IN', 'READ',
            'OUT', 'PLEASE', 'COME', 'FROM'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, integer_tb, variable16_tb,
            variable32_tb, array16_tb, array32_tb, keyword_tb, groupers_tb,
            label_tb, known_operator_tb, comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        # self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 21
0
    def __init__(self, code, extension):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        operand_types.append('number')

        quotes = ["'", '"']
        string_tb = StuffedQuoteStringTokenBuilder(quotes, False)
        operand_types.append('string')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        bracketed_identifier_tb = NullTokenBuilder()

        if extension in ['microsoft', 't-sql']:
            bracketed_identifier_tb = SqlBracketedIdentifierTokenBuilder()

        operand_types.append('identifier')

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        comment_tb = LeadToEndOfLineTokenBuilder('--', True, 'comment')

        known_operators = [
            '=', '>', '>=', '<', '<=', '<>', '!=', 'AND', 'OR', 'NOT', 'IN',
            'EXISTS', 'LIKE', 'BETWEEN', 'ANY', 'ALL', '.'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['NOT', 'EXISTS', 'ANY', 'ALL']

        groupers = ['(', ')', ',']
        group_starts = ['(', ',']
        group_mids = [',']
        group_ends = [')']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'ACOS', 'ASIN', 'ATAN', 'ABSOLUTE', 'ACTION', 'ADD', 'ALL',
            'ALLOCATE', 'ALTER', 'ARE', 'ABS', 'ARRAY_AGG', 'AVG', 'AS', 'ASC',
            'ASSERTION', 'AT', 'AUTHORIZATION', 'AFTER', 'ARRAY', 'ASENSITIVE',
            'ASYMMETRIC', 'ATOMIC', 'ARRAY_MAX_CARDINALITY', 'BEFORE', 'BEGIN',
            'BETWEEN', 'BIT_LENGTH', 'BOTH', 'BY', 'BEGIN_FRAME',
            'BEGIN_PARTITION', 'BINARY', 'BOOLEAN', 'BREADTH', 'CALL',
            'CASCADE', 'CASCADED', 'CASE', 'CAST', 'CATALOG', 'CALLED',
            'CHAR_LENGTH', 'CHARACTER_LENGTH', 'CHECK', 'COALESCE', 'COLLATE',
            'COLLATION', 'COLUMN', 'COMMIT', 'CONDITION', 'CONNECT',
            'CONNECTION', 'CONSTRAINT', 'CONSTRAINTS', 'CONTAINS', 'CONTINUE',
            'CONVERT', 'CORRESPONDING', 'COUNT', 'CREATE', 'CROSS', 'CURRENT',
            'CURRENT_DATE', 'CURRENT_PATH', 'CURRENT_TIME',
            'CURRENT_TIMESTAMP', 'CURRENT_USER', 'CURSOR', 'CLOSE',
            'CONSTRUCTOR', 'CUBE', 'CURRENT_DEFAULT_TRANSFORM_GROUP',
            'CURRENT_ROLE', 'CURRENT_TRANSFORM_GROUP_FOR_TYPE', 'CYCLE',
            'CARDINALITY', 'CEIL', 'CEILING', 'CONVERT', 'CORR', 'COVAR_POP',
            'COVAR_SAMPLE', 'CUME_DIST', 'CURRENT_CATALOG', 'CURRENT_SCHEMA',
            'CLASSIFIER', 'COS', 'COSH', 'DAY', 'DEALLOCATE', 'DEC', 'DECLARE',
            'DEFAULT', 'DECFLOAT', 'DEFINE', 'DEFERRABLE', 'DEFERRED',
            'DELETE', 'DEPTH', 'DESC', 'DESCRIBE', 'DENSE_RANK', 'DESCRIPTOR',
            'DETERMINISTIC', 'DIAGNOSTICS', 'DISCONNECT', 'DISTINCT', 'DO',
            'DOMAIN', 'DROP', 'DYNAMIC', 'ELSE', 'END', 'ESCAPE', 'EXCEPT',
            'EXCEPTION', 'ELEMENT', 'EXEC', 'EXECUTE', 'EXISTS', 'EXIT',
            'EXTERNAL', 'EXTRACT', 'EACH', 'ELSEIF', 'EQUALS', 'END_EXEC',
            'EVERY', 'EXP', 'EMPTY', 'EQUALS', 'FETCH', 'FIRST', 'FOR',
            'FOREIGN', 'FOUND', 'FROM', 'FULL', 'FUNCTION', 'FUSION', 'FILTER',
            'FREE', 'FIRST_VALUE', 'FRAME_ROW', 'GENERAL', 'GET', 'GLOBAL',
            'GO', 'GOTO', 'GRANT', 'GROUP', 'GROUPING', 'GROUPS', 'HANDLER',
            'HAVING', 'HOUR', 'HOLD', 'IDENTITY', 'IF', 'IMMEDIATE', 'IN',
            'INDICATOR', 'INITIALLY', 'INNER', 'INOUT', 'INPUT', 'INSENSITIVE',
            'INSERT', 'INT', 'INTERSECT', 'INITIAL', 'INTERVAL', 'INTO', 'IS',
            'ISOLATION', 'INTERSECTION', 'ITERATE', 'JOIN', 'JSON_ARRY',
            'JSON_ARRAYAGG', 'JSON_EXISTS', 'JSON_OBJECT', 'JSON_OBJECTAGG',
            'JSON_QUERY', 'JSON_TABLE', 'JSON_TABLE_PRIMITIVE', 'JSON_VALUE',
            'KEY', 'LANGUAGE', 'LAST', 'LEADING', 'LEFT', 'LEVEL', 'LIKE',
            'LOCAL', 'LARGE', 'LATERAL', 'LEAVE', 'LOCALTIME',
            'LOCALTIMESTAMP', 'LOCATOR', 'LOOP', 'LAG', 'LISTAGG', 'LOG',
            'LOG10', 'LIKE_REGEX', 'LN', 'LOWER', 'LAST_VALUE', 'LEAD',
            'MATCH', 'MAX', 'MIN', 'MINUTE', 'MODULE', 'MONTH', 'MAP',
            'METHOD', 'MODIFIES', 'MATCH_NUMBER', 'MATCH_RECOGNIZE', 'MATCHES',
            'MEMBER', 'MERGE', 'MULTISET', 'MOD', 'NAMES', 'NATIONAL',
            'NATURAL', 'NEXT', 'NO', 'NOT', 'NULLIF', 'NUMERIC', 'NTH_VALUE',
            'NTILE', 'NEW', 'NORMALIZE', 'OCTET_LENGTH', 'OF', 'ONLY', 'OPEN',
            'OPTION', 'ORDER', 'OUTPUT', 'OVERLAPS', 'OBJECT', 'OLD',
            'ORDINALITY', 'OUT', 'OUTER', 'OCTET_LENGTH', 'OFFSET', 'OMIT',
            'OCCURRENCES_REGEX', 'ONE', 'OVER', 'OVERLAY', 'PAD', 'PARAMETER',
            'PARTIAL', 'PRECISION', 'PREPARE', 'PRESERVE', 'PRIMARY', 'PRIOR',
            'PRIVILEGES', 'PROCEDURE', 'PUBLIC', 'PATTERN', 'PER', 'PTF',
            'PARTITION', 'PERCENT_RANK', 'PERCENTILE_CONT', 'PERCENTILE_DISC',
            'POSITION', 'PERCENT', 'PERIOD', 'PORTION', 'PRECEDES',
            'POSITION_REGEX', 'POWER', 'RANGE', 'READ', 'REFERENCES',
            'RELATIVE', 'RESTRICT', 'RETURN', 'RETURNS', 'REVOKE', 'RIGHT',
            'ROLLBACK', 'ROLLUP', 'READS', 'ROWS', 'RECURSIVE', 'REF',
            'REFERENCING', 'RELEASE', 'REPEAT', 'REGIONAL', 'RESULT', 'ROW',
            'RANK', 'REGR_AVGX', 'REGR_AVGY', 'REGR_COUNT', 'REGR_INTERCEPT',
            'REGR_R2', 'REGR_SLOPE', 'REGR_SXX', 'REGR_SXY', 'REGR_SYY',
            'ROW_NUMBER', 'RUNNING', 'SCHEMA', 'SCROLL', 'SECOND', 'SECTION',
            'SELECT', 'SESSION', 'SESSION_USER', 'SET', 'SIZE', 'SOME',
            'SPACE', 'SPECIFIC', 'SQL', 'SQLCODE', 'SQLERROR', 'SQLEXCEPTION',
            'SQLSTATE', 'SQLWARNING', 'SUBSTRING', 'SUM', 'SQRT', 'STDDEV_POP',
            'STDDEV_SAMP', 'SUBSTRING_REGEX', 'SUM', 'SEEK', 'SHOW', 'SIN',
            'SINH', 'SUBSET', 'SUBMULTISET', 'SYSTEM_USER', 'SAVEPOINT',
            'SCOPE', 'SEARCH', 'SENSITIVE', 'SETS', 'SIGNAL', 'SIMILAR',
            'SPECIFICTYPE', 'START', 'STATE', 'STATIC', 'SYMMETRIC', 'SYSTEM',
            'TABLE', 'TEMPORARY', 'THEN', 'TIME', 'TIMESTAMP', 'TIMEZONE_HOUR',
            'TABLESAMPLE'
            'TAN', 'TANH'
            'TIMEZONE_MINUTE', 'TO', 'TRAILING', 'TRANSACTION', 'TRANSLATE',
            'TRANSLATION', 'TRIM', 'TRANSLATE', 'TRANSLATE_REGEX', 'TRUNCATE',
            'TREAT', 'TRIGGER', 'TRIM_ARRAY', 'UNDO', 'UNION', 'UNIQUE',
            'UNKNOWN', 'UPDATE', 'UPPER', 'USAGE', 'USER', 'USING', 'UNDER',
            'UNNEST', 'UNTIL', 'UESCAPE', 'UPPER', 'VALUE', 'VALUES',
            'VARYING', 'VIEW', 'VAR_POP', 'VAR_SAMP', 'VALUE_OF', 'VERSIONING'
            'WHEN', 'WHENEVER', 'WHERE', 'WITH', 'WORK', 'WRITE', 'WHILE',
            'WINDOW', 'WITHIN', 'WITHOUT'
            'WIDTH_BUCKET'
            'YEAR', 'ZONE'
        ]

        keywords_tsql = [
            'INSTEAD', 'CASE', 'UPDLOCK', 'DATEADD', 'GETDATE', 'TEXTIMAGE_ON',
            'CLUSTERED', 'GENERATED', 'DECLARE', 'SET', 'BEGIN', 'END',
            'BREAK', 'CONTINUE', 'GOTO', 'ELSE', 'RETURN', 'WAITFOR', 'BULK',
            'TRY', 'CATCH'
        ]

        keywords_plsql = [
            '%TYPE', 'BEFORE', 'DECODE', 'DESCRIBE', 'DUAL', 'INTERSECT',
            'MINUS', 'SYSDATE', 'USER'
        ]

        if extension in ['microsoft', 't-sql']:
            keywords += keywords_tsql

        if extension in ['oracle', 'pl-sql']:
            keywords += keywords_plsql

        keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword',
                                                     False)

        values = ['TRUE', 'FALSE', 'NULL', 'OFF', 'ON', 'NONE']

        values_tsql = [
            'ALLOW_ROW_LOCKS', 'ALLOW_PAGE_LOCKS', 'ALWAYS', 'IGNORE_DUP_KEY',
            'FILLFACTOR', 'HISTORY_TABLE', 'PAD_INDEX',
            'STATISTICS_NORECOMPUTE', 'SUSER_SNAME', 'SYSTEM_VERSIONING',
            'SYSTEM_TIME'
        ]

        if extension in ['microsoft', 't-sql']:
            values += values_tsql

        values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        types = [
            'BIGINT', 'BIT', 'BLOB', 'CHAR', 'CHARACTER', 'CLOB', 'DATE',
            'DECIMAL', 'DOUBLE', 'FLOAT', 'INTEGER', 'NCHAR', 'NCLOB', 'REAL',
            'SMALLINT', 'VARCHAR'
        ]

        types_tsql = [
            'nvarchar', 'bigint', 'datetime', 'datetime2', 'geography'
        ]

        if extension in ['microsoft', 't-sql']:
            types += types_tsql

        type_tb = CaseInsensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, integer_tb, integer_exponent_tb,
            real_tb, real_exponent_tb, string_tb, known_operator_tb,
            terminators_tb, groupers_tb, keyword_tb, values_tb, identifier_tb,
            type_tb, bracketed_identifier_tb, comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], [],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        # operand_types_2 = ['number', 'string', 'symbol']
        # self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 22
0
    def __init__(self, code, variant):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        quotes = ['"']
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        prefixed_string_tb = PrefixedStringTokenBuilder('@', False, quotes)
        char_tb = FsharpCharTokenBuilder(["'", "’"])
        operand_types.append('string')

        slash_slash_comment_tb = NullTokenBuilder()
        parens_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment')
        triple_slash_comment_tb = NullTokenBuilder()
        if variant in ['fsharp']:
            slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
            triple_slash_comment_tb = TripleSlashCommentTokenBuilder()

        directives = [
            '#if', '#else', '#elif', '#endif', '#define', '#undef', '#line',
            '#region', '#endregion', '#pragma'
        ]

        preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True,
                                                   'preprocessor')
        c_error_tb = LeadToEndOfLineTokenBuilder('#error', True,
                                                 'preprocessor')

        known_operators = [
            'and', 'as', 'in', 'mod', 'not', 'of', 'or', 'when', '::', '+',
            '-', '*', '/', '+.', '-.', '*.', '/.', '=', "'", '->', '>', '<',
            '>=', '<=', '==', '^', '||', '.', '#'
        ]

        known_operators_fsharp = [
            'new', '!', '!=', '%', '%%', '%?', '&', '&&', '&&&', '(|', '|)',
            '*?', '**', '+?', '-?', '->', '..', '.. ..', '/?', ':', ':=', ':/',
            '<<', '<<<', '<-', '<>', '<>?', '<=?', '<|', '<||', '<|||', '<@',
            '@>', '<@@', '@@>', '=?', '==', '>?', '>>', '>>>', '>=?', '?',
            '|||', '^^^', '?>=', '?>', '?<=', '?<', '?=', '?<>', '?+', '?-',
            '?*', '?/', '>=?', '>?', '<=?', '<?', '=?', '<>?', '+?', '-?',
            '*?', '/?', '?>=?', '?>?', '?<=?', '?<?', '?=?', '?<>?', '?+?',
            '?-?', '?*?', '?/?', '@', '|>', '||>', '|||>', '~~', '~~~', '~-',
            '~+', ':>', ':?>', "'"
        ]

        if variant in ['fsharp']:
            known_operators += known_operators_fsharp

        self.unary_operators = ['new', 'not', "'", '-']

        self.postfix_operators = ["'"]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        groupers = [
            '(', ')', ',', '[', ']', '{', '}', 'begin', 'end', ';', '|'
        ]

        groupers_fsharp = ['[|', '|]', '[<', '>]', '^']

        if variant in ['fsharp']:
            groupers += groupers_fsharp

        # group_starts = ['(', '[', ',', '{', '[|', '[<']
        group_mids = [',', ';', '^', '|']
        group_ends = [')', ']', '}', '|]', '>]']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'assert', 'class', 'def', 'do', 'done', 'downto', 'else',
            'exception', 'failwith', 'for', 'fun', 'function', 'if', 'inherit',
            'lazy', 'let', 'match', 'method', 'module', 'object', 'open',
            'raise', 'rec', 'sig', 'then', 'to', 'try', 'type', 'val',
            'virtual', 'while', 'with'
        ]

        keywords_fsharp = [
            'abstract', 'break', 'default', 'delegate', 'downcast', 'elif',
            'extern', 'finally', 'fixed', 'global', 'inline', 'interface',
            'internal', 'let!', 'match!', 'member', 'mutable', 'namespace',
            'override', 'private', 'public', 'return', 'return!', 'upcast',
            'use', 'use!', 'yield', 'yield!'
        ]

        if variant in ['fsharp']:
            keywords += keywords_fsharp

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'bool', 'byte', 'char', 'double', 'float', 'int', 'list', 'long',
            'number', 'object', 'range', 'string', 'struct', 'union', 'unit',
            'void'
        ]

        types_fsharp = [
            'decimal', 'sbyte', 'short', 'uint', 'ulong', 'ushort', 'void'
        ]

        if variant in ['fsharp']:
            types += types_fsharp

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['base', 'false', 'null', 'true', '_']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, integer_tb, integer_exponent_tb,
            real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb,
            known_operator_tb, groupers_tb, identifier_tb, class_type_tb,
            string_tb, triple_quote_string_tb, prefixed_string_tb, char_tb,
            triple_slash_comment_tb, slash_slash_comment_tb,
            parens_star_comment_tb, preprocessor_tb, c_error_tb, c_warning_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 23
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        directive_tb = DirectiveTokenBuilder()

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 10)
        prefixed_string_tb = PrefixedStringTokenBuilder('@', False, quotes)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        directives = [
            '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else',
            '#elif', '#import', '#line', '#include'
        ]

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        c_preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True,
                                                   'preprocessor')
        c_error_tb = LeadToEndOfLineTokenBuilder('#error', True,
                                                 'preprocessor')
        c_pragma_tb = LeadToEndOfLineTokenBuilder('#pragma', True,
                                                  'preprocessor')
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=',
            '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!',
            '&', '|', '<<', '>>', '~', '.', '->', '++', '--', '&&', '||', '^',
            '?', '##'
        ]

        self.unary_operators = [
            '+', '-', '*', '!', '&', '^', '~', '++', '--', '##'
        ]

        self.postfix_operators = ['++', '--', '&', '->', '*', '^']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'atomic', 'break', 'bycopy', 'byref', 'case', 'continue',
            'default', 'do', 'else', 'for', 'goto', 'if', 'IMP', 'in',
            'inline', 'inout', 'nonatomic', 'oneway', 'out', 'Protocol',
            'restrict', 'retain', 'return', 'SEL', 'sizeof', 'switch',
            'typedef', 'while', '@interface', '@end', '@implementation',
            '@protocol', '@class', '@public', '@protected', '@private',
            '@property', '@try', '@throw', '@catch()', '@finally',
            '@synthesize', '@dynamic', '@selector'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'auto', 'char', 'const', 'double', 'enum', 'extern', 'float', 'id',
            'int', 'long', 'register', 'short', 'signed', 'static', 'struct',
            'union', 'unsigned', 'void', 'volatile', '_Bool', '_Complex',
            '_Imaginary', 'BOOL', 'Class'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['self', 'super', 'nil', 'YES', 'NO', 'NULL', '...']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, real_tb, real_exponent_tb,
            keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb,
            directive_tb, identifier_tb, class_type_tb, string_tb,
            prefixed_string_tb, slash_slash_comment_tb, slash_star_comment_tb,
            c_preprocessor_tb, c_warning_tb, c_error_tb, c_pragma_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment', 'line description'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()
        self.convert_values_to_operators()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
  def __init__(self, code, year, extension, tab_size, wide):
    super().__init__()
    self.max_expected_line = 80

    if year is not None and year not in ['68', '1968', '74', '1974', '85', '1985']:
      raise CodeStatException('Unknown year for language')

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder(None)
    integer_exponent_tb = IntegerExponentTokenBuilder(None)
    real_tb = RealTokenBuilder(False, True, None)
    real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', None)
    operand_types.append('number')

    identifier_tb = CobolIdentifierTokenBuilder()
    operand_types.append('identifier')

    quotes = ['"', "'", "’"]
    string_tb = StuffedQuoteStringTokenBuilder(quotes, True)
    operand_types.append('string')

    picture_tb = PictureTokenBuilder()
    cr_picture_tb = CRPictureTokenBuilder()
    operand_types.append('picture')

    terminators_tb = SingleCharacterTokenBuilder('.', 'statement terminator', False)

    known_operators = [
      'ADD', 'SUBTRACT', 'MULTIPLY', 'DIVIDE',
      '+', '-', '*', '/', '**',
      '=', '<>', '>', '>=', '<', '<=',
      'AND', 'OR', 'NOT',
      ':'
    ]

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    self.unary_operators = [
      '+', '-'
    ]

    groupers = ['(', ')', ',']
    group_starts = ['(']
    group_mids = [',']
    # group_ends = [')']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    keywords = [
      'ACCEPT', 'ACCESS', 'ADD', 'ADDRESS', 'ADVANCING', 'AFTER', 'ALL',
      'ALPHABETIC', 'ALPHABETIC-LOWER', 'ALPHABETIC-UPPER',
      'ALPHANUMERIC', 'ALPHANUMERIC-EDITED', 'ALTER', 'ALTERNATE', 'AND',
      'APPLY', 'ARE', 'AREA', 'AREAS', 'ASCENDING', 'ASSIGN', 'AT', 'AUTHOR',
      'BEFORE', 'BLOCK', 'BY',
      'CALL', 'CANCEL', 'CD', 'CF', 'CH', 'CHARACTER', 'CHARACTERS',
      'CLOCK-UNITS', 'CLOSE', 'COBOL', 'CODE', 'COLUMN', 'COMMA',
      'COMMUNICATION', 'COMP', 'COMPUTATIONAL', 'COMPUTE', 'CONFIGURATION',
      'CONTAINS', 'CONTROL', 'CONTROLS', 'COPY', 'CORR', 'CORRESPONDING',
      'COUNT', 'CURRENCY',
      'DATA', 'DATE', 'DATE-COMPILED', 'DATE-WRITTEN',
      'DE', 'DEBUG-CONTENTS', 'DEBUG-ITEM', 'DEBUG-LINE', 'DEBUG-NAME',
      'DEBUG-SUB-1', 'DEBUG-SUB-2', 'DEBUG-SUB-3',
      'DECIMAL-POINT', 'DECLARATIVES', 'DELIMITED', 'DELIMITER', 'DEPENDING',
      'DESCENDING', 'DESTINATION', 'DETAIL', 'DISABLE', 'DISPLAY',
      'DIVIDE', 'DIVISION', 'DOWN',
      'EGI', 'ELSE', 'EMI', 'ENABLE', 'END', 'ENTER', 'ENVIRONMENT', 'EQUAL',
      'ERROR', 'ESI', 'EVERY', 'EXIT', 'EXTEND',
      'FD', 'FILE', 'FILE-CONTROL', 'FILLER', 'FINAL', 'FIRST', 'FOOTING', 'FOR',
      'FROM',
      'GENERATE', 'GIVING', 'GLOBAL', 'GO', 'GOBACK', 'GREATER', 'GROUP',
      'HEADING', 'HIGH-VALUE', 'HIGH-VALUES',
      'I-O', 'I-O-CONTROL',
      'IDENTIFICATION', 'IF', 'IN', 'INDEX', 'INDEXED', 'INDICATE', 'INITIAL',
      'INITIATE', 'INPUT', 'INPUT-OUTPUT', 'INSTALLATION', 'INTO', 'INVALID',
      'IS',
      'JUST', 'JUSTIFIED',
      'KEY',
      'LABEL', 'LAST', 'LEADING', 'LEFT', 'LENGTH', 'LESS', 'LIMIT', 'LIMITS',
      'LINE', 'LINE-COUNTER', 'LINES', 'LINKAGE',
      'LOCK', 'LOW-VALUE', 'LOW-VALUES',
      'MEMORY', 'MERGE', 'MESSAGE', 'MODE', 'MODULES', 'MOVE', 'MULTIPLE',
      'MULTIPLY',
      'NEGATIVE', 'NEXT', 'NO', 'NOT', 'NUMBER', 'NUMERIC', 'NUMERIC-EDITED',
      'OBJECT-COMPUTER', 'OCCURS', 'OF', 'OMITTED',
      'OPEN', 'OPTIONAL', 'OR', 'OUTPUT', 'OVERFLOW',
      'PAGE', 'PAGE-COUNTER', 'PERFORM', 'PF', 'PH', 'PIC', 'PICTURE',
      'PLUS', 'POINTER', 'POSITION', 'POSITIVE', 'PROCEDURE', 'PROCEED',
      'PROGRAM', 'PROGRAM-ID',
      'QUEUE', 'QUOTE', 'QUOTES',
      'RANDOM', 'RD', 'READ', 'RECEIVE', 'RECORD', 'RECORDS', 'REDEFINES',
      'REEL', 'REFERENCE', 'RELATIVE', 'RELEASE', 'REMAINDER',
      'RENAMES', 'REPLACE', 'REPLACING', 'REPORT', 'REPORTING', 'REPORTS',
      'RERUN', 'RESERVE', 'RESET', 'RETURN', 'REVERSED', 'REWIND', 'REWRITE',
      'RF', 'RH', 'RIGHT', 'ROUNDED', 'RUN',
      'SAME', 'SD', 'SEARCH', 'SECTION', 'SECURITY', 'SEGMENT', 'SEGMENT-LIMIT',
      'SELECT', 'SEND', 'SENTENCE', 'SEQUENCE', 'SEQUENTIAL', 'SET', 'SIGN', 'SIZE',
      'SORT', 'SOURCE', 'SOURCE-COMPUTER', 'SPECIAL-NAMES', 'STANDARD', 'STATUS',
      'STOP', 'STRING','SUB-QUEUE-1', 'SUB-QUEUE-2', 'SUB-QUEUE-3', 'SUBTRACT',
      'SUM', 'SUPPRESS', 'SYMBOLIC', 'SYNC', 'SYNCHRONIZED',
      'TABLE', 'TALLY', 'TAPE', 'TERMINAL', 'TERMINATE', 'TEST', 'TEXT', 'THAN',
      'THEN', 'THROUGH', 'THRU', 'TIME', 'TIMES', 'TITLE', 'TO', 'TYPE',
      'UNIT', 'UNSTRING', 'UNTIL', 'UP', 'UPON', 'USAGE', 'USE', 'USING',
      'VALUE', 'VALUES', 'VARYING',
      'WHEN',
      'WITH', 'WORDS', 'WORKING-STORAGE', 'WRITE'
    ]

    keywords_68_only = [
      'ACTUAL',
      'FILE-LIMITS',
      'NOMINAL',
      'PROCESSING',
      'NOTE',
      'REMARKS',
      'SEEK',
      'TODAY'
    ]

    keywords_74 = [
      'ALSO',
      'BOTTOM',
      'CODE-SET', 'COLLATING', 'COMMON',
      'DAY', 'DELETE', 'DEBUGGING', 'DUPLICATES', 'DYNAMIC',
      'END-OF-PAGE', 'EOP', 'EXCEPTION',
      'INSPECT',
      'LINAGE', 'LINAGE-COUNTER',
      'NATIVE',
      'ORGANIZATION',
      'PACKED-DECIMAL', 'PADDING', 'PRINTING', 'PROCEDURES',
      'REFERENCES', 'REMOVAL',
      'SEPARATE', 'SORT-MERGE', 'STANDARD-1', 'STANDARD-2', 'START',
      'TALLYING', 'TOP', 'TRAILING'
    ]

    keywords_85 = [
      'ALPHABET', 'ANY',
      'BINARY',
      'CONTENT', 'CONTINUE', 'CONVERTING',
      'DAY-OF-WEEK',
      'END-ADD', 'END-CALL', 'END-COMPUTE', 'END-DELETE', 'END-DIVIDE',
      'END-EVALUATE', 'END-IF', 'END-MULTIPLY', 'END-PERFORM', 'END-READ',
      'END-RECEIVE', 'END-RETURN', 'END-REWRITE', 'END-SEARCH', 'END-START',
      'END-STRING', 'END-SUBTRACT', 'END-UNSTRING', 'END-WRITE',
      'EVALUATE', 'EXTERNAL',
      'INITIALIZE',
      'ORDER', 'OTHER',
      'PURGE'
    ]

    if year in ['68', '1968']:
      keywords += keywords_68_only

    if year in ['74', '1974', '85', '1985']:
      keywords += keywords_74

    if year in ['85', '1985']:
      keywords += keywords_85

    keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False)

    values = [
      'BLANK', 'SPACE', 'SPACES', 'ZERO', 'ZEROES', 'ZEROS',
      'NO', 'OFF', 'ON'
    ]

    values_85 = ['FALSE', 'TRUE']

    if year in ['85', '1985']:
      values += values_85

    value_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    exec_tb = BlockTokenBuilder('EXEC', 'END-EXEC', 'exec block')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      terminators_tb,
      integer_tb,
      integer_exponent_tb,
      real_tb,
      real_exponent_tb,
      picture_tb,
      cr_picture_tb,
      keyword_tb,
      known_operator_tb,
      groupers_tb,
      value_tb,
      identifier_tb,
      string_tb,
      exec_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)

    tokens = self.tokenize_code(code, tab_size, tokenizer, wide)
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator')
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
    self.tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'whitespace')

    self.convert_numbers_to_pictures()
    self.convert_numbers_to_levels()

    self.calc_statistics()

    tokens = self.source_tokens()
    tokens = Examiner.join_all_lines(tokens)

    self.calc_token_confidence()
    self.calc_token_2_confidence()

    num_operators = self.count_my_tokens(['operator', 'invalid operator'])
    if num_operators > 0:
      self.calc_operator_confidence(num_operators)
      allow_pairs = []
      self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
      # self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
      self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

    self.calc_group_confidence(tokens, group_mids)

    # self.calc_operand_n_confidence(tokens, operand_types, 2)
    # self.calc_operand_n_confidence(tokens, operand_types, 4)

    self.calc_keyword_confidence()

    self.calc_picture_confidence()

    if not wide:
      self.calc_line_length_confidence(code, self.max_expected_line)

    expected_keyword_confidence = self.check_expected_keywords()
    self.confidences['expected_keywords'] = expected_keyword_confidence
Esempio n. 25
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(True, True, '_')
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_')
        operand_types.append('number')

        argument_tb = SwiftArgumentTokenBuilder()

        leads = '_'
        extras = '_'
        suffixes = '?'
        identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes)
        operand_types.append('identifier')

        attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False)

        symbol_tb = SwiftSymbolTokenBuilder('.', 'symbol', True)
        operand_types.append('symbol')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 10)
        triple_quote_comment_tb = TripleQuoteStringTokenBuilder(quotes)
        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()
        operand_types.append('string')

        known_operators = [
            '+', '-', '*', '/', '%', '==', '!=', '>', '<', '>=', '<=', '&&',
            '||', '!', '&', '|', '^', '~', '<<', '>>', '===', '=', '+=', '-=',
            '*=', '/=', '%=', '<<=', '>>=', '&=', '^=', '|=', '...', '..<',
            '?', ':', '.', '++', '--', '->', '??', '\\.', '&+', '&-', '&*'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', '!', '~', '&', '++', '--', ':', '?']

        self.postfix_operators = ['++', '--', ':', '!', '?']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'associatedtype', 'class', 'deinit', 'enum', 'extension',
            'fileprivate', 'func', 'import', 'init', 'inout', 'internal',
            'let', 'open', 'operator', 'private', 'protocol', 'public',
            'static', 'struct', 'subscript', 'typealias', 'var', 'break',
            'case', 'continue', 'default', 'defer', 'do', 'else',
            'fallthrough', 'for', 'guard', 'if', 'in', 'repeat', 'return',
            'switch', 'where', 'while', 'as', 'Any', 'catch', 'is', 'rethrows',
            'super', 'throw', 'throws', 'try', 'try?', 'try!', '#available',
            '#colorLiteral', '#column', '#else', '#elseif', '#endif', '#file',
            '#fileLiteral', '#function', '#if', '#imageLiteral', '#line',
            '#selector', '#sourceLocation', 'associativity', 'convenience',
            'dynamic', 'didSet', 'final', 'get', 'infix', 'indirect', 'lazy',
            'left', 'mutating', 'none', 'nonmutating', 'optional', 'override',
            'postfix', 'precedence', 'prefix', 'Protocol', 'required', 'right',
            'set', 'Type', 'unowned', 'weak', 'willSet'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'char',
            'double',
            'float',
            'int',
            'long',
            'short',
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['nil', 'Self', 'false', 'true']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, argument_tb,
            keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, attribute_tb, symbol_tb, string_tb,
            slash_slash_comment_tb, slash_star_comment_tb,
            triple_quote_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')
        self.convert_keywords_to_identifiers(['.'])

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 26
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        identifier_tb = HaskellIdentifierTokenBuilder()
        operand_types.append('identifier')

        class_tb = HaskellClassTokenBuilder()
        operand_types.append('class')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        line_comment_tb = LeadToEndOfLineTokenBuilder('--', False, 'comment')
        block_comment_tb = BlockTokenBuilder('{-', '-}', 'comment')

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        operators_tb = HaskellOperatorTokenBuilder('#$%&*+./<=>?@\\^|-~')

        known_operators = ["'", '..']

        known_operators_tb = CaseInsensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.postfix_operators = ['..', "'"]

        keywords = [
            'case', 'class', 'data', 'deriving', 'do', 'else', 'if', 'import',
            'in', 'infix', 'infix1', 'infixr', 'instance', 'let', 'module',
            'newtype', 'of', 'then', 'type', 'where'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True)

        values = ['True', 'False', 'Nothing', '_']

        value_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb,
            keyword_tb, groupers_tb, operators_tb, known_operators_tb,
            identifier_tb, value_tb, class_tb, string_tb, line_comment_tb,
            block_comment_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment'])
        HaskellExaminer.convert_keywords_to_identifiers(tokens)
        self.tokens = tokens
        # self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        # self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            # self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        # operand_types_2 = ['number']
        # self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # operand_types = ['number', 'string', 'symbol', 'identifier', 'variable']
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 27
0
    def __init__(self, code, tab_size, wide):
        super().__init__()

        self.operand_types = []

        self.whitespace_tb = WhitespaceTokenBuilder()
        self.newline_tb = NewlineTokenBuilder()

        self.integer_tb = IntegerTokenBuilder(None)
        self.integer_exponent_tb = IntegerExponentTokenBuilder(None)
        self.binary_integer_tb = SuffixedIntegerTokenBuilder(['B'], False,
                                                             None)
        self.real_tb = RealTokenBuilder(False, False, None)
        self.real_exponent_tb = RealExponentTokenBuilder(
            False, False, 'E', None)
        self.binary_real_tb = SuffixedRealTokenBuilder(True, True, ['B'],
                                                       False, None)
        self.operand_types.append('number')

        leads = '_'
        extras = '_'
        self.identifier_tb = IdentifierTokenBuilder(leads, extras)
        self.operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        self.string_tb = EscapedStringTokenBuilder(quotes, 0)
        self.operand_types.append('string')

        self.label_tb = PL1LabelTokenBuilder()
        self.operand_types.append('label')

        self.slash_star_comment_tb = SlashStarCommentTokenBuilder()

        self.jcl_tb = JCLTokenBuilder()

        directives = [
            '%ACTIVATE', '%DEACTIVATE', '%DECLARE', '%DCL', '%DICTIONARY',
            '%DO', '%ELSE', '%END', '%FATAL', '%GOTO', '%IF', '%INCLUDE',
            '%LIST', '%NOLIST', '%PAGE', '%PROCEDURE', '%PROC', '%REPLACE',
            '%RETURN', '%THEN'
        ]

        self.line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        self.preprocessor_tb = CaseInsensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        self.title_tb = LeadToEndOfLineTokenBuilder('%TITLE', True,
                                                    'preprocessor')
        self.subtitle_tb = LeadToEndOfLineTokenBuilder('%SBTTL', True,
                                                       'preprocessor')
        self.error_tb = LeadToEndOfLineTokenBuilder('%ERROR', True,
                                                    'preprocessor')
        self.warn_tb = LeadToEndOfLineTokenBuilder('%WARN', True,
                                                   'preprocessor')
        self.inform_tb = LeadToEndOfLineTokenBuilder('%INFORM', True,
                                                     'preprocessor')
        self.terminators_tb = SingleCharacterTokenBuilder(
            ';', 'statement terminator', False)

        known_operators = [
            '+', '-', '*', '/', '**', '>', '<', '=', '>=', '<=', '¬>', '¬<',
            '¬=', '^>', '^<', '^=', '^', '~>', '~<', '~=', '~', '¬', '&', '&:',
            '|', '|:', '||', '!', '!:', '!!', ':'
        ]

        self.unary_operators = ['+', '-', '^', '~', '¬']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        self.group_starts = ['(', '[', ',', '{']
        self.group_mids = [',']
        self.group_ends = [')', ']', '}']

        self.groupers_tb = CaseInsensitiveListTokenBuilder(
            groupers, 'group', False)

        self.known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'ALLOCATE', 'ALLOC', 'BEGIN', 'CALL', 'CLOSE', 'DECLARE', 'DCL',
            'DO', 'ELSE', 'END', 'FORMAT', 'FREE', 'GET', 'GOTO', 'GO TO',
            'IF', 'LEAVE', 'ON', 'OPEN', 'OTHERWISE', 'OTHER', 'PROCEDURE',
            'PROC', 'PUT', 'READ', 'RETURN', 'REVERT', 'REWRITE', 'SELECT',
            'SIGNAL', 'STOP', 'THEN', 'WHEN', 'WRITE'
        ]

        self.keyword_tb = CaseInsensitiveListTokenBuilder(
            keywords, 'keyword', False)

        attributes = [
            'ALIGNED', 'ANY', 'AREA', 'BASED', 'BUILTIN', 'CONDITION', 'COND',
            'CONTROLLED', 'CTL', 'DEFINED', 'DEF', 'DIRECT', 'ENTRY',
            'ENVIRONMENT', 'ENV', 'EXTERNAL', 'EXT', 'FILE', 'GLOBALDEF',
            'GLOBALREF', 'INITIAL', 'INIT', 'INPUT', 'INTERNAL', 'INT'
            'KEYED', 'LABEL', 'LIKE', 'LIST', 'MEMBER', 'NONVARYING', 'NONVAR',
            'OPTIONAL', 'OPTIONS', 'OUTPUT', 'PARAMETER', 'PARM', 'PICTURE',
            'PIC', 'POSITION', 'POS', 'PRECISION', 'PREC', 'PRINT', 'READONLY',
            'RECORD', 'REFER', 'RETURNS', 'SEQUENTIAL', 'SEQL', 'STATIC',
            'STREAM', 'STRUCTURE', 'TRUNCATE', 'UNALIGNED', 'UNAL', 'UNION',
            'UPDATE', 'VARIABLE', 'VARYING', 'VAR'
        ]

        self.attributes_tb = CaseInsensitiveListTokenBuilder(
            attributes, 'attribute', False)

        functions = [
            'ABS', 'ACOS', 'ACTUALCOUNT', 'ADD', 'ADDR', 'ADDREL',
            'ALLOCATION', 'ALLOCN', 'ASIN', 'ATAN', 'ATAND', 'ATANH',
            'AUTOMATIC', 'AUTO', 'BINARY', 'BIN', 'BIT', 'BOOL', 'BYTE',
            'BYTESIZE', 'CEIL', 'CHARACTER', 'CHAR', 'COLLATE', 'COPY', 'COS',
            'COSD', 'COSH', 'DATE', 'DATETIME', 'DECIMAL', 'DEC', 'DECODE',
            'DESCRIPTOR', 'DESC', 'DIMENSION', 'DIM', 'DIVIDE', 'EMPTY',
            'ENCODE', 'ERROR', 'EVERY', 'EXP', 'FIXED', 'FLOAT', 'FLOOR',
            'HBOUND', 'HIGH', 'INDEX', 'INFORM', 'INT', 'LBOUND', 'LENGTH',
            'LINE', 'LINENO', 'LOG', 'LOG10', 'LOG2', 'LOW', 'LTRIM', 'MAX',
            'MAXLENGTH', 'MIN', 'MOD', 'MULTIPLY', 'NULL', 'OFFSET',
            'ONARGSLIST', 'ONCHAR', 'ONCODE', 'ONFILE', 'ONKEY', 'ONSOURCE',
            'PAGENO', 'POINTER', 'PTR', 'POSINT', 'PRESENT', 'PROD', 'RANK',
            'REFERENCE', 'REVERSE', 'ROUND', 'RTRIM', 'SEARCH', 'SIGN', 'SIN',
            'SIND', 'SINH', 'SIZE', 'SOME', 'SQRT', 'STRING', 'SUBSTR',
            'SUBTRACT', 'SUM', 'TAN', 'TAND', 'TANH', 'TIME', 'TRANSLATE',
            'TRIM', 'TRUNC', 'UNSPEC', 'VALID', 'VALUE', 'VAL', 'VARIANT',
            'VERIFY', 'WARN'
        ]

        self.function_tb = CaseInsensitiveListTokenBuilder(
            functions, 'function', True)

        format_items = [
            'A', 'B', 'B1', 'B2', 'B3', 'B4', 'COLUMN', 'COL', 'E', 'F', 'P',
            'R', 'TAB', 'X'
        ]

        self.format_item_tb = CaseSensitiveListTokenBuilder(
            format_items, 'format', True)
        self.operand_types.append('format')

        options = [
            'APPEND', 'BACKUP_DATE', 'BATCH', 'BLOCK_BOUNDARY_FORMAT',
            'BLOCK_IO', 'BLOCK_SIZE', 'BUCKET_SIZE', 'BY', 'CANCEL_CONTROL_O',
            'CARRIAGE_RETURN_FORMAT', 'CONTIGUOUS', 'CONTIGUOUS_BEST_TRY',
            'CREATION_DATE', 'CURRENT_POSITION', 'DEFAULT_FILE_NAME',
            'DEFERRED_WRITE', 'DELETE', 'EDIT', 'EXPIRATION_DATE',
            'EXTENSION_SIZE', 'FAST_DELETE', 'FILE_ID', 'FILE_ID_TO',
            'FILE_SIZE', 'FIXED_CONTROL_FROM', 'FIXED_CONTROL_SIZE',
            'FIXED_CONTROL_SIZE_TO', 'FIXED_CONTROL_TO',
            'FIXED_LENGTH_RECORDS', 'FROM', 'GROUP_PROTECTION', 'IDENT',
            'IGNORE_LINE_MARKS', 'IN', 'INDEXED', 'INDEX_NUMBER',
            'INITIAL_FILL', 'INTO', 'KEY', 'KEYFROM', 'KEYTO', 'LINESIZE',
            'LOCK_ON_READ', 'LOCK_ON_WRITE', 'MAIN PROCEDURE',
            'MANUAL_UNLOCKING', 'MATCH_GREATER', 'MATCH_GREATER_EQUAL',
            'MATCH_NEXT', 'MATCH_NEXT_EQUAL', 'MAXIMUM_RECORD_NUMBER',
            'MAXIMUM_RECORD_SIZE', 'MULTIBLOCK_COUNT', 'MULTIBUFFER_COUNT',
            'NOLOCK', 'NONEXISTENT_RECORD', 'NONRECURSIVE', 'NORESCAN',
            'NO_ECHO', 'NO_FILTER', 'NO_SHARE', 'OWNER_GROUP', 'OWNER_ID',
            'OWNER_MEMBER', 'OWNER_PROTECTION', 'PAGE', 'PAGESIZE',
            'PRINTER_FORMAT', 'PROMPT', 'PURGE_TYPE_AHEAD', 'READ_AHEAD',
            'READ_CHECK', 'READ_REGARDLESS', 'RECORD_ID', 'RECORD_ID_ACCESS',
            'RECORD_ID_TO', 'RECURSIVE', 'REPEAT', 'RESCAN',
            'RETRIEVAL_POINTERS', 'REVISION_DATE', 'REWIND_ON_CLOSE',
            'REWIND_ON_OPEN', 'SCALARVARYING', 'SET READ', 'SHARED_READ',
            'SHARED_WRITE', 'SKIP', 'SNAP', 'SPOOL', 'STATEMENT', 'SUPERSEDE',
            'SYSTEM', 'SYSTEM_PROTECTION', 'TEMPORARY', 'TIMEOUT_PERIOD',
            'TITLE', 'TO', 'UNDERFLOW', 'UFL', 'UNTIL', 'USER_OPEN',
            'WAIT_FOR_RECORD', 'WHILE', 'WORLD_PROTECTION', 'WRITE_BEHIND',
            'WRITE_CHECK'
        ]

        self.options_tb = CaseInsensitiveListTokenBuilder(
            options, 'option', False)

        conditions = [
            'ANYCONDITION', 'CONVERSION', 'CONV', 'ENDFILE', 'ENDPAGE',
            'FINISH', 'FIXEDOVERFLOW', 'FOFL', 'OVERFLOW', 'OFL', 'STORAGE',
            'STRINGRANGE', 'STRG', 'SUBSCRIPTRANGE', 'SUBRG', 'UNDEFINEDFILE',
            'UNDF', 'VAXCONDITION', 'ZERODIVIDE', 'ZDIV'
        ]

        self.conditions_tb = CaseInsensitiveListTokenBuilder(
            conditions, 'condition', False)

        subroutines = [
            'DISPLAY', 'EXTEND', 'FLUSH', 'NEXT_VOLUME', 'RELEASE', 'RESIGNAL',
            'REWIND', 'SPACEBLOCK'
        ]

        self.subroutines_tb = CaseInsensitiveListTokenBuilder(
            subroutines, 'subroutine', False)

        types = [
            'FIXED', 'BINARY', 'FLOAT', 'DECIMAL', 'BIT', 'CHARACTER',
            'PICTURE'
        ]

        self.types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True)
        self.operand_types.append('type')

        values = ['SYSIN', 'SYSPRINT']

        self.values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        self.operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        # tokenize as free-format
        tokenbuilders_free = [
            self.newline_tb, self.whitespace_tb, self.line_continuation_tb,
            self.terminators_tb, self.integer_tb, self.integer_exponent_tb,
            self.binary_integer_tb, self.real_tb, self.real_exponent_tb,
            self.binary_real_tb, self.keyword_tb, self.function_tb,
            self.attributes_tb, self.options_tb, self.conditions_tb,
            self.subroutines_tb, self.types_tb, self.values_tb,
            self.groupers_tb, self.known_operator_tb, self.identifier_tb,
            self.string_tb, self.label_tb, self.slash_star_comment_tb,
            self.preprocessor_tb, self.title_tb, self.subtitle_tb,
            self.error_tb, self.warn_tb, self.inform_tb, self.jcl_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer_free = Tokenizer(tokenbuilders_free)
        tokens_free = tokenizer_free.tokenize(code)
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid operator')
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid')
        self.tokens = tokens_free

        self.calc_statistics()
        statistics_free = self.statistics
        self.statistics = {}

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators,
                                            self.group_ends, allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            self.group_starts, allow_pairs)

        self.calc_group_confidence(tokens, self.group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, self.operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
        confidences_free = self.confidences
        self.confidences = {}
        errors_free = self.errors
        self.errors = []

        # tokenize as fixed-format
        tokenbuilders_fixed = [
            self.newline_tb, self.whitespace_tb, self.line_continuation_tb,
            self.terminators_tb, self.integer_tb, self.integer_exponent_tb,
            self.binary_integer_tb, self.real_tb, self.real_exponent_tb,
            self.binary_real_tb, self.keyword_tb, self.function_tb,
            self.attributes_tb, self.options_tb, self.conditions_tb,
            self.subroutines_tb, self.types_tb, self.values_tb,
            self.groupers_tb, self.known_operator_tb, self.identifier_tb,
            self.string_tb, self.label_tb, self.slash_star_comment_tb,
            self.preprocessor_tb, self.title_tb, self.subtitle_tb,
            self.error_tb, self.warn_tb, self.inform_tb, self.jcl_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        comment_start_tb = PL1CommentStartTokenBuilder()
        comment_middle_tb = PL1CommentMiddleTokenBuilder()
        comment_end_tb = PL1CommentEndTokenBuilder()

        type1_tokenbuilders = [comment_start_tb]
        tokenbuilders_fixed_1 = tokenbuilders_fixed + type1_tokenbuilders + [
            invalid_token_builder
        ]
        tokenizer_fixed_1 = Tokenizer(tokenbuilders_fixed_1)

        type2_tokenbuilders = [
            comment_start_tb, comment_middle_tb, comment_end_tb
        ]
        tokenbuilders_fixed_2 = tokenbuilders_fixed + type2_tokenbuilders + [
            invalid_token_builder
        ]
        tokenizer_fixed_2 = Tokenizer(tokenbuilders_fixed_2)

        tokens_fixed = self.tokenize_code(code, tab_size, tokenizer_fixed_1,
                                          tokenizer_fixed_2, wide)
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'invalid operator')
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'invalid')
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'whitespace')
        tokens_fixed = self.convert_broken_comments_to_comments(tokens_fixed)
        self.tokens = tokens_fixed

        self.calc_statistics()
        statistics_fixed = self.statistics
        self.statistics = {}

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators,
                                            self.group_ends, allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            self.group_starts, allow_pairs)

        self.calc_group_confidence(tokens, self.group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, self.operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
        confidences_fixed = self.confidences
        self.confidences = {}
        errors_fixed = self.errors
        self.errors = []

        # compute confidence for free-format and fixed-format
        confidence_free = 1.0
        if len(confidences_free) == 0:
            confidence_free = 0.0
        else:
            for key in confidences_free:
                factor = confidences_free[key]
                confidence_free *= factor

        confidence_fixed = 1.0
        if len(confidences_fixed) == 0:
            confidence_fixed = 0.0
        else:
            for key in confidences_fixed:
                factor = confidences_fixed[key]
                confidence_fixed *= factor

        # select the better of free-format and spaced-format
        if confidence_fixed > confidence_free:
            self.tokens = tokens_fixed
            self.statistics = statistics_fixed
            self.confidences = confidences_fixed
            self.errors = errors_fixed
        else:
            self.tokens = tokens_free
            self.statistics = statistics_free
            self.confidences = confidences_free
            self.errors = errors_free
Esempio n. 28
0
    def __init__(self, code):
        super().__init__()
        self.newlines_important = 'always'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("")
        operand_types.append('number')

        leads = '_$'
        extras = '_$'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        comment_tb = LeadToEndOfLineTokenBuilder(';', False, 'comment')

        directives = [
            '.PROC', '.LIST', '.NOLIST', '.PAGE', '.INCLUDE', '.IFDEF',
            '.ENDC', '.IFNDEF', '.END'
        ]

        directive_tb = CaseInsensitiveListTokenBuilder(directives, 'directive',
                                                       False)

        line_continuation_tb = SingleCharacterTokenBuilder(
            '&', 'line continuation', False)
        title_tb = LeadToEndOfLineTokenBuilder('.TITLE', True, 'directive')

        known_operators = [
            '=', '+', '-', '*', '/', '.EQ.', '.NE.', '.LT.', '.LE.', '.GE.',
            '.GT.', '.NOT.', '.AND.', '.OR.', '.XOR.'
        ]

        self.unary_operators = ['.NOT.', '+', '-']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', ':']
        group_starts = ['(', '[', ',']
        group_ends = [')', ']']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'ACCEPT', 'BEGIN', 'BY', 'CALL', 'CLEAR', 'CLOSE', 'COMMON',
            'DELETE', 'DISPLAY', 'DO', 'ELSE', 'END', 'ENDUSING', 'FOR',
            'GOTO', 'IF', 'INCR', 'OPEN', 'PROC', 'READ', 'READS', 'RECORD',
            'RETURN', 'SELECT', 'STORE', 'SUBROUTINE', 'THEN', 'THRU', 'UNTIL',
            'USING', 'WHILE', 'WRITE', 'WRITES', 'XCALL'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, integer_tb,
            keyword_tb, groupers_tb, known_operator_tb, identifier_tb,
            comment_tb, string_tb, directive_tb, title_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence([])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 29
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '0123456789abcdefABCDEF_')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_')
        octal_integer_tb = PrefixedIntegerTokenBuilder('0c', False,
                                                       '01234567_')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        comment_tb = LeadToEndOfLineTokenBuilder('--', True, 'comment')

        known_operators = [
            ':=', '=', '/=', '<', '>', '<=', '>=', '+', '-', '*', '/', '//',
            '\\\\', '^', '|..|', '..', 'and', 'or', 'xor', 'not', 'and then',
            'or else', 'implies', '.', '@', '#', '|', '&'
        ]

        self.unary_operators = ['+', '-', 'not', '@', '#', '|', '&']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', ';']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ';', ':']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'across', 'agent', 'alias', 'all', 'as', 'assign', 'attribute',
            'check', 'class', 'convert', 'create', 'debug', 'deferred', 'do',
            'else', 'elseif', 'end', 'ensure', 'expanded', 'export',
            'external', 'feature', 'from', 'frozen', 'if', 'implies',
            'inherit', 'inspect', 'invariant', 'like', 'local', 'loop', 'note',
            'obsolete', 'old', 'once', 'only', 'redefine', 'rename', 'require',
            'rescue', 'retry', 'select', 'separate', 'then', 'undefine',
            'until', 'variant', 'when'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = ['Current', 'Precursor', 'Result', 'Void', 'TUPLE']

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['False', 'True', '?']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, integer_tb, integer_exponent_tb,
            hex_integer_tb, binary_integer_tb, octal_integer_tb, real_tb,
            real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb,
            known_operator_tb, identifier_tb, string_tb, comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Esempio n. 30
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("_")
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '_0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '_01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder([
            'G',
            'L',
            'I',
        ], False, '_')
        real_tb = RealTokenBuilder(False, False, "_")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "_")
        suffixed_real_tb = SuffixedRealTokenBuilder(False, False,
                                                    ['G', 'D', 'F'], False,
                                                    '_')
        operand_types.append('number')

        leads = '@_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        regex_tb = RegexTokenBuilder()
        # dollar-slash slash-dollar strings (allow newline)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        shebang_tb = SheBangTokenBuilder()
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+',
            '-',
            '*',
            '/',
            '%',
            '**',
            '=',
            '==',
            '!=',
            '===',
            '!==',
            '>',
            '>=',
            '<',
            '<=',
            '+=',
            '-=',
            '*=',
            '/=',
            '%=',
            '**=',
            '&=',
            '|=',
            '^=',
            '<<=',
            '>>=',
            '!',
            '&',
            '|',
            '~',
            '<<',
            '>>',
            '>>>',
            '^',
            '?.',
            '?:',
            '<>',
            '>>>=',
            '.',
            '.&',
            '.@',
            '::',
            '=~',
            '==~',
            '*.',
            '*:',
            '..',
            '..<',
            '<=>',
            '++',
            '--',
            '->',
            '&&',
            '||',
            '?',
            '##',
            'as',
            'in',
            '!in',
            'instanceof',
            '!instanceof',
            'new',
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--', '?']

        self.postfix_operators = ['++', '--', '&', '*']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        # group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'assert', 'break', 'case', 'catch', 'class', 'const', 'continue',
            'def', 'default', 'do', 'else', 'enum', 'extends', 'finally',
            'for', 'goto', 'if', 'implements', 'import', 'interface', 'new',
            'package', 'return', 'super', 'switch', 'throw', 'throws', 'trait',
            'try', 'var', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'byte', 'char', 'double', 'float', 'int', 'long', 'short',
            'Java.lang.BigInteger'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['null', 'true', 'false', 'this']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, binary_integer_tb,
            suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb,
            keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb,
            identifier_tb, class_type_tb, string_tb, triple_quote_string_tb,
            regex_tb, slash_slash_comment_tb, slash_star_comment_tb,
            shebang_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        # shebang line at start

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)