Example #1
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     RegexTokenBuilder.__escape_z__()
     PerlIdentifierTokenBuilder.__escape_z__()
     PerlDollarCaretIdentifierTokenBuilder.__escape_z__()
     PerlQStringTokenBuilder.__escape_z__()
     MRegexTokenBuilder.__escape_z__()
     SRegexTokenBuilder.__escape_z__()
     YRegexTokenBuilder.__escape_z__()
     TrRegexTokenBuilder.__escape_z__()
     PerlPrototypeTokenBuilder.__escape_z__()
     PerlSigilBraceTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #2
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     RegexTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #3
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     RegexTokenBuilder.__escape_z__()
     SlashSlashCommentTokenBuilder.__escape_z__()
     SlashStarCommentTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #4
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     TripleQuoteStringTokenBuilder.__escape_z__()
     RegexTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     SuffixedIntegerTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     SuffixedRealTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     SheBangTokenBuilder.__escape_z__()
     SlashSlashCommentTokenBuilder.__escape_z__()
     SlashStarCommentTokenBuilder.__escape_z__()
     ClassTypeTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #5
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        perl_identfier_tb = PerlIdentifierTokenBuilder()
        operand_types.append('identifier')

        specials = [
            '$_', '@_', '$$', '$"', '$(', '$)', '$>', '$<', '$;', '$]', '$[',
            '$&', '$`', "$'", '$+', '@+', '%+', '@-', '%-', '$,', '$.', '$/',
            '$\\', '$|', '$%', '$-', '$:', '$=', '$^', '$~', '$!', '$?', '$@',
            '$#', '$*'
        ]

        specials_tb = CaseInsensitiveListTokenBuilder(specials, 'identifier',
                                                      True)

        dollar_carat_tb = PerlDollarCaretIdentifierTokenBuilder()

        sigilbrace_tb = PerlSigilBraceTokenBuilder()

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        q_string_tb = PerlQStringTokenBuilder()

        regex_tb = RegexTokenBuilder()
        m_regex_tb = MRegexTokenBuilder()
        s_regex_tb = SRegexTokenBuilder()
        y_regex_tb = YRegexTokenBuilder()
        tr_regex_tb = TrRegexTokenBuilder()
        operand_types.append('regex')

        prototype_tb = PerlPrototypeTokenBuilder()

        comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        directives = ['#line']

        preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '**', '/', '%', '=', '==', '!=', '>', '>=', '<',
            '<=', '**=', '+=', '*=', '&=', '&.=', '<<=', '&&=', '-=', '/=',
            '|=', '|.=', '>>=', '||=', '.=', '%=', '^=', '^.=', '//=', 'x=',
            'ne', 'gt', 'ge', 'le', 'lt', 'eq', '!', '&', '|', '~', '<<', '>>',
            '^', '.', '..', '...', '++', '--', '->', '=>', '&&', '||', '?',
            '<->', '<=>', 'and', 'cmp', 'or', 'xor'
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--']

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':', '::']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'bless', 'break', 'continue', 'die', 'do', 'else', 'elsif', 'eval',
            'exit', 'exp', 'for', 'foreach', 'if', 'last', 'lock', 'my',
            'next', 'no', 'our', 'package', 'redo', 'return', 'say', 'sub',
            'taint', 'undef', 'unless', 'until', 'use', 'wantarray', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True)

        values = ['NULL']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, real_tb,
            real_exponent_tb, keyword_tb, values_tb, groupers_tb,
            known_operator_tb, prototype_tb, identifier_tb, perl_identfier_tb,
            specials_tb, dollar_carat_tb, sigilbrace_tb, string_tb,
            q_string_tb, regex_tb, m_regex_tb, s_regex_tb, y_regex_tb,
            tr_regex_tb, preprocessor_tb, comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code, ['__END__'])
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment', 'line description'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #6
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        hex_constant_tb = PrefixedIntegerTokenBuilder(
            '0H', False, '0123456789ABCDEFabcdef')
        octal_constant_tb = PrefixedIntegerTokenBuilder(
            '0O', False, '01234567')
        binary_constant_tb = PrefixedIntegerTokenBuilder('0B', False, '01')
        operand_types.append('number')

        leads = '_$'
        extras = '_$'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        terminators_tb = CaseInsensitiveListTokenBuilder(
            [';'], 'statement terminator', False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '===', '!==', '>', '>=',
            '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=',
            '>>=', '!', '&', '|', '~', '<<', '>>', '=>', '^', '.', ':', '++',
            '--', '&&', '||', '?', '$', '?.', 'new', 'delete'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = [
            '+', '-', '!', '~', '++', '--', ':', '$', 'new', 'delete'
        ]

        self.postfix_operators = ['++', '--', ':']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False)

        regex_tb = RegexTokenBuilder()

        keywords = [
            'break', 'case', 'catch', 'class', 'const', 'continue', 'debugger',
            'default', 'do', 'else', 'enum', 'export', 'extends', 'finally',
            'for', 'function', 'if', 'import', 'in', 'instanceof', 'return',
            'switch', 'throw', 'try', 'typeof', 'while', 'with', 'as',
            'implements', 'interface', 'let', 'package', 'private',
            'protected', 'public', 'static', 'yield', 'constructor', 'declare',
            'get', 'module', 'require', 'set', 'type', 'from', 'of'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'any', 'boolean', 'byte', 'char', 'number', 'string', 'symbol',
            'void', 'never', 'object'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['this', 'super', 'null', 'true', 'false', 'undefined']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, keyword_tb, types_tb,
            values_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb,
            string_tb, slash_slash_comment_tb, slash_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')
        self.convert_keywords_to_identifiers(['.'])

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #7
0
    def __init__(self, code):
        super().__init__()

        self.newlines_important = 'parens'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(True, True, '_')
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_')
        operand_types.append('number')

        identifier_tb = RubyIdentifierTokenBuilder()
        operand_types.append('identifier')

        symbol_tb = PrefixedIdentifierTokenBuilder(':', 'symbol', True)
        operand_types.append('symbol')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 10)
        operand_types.append('string')

        regex_tb = RegexTokenBuilder()
        operand_types.append('regex')

        heredoc_tb = HereDocTokenBuilder('<<-')

        hash_comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')

        known_operators = [
            '!', '~', '**', '*', '/', '%', '+', '-', '<<', '>>', '&', '|', '^',
            '<', '<=', '>', '>=', '==', '===', '!=', '=~', '!~', '<=>', '&&',
            '||', '..', '...', '?', ':', '=', '**=', '*=', '/=', '%=', '+=',
            '-=', '<<=', '>>=', '&&=', '&=', '||=', '|=', '^=', 'not', 'and',
            'or', 'in', '.', '.:', '=>', '::', '<<-'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', '!', '~', '&', '*', '**', '<<-']

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'BEGIN', 'END', 'alias', 'begin', 'break', 'case', 'class', 'def',
            'defined?', 'do', 'else', 'elsif', 'end', 'ensure', 'for', 'if',
            'module', 'next', 'redo', 'rescue', 'retry', 'return', 'then',
            'undef', 'unless', 'until', 'when', 'while', 'yield'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['nil', 'self', 'true', 'false', 'super']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        array_markers = ['%w', '%q', '%Q', '%i', '%s', '%x']

        array_marker_tb = CaseSensitiveListTokenBuilder(
            array_markers, 'identifier', True)

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb,
            values_tb, symbol_tb, known_operator_tb, groupers_tb, regex_tb,
            identifier_tb, array_marker_tb, string_tb, heredoc_tb,
            hash_comment_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.convert_bars_to_groups()
        self.convert_keywords_to_identifiers(['.'])
        self.convert_operators_to_identifiers()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        openers = ['begin', 'def', 'do', 'class', 'module']
        closers = ['end']
        self.calc_paired_blockers_confidence(openers, closers)

        self.calc_line_length_confidence(code, self.max_expected_line)
Example #8
0
    def __init__(self, code, extension):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        operand_types.append('number')

        num_variable_tb = PrefixedIntegerTokenBuilder('$', False, '0123456789')
        operand_types.append('variable')

        known_variables = [
            'ARGC',
            'ARGV',
            'ENVIRON',
            'FILENAME',
            'FS',
            'NF',
            'NR',
            'FNR',
            'OFMT',
            'OFS',
            'ORS',
            'RLENGTH',
            'RS',
            'RSTART',
            'SUBSEP',
        ]

        known_variables_gnu = [
            'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'IGNORECASE', 'LINT',
            'PROCINFO', 'TEXTDOMAIN'
        ]

        if extension == 'gnu':
            known_variables += known_variables_gnu

        variable_tb = CaseSensitiveListTokenBuilder(known_variables,
                                                    'variable', True)

        regex_tb = RegexTokenBuilder()
        operand_types.append('regex')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        hash_comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '=', '+', '-', '*', '/', '%', '^', '++', '--', '==', '+=', '-=',
            '*=', '/=', '%=', '^=', '!=', '>', '>=', '<', '<=', '&&', '||',
            '|', '!', '?', ':', '~', '!~'
        ]

        self.unary_operators = ['+', '-', '!', '~', '++', '--']

        self.postfix_operators = [
            '++',
            '--',
        ]

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'BEGIN', 'END', 'if', 'else', 'while', 'do', 'for', 'break',
            'continue', 'delete', 'next', 'nextfile', 'function', 'func',
            'exit'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, variable_tb, num_variable_tb,
            real_tb, real_exponent_tb, keyword_tb, known_operator_tb,
            groupers_tb, regex_tb, identifier_tb, string_tb, hash_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'variable', 'regex']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #9
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("_")
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '_0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '_01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder([
            'G',
            'L',
            'I',
        ], False, '_')
        real_tb = RealTokenBuilder(False, False, "_")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "_")
        suffixed_real_tb = SuffixedRealTokenBuilder(False, False,
                                                    ['G', 'D', 'F'], False,
                                                    '_')
        operand_types.append('number')

        leads = '@_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        regex_tb = RegexTokenBuilder()
        # dollar-slash slash-dollar strings (allow newline)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        shebang_tb = SheBangTokenBuilder()
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+',
            '-',
            '*',
            '/',
            '%',
            '**',
            '=',
            '==',
            '!=',
            '===',
            '!==',
            '>',
            '>=',
            '<',
            '<=',
            '+=',
            '-=',
            '*=',
            '/=',
            '%=',
            '**=',
            '&=',
            '|=',
            '^=',
            '<<=',
            '>>=',
            '!',
            '&',
            '|',
            '~',
            '<<',
            '>>',
            '>>>',
            '^',
            '?.',
            '?:',
            '<>',
            '>>>=',
            '.',
            '.&',
            '.@',
            '::',
            '=~',
            '==~',
            '*.',
            '*:',
            '..',
            '..<',
            '<=>',
            '++',
            '--',
            '->',
            '&&',
            '||',
            '?',
            '##',
            'as',
            'in',
            '!in',
            'instanceof',
            '!instanceof',
            'new',
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--', '?']

        self.postfix_operators = ['++', '--', '&', '*']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        # group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'assert', 'break', 'case', 'catch', 'class', 'const', 'continue',
            'def', 'default', 'do', 'else', 'enum', 'extends', 'finally',
            'for', 'goto', 'if', 'implements', 'import', 'interface', 'new',
            'package', 'return', 'super', 'switch', 'throw', 'throws', 'trait',
            'try', 'var', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'byte', 'char', 'double', 'float', 'int', 'long', 'short',
            'Java.lang.BigInteger'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['null', 'true', 'false', 'this']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, binary_integer_tb,
            suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb,
            keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb,
            identifier_tb, class_type_tb, string_tb, triple_quote_string_tb,
            regex_tb, slash_slash_comment_tb, slash_star_comment_tb,
            shebang_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        # shebang line at start

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #10
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        big_integer_tb = SuffixedIntegerTokenBuilder(['n', 'N'], False, '_')
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        hex_constant_tb = PrefixedIntegerTokenBuilder(
            '0X', False, '0123456789ABCDEFabcdef')
        octal_constant_tb = PrefixedIntegerTokenBuilder(
            '0O', False, '01234567')
        binary_constant_tb = PrefixedIntegerTokenBuilder('0B', False, '01')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        dollar_sign_tb = SingleCharacterTokenBuilder('$', 'identifier', False)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        template_string_tb = EscapedStringTokenBuilder(['`'], 10)
        operand_types.append('string')

        comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '===', '!==', '>', '>=',
            '<', '<=', '+=', '-=', '*=', '/=', '%=', '**=', '&=', '|=', '^=',
            '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '>>>', '>>>=', '^',
            '**', '->', '=>', '.', ':', '...', '++', '--', '&&', '||', '?',
            '?=', '?.', 'in', 'of', 'is', 'isnt', 'and', 'or', 'not', '@',
            '//', '%%', '::', 'new', 'delete'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = [
            '+', '-', '!', '~', '++', '--', ':', 'not', '->', '=>', '.', '@',
            'new', 'delete'
        ]

        self.postfix_operators = ['++', '--', ':', '...']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        # group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        regex_tb = RegexTokenBuilder()
        operand_types.append('regex')

        keywords = [
            'for', 'while', 'loop', 'by', 'break', 'continue', 'if', 'then',
            'else', 'unless', 'switch', 'when', 'default', 'return', 'do',
            'throw', 'try', 'catch', 'finally', 'class', 'extends', 'typeof',
            'instanceof', 'await', 'defer', 'yield', 'export', 'import',
            'package', 'let', 'case', 'debugger', 'function', 'var', 'with',
            'private', 'protected', 'public', 'native', 'static', 'const',
            'implements', 'interface', 'enum'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = [
            'true', 'yes', 'on', 'false', 'no', 'off', 'super', 'this',
            'arguments', 'null', 'undefined', 'Infinity', 'Nan'
        ]

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            whitespace_tb, newline_tb, terminators_tb, integer_tb,
            integer_exponent_tb, big_integer_tb, real_tb, real_exponent_tb,
            hex_constant_tb, octal_constant_tb, binary_constant_tb, keyword_tb,
            values_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb,
            dollar_sign_tb, string_tb, template_string_tb, comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)