Example #1
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     RegexTokenBuilder.__escape_z__()
     PerlIdentifierTokenBuilder.__escape_z__()
     PerlDollarCaretIdentifierTokenBuilder.__escape_z__()
     PerlQStringTokenBuilder.__escape_z__()
     MRegexTokenBuilder.__escape_z__()
     SRegexTokenBuilder.__escape_z__()
     YRegexTokenBuilder.__escape_z__()
     TrRegexTokenBuilder.__escape_z__()
     PerlPrototypeTokenBuilder.__escape_z__()
     PerlSigilBraceTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #2
0
 def __escape_z__():
   InvalidTokenBuilder.__escape_z__()
   WhitespaceTokenBuilder.__escape_z__()
   NewlineTokenBuilder.__escape_z__()
   EscapedStringTokenBuilder.__escape_z__()
   PrefixedStringTokenBuilder.__escape_z__()
   SuffixedStringTokenBuilder.__escape_z__()
   IntegerTokenBuilder.__escape_z__()
   IntegerExponentTokenBuilder.__escape_z__()
   PrefixedIntegerTokenBuilder.__escape_z__()
   SuffixedIntegerTokenBuilder.__escape_z__()
   RealTokenBuilder.__escape_z__()
   RealExponentTokenBuilder.__escape_z__()
   SuffixedRealTokenBuilder.__escape_z__()
   IdentifierTokenBuilder.__escape_z__()
   PrefixedIdentifierTokenBuilder.__escape_z__()
   CaseInsensitiveListTokenBuilder.__escape_z__()
   CaseSensitiveListTokenBuilder.__escape_z__()
   SingleCharacterTokenBuilder.__escape_z__()
   SlashSlashCommentTokenBuilder.__escape_z__()
   SlashStarCommentTokenBuilder.__escape_z__()
   ClassTypeTokenBuilder.__escape_z__()
   HexRealExponentTokenBuilder.__escape_z__()
   NestedCommentTokenBuilder.__escape_z__()
   return 'Escape ?Z'
Example #3
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     ParensLabelTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #4
0
 def __escape_z__():
   InvalidTokenBuilder.__escape_z__()
   WhitespaceTokenBuilder.__escape_z__()
   NewlineTokenBuilder.__escape_z__()
   EscapedStringTokenBuilder.__escape_z__()
   PrefixedStringTokenBuilder.__escape_z__()
   IntegerTokenBuilder.__escape_z__()
   IntegerExponentTokenBuilder.__escape_z__()
   PrefixedIntegerTokenBuilder.__escape_z__()
   SuffixedIntegerTokenBuilder.__escape_z__()
   RealTokenBuilder.__escape_z__()
   AssemblyCommentTokenBuilder.__escape_z__()
   return 'Escape ?Z'
Example #5
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #6
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     BlockTokenBuilder.__escape_z__()
     BraceCommentTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #7
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     StuffedQuoteStringTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     NullTokenBuilder.__escape_z__()
     SqlBracketedIdentifierTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #8
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     StuffedQuoteStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     RemarkTokenBuilder.__escape_z__()
     CBasicVariableTokenBuilder.__escape_z__()
     CBasicLabelTokenBuilder.__escape_z__()
     CBasicSuffixedIntegerTokenBuilder.__escape_z__()
     CBasicLineContinuationTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #9
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     PrefixedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     SuffixedIntegerTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     LabelTokenBuilder.__escape_z__()
     AssemblyCommentTokenBuilder.__escape_z__()
     MultilineCommentTokenBuilder.__escape_z__()
     HashQuoteCharTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #10
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     SuffixedIdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     PrefixedIdentifierTokenBuilder.__escape_z__()
     TripleQuoteStringTokenBuilder.__escape_z__()
     SlashSlashCommentTokenBuilder.__escape_z__()
     SlashStarCommentTokenBuilder.__escape_z__()
     SwiftArgumentTokenBuilder.__escape_z__()
     SwiftSymbolTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #11
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     PrefixedStringTokenBuilder.__escape_z__()
     PrefixedRawStringTokenBuilder.__escape_z__()
     TripleQuoteStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     SuffixedRealTokenBuilder.__escape_z__()
     PrefixedIdentifierTokenBuilder.__escape_z__()
     SuffixedIdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__(),
     NestedCommentTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #12
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     StuffedQuoteStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     SuffixedIntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     SuffixedRealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     NullTokenBuilder.__escape_z__()
     BasicVariableTokenBuilder.__escape_z__()
     BasicLongVariableTokenBuilder.__escape_z__()
     RemarkTokenBuilder.__escape_z__()
     UserFunctionTokenBuilder.__escape_z__()
     LongUserFunctionTokenBuilder.__escape_z__()
     HardwareFunctionTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Example #13
0
    def __init__(self, code, version):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        command_tb = PrefixedIdentifierTokenBuilder('!', 'command', False)

        metaclass_tb = PrefixedIdentifierTokenBuilder('?', 'metaclass', False)

        quotes = ['"', "'", "’"]
        string_tb = MatlabStringTokenBuilder(quotes, False)
        operand_types.append('string')

        line_comment_m_tb = LeadToEndOfLineTokenBuilder('%', False, 'comment')
        line_comment_o_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')
        block_comment_m_tb = BlockTokenBuilder('%{', '%}', 'comment')
        block_comment_o_tb = BlockTokenBuilder('#{', '#}', 'comment')

        line_continuation_tb = KeywordTokenBuilder('...', 'line continuation')

        known_operators = [
            '+', '-', '.*', '*', './', '/', '\\', '.^', '^', ".'", "'", '=',
            '==', '~=', '>', '>=', '<', '<=', '&', '|', '&&', '||', '~', '@',
            '.', '.?'
        ]

        operators_octave = [
            '++', '--', '+=', '-=', '*=', '/=', '^=', '!', '!=', '**'
        ]

        if version == 'octave':
            known_operators += operators_octave

        self.unary_operators = ['+', '-', '~', '@']

        self.postfix_operators = ["'"]

        groupers = ['(', ')', ',', '[', ']', '{', '}', ';', ':']
        group_starts = ['(', '[', ',', '{']
        # group_mids = [',', ';', ':']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'break', 'case', 'catch', 'classdef', 'continue', 'else', 'elseif',
            'end', 'for', 'function', 'global', 'if', 'otherwise', 'parfor',
            'persistent', 'return', 'spmd', 'switch', 'try', 'while'
        ]

        keywords_octave = ['endfor', 'endif', 'endwhile']

        if version == 'octave':
            keywords += keywords_octave

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['inf', 'Nan']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb,
            real_exponent_tb, keyword_tb, values_tb, groupers_tb,
            known_operator_tb, identifier_tb, command_tb, metaclass_tb,
            string_tb, line_comment_m_tb, block_comment_m_tb
        ]

        tokenbuilders_2 = [line_comment_o_tb, block_comment_o_tb]

        if version == 'octave':
            tokenbuilders += tokenbuilders_2

        tokenbuilders_9 = [self.unknown_operator_tb, invalid_token_builder]

        tokenbuilders += tokenbuilders_9

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        # self.calc_group_confidence(tokens, group_mids)

        # operand_types_2 = ['number']
        # self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #14
0
    def __init__(self, code, block_comment_limit):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(False, True, '_')
        real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', '_')
        octal_integer_tb = PrefixedIntegerTokenBuilder('0o', True, '01234567_')
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', True, '0123456789ABCDEFabcdef_')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', True, '01_')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        lifetime_tb = IdentifierTokenBuilder("'", extras)

        attribute_tb = RustAttributeTokenBuilder()

        quotes = ['"']
        string_tb = EscapedStringTokenBuilder(quotes, 10)
        bstring_tb = PrefixedStringTokenBuilder('b', True, quotes)
        rstring_tb = RustRawStringTokenBuilder()
        operand_types.append('string')

        char_tb = SingleCharStringTokenBuilder()

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = NestedCommentTokenBuilder(
            '/*', '*/', block_comment_limit)

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '^', '!', '&', '|', '&&', '||', '<<',
            '>>', '+=', '-=', '*=', '/=', '%=', '^=', '&=', '|-', '<<=', '>>=',
            '=', '==', '!=', '>', '<', '>=', '<=', '@', '.', '..', '...', '->',
            '#', '$', '?', 'in', '&mut'
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '&mut']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::', '=>']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':', '::', '=>']
        group_ends = [')', ']', '}', ')|']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'as', 'break', 'const', 'continue', 'crate'
            'else', 'enum', 'extern', 'fn', 'for', 'if', 'impl', 'let', 'loop',
            'match', 'mod', 'move', 'mut', 'pub', 'ref', 'return', 'static',
            'struct', 'trait', 'type', 'unsafe', 'use', 'where', 'while'
        ]

        keywords_2018 = ['dyn', 'union', 'static']

        keywords_future = [
            'abstract', 'become', 'box', 'do', 'final', 'macro', 'override',
            'priv', 'typeof', 'unsized', 'virtual', 'yield', 'async', 'await',
            'try'
        ]

        keywords += keywords_2018
        keywords += keywords_future

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'Self', 'u8', 'i8', 'u16', 'i16', 'u32', 'i32', 'u64', 'i64',
            'u128', 'i128', 'usize', 'isize', 'f32', 'f64'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['self', 'true', 'false', 'super', '_']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, octal_integer_tb, hex_integer_tb,
            binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, types_tb,
            values_tb, groupers_tb, known_operator_tb, identifier_tb, char_tb,
            lifetime_tb, class_type_tb, attribute_tb, string_tb, bstring_tb,
            rstring_tb, slash_slash_comment_tb, slash_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        self.tokens = self.combine_numbers_and_adjacent_types(tokens)
        self.convert_operators_to_identifiers()
        self.convert_bars_to_groups()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_format_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #15
0
    def __init__(self, code, extension):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        operand_types.append('number')

        num_variable_tb = PrefixedIntegerTokenBuilder('$', False, '0123456789')
        operand_types.append('variable')

        known_variables = [
            'ARGC',
            'ARGV',
            'ENVIRON',
            'FILENAME',
            'FS',
            'NF',
            'NR',
            'FNR',
            'OFMT',
            'OFS',
            'ORS',
            'RLENGTH',
            'RS',
            'RSTART',
            'SUBSEP',
        ]

        known_variables_gnu = [
            'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'IGNORECASE', 'LINT',
            'PROCINFO', 'TEXTDOMAIN'
        ]

        if extension == 'gnu':
            known_variables += known_variables_gnu

        variable_tb = CaseSensitiveListTokenBuilder(known_variables,
                                                    'variable', True)

        regex_tb = RegexTokenBuilder()
        operand_types.append('regex')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        hash_comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '=', '+', '-', '*', '/', '%', '^', '++', '--', '==', '+=', '-=',
            '*=', '/=', '%=', '^=', '!=', '>', '>=', '<', '<=', '&&', '||',
            '|', '!', '?', ':', '~', '!~'
        ]

        self.unary_operators = ['+', '-', '!', '~', '++', '--']

        self.postfix_operators = [
            '++',
            '--',
        ]

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'BEGIN', 'END', 'if', 'else', 'while', 'do', 'for', 'break',
            'continue', 'delete', 'next', 'nextfile', 'function', 'func',
            'exit'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, variable_tb, num_variable_tb,
            real_tb, real_exponent_tb, keyword_tb, known_operator_tb,
            groupers_tb, regex_tb, identifier_tb, string_tb, hash_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'variable', 'regex']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #16
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '0123456789ABCDEFabcdef_')
        real_tb = RealTokenBuilder(True, True, '_')
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False)
        operand_types.append('decorator')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        operand_types.append('string')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        class_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '+=', '-=', '*=', '/=', '%=', '++',
            '--', '&&', '||', '!', '==', '!=', '===', '!==', '<', '>', '<=',
            '>=', '!!', '?.', '?:', '::', '..', ':', '?', '.'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', '!', '*', '++', '--']

        self.postfix_operators = ['++', '--', ':']

        groupers = ['->', '(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = ['->', ',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'as', 'as?', 'break', 'class', 'continue', 'do', 'else', 'for',
            'fun', 'if', 'in', '!in', 'is', '!is', 'object', 'package',
            'return', 'super', 'throw', 'try', 'typealias', 'typeof', 'val',
            'var', 'when', 'while', 'by', 'catch', 'constructor', 'delegate',
            'dynamic', 'field', 'file', 'finally', 'get', 'import', 'init',
            'param', 'property', 'receiver', 'set', 'setparam', 'where',
            'actual', 'abstract', 'annotation', 'companion', 'const',
            'crossinline', 'data', 'enum', 'expect', 'external', 'final',
            'infix', 'inline', 'inner', 'internal', 'lateinit', 'noinline',
            'open', 'operator', 'out', 'override', 'private', 'protected',
            'public', 'reified', 'sealed', 'suspend', 'tailrec', 'vararg'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'Byte', 'Short', 'Int', 'Long', 'Float', 'Double', 'Char', 'u',
            'f', 'ul', 'UInt', 'ULong', 'UByte', 'UShort'
        ]

        type_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'null', 'this', 'true']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb,
            keyword_tb, type_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, class_tb, decorator_tb, string_tb,
            triple_quote_string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        self.tokens = self.combine_numbers_and_adjacent_types(tokens)

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
    def __init__(self, code, tab_size):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        real_tb = RealTokenBuilder(True, True, None)
        hex_integer_1_tb = PrefixedIntegerTokenBuilder(
            '$', False, '0123456789abcdefABCDEF')
        hex_integer_2_tb = PrefixedIntegerTokenBuilder(
            '#$', False, '0123456789abcdefABCDEF')
        hex_integer_3_tb = PrefixedIntegerTokenBuilder(
            '&', False, '0123456789abcdefABCDEF')
        hex_integer_h_tb = SuffixedIntegerTokenBuilder(['h'], False,
                                                       'abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder(
            ['Q', 'A', 'O', 'D', 'B'], False, None)
        operand_types.append('number')

        leads = '_$#.'
        extras = '_$#.'
        identifier_tb = IbmAsmIdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        hex_string_tb = PrefixedStringTokenBuilder('X', False, quotes)
        char_string_tb = PrefixedStringTokenBuilder('C', False, quotes)
        operand_types.append('string')

        known_operators = ['+', '-', '*', '/', '=', '&', '#', '?']

        self.unary_operators = ['+', '-', '=', '&', '#', '?']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '<', '>']
        group_starts = ['(', '[', ',', '{', '<']
        group_ends = [')', ']', '}', '>']
        group_mids = [',']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        # keywords = []

        # keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        # types = []

        # types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)

        values = ['*']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        comment_tb = AssemblyCommentTokenBuilder(';*')

        title_directive_tb = LeadToEndOfLineTokenBuilder(
            'TITLE', False, 'directive')
        subtitle_directive_tb = LeadToEndOfLineTokenBuilder(
            'SUBTTL', False, 'directive')
        include_directive_tb = LeadToEndOfLineTokenBuilder(
            'INCLUDE', False, 'directive')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, integer_tb, integer_exponent_tb,
            hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb,
            hex_integer_h_tb, binary_integer_tb, suffixed_integer_tb, real_tb,
            values_tb, groupers_tb, known_operator_tb, title_directive_tb,
            subtitle_directive_tb, include_directive_tb, identifier_tb,
            string_tb, hex_string_tb, char_string_tb, comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        opcode_tokenbuilders = [identifier_tb, invalid_token_builder]

        args_tokenbuilders = [
            integer_tb, integer_exponent_tb, hex_integer_1_tb,
            hex_integer_2_tb, hex_integer_3_tb, hex_integer_h_tb,
            binary_integer_tb, suffixed_integer_tb, real_tb, values_tb,
            groupers_tb, known_operator_tb, identifier_tb, string_tb,
            hex_string_tb, char_string_tb, comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        opcode_tokenizer = Tokenizer(opcode_tokenbuilders)
        args_tokenizer = Tokenizer(args_tokenbuilders)

        # tokenize as free-format
        tokens_free = tokenizer.tokenize(code)
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid operator')
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid')
        tokens_free = Examiner.convert_values_to_operators(
            tokens_free, known_operators)
        self.tokens = tokens_free
        self.convert_asm_identifiers_to_labels()

        self.calc_statistics()
        statistics_free = self.statistics
        self.statistics = {}

        self.calc_confidences(operand_types, group_starts, group_mids,
                              group_ends, None)
        self.calc_line_length_confidence(code, self.max_expected_line)

        confidences_free = self.confidences
        self.confidences = {}
        errors_free = self.errors
        self.errors = []

        # tokenize as space-format
        opcode_extras = '.&=,()+-*/'
        label_leads = '.&$@'
        label_mids = '.&$#@'
        label_ends = ':,'
        comment_leads = '*;!'
        line_comment_leads = ''
        use_line_id = False
        tokens_space, indents = Tokenizer.tokenize_asm_code(
            code, tab_size, opcode_tokenizer, opcode_extras, args_tokenizer,
            label_leads, label_mids, label_ends, comment_leads,
            line_comment_leads, use_line_id)
        tokens_space = Examiner.combine_adjacent_identical_tokens(
            tokens_space, 'invalid operator')
        tokens_space = Examiner.combine_adjacent_identical_tokens(
            tokens_space, 'invalid')
        tokens_space = Examiner.combine_identifier_colon(
            tokens_space, ['newline'], [], [])
        tokens_space = Tokenizer.combine_number_and_adjacent_identifier(
            tokens_space)
        tokens_space = Examiner.convert_values_to_operators(
            tokens_space, known_operators)
        self.tokens = tokens_space
        self.convert_asm_identifiers_to_labels()

        self.calc_statistics()
        statistics_space = self.statistics
        self.statistics = {}

        self.calc_confidences(operand_types, group_starts, group_mids,
                              group_ends, indents)
        self.calc_line_length_confidence(code, self.max_expected_line)

        confidences_space = self.confidences
        self.confidences = {}
        errors_space = self.errors
        self.errors = []

        # select the better of free-format and spaced-format
        confidence_free = 1.0
        for key in confidences_free:
            factor = confidences_free[key]
            confidence_free *= factor

        confidence_space = 1.0
        for key in confidences_space:
            factor = confidences_space[key]
            confidence_space *= factor

        if confidence_space > confidence_free:
            self.tokens = tokens_space
            self.statistics = statistics_space
            self.confidences = confidences_space
            self.errors = errors_space
        else:
            self.tokens = tokens_free
            self.statistics = statistics_free
            self.confidences = confidences_free
            self.errors = errors_free
Example #18
0
    def __init__(self, code, block_comment_limit):
        super().__init__()
        self.newlines_important = 'parens'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        imaginary_tb = SuffixedRealTokenBuilder(False, False, ['im', 'cx'],
                                                True, None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        suffixes = '!'
        identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes)
        operand_types.append('identifier')

        symbol_tb = PrefixedIdentifierTokenBuilder(':', 'symbol', True)
        operand_types.append('symbol')

        attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False)
        operand_types.append('attribute')

        dollar_sign_tb = SingleCharacterTokenBuilder('$', 'identifier', True)

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        raw_string_tb = PrefixedRawStringTokenBuilder('raw', True, quotes)
        b_string_tb = PrefixedStringTokenBuilder('b', True, quotes)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        operand_types.append('string')

        comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment')
        nested_comment_tb = NestedCommentTokenBuilder('#=', '=#',
                                                      block_comment_limit)

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            'where', 'in', 'isa', '′', "'", '+', '-', '*', '/', '\\', '^', '%',
            '//', '<<', '>>', '<<<', '>>>', ':', '=', '==', '!=', '===', '!==',
            '+=', '-=', '*=', '/=', '^=', '%=', '<', '>', '<=', '>=', '~', '&',
            '|', '!', '&&', '||', '?', '.', '<:', '>:', '::', '->', '...',
            '..', '∀', '≤', '≥', '⊻', '⊽', '⊼'
        ]

        # 0x391 through 0x3a9 (capital)
        # 0x3b1 through 0x3c9 (small)
        greek_letters = [
            'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν',
            'ξ', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω'
        ]

        greek_letter_tb = CaseSensitiveListTokenBuilder(
            greek_letters, 'identifier', True)

        self.unary_operators = [
            'isa', '+', '-', '~', '!', '.', ':', '::', "'", '<:', '>:', 'in',
            '..'
        ]

        self.postfix_operators = ['...', '′']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        # group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'baremodule', 'begin', 'break', 'catch', 'const', 'continue', 'do',
            'else', 'elseif', 'end', 'export', 'finally', 'for', 'function',
            'global', 'if', 'import', 'let', 'local', 'macro', 'module',
            'quote', 'return', 'struct', 'try', 'using', 'while', 'abstract',
            'mutable', 'primitive', 'type'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'Int8', 'UInt8', 'Int16', 'UInt16', 'Int32', 'UInt32', 'Int64',
            'UInt64', 'Int128', 'UInt128', 'Float16', 'Float32', 'Float64',
            'Bool', 'Char'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'true']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, real_tb,
            real_exponent_tb, imaginary_tb, keyword_tb, types_tb, values_tb,
            groupers_tb, known_operator_tb, identifier_tb, symbol_tb,
            attribute_tb, dollar_sign_tb, greek_letter_tb, string_tb,
            raw_string_tb, b_string_tb, triple_quote_string_tb, comment_tb,
            nested_comment_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = JuliaExaminer.split_symbols_to_operators_identifiers(
            tokens, group_ends)
        self.tokens = tokens
        self.convert_keywords_to_identifiers()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'identifier', 'symbol']
        self.calc_operand_confidence(tokens, operand_types_2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #19
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("_")
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '_0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '_01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder([
            'G',
            'L',
            'I',
        ], False, '_')
        real_tb = RealTokenBuilder(False, False, "_")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "_")
        suffixed_real_tb = SuffixedRealTokenBuilder(False, False,
                                                    ['G', 'D', 'F'], False,
                                                    '_')
        operand_types.append('number')

        leads = '@_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        regex_tb = RegexTokenBuilder()
        # dollar-slash slash-dollar strings (allow newline)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        shebang_tb = SheBangTokenBuilder()
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+',
            '-',
            '*',
            '/',
            '%',
            '**',
            '=',
            '==',
            '!=',
            '===',
            '!==',
            '>',
            '>=',
            '<',
            '<=',
            '+=',
            '-=',
            '*=',
            '/=',
            '%=',
            '**=',
            '&=',
            '|=',
            '^=',
            '<<=',
            '>>=',
            '!',
            '&',
            '|',
            '~',
            '<<',
            '>>',
            '>>>',
            '^',
            '?.',
            '?:',
            '<>',
            '>>>=',
            '.',
            '.&',
            '.@',
            '::',
            '=~',
            '==~',
            '*.',
            '*:',
            '..',
            '..<',
            '<=>',
            '++',
            '--',
            '->',
            '&&',
            '||',
            '?',
            '##',
            'as',
            'in',
            '!in',
            'instanceof',
            '!instanceof',
            'new',
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--', '?']

        self.postfix_operators = ['++', '--', '&', '*']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        # group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'assert', 'break', 'case', 'catch', 'class', 'const', 'continue',
            'def', 'default', 'do', 'else', 'enum', 'extends', 'finally',
            'for', 'goto', 'if', 'implements', 'import', 'interface', 'new',
            'package', 'return', 'super', 'switch', 'throw', 'throws', 'trait',
            'try', 'var', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'byte', 'char', 'double', 'float', 'int', 'long', 'short',
            'Java.lang.BigInteger'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['null', 'true', 'false', 'this']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, binary_integer_tb,
            suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb,
            keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb,
            identifier_tb, class_type_tb, string_tb, triple_quote_string_tb,
            regex_tb, slash_slash_comment_tb, slash_star_comment_tb,
            shebang_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        # shebang line at start

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #20
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '0123456789abcdefABCDEF_')
        long_integer_tb = SuffixedIntegerTokenBuilder('L', False, None)
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        float_real_tb = SuffixedRealTokenBuilder(False, False, ['f'], False,
                                                 None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        symbol_tb = PrefixedIdentifierTokenBuilder("'", 'symbol', True)
        operand_types.append('symbol')

        quotes = ['"']
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_string_tb = TripleQuoteStringTokenBuilder(quotes)
        operand_types.append('string')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '&', '|', '^', '<<', '>>', '&&', '||',
            '=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=',
            '>:', '⇒', '=>', '=', '<%', '<:', '←', '<-', '#', '@', '==', '!=',
            '>', '<', '>=', '<=', '!', '~', '<<<', '>>>', '.', '++', '--',
            'new'
        ]

        self.unary_operators = ['+', '-', '*', '!', '~', '++', '--', 'new']

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'abstract', 'case', 'catch', 'class', 'def', 'do', 'else',
            'extends', 'final', 'finally', 'for', 'forSome', 'if', 'implicit',
            'import', 'lazy', 'match', 'object', 'override', 'package',
            'private', 'protected', 'return', 'sealed', 'then', 'throw',
            'trait', 'try', 'type', 'val', 'var', 'while', 'with', 'yield'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['false', 'true', 'null', 'this', 'super']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, long_integer_tb,
            real_tb, real_exponent_tb, float_real_tb, keyword_tb, values_tb,
            groupers_tb, known_operator_tb, identifier_tb, symbol_tb,
            string_tb, triple_string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #21
0
    def __init__(self, code):
        super().__init__()
        self.newlines_important = 'parens'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        bracket_string_tb = DoubleBracketStringTokenBuilder()
        operand_types.append('string')

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '^', '<', '>', '<=', '>=', '==', '~=', '=',
            '..', '.', '#', ':', 'and', 'not', 'or'
        ]

        self.unary_operators = ['+', '-', '#', 'not']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        # group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'break', 'do', 'else', 'elseif', 'end', 'for', 'function', 'if',
            'in', 'local', 'repeat', 'return', 'then', 'until', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['false', 'true', 'nil', '...']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        line_comment_tb = LeadToEndOfLineTokenBuilder('--', True, 'comment')
        block_comment_tb = LuaBlockCommentTokenBuilder()

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb,
            real_exponent_tb, keyword_tb, values_tb, groupers_tb,
            known_operator_tb, identifier_tb, string_tb, bracket_string_tb,
            line_comment_tb, block_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'identifier']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #22
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        annotation_tb = PrefixedIdentifierTokenBuilder('@', 'annotation',
                                                       False)
        operand_types.append('annotation')

        symbol_tb = PrefixedIdentifierTokenBuilder('#', 'symbol', True)
        operand_types.append('symbol')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '~/', '%', '^', '=', '==', '!=', '>', '>=',
            '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=',
            '>>=', '!', '&', '|', '~', '<<', '>>', '~/=', '||', '&&', '.',
            '..', ':', '?', '??', '??=', 'as', 'is', 'is!', '++', '--', 'new'
        ]

        self.unary_operators = [
            '+', '-', '*', '!', '~', '.', '..'
            '?.', '++', '--', 'new'
        ]

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'abstract', 'assert', 'async', 'await', 'break', 'case', 'catch',
            'class', 'const', 'continue', 'covariant', 'default', 'deferred',
            'do', 'dynamic', 'else', 'enum', 'export', 'extends', 'external',
            'factory', 'final', 'finally', 'for', 'Function', 'get', 'hide',
            'if', 'implements', 'import', 'in', 'interface', 'library',
            'mixin', 'on', 'operator', 'part', 'rethrow', 'return', 'set',
            'show', 'static', 'switch', 'sync', 'throw', 'try', 'typedef',
            'var', 'void', 'while', 'with', 'yield'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = ['int', 'double', 'String', 'List', 'bool', 'void']

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'true', 'null', 'this', 'super']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb,
            real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb,
            groupers_tb, known_operator_tb, identifier_tb, annotation_tb,
            symbol_tb, class_type_tb, string_tb, raw_string_tb,
            slash_slash_comment_tb, slash_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #23
0
  def __init__(self, code, block_comment_limit):
    super().__init__()

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder("'")
    integer_exponent_tb = IntegerExponentTokenBuilder("'")
    hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF_')
    binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_')
    suffixed_integer_tb = SuffixedIntegerTokenBuilder(['U', 'L', 'LU', 'UL'], False, None)
    real_tb = RealTokenBuilder(False, False, "'")
    suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l', 'i'], False, None)
    real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
    hex_real_tb = HexRealExponentTokenBuilder()
    operand_types.append('number')

    leads = '_'
    extras = '_'
    identifier_tb = IdentifierTokenBuilder(leads, extras)
    operand_types.append('identifier')

    attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False)
    operand_types.append('attribute')

    # string suffix: c,w,d
    quotes = ['"', "'", "’"]
    string_tb = EscapedStringTokenBuilder(quotes, 0)
    r_string_tb = PrefixedStringTokenBuilder('r', True, quotes)
    backtick_string_tb = EscapedStringTokenBuilder(['`'], 0)
    x_string_tb = PrefixedStringTokenBuilder('x', True, quotes)
    q_string_tb = PrefixedStringTokenBuilder('q', True, quotes)
    # q{} string
    cwd_string_tb = SuffixedStringTokenBuilder(quotes, 'cwd', False)
    operand_types.append('string')

    class_type_tb = ClassTypeTokenBuilder()
    operand_types.append('class')

    slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
    slash_star_comment_tb = SlashStarCommentTokenBuilder()
    slash_plus_comment_tb = NestedCommentTokenBuilder('/+', '+/', block_comment_limit)

    line_continuation_tb = SingleCharacterTokenBuilder('\\', 'line continuation', False)
    terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False)

    known_operators = [
      '/', '/=', '.', '..', '...', '&', '&=', '&&', '|', '|=', '||',
      '-', '-=', '--', '+', '+=', '++', '<', '<=', '<<', '<<=', '>', '>=',
      '>>=', '>>>=', '>>', '>>>', '!', '!=', '?', ',', ':', '$',
      '=', '==', '*', '*=', '%', '%=', '^', '^=', '^^', '^^=', '~', '~=',
      '@', '=>', '#',
      'new', 'delete',
      'typeof', 'is'
    ]

    self.unary_operators = [
      '+', '-', '*',
      '!', '&', '~',
      '++', '--', ':',
      'new', 'delete',
      'typeof', 'is'
    ]

    self.postfix_operators = [
      '++', '--', '&', ':'
    ]

    groupers = ['(', ')', ',', '[', ']', '{', '}']
    group_starts = ['(', '[', ',', '{']
    group_mids = [',']
    group_ends = [')', ']', '}']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    keywords = [
      'abstract', 'alias', 'align', 'asm', 'assert', 'auto',
      'body', 'break', 'case', 'cast', 'catch', 'class', 'const', 'continue',
      'debug', 'default', 'delegate', 'deprecated', 'do',
      'else', 'enum', 'export', 'extern',
      'final', 'finally', 'for', 'foreach', 'foreach_reverse', 'function',
      'goto',
      'if', 'immutable', 'import', 'in', 'inout', 'interface', 'invariant',
      'lazy',
      'macro', 'mixin', 'module',
      'nothrow',
      'out', 'override',
      'package', 'pragma', 'private', 'protected', 'public', 'pure',
      'ref', 'return',
      'scope', 'shared', 'static', 'struct', 'switch', 'synchronized',
      'template', 'throw', 'try', 'typeid',
      'union', 'unittest', 'version', 'while', 'with',
      '__gshared', '__traits', '__vector', '__parameters'
]

    keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

    types = [
      'bool', 'byte', 'cdouble', 'cent', 'cfloat', 'char', 'creal',
      'dchar', 'double', 'float', 'idouble', 'ifloat', 'int', 'ireal',
      'long', 'real', 'short', 'ubyte', 'ucent', 'uint', 'ulong', 'ushort',
      'void', 'wchar'
    ]

    types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
    operand_types.append('type')

    values = [
      'false', 'null', 'super', 'this', 'true',
      '__FILE__', '__FILE_FULL_PATH__', '__MODULE__', '__LINE__',
      '__FUNCTION__', '__PRETTY_FUNCTION__',
      '__DATE__', '__EOF__', '__TIME__','__TIMESTAMP__',
      '__VENDOR__', '__VERSION__'
    ]

    values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      line_continuation_tb,
      terminators_tb,
      integer_tb,
      integer_exponent_tb,
      hex_integer_tb,
      binary_integer_tb,
      suffixed_integer_tb,
      real_tb,
      real_exponent_tb,
      suffixed_real_tb,
      hex_real_tb,
      keyword_tb,
      types_tb,
      values_tb,
      groupers_tb,
      known_operator_tb,
      identifier_tb,
      attribute_tb,
      class_type_tb,
      string_tb,
      r_string_tb,
      x_string_tb,
      backtick_string_tb,
      q_string_tb,
      cwd_string_tb,
      slash_slash_comment_tb,
      slash_star_comment_tb,
      slash_plus_comment_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)
    tokens = tokenizer.tokenize(code)
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator')
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
    tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment'])
    self.tokens = tokens
    self.convert_identifiers_to_labels()

    number_suffixes = ['f', 'F', 'i', 'I', 'u', 'U', 'l', 'L', 'ul', 'uL', 'Ul', 'UL', 'lu', 'lU', 'Lu', 'LU']
    tokens = self.combine_tokens_and_adjacent_types(tokens, 'number', 'identifier', number_suffixes)

    string_suffixes = ['c', 'w', 'd']
    self.tokens = self.combine_tokens_and_adjacent_types(tokens, 'string', 'identifier', string_suffixes)

    self.calc_statistics()

    tokens = self.source_tokens()
    tokens = Examiner.join_all_lines(tokens)

    self.calc_token_confidence()
    self.calc_token_2_confidence()

    num_operators = self.count_my_tokens(['operator', 'invalid operator'])
    if num_operators > 0:
      self.calc_operator_confidence(num_operators)
      allow_pairs = []
      self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
      self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
      self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

    self.calc_group_confidence(tokens, group_mids)

    operand_types_2 = ['number', 'symbol']
    self.calc_operand_n_confidence(tokens, operand_types_2, 2)
    self.calc_operand_n_confidence(tokens, operand_types, 4)

    self.calc_keyword_confidence()

    self.calc_paired_blockers_confidence(['{'], ['}'])
    self.calc_line_length_confidence(code, self.max_expected_line)
Example #24
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        hex_constant_tb = PrefixedIntegerTokenBuilder(
            '$', True, '0123456789ABCDEFabcdef')
        octal_constant_tb = PrefixedIntegerTokenBuilder('&', True, '01234567')
        binary_constant_tb = PrefixedIntegerTokenBuilder('%', True, '01')
        char_constant_tb = PrefixedIntegerTokenBuilder('#', True, '0123456789')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ["'"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        brace_comment_tb = BraceCommentTokenBuilder()
        paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment')

        known_operators = [
            '+', '-', '*', '/', '=', '<>', '>', '>=', '<', '<=', 'and', 'or',
            'not', '&', '|', '~', '<<', '>>', ':=', '^', '~', '@', '.', ':',
            '..', 'div', 'mod', 'shl', 'shr', 'in'
        ]

        known_operator_tb = CaseInsensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', 'not', '@', '^', '.']

        self.postfix_operators = ['^']

        groupers = ['(', ')', ',', '[', ']']
        group_starts = ['(', '[', ',']
        group_mids = [',']
        group_ends = [')', ']']

        groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'begin', 'break', 'case', 'const', 'do', 'downto', 'else', 'end',
            'for', 'forward', 'function', 'goto', 'if', 'label', 'of',
            'otherwise', 'packed', 'procedure', 'program', 'repeat', 'reset',
            'then', 'to', 'type', 'until', 'uses', 'value', 'var', 'while',
            'with'
        ]

        keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword',
                                                     False)

        types = [
            'array', 'boolean', 'char', 'file', 'integer', 'real', 'record',
            'set', 'string'
        ]

        types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'nil', 'true']

        values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, char_constant_tb,
            keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, string_tb, brace_comment_tb, paren_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = self.combine_identifier_colon(
            tokens, ['statement separator'], ['begin'],
            ['whitespace', 'comment', 'newline', 'line description'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()
        self.convert_identifiers_to_labels_2()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'identifier', 'variable']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['begin', 'record', 'case'],
                                             ['end'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #25
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        hex_constant_tb = PrefixedIntegerTokenBuilder(
            '0H', False, '0123456789ABCDEFabcdef')
        octal_constant_tb = PrefixedIntegerTokenBuilder(
            '0O', False, '01234567')
        binary_constant_tb = PrefixedIntegerTokenBuilder('0B', False, '01')
        operand_types.append('number')

        leads = '_$'
        extras = '_$'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        terminators_tb = CaseInsensitiveListTokenBuilder(
            [';'], 'statement terminator', False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '===', '!==', '>', '>=',
            '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=',
            '>>=', '!', '&', '|', '~', '<<', '>>', '=>', '^', '.', ':', '++',
            '--', '&&', '||', '?', '$', '?.', 'new', 'delete'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = [
            '+', '-', '!', '~', '++', '--', ':', '$', 'new', 'delete'
        ]

        self.postfix_operators = ['++', '--', ':']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False)

        regex_tb = RegexTokenBuilder()

        keywords = [
            'break', 'case', 'catch', 'class', 'const', 'continue', 'debugger',
            'default', 'do', 'else', 'enum', 'export', 'extends', 'finally',
            'for', 'function', 'if', 'import', 'in', 'instanceof', 'return',
            'switch', 'throw', 'try', 'typeof', 'while', 'with', 'as',
            'implements', 'interface', 'let', 'package', 'private',
            'protected', 'public', 'static', 'yield', 'constructor', 'declare',
            'get', 'module', 'require', 'set', 'type', 'from', 'of'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'any', 'boolean', 'byte', 'char', 'number', 'string', 'symbol',
            'void', 'never', 'object'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['this', 'super', 'null', 'true', 'false', 'undefined']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, keyword_tb, types_tb,
            values_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb,
            string_tb, slash_slash_comment_tb, slash_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')
        self.convert_keywords_to_identifiers(['.'])

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #26
0
  def __init__(self, code, tab_size, processor):
    super().__init__()

    self.newlines_important = 'always'

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder("'")
    integer_exponent_tb = IntegerExponentTokenBuilder("'")
    real_tb = RealTokenBuilder(True, True, None)
    hex_integer_1_tb = PrefixedIntegerTokenBuilder('$', False, '0123456789abcdefABCDEF')
    hex_integer_2_tb = PrefixedIntegerTokenBuilder('#$', False, '0123456789abcdefABCDEF')
    hex_integer_3_tb = PrefixedIntegerTokenBuilder('&', False, '0123456789abcdefABCDEF')
    hex_integer_h_tb = SuffixedIntegerTokenBuilder(['h'], False, 'abcdefABCDEF')
    binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
    suffixed_integer_tb = SuffixedIntegerTokenBuilder(['Q', 'A', 'O', 'D', 'B'], False, None)
    operand_types.append('number')

    leads = '$#.@&'
    extras = '$#.@&'
    identifier_tb = IbmAsmIdentifierTokenBuilder(leads, extras)
    operand_types.append('identifier')

    quotes = ['"', "'", "’"]
    string_tb = EscapedStringTokenBuilder(quotes, 0)
    hex_string_tb = PrefixedStringTokenBuilder('X', False, quotes)
    char_string_tb = PrefixedStringTokenBuilder('C', False, quotes)
    operand_types.append('string')

    known_operators = [
      '+', '-', '*', '/', '=', '&', '#', '?', "'"
    ]

    self.unary_operators = [
      '+', '-', '=', '&', '#', '?', "'"
    ]

    self.postfix_operators = []

    groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '<', '>']
    group_starts = ['(', '[', ',', '{', '<']
    group_ends = [')', ']', '}', '>']
    group_mids = [',']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    preprocessors = [
      'MACRO', 'MEND'
    ]

    preprocessor_tb = CaseInsensitiveListTokenBuilder(preprocessors, 'preprocesssor', False)

    directives = [
      'CSECT',
      'DC', 'DROP', 'DS', 
      'EJECT', 'END', 'ENTRY', 'EQU', 'EXTRN',
      'FREEMAIN',
      'GETMAIN', 'GLOBAL',
      'NAM', 'NAME',
      'ORG',
      'PAGE', 'PARAM', 'PROC', 'PUBLIC',
      'RETURN',
      'STIMER',
      'TITLE', 'SUBTTL',
      'USING'
    ]

    directive_tb = CaseInsensitiveListTokenBuilder(directives, 'directive', False)

    keywords = []

    keywords_360 = [
      'A', 'ABEND', 'AD', 'ADR', 'AE', 'AER', 'AH', 'AL', 'ALR', 'AP', 'AR',
      'AU', 'AUR', 'AW', 'AWR', 'AXR',
      'B', 'BAL', 'BALR', 'BAS', 'BASR', 'BC', 'BCR', 'BCT', 'BCTR',
      'BE', 'BH', 'BL', 'BM', 'BNE', 'BNH', 'BNL', 'BNM', 'BNP', 'BNO', 'BNZ',
      'BO', 'BP', 'BR', 'BXH', 'BXLE', 'BZ',
      'C', 'CD', 'CDR', 'CE', 'CER', 'CH', 'CL', 'CLC', 'CLI', 'CLR', 'CP',
      'CR', 'CVB', 'CVD',
      'D', 'DD', 'DDR', 'DE', 'DER', 'DIAGNOSE', 'DP', 'DR',
      'ED', 'EDMK', 'EX',
      'HDR', 'HER', 'HIO',
      'IC', 'ISK',
      'L',
      'LA', 'LCR', 'LCDR', 'LCER', 'LD', 'LDR',
      'LE', 'LER', 'LH', 'LM',
      'LNDR', 'LNER', 'LNR',
      'LPDR', 'LPER', 'LPR', 'LPSW', 'LR', 'LRDR', 'LRER',
      'LTDR', 'LTER', 'LTR',
      'M', 'MD', 'MDR', 'ME', 'MER', 'MH', 'MP', 'MR', 'MVC', 'MVI',
      'MVN', 'MVO', 'MVZ', 'MXD', 'MXDR', 'MXR',
      'N', 'NC', 'NI', 'NOP', 'NOPR', 'NR',
      'O', 'OC', 'OI', 'OR',
      'PACK',
      'RDD',
      'S', 'SD', 'SDR', 'SE', 'SER', 'SH', 'SIO',
      'SL', 'SLA', 'SLDA', 'SLDL', 'SLL', 'SLR',
      'SP', 'SPM',
      'SR', 'SRA', 'SRDL', 'SRP',
      'SSK', 'SSM', 'SRDA', 'SRL',
      'ST', 'STC', 'STD', 'STE', 'STH', 'STM', 'SU', 'SUR', 'SVC',
      'SW', 'SWR', 'SXR',
      'TCH', 'TIO', 'TM', 'TR', 'TRT', 'TS',
      'UNPK', 'UNPKU',
      'WRD',
      'X', 'XC', 'XI', 'XR',
      'ZAP'
    ]

    keywords_370 = [
      'BRXH', 'BRXLE',
      'CLCL',
      'HDV',
      'LAM', 'LEDR',
      'MS', 'MVCL',
      'RIO',
      'SIOF', 'STAM',
      'VA', 'VACD', 'VACDR', 'VACE', 'VACER',
      'VAD', 'VADQ', 'VADR', 'VADS',
      'VAE', 'VAEQ', 'VAER', 'VAES',
      'VAQ', 'VAR', 'VAS',
      'VC', 'VCD', 'VCDQ', 'VCDR', 'VCDS',
      'VCE', 'VCEQ', 'VCER', 'VCES',
      'VCQ', 'VCR', 'VCS',
      'VDD', 'VDDQ', 'VDDR', 'VDDS',
      'VDE', 'VDEQ', 'VDER', 'VDES',
      'VL', 'VLCDR', 'VLCER', 'VLCR',
      'VLD', 'VLDQ', 'VLDR', 'VLEQ', 'VLH', 'VLINT',
      'VLM', 'VLMD', 'VLMDQ', 'VLMDR', 'VLMEQ', 'VLMQ', 'VLMR',
      'VLNDR', 'VLNER', 'VLNR', 'VLPDR', 'VLPER', 'VLPR',
      'VLQ', 'VLR', 'VLY', 'VLYD', 'VLZDR', 'VLZR',
      'VM', 'VMAD', 'VMADQ', 'VMADS', 'VMAE', 'VMAEQ', 'VMAES',
      'VMCD', 'VMCE', 'VMCER',
      'VMD', 'VMDQ', 'VMDR', 'VMDS',
      'VME', 'VMEQ', 'VMER', 'VMES',
      'VMQ', 'VMR', 'VMS', 'VMSD', 'VMSDQ', 'VMSDS', 'VMSE', 'VMSEQ', 'VMSES',
      'VN', 'VNQ', 'VNR', 'VNS',
      'VO', 'VOQ', 'VOR', 'VOS',
      'VS', 'VSD', 'VSDQ', 'VSDR', 'VSDS',
      'VSE', 'VSEQ', 'VSER', 'VSES',
      'VSQD', 'VSQDR', 'VSQE', 'VSQER',
      'VSQ', 'VSR', 'VSS', 'VST', 'VSTD', 'VSTE', 'VSTH', 'VSTKD', 'VSTMD',
      'VTAD', 'VTAE', 'VTSD', 'VTSE',
      'VX', 'VXQ', 'VXR', 'VXS',
      'VMXSE', 'VMNSE', 'VMXAE', 'VLELE', 'VSELE', 'VMXDS', 'VMNSD', 'VMXAD',
      'VLELD', 'VXELD', 'VSPSD', 'VAPSD', 'VTVM', 'VCVM', 'VCZVM', 'VCOVM',
      'VXVC', 'VXVMM', 'VRRS', 'VRSVC', 'VRSV', 'VLVM', 'VLCVM', 'VSTVM', 'VNVM',
      'VOVM', 'VXVM', ' VSRSV', 'VMRSV', 'VSRRS', 'VLVCA', 'VRCL', 'VSVMM',
      'VLVXA', 'VSVTP', 'VACSV', 'VACRS',
      'STNSM', 'SOTSM', 'SIOP', 'MC', 'LRA', 'CONCS', 'DISCS', 'STIDP', 'SCK',
      'SPT', 'STPT', 'SPKA', 'IPK', 'PTLB', 'SPX', 'STPX', 'STAP', 'RRB',
      'PC', 'SAC', 'IPTE',
      'IVSK', 'IAC', 'SSAR', 'EPAR', 'ESAR', 'PT', 'ISKE', 'RRBE', 'SSKE', 'TB',
      'STCTL', 'LCTL', 'CS', 'CDS', 'CLM', 'STCM', 'ICM',
      'MVCK', 'MVCP', 'MVCS', 'VLI', 'VSTI', 'VLID', 'VSTID', 'VSRL',
      'VSLL', 'VLBIX', 'LASP', 'TPROT', 'STRAG',
      'MVCSK', 'MVCDK', 'DPFET', 'MVHHI', 'MVGHI', 'MVHI', 'CHHSI', 'CLHHSI',
      'CGHSI', 'CLGHSI', 'CHSI', 'CLFHSI', 'TBEGIN', 'TBEGINC', 'MVCIN', 'UNPKA'
    ]

    keywords_390 = [
      'BASSM', 'BSG', 'BSM',
      'CLRCH', 'CMPS', 'CLRIO', 'CMSG',
      'LAE', 'LXDR',
      'MDE',
      'PFPO', 'PR', 'PTFF',
      'SAM24', 'SAM31', 'SCKPF',
      'TAM', 'TMPS', 'TMSG', 'TRACE', 'TRAP2',
      'TMH',' TMLH', 'TML', 'TMLL', 'TMHH', 'TMHL',
      'BRC', 'BRAS', 'BRCT', 'BRCTG',
      'LHI', 'LGHI',
      'AHI', 'AGHI',
      'MHI', 'MGHI',
      'CHI', 'CGHI',
      'MVCLE', 'CLCLE',
      'UPT',
      'SIE', 'PCF', 'CFC', 'DEP', 'DCTP', 'MAD', 'MUN', 'STCAP', 'SERVC',
      'IPM', 'DXR', 'PGIN', 'PGOUT', 'CSCH', 'HSCH', 'MSCH', 'SSCH', 'STSCH', 'TSCH',
      'TPI', 'SAL', 'RSCH', 'STCRW', 'STCPS', 'RCHP', 'SCHM', 'STZP', 'SZP',
      'TPZI', 'BAKR', 'CKSM', 'MADS', 'SQDR', 'STURA', 'MSTA', 'PALB', 'EREG',
      'ESTA', 'LURA', 'TAR', 'SQDR', 'SAR', 'EAR', 'CSP', 'MSR', 'MVPG', 'MVST',
      'CUSE', 'BSG', 'CLST', 'SRST', 'XSCH', 'RP', 'STCKE', 'SACF', 'STSI',
      'SRNM', 'STFPC', 'LFPC', 'TRE', 'CUUTF', 'CUTFU', 'STFL', 'LPSWE',
      'TRAP4', 'LPEBR', 'LNEBR', 'LTEBR', 'LCEBR', 'LDEBR', 'LXDBR', 'LDEBR',
      'MXDBR', 'KEBR', 'CEBR', 'AEBR', 'SEBR', 'MDEBR', 'DEBR', 'MAEBR',
      'MSEBR', 'LPDBR', 'LCDBR', 'SQEBR', 'MEEBR', 'KDBR', 'CDBR', 'ADBR',
      'MDBR', 'DDBR', 'SDBR', 'LDER', 'LXDR', 'MAER', 'MSER', 'SQXR', 'MEER',
      'MADR', 'MSDR', 'LPXBR', 'LNXBR', 'LTXBR', 'LCXBR', 'LCXBR', 'LEDBR',
      'LDXBR', 'LEXBR', 'FIXBR', 'KXBR', 'CXBR', 'AXBR', 'SXBR', 'MXBR', 'DXBR',
      'TBEDR', 'TBDR', 'DIEBR', 'FIEBR', 'THDER', 'DIDBR', 'FIDBR', 'LPXR',
      'LNXR', 'LTXR', 'LCXR', 'LXR', 'LEXR', 'FIXR', 'CXR', 'LZER', 'LZDR',
      'LZXR', 'FIER', 'FIDR', 'SFPC', 'EFPC', 'CEFBR', 'CDFBR', 'CXFBR', 'CEGBR',
      'CEFR', 'CDFR', 'CXFR', 'CFDR', 'CFXR', 'CEGR', 'CDGR', 'CXGR', 'CGER', 'CGDR', 'CGXR',
      'CDGBR', 'CXGBR', 'CGDBR', 'CGEBR', 'CGXBR',
      'LMC', 'LPGR', 'LNGR', 'LTGR', 'LCGR', 'LGC', 'LURAG', 'AGR', 'SGR',
      'ALGR', 'SLGR', 'MSGR', 'DSGR', 'EREGG', 'LRVGR', 'LPGFR', 'LNGFR',
      'LTGFR', 'LCGFR', 'LGFR', 'LLGFR', 'LLGTR', 'AGFR', 'SGFR', 'ALGFR',
      'SLGFR', 'MSGFR', 'DSGFR', 'LRVR', 'CGR', 'CLGR', 'STURG', 'CGFR',
      'CLGFR', 'BCTGR', 'NGR', 'OGR', 'XGR', 'MLGR', 'DLGR', 'ALCGR', 'SLBGR',
      'EPSW', 'TRTT', 'TRTO', 'TROT', 'TROO', 'MLR', 'DLR', 'ALCR', 'SLBR', 'ESEA',
      'LARL', 'LGFI', 'BRCL', 'BRASL', 'XIHF', 'XILF', 'IIHF', 'IILF',
      'NIHF', 'NILF', 'OIHF', 'OILF', 'LLIHF', 'LLILF', 'LLHRL', 'LGHRL',
      'LHRL', 'AGFI', 'AFI', 'ALGFI', 'ALFI', 'CGFI', 'CFI', 'LLGFRL', 'STRL',
      'EXRL', 'PFDRL', 'CGHRL','CHRL', 'CLGHRL', 'CLHRL', 'CGRL', 'CLGRL',
      'CRL', 'CLGFRL', 'CLRL', 'MVCOS', 'ECTG', 'CSST', 'PKU',
      'LRAG', 'LG', 'AG', 'SG', 'ALG', 'SLG', 'MSG', 'DSG', 'CVBG',
      'LRVG', 'LGF', 'LGH', 'LLGF', 'LLGT', 'AGF', 'SGF', 'ALGF', 'SLGF',
      'MSGF', 'DSGF', 'LRV', 'LRVH', 'CG', 'CLG', 'STG', 'CVDG', 'STRVG',
      'CGF', 'CLGF', 'STRV', 'STRVH', 'BCTG', 'NG', 'OG', 'XG', 'MLG',
      'DLG', 'ALCG', 'SLBG', 'STPQ', 'LPQ', 'LLGC', 'LLGH', 'ML', 'DL',
      'ALC', 'SLB', 'PKA',
      'DIL', 'BDIL', 'ANUM', 'COMP', 'MCPU', 'MIO', 'BIFLAG', 'MULDIV',
      'LMG', 'SRAG', 'SLAG', 'SRLG', 'SLLG', 'TRACG', 'RLLG', 'RLL',
      'CLMH', 'CLMY', 'CLT', 'CLTH', 'CLTL', 'CLTNE', 'CLTE', 'CLTNL',
      'CLTNH', 'STMG', 'STCTG', 'STMH', 'STCMH', 'LCTLG', 'CSG', 'CDSG',
      'BXHG', 'BXLEG', 'ICMH', 'MVCLU', 'CLCLU', 'LMH', 'LMY', 'TP',
      'SRAK', 'SLAK', 'SRLK', 'SRLK', 'LOCG', 'BRXHG', 'BRXLG', 'LDEB',
      'LXDB', 'LXEB', 'MXDB', 'KEB', 'CEB', 'AEB', 'SEB', 'MDEB', 'DEB',
      'MAEB', 'MSEB', 'TCEB', 'TCDB', 'TCXB', 'SQEB', 'SQDB', 'MEEB',
      'KDB', 'CDB', 'ADB', 'SDB', 'MDB', 'DDB', 'MADB', 'MSDB', 'LDE',
      'LXD', 'LXE', 'SQE', 'SQD', 'MEE', 'PLO', 'LMD'
    ]

    keywords_z = [
      'IIHH', 'IIHL', 'IILH', 'IILL',
      'LLIHH', 'LLIHL', 'LLILH', 'LLILL',
      'NIHH', 'NIHL', 'NILH', 'NILL',
      'OIHH', 'OIHL', 'OILH', 'OILL',
      'SAM64'
    ]

    if processor in ['360', '370', '390', 'system-z']:
      keywords += keywords_360

    if processor in ['370', '390', 'system-z']:
      keywords += keywords_370

    if processor in ['390', 'system-z']:
      keywords += keywords_390

    if processor in ['system-z']:
      keywords += keywords_z

    opcode_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False)

    registers = [
      'R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10',
      'R11', 'R12', 'R13', 'R14', 'R15',
      'FP0', 'FP2', 'FP4', 'FP6'
    ]

    register_tb = CaseInsensitiveListTokenBuilder(registers, 'register', True)

    values = ['*']

    values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    comment_tb = LeadToEndOfLineTokenBuilder('!', False, 'comment')
    line_comment_tb = AssemblyCommentTokenBuilder('*')

    include_directive_tb = LeadToEndOfLineTokenBuilder('INCLUDE', False, 'directive')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      integer_tb,
      integer_exponent_tb,
      hex_integer_1_tb,
      hex_integer_2_tb,
      hex_integer_3_tb,
      hex_integer_h_tb,
      binary_integer_tb,
      suffixed_integer_tb,
      real_tb,
      values_tb,
      groupers_tb,
      known_operator_tb,
      register_tb,
      opcode_tb,
      directive_tb,
      include_directive_tb,
      preprocessor_tb,
      identifier_tb,
      string_tb,
      hex_string_tb,
      char_string_tb,
      comment_tb,
      line_comment_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    opcode_tokenbuilders = [
      whitespace_tb,
      opcode_tb,
      directive_tb,
      include_directive_tb,
      preprocessor_tb,
      identifier_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    args_tokenbuilders = [
      whitespace_tb,
      integer_tb,
      integer_exponent_tb,
      hex_integer_1_tb,
      hex_integer_2_tb,
      hex_integer_3_tb,
      hex_integer_h_tb,
      binary_integer_tb,
      suffixed_integer_tb,
      real_tb,
      values_tb,
      groupers_tb,
      known_operator_tb,
      register_tb,
      identifier_tb,
      string_tb,
      hex_string_tb,
      char_string_tb,
      comment_tb,
      line_comment_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)
    opcode_tokenizer = Tokenizer(opcode_tokenbuilders)
    args_tokenizer = Tokenizer(args_tokenbuilders)

    # tokenize as free-format
    tokens_free = tokenizer.tokenize(code)
    tokens_free = Examiner.combine_adjacent_identical_tokens(tokens_free, 'invalid operator')
    tokens_free = Examiner.combine_adjacent_identical_tokens(tokens_free, 'invalid')
    tokens_free = AssemblyIBMExaminer.convert_keywords_to_identifiers(tokens_free)
    tokens_free = Examiner.convert_values_to_operators(tokens_free, known_operators)
    self.tokens = tokens_free
    self.convert_asm_identifiers_to_labels()
    self.convert_asm_keywords_to_identifiers()

    self.calc_statistics()
    statistics1 = self.statistics
    self.statistics = {}

    self.calc_confidences(operand_types, group_starts, group_mids, group_ends, None)
    self.calc_line_length_confidence(code, self.max_expected_line)

    confidences_free = self.confidences
    self.confidences = {}
    errors_free = self.errors
    self.errors = []

    # tokenize as space-format
    opcode_extras = '.&=,()+-*/'
    label_leads = '.&$@'
    label_mids = '.&$#@'
    label_ends = ':,'
    comment_leads = '!'
    line_comment_leads = '*'
    use_line_id = True
    tokens_space, indents = Tokenizer.tokenize_asm_code(code, tab_size, opcode_tokenizer, opcode_extras, args_tokenizer, label_leads, label_mids, label_ends, comment_leads, line_comment_leads, use_line_id)
    tokens_space = Examiner.combine_adjacent_identical_tokens(tokens_space, 'invalid operator')
    tokens_space = Examiner.combine_adjacent_identical_tokens(tokens_space, 'invalid')
    tokens_space = Examiner.combine_identifier_colon(tokens_space, ['newline'], [], [])
    tokens_space = Tokenizer.combine_number_and_adjacent_identifier(tokens_space)
    tokens_space = AssemblyIBMExaminer.convert_opcodes_to_keywords(tokens_space, keywords)
    tokens_space = AssemblyIBMExaminer.convert_keywords_to_identifiers(tokens_space)
    tokens_space = Examiner.convert_values_to_operators(tokens_space, known_operators)
    self.tokens = tokens_space
    self.convert_asm_identifiers_to_labels()
    self.convert_asm_keywords_to_identifiers()

    self.calc_statistics()
    statistics2 = self.statistics
    self.statistics = {}

    self.calc_confidences(operand_types, group_starts, group_mids, group_ends, indents)
    self.calc_line_length_confidence(code, self.max_expected_line)

    confidences_space = self.confidences
    self.confidences = {}
    errors_space = self.errors
    self.errors = []

    # select the better of free-format and spaced-format

    confidence_free = 1.0
    for key in confidences_free:
      factor = confidences_free[key]
      confidence_free *= factor

    confidence_space = 1.0
    for key in confidences_space:
      factor = confidences_space[key]
      confidence_space *= factor

    if confidence_space > confidence_free:
      self.tokens = tokens_space
      self.statistics = statistics2
      self.confidences = confidences_space
      self.errors = errors_space
    else:
      self.tokens = tokens_free
      self.statistics = statistics1
      self.confidences = confidences_free
      self.errors = errors_free
Example #27
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        perl_identfier_tb = PerlIdentifierTokenBuilder()
        operand_types.append('identifier')

        specials = [
            '$_', '@_', '$$', '$"', '$(', '$)', '$>', '$<', '$;', '$]', '$[',
            '$&', '$`', "$'", '$+', '@+', '%+', '@-', '%-', '$,', '$.', '$/',
            '$\\', '$|', '$%', '$-', '$:', '$=', '$^', '$~', '$!', '$?', '$@',
            '$#', '$*'
        ]

        specials_tb = CaseInsensitiveListTokenBuilder(specials, 'identifier',
                                                      True)

        dollar_carat_tb = PerlDollarCaretIdentifierTokenBuilder()

        sigilbrace_tb = PerlSigilBraceTokenBuilder()

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        q_string_tb = PerlQStringTokenBuilder()

        regex_tb = RegexTokenBuilder()
        m_regex_tb = MRegexTokenBuilder()
        s_regex_tb = SRegexTokenBuilder()
        y_regex_tb = YRegexTokenBuilder()
        tr_regex_tb = TrRegexTokenBuilder()
        operand_types.append('regex')

        prototype_tb = PerlPrototypeTokenBuilder()

        comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        directives = ['#line']

        preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '**', '/', '%', '=', '==', '!=', '>', '>=', '<',
            '<=', '**=', '+=', '*=', '&=', '&.=', '<<=', '&&=', '-=', '/=',
            '|=', '|.=', '>>=', '||=', '.=', '%=', '^=', '^.=', '//=', 'x=',
            'ne', 'gt', 'ge', 'le', 'lt', 'eq', '!', '&', '|', '~', '<<', '>>',
            '^', '.', '..', '...', '++', '--', '->', '=>', '&&', '||', '?',
            '<->', '<=>', 'and', 'cmp', 'or', 'xor'
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--']

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':', '::']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'bless', 'break', 'continue', 'die', 'do', 'else', 'elsif', 'eval',
            'exit', 'exp', 'for', 'foreach', 'if', 'last', 'lock', 'my',
            'next', 'no', 'our', 'package', 'redo', 'return', 'say', 'sub',
            'taint', 'undef', 'unless', 'until', 'use', 'wantarray', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True)

        values = ['NULL']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, real_tb,
            real_exponent_tb, keyword_tb, values_tb, groupers_tb,
            known_operator_tb, prototype_tb, identifier_tb, perl_identfier_tb,
            specials_tb, dollar_carat_tb, sigilbrace_tb, string_tb,
            q_string_tb, regex_tb, m_regex_tb, s_regex_tb, y_regex_tb,
            tr_regex_tb, preprocessor_tb, comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code, ['__END__'])
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment', 'line description'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #28
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        identifier_tb = HaskellIdentifierTokenBuilder()
        operand_types.append('identifier')

        class_tb = HaskellClassTokenBuilder()
        operand_types.append('class')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        line_comment_tb = LeadToEndOfLineTokenBuilder('--', False, 'comment')
        block_comment_tb = BlockTokenBuilder('{-', '-}', 'comment')

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        operators_tb = HaskellOperatorTokenBuilder('#$%&*+./<=>?@\\^|-~')

        known_operators = ["'", '..']

        known_operators_tb = CaseInsensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.postfix_operators = ['..', "'"]

        keywords = [
            'case', 'class', 'data', 'deriving', 'do', 'else', 'if', 'import',
            'in', 'infix', 'infix1', 'infixr', 'instance', 'let', 'module',
            'newtype', 'of', 'then', 'type', 'where'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True)

        values = ['True', 'False', 'Nothing', '_']

        value_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb,
            keyword_tb, groupers_tb, operators_tb, known_operators_tb,
            identifier_tb, value_tb, class_tb, string_tb, line_comment_tb,
            block_comment_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment'])
        HaskellExaminer.convert_keywords_to_identifiers(tokens)
        self.tokens = tokens
        # self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        # self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            # self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        # operand_types_2 = ['number']
        # self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # operand_types = ['number', 'string', 'symbol', 'identifier', 'variable']
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #29
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '0123456789abcdefABCDEF_')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_')
        octal_integer_tb = PrefixedIntegerTokenBuilder('0c', False,
                                                       '01234567_')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        comment_tb = LeadToEndOfLineTokenBuilder('--', True, 'comment')

        known_operators = [
            ':=', '=', '/=', '<', '>', '<=', '>=', '+', '-', '*', '/', '//',
            '\\\\', '^', '|..|', '..', 'and', 'or', 'xor', 'not', 'and then',
            'or else', 'implies', '.', '@', '#', '|', '&'
        ]

        self.unary_operators = ['+', '-', 'not', '@', '#', '|', '&']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', ';']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ';', ':']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'across', 'agent', 'alias', 'all', 'as', 'assign', 'attribute',
            'check', 'class', 'convert', 'create', 'debug', 'deferred', 'do',
            'else', 'elseif', 'end', 'ensure', 'expanded', 'export',
            'external', 'feature', 'from', 'frozen', 'if', 'implies',
            'inherit', 'inspect', 'invariant', 'like', 'local', 'loop', 'note',
            'obsolete', 'old', 'once', 'only', 'redefine', 'rename', 'require',
            'rescue', 'retry', 'select', 'separate', 'then', 'undefine',
            'until', 'variant', 'when'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = ['Current', 'Precursor', 'Result', 'Void', 'TUPLE']

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['False', 'True', '?']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, integer_tb, integer_exponent_tb,
            hex_integer_tb, binary_integer_tb, octal_integer_tb, real_tb,
            real_exponent_tb, keyword_tb, types_tb, values_tb, groupers_tb,
            known_operator_tb, identifier_tb, string_tb, comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Example #30
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = PrefixedIntegerTokenBuilder('#', False, '0123456789')
        variable16_tb = PrefixedIntegerTokenBuilder('.', False, '0123456789')
        variable32_tb = PrefixedIntegerTokenBuilder(':', False, '0123456789')
        array16_tb = PrefixedIntegerTokenBuilder(',', False, '0123456789')
        array32_tb = PrefixedIntegerTokenBuilder(';', False, '0123456789')
        operand_types.append('number')

        comment_tb = LeadToEndOfLineTokenBuilder('NOTE', True, 'comment')

        label_tb = ParensLabelTokenBuilder()

        known_operators = ['~', '$', 'V', '?', '&', 'SUB', '<-']

        self.unary_operators = ['V', '?', '&']

        self.postfix_operators = []

        groupers = ['"', "'"]
        group_starts = ['"', "'"]
        group_ends = ['"', "'"]
        group_mids = []

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'DO', 'STASH', 'RETRIEVE', 'RESUME', 'FORGET', 'NEXT', 'ABSTAIN',
            'FROM', 'REINSTATE', 'IGNORE', 'REMEMBER', 'WRITE', 'IN', 'READ',
            'OUT', 'PLEASE', 'COME', 'FROM'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, integer_tb, variable16_tb,
            variable32_tb, array16_tb, array32_tb, keyword_tb, groupers_tb,
            label_tb, known_operator_tb, comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        # self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)