Beispiel #1
0
 def __escape_z__():
   InvalidTokenBuilder.__escape_z__()
   WhitespaceTokenBuilder.__escape_z__()
   NewlineTokenBuilder.__escape_z__()
   EscapedStringTokenBuilder.__escape_z__()
   PrefixedStringTokenBuilder.__escape_z__()
   SuffixedStringTokenBuilder.__escape_z__()
   IntegerTokenBuilder.__escape_z__()
   IntegerExponentTokenBuilder.__escape_z__()
   PrefixedIntegerTokenBuilder.__escape_z__()
   SuffixedIntegerTokenBuilder.__escape_z__()
   RealTokenBuilder.__escape_z__()
   RealExponentTokenBuilder.__escape_z__()
   SuffixedRealTokenBuilder.__escape_z__()
   IdentifierTokenBuilder.__escape_z__()
   PrefixedIdentifierTokenBuilder.__escape_z__()
   CaseInsensitiveListTokenBuilder.__escape_z__()
   CaseSensitiveListTokenBuilder.__escape_z__()
   SingleCharacterTokenBuilder.__escape_z__()
   SlashSlashCommentTokenBuilder.__escape_z__()
   SlashStarCommentTokenBuilder.__escape_z__()
   ClassTypeTokenBuilder.__escape_z__()
   HexRealExponentTokenBuilder.__escape_z__()
   NestedCommentTokenBuilder.__escape_z__()
   return 'Escape ?Z'
Beispiel #2
0
 def __escape_z__():
   InvalidTokenBuilder.__escape_z__()
   WhitespaceTokenBuilder.__escape_z__()
   NewlineTokenBuilder.__escape_z__()
   EscapedStringTokenBuilder.__escape_z__()
   PrefixedStringTokenBuilder.__escape_z__()
   IntegerTokenBuilder.__escape_z__()
   IntegerExponentTokenBuilder.__escape_z__()
   PrefixedIntegerTokenBuilder.__escape_z__()
   SuffixedIntegerTokenBuilder.__escape_z__()
   RealTokenBuilder.__escape_z__()
   AssemblyCommentTokenBuilder.__escape_z__()
   return 'Escape ?Z'
Beispiel #3
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     StringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     SuffixedIntegerTokenBuilder.__escape_z__()
     BlockTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Beispiel #4
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     PrefixedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     SuffixedIntegerTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     LabelTokenBuilder.__escape_z__()
     AssemblyCommentTokenBuilder.__escape_z__()
     MultilineCommentTokenBuilder.__escape_z__()
     HashQuoteCharTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Beispiel #5
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     StuffedQuoteStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     SuffixedIntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     SuffixedRealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     NullTokenBuilder.__escape_z__()
     BasicVariableTokenBuilder.__escape_z__()
     BasicLongVariableTokenBuilder.__escape_z__()
     RemarkTokenBuilder.__escape_z__()
     UserFunctionTokenBuilder.__escape_z__()
     LongUserFunctionTokenBuilder.__escape_z__()
     HardwareFunctionTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Beispiel #6
0
  def __init__(self, code):
    super().__init__()

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder(None)
    integer_exponent_tb = IntegerExponentTokenBuilder(None)
    big_integer_tb = SuffixedIntegerTokenBuilder(['n', 'N'], False, '_')
    real_tb = RealTokenBuilder(False, False, None)
    real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
    hex_constant_tb = PrefixedIntegerTokenBuilder('0X', False, '0123456789ABCDEFabcdef')
    octal_constant_tb = PrefixedIntegerTokenBuilder('0O', False, '01234567')
    binary_constant_tb = PrefixedIntegerTokenBuilder('0B', False, '01')
    operand_types.append('number')

    leads = '_'
    extras = '_'
    identifier_tb = IdentifierTokenBuilder(leads, extras)
    operand_types.append('identifier')

    quotes = ['"', "'", "’"]
    string_tb = EscapedStringTokenBuilder(quotes, 0)
    template_string_tb = EscapedStringTokenBuilder(['`'], 10)
    operand_types.append('string')

    regex_tb = RegexTokenBuilder()
    operand_types.append('regex')

    slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
    slash_star_comment_tb = SlashStarCommentTokenBuilder()

    terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False)

    known_operators = [
      '+', '-', '*', '/', '%',
      '=', '==', '!=', '===', '!==', '>', '>=', '<', '<=',
      '+=', '-=', '*=', '/=', '%=', '**=', '&=', '|=', '^=', '<<=', '>>=',
      '!', '!!', '&', '|', '~', '<<', '>>', '>>>', '>>>=',
      '^', '**',
      '.', ':',
      '++', '--', '&&', '||',
      '?', '?.',
      'new', 'delete'
    ]

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    self.unary_operators = [
      '+', '-',
      '!', '!!', '~',
      '++', '--', ':',
      'new', 'delete'
    ]

    self.postfix_operators = [
      '++', '--', ':'
    ]

    groupers = ['(', ')', ',', '[', ']', '{', '}']
    # group_starts = ['(', '[', ',', '{']
    group_mids = [',']
    group_ends = [')', ']', '}']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    keywords = [
      'abstract',
      'break',
      'case', 'catch', 'class', 'const', 'continue',
      'debugger', 'default', 'do',
      'else', 'export', 'extends',
      'final', 'finally', 'for', 'function',
      'goto',
      'if', 'import', 'in', 'instanceof',
      'let',
      'native', 'new',
      'return',
      'switch', 'synchronized',
      'throw', 'throws', 'transient', 'try', 'typeof',
      'var', 'void', 'volatile',
      'while', 'with',
      'yield'
    ]

    keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

    values = [
      'this', 'super', 'null', 'true', 'false'
    ]

    values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      whitespace_tb,
      newline_tb,
      terminators_tb,
      integer_tb,
      integer_exponent_tb,
      big_integer_tb,
      real_tb,
      real_exponent_tb,
      hex_constant_tb,
      octal_constant_tb,
      binary_constant_tb,
      keyword_tb,
      values_tb,
      known_operator_tb,
      groupers_tb,
      regex_tb,
      identifier_tb,
      string_tb,
      template_string_tb,
      slash_slash_comment_tb,
      slash_star_comment_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)
    tokens = tokenizer.tokenize(code)
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator')
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
    # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment'])
    self.tokens = tokens
    self.convert_identifiers_to_labels()

    self.calc_statistics()

    tokens = self.source_tokens()
    tokens = Examiner.join_all_lines(tokens)

    self.calc_token_confidence()
    self.calc_token_2_confidence([';'])

    num_operators = self.count_my_tokens(['operator', 'invalid operator'])
    if num_operators > 0:
      self.calc_operator_confidence(num_operators)
      allow_pairs = []
      self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
      self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
      # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

    self.calc_group_confidence(tokens, group_mids)

    operand_types_2 = ['number', 'string', 'symbol']
    self.calc_operand_n_confidence(tokens, operand_types_2, 2)
    self.calc_operand_n_confidence(tokens, operand_types, 4)

    self.calc_keyword_confidence()

    self.calc_paired_blockers_confidence(['{'], ['}'])
    self.calc_line_length_confidence(code, self.max_expected_line)
    def __init__(self, code, tab_size):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        real_tb = RealTokenBuilder(True, True, None)
        hex_integer_1_tb = PrefixedIntegerTokenBuilder(
            '$', False, '0123456789abcdefABCDEF')
        hex_integer_2_tb = PrefixedIntegerTokenBuilder(
            '#$', False, '0123456789abcdefABCDEF')
        hex_integer_3_tb = PrefixedIntegerTokenBuilder(
            '&', False, '0123456789abcdefABCDEF')
        hex_integer_h_tb = SuffixedIntegerTokenBuilder(['h'], False,
                                                       'abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder(
            ['Q', 'A', 'O', 'D', 'B'], False, None)
        operand_types.append('number')

        leads = '_$#.'
        extras = '_$#.'
        identifier_tb = IbmAsmIdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        hex_string_tb = PrefixedStringTokenBuilder('X', False, quotes)
        char_string_tb = PrefixedStringTokenBuilder('C', False, quotes)
        operand_types.append('string')

        known_operators = ['+', '-', '*', '/', '=', '&', '#', '?']

        self.unary_operators = ['+', '-', '=', '&', '#', '?']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '<', '>']
        group_starts = ['(', '[', ',', '{', '<']
        group_ends = [')', ']', '}', '>']
        group_mids = [',']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        # keywords = []

        # keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        # types = []

        # types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)

        values = ['*']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        comment_tb = AssemblyCommentTokenBuilder(';*')

        title_directive_tb = LeadToEndOfLineTokenBuilder(
            'TITLE', False, 'directive')
        subtitle_directive_tb = LeadToEndOfLineTokenBuilder(
            'SUBTTL', False, 'directive')
        include_directive_tb = LeadToEndOfLineTokenBuilder(
            'INCLUDE', False, 'directive')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, integer_tb, integer_exponent_tb,
            hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb,
            hex_integer_h_tb, binary_integer_tb, suffixed_integer_tb, real_tb,
            values_tb, groupers_tb, known_operator_tb, title_directive_tb,
            subtitle_directive_tb, include_directive_tb, identifier_tb,
            string_tb, hex_string_tb, char_string_tb, comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        opcode_tokenbuilders = [identifier_tb, invalid_token_builder]

        args_tokenbuilders = [
            integer_tb, integer_exponent_tb, hex_integer_1_tb,
            hex_integer_2_tb, hex_integer_3_tb, hex_integer_h_tb,
            binary_integer_tb, suffixed_integer_tb, real_tb, values_tb,
            groupers_tb, known_operator_tb, identifier_tb, string_tb,
            hex_string_tb, char_string_tb, comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        opcode_tokenizer = Tokenizer(opcode_tokenbuilders)
        args_tokenizer = Tokenizer(args_tokenbuilders)

        # tokenize as free-format
        tokens_free = tokenizer.tokenize(code)
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid operator')
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid')
        tokens_free = Examiner.convert_values_to_operators(
            tokens_free, known_operators)
        self.tokens = tokens_free
        self.convert_asm_identifiers_to_labels()

        self.calc_statistics()
        statistics_free = self.statistics
        self.statistics = {}

        self.calc_confidences(operand_types, group_starts, group_mids,
                              group_ends, None)
        self.calc_line_length_confidence(code, self.max_expected_line)

        confidences_free = self.confidences
        self.confidences = {}
        errors_free = self.errors
        self.errors = []

        # tokenize as space-format
        opcode_extras = '.&=,()+-*/'
        label_leads = '.&$@'
        label_mids = '.&$#@'
        label_ends = ':,'
        comment_leads = '*;!'
        line_comment_leads = ''
        use_line_id = False
        tokens_space, indents = Tokenizer.tokenize_asm_code(
            code, tab_size, opcode_tokenizer, opcode_extras, args_tokenizer,
            label_leads, label_mids, label_ends, comment_leads,
            line_comment_leads, use_line_id)
        tokens_space = Examiner.combine_adjacent_identical_tokens(
            tokens_space, 'invalid operator')
        tokens_space = Examiner.combine_adjacent_identical_tokens(
            tokens_space, 'invalid')
        tokens_space = Examiner.combine_identifier_colon(
            tokens_space, ['newline'], [], [])
        tokens_space = Tokenizer.combine_number_and_adjacent_identifier(
            tokens_space)
        tokens_space = Examiner.convert_values_to_operators(
            tokens_space, known_operators)
        self.tokens = tokens_space
        self.convert_asm_identifiers_to_labels()

        self.calc_statistics()
        statistics_space = self.statistics
        self.statistics = {}

        self.calc_confidences(operand_types, group_starts, group_mids,
                              group_ends, indents)
        self.calc_line_length_confidence(code, self.max_expected_line)

        confidences_space = self.confidences
        self.confidences = {}
        errors_space = self.errors
        self.errors = []

        # select the better of free-format and spaced-format
        confidence_free = 1.0
        for key in confidences_free:
            factor = confidences_free[key]
            confidence_free *= factor

        confidence_space = 1.0
        for key in confidences_space:
            factor = confidences_space[key]
            confidence_space *= factor

        if confidence_space > confidence_free:
            self.tokens = tokens_space
            self.statistics = statistics_space
            self.confidences = confidences_space
            self.errors = errors_space
        else:
            self.tokens = tokens_free
            self.statistics = statistics_free
            self.confidences = confidences_free
            self.errors = errors_free
Beispiel #8
0
  def __init__(self, code, tab_size, processor):
    super().__init__()

    self.newlines_important = 'always'

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder("'")
    integer_exponent_tb = IntegerExponentTokenBuilder("'")
    real_tb = RealTokenBuilder(True, True, None)
    hex_integer_1_tb = PrefixedIntegerTokenBuilder('$', False, '0123456789abcdefABCDEF')
    hex_integer_2_tb = PrefixedIntegerTokenBuilder('#$', False, '0123456789abcdefABCDEF')
    hex_integer_3_tb = PrefixedIntegerTokenBuilder('&', False, '0123456789abcdefABCDEF')
    hex_integer_h_tb = SuffixedIntegerTokenBuilder(['h'], False, 'abcdefABCDEF')
    binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
    suffixed_integer_tb = SuffixedIntegerTokenBuilder(['Q', 'A', 'O', 'D', 'B'], False, None)
    operand_types.append('number')

    leads = '$#.@&'
    extras = '$#.@&'
    identifier_tb = IbmAsmIdentifierTokenBuilder(leads, extras)
    operand_types.append('identifier')

    quotes = ['"', "'", "’"]
    string_tb = EscapedStringTokenBuilder(quotes, 0)
    hex_string_tb = PrefixedStringTokenBuilder('X', False, quotes)
    char_string_tb = PrefixedStringTokenBuilder('C', False, quotes)
    operand_types.append('string')

    known_operators = [
      '+', '-', '*', '/', '=', '&', '#', '?', "'"
    ]

    self.unary_operators = [
      '+', '-', '=', '&', '#', '?', "'"
    ]

    self.postfix_operators = []

    groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '<', '>']
    group_starts = ['(', '[', ',', '{', '<']
    group_ends = [')', ']', '}', '>']
    group_mids = [',']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    preprocessors = [
      'MACRO', 'MEND'
    ]

    preprocessor_tb = CaseInsensitiveListTokenBuilder(preprocessors, 'preprocesssor', False)

    directives = [
      'CSECT',
      'DC', 'DROP', 'DS', 
      'EJECT', 'END', 'ENTRY', 'EQU', 'EXTRN',
      'FREEMAIN',
      'GETMAIN', 'GLOBAL',
      'NAM', 'NAME',
      'ORG',
      'PAGE', 'PARAM', 'PROC', 'PUBLIC',
      'RETURN',
      'STIMER',
      'TITLE', 'SUBTTL',
      'USING'
    ]

    directive_tb = CaseInsensitiveListTokenBuilder(directives, 'directive', False)

    keywords = []

    keywords_360 = [
      'A', 'ABEND', 'AD', 'ADR', 'AE', 'AER', 'AH', 'AL', 'ALR', 'AP', 'AR',
      'AU', 'AUR', 'AW', 'AWR', 'AXR',
      'B', 'BAL', 'BALR', 'BAS', 'BASR', 'BC', 'BCR', 'BCT', 'BCTR',
      'BE', 'BH', 'BL', 'BM', 'BNE', 'BNH', 'BNL', 'BNM', 'BNP', 'BNO', 'BNZ',
      'BO', 'BP', 'BR', 'BXH', 'BXLE', 'BZ',
      'C', 'CD', 'CDR', 'CE', 'CER', 'CH', 'CL', 'CLC', 'CLI', 'CLR', 'CP',
      'CR', 'CVB', 'CVD',
      'D', 'DD', 'DDR', 'DE', 'DER', 'DIAGNOSE', 'DP', 'DR',
      'ED', 'EDMK', 'EX',
      'HDR', 'HER', 'HIO',
      'IC', 'ISK',
      'L',
      'LA', 'LCR', 'LCDR', 'LCER', 'LD', 'LDR',
      'LE', 'LER', 'LH', 'LM',
      'LNDR', 'LNER', 'LNR',
      'LPDR', 'LPER', 'LPR', 'LPSW', 'LR', 'LRDR', 'LRER',
      'LTDR', 'LTER', 'LTR',
      'M', 'MD', 'MDR', 'ME', 'MER', 'MH', 'MP', 'MR', 'MVC', 'MVI',
      'MVN', 'MVO', 'MVZ', 'MXD', 'MXDR', 'MXR',
      'N', 'NC', 'NI', 'NOP', 'NOPR', 'NR',
      'O', 'OC', 'OI', 'OR',
      'PACK',
      'RDD',
      'S', 'SD', 'SDR', 'SE', 'SER', 'SH', 'SIO',
      'SL', 'SLA', 'SLDA', 'SLDL', 'SLL', 'SLR',
      'SP', 'SPM',
      'SR', 'SRA', 'SRDL', 'SRP',
      'SSK', 'SSM', 'SRDA', 'SRL',
      'ST', 'STC', 'STD', 'STE', 'STH', 'STM', 'SU', 'SUR', 'SVC',
      'SW', 'SWR', 'SXR',
      'TCH', 'TIO', 'TM', 'TR', 'TRT', 'TS',
      'UNPK', 'UNPKU',
      'WRD',
      'X', 'XC', 'XI', 'XR',
      'ZAP'
    ]

    keywords_370 = [
      'BRXH', 'BRXLE',
      'CLCL',
      'HDV',
      'LAM', 'LEDR',
      'MS', 'MVCL',
      'RIO',
      'SIOF', 'STAM',
      'VA', 'VACD', 'VACDR', 'VACE', 'VACER',
      'VAD', 'VADQ', 'VADR', 'VADS',
      'VAE', 'VAEQ', 'VAER', 'VAES',
      'VAQ', 'VAR', 'VAS',
      'VC', 'VCD', 'VCDQ', 'VCDR', 'VCDS',
      'VCE', 'VCEQ', 'VCER', 'VCES',
      'VCQ', 'VCR', 'VCS',
      'VDD', 'VDDQ', 'VDDR', 'VDDS',
      'VDE', 'VDEQ', 'VDER', 'VDES',
      'VL', 'VLCDR', 'VLCER', 'VLCR',
      'VLD', 'VLDQ', 'VLDR', 'VLEQ', 'VLH', 'VLINT',
      'VLM', 'VLMD', 'VLMDQ', 'VLMDR', 'VLMEQ', 'VLMQ', 'VLMR',
      'VLNDR', 'VLNER', 'VLNR', 'VLPDR', 'VLPER', 'VLPR',
      'VLQ', 'VLR', 'VLY', 'VLYD', 'VLZDR', 'VLZR',
      'VM', 'VMAD', 'VMADQ', 'VMADS', 'VMAE', 'VMAEQ', 'VMAES',
      'VMCD', 'VMCE', 'VMCER',
      'VMD', 'VMDQ', 'VMDR', 'VMDS',
      'VME', 'VMEQ', 'VMER', 'VMES',
      'VMQ', 'VMR', 'VMS', 'VMSD', 'VMSDQ', 'VMSDS', 'VMSE', 'VMSEQ', 'VMSES',
      'VN', 'VNQ', 'VNR', 'VNS',
      'VO', 'VOQ', 'VOR', 'VOS',
      'VS', 'VSD', 'VSDQ', 'VSDR', 'VSDS',
      'VSE', 'VSEQ', 'VSER', 'VSES',
      'VSQD', 'VSQDR', 'VSQE', 'VSQER',
      'VSQ', 'VSR', 'VSS', 'VST', 'VSTD', 'VSTE', 'VSTH', 'VSTKD', 'VSTMD',
      'VTAD', 'VTAE', 'VTSD', 'VTSE',
      'VX', 'VXQ', 'VXR', 'VXS',
      'VMXSE', 'VMNSE', 'VMXAE', 'VLELE', 'VSELE', 'VMXDS', 'VMNSD', 'VMXAD',
      'VLELD', 'VXELD', 'VSPSD', 'VAPSD', 'VTVM', 'VCVM', 'VCZVM', 'VCOVM',
      'VXVC', 'VXVMM', 'VRRS', 'VRSVC', 'VRSV', 'VLVM', 'VLCVM', 'VSTVM', 'VNVM',
      'VOVM', 'VXVM', ' VSRSV', 'VMRSV', 'VSRRS', 'VLVCA', 'VRCL', 'VSVMM',
      'VLVXA', 'VSVTP', 'VACSV', 'VACRS',
      'STNSM', 'SOTSM', 'SIOP', 'MC', 'LRA', 'CONCS', 'DISCS', 'STIDP', 'SCK',
      'SPT', 'STPT', 'SPKA', 'IPK', 'PTLB', 'SPX', 'STPX', 'STAP', 'RRB',
      'PC', 'SAC', 'IPTE',
      'IVSK', 'IAC', 'SSAR', 'EPAR', 'ESAR', 'PT', 'ISKE', 'RRBE', 'SSKE', 'TB',
      'STCTL', 'LCTL', 'CS', 'CDS', 'CLM', 'STCM', 'ICM',
      'MVCK', 'MVCP', 'MVCS', 'VLI', 'VSTI', 'VLID', 'VSTID', 'VSRL',
      'VSLL', 'VLBIX', 'LASP', 'TPROT', 'STRAG',
      'MVCSK', 'MVCDK', 'DPFET', 'MVHHI', 'MVGHI', 'MVHI', 'CHHSI', 'CLHHSI',
      'CGHSI', 'CLGHSI', 'CHSI', 'CLFHSI', 'TBEGIN', 'TBEGINC', 'MVCIN', 'UNPKA'
    ]

    keywords_390 = [
      'BASSM', 'BSG', 'BSM',
      'CLRCH', 'CMPS', 'CLRIO', 'CMSG',
      'LAE', 'LXDR',
      'MDE',
      'PFPO', 'PR', 'PTFF',
      'SAM24', 'SAM31', 'SCKPF',
      'TAM', 'TMPS', 'TMSG', 'TRACE', 'TRAP2',
      'TMH',' TMLH', 'TML', 'TMLL', 'TMHH', 'TMHL',
      'BRC', 'BRAS', 'BRCT', 'BRCTG',
      'LHI', 'LGHI',
      'AHI', 'AGHI',
      'MHI', 'MGHI',
      'CHI', 'CGHI',
      'MVCLE', 'CLCLE',
      'UPT',
      'SIE', 'PCF', 'CFC', 'DEP', 'DCTP', 'MAD', 'MUN', 'STCAP', 'SERVC',
      'IPM', 'DXR', 'PGIN', 'PGOUT', 'CSCH', 'HSCH', 'MSCH', 'SSCH', 'STSCH', 'TSCH',
      'TPI', 'SAL', 'RSCH', 'STCRW', 'STCPS', 'RCHP', 'SCHM', 'STZP', 'SZP',
      'TPZI', 'BAKR', 'CKSM', 'MADS', 'SQDR', 'STURA', 'MSTA', 'PALB', 'EREG',
      'ESTA', 'LURA', 'TAR', 'SQDR', 'SAR', 'EAR', 'CSP', 'MSR', 'MVPG', 'MVST',
      'CUSE', 'BSG', 'CLST', 'SRST', 'XSCH', 'RP', 'STCKE', 'SACF', 'STSI',
      'SRNM', 'STFPC', 'LFPC', 'TRE', 'CUUTF', 'CUTFU', 'STFL', 'LPSWE',
      'TRAP4', 'LPEBR', 'LNEBR', 'LTEBR', 'LCEBR', 'LDEBR', 'LXDBR', 'LDEBR',
      'MXDBR', 'KEBR', 'CEBR', 'AEBR', 'SEBR', 'MDEBR', 'DEBR', 'MAEBR',
      'MSEBR', 'LPDBR', 'LCDBR', 'SQEBR', 'MEEBR', 'KDBR', 'CDBR', 'ADBR',
      'MDBR', 'DDBR', 'SDBR', 'LDER', 'LXDR', 'MAER', 'MSER', 'SQXR', 'MEER',
      'MADR', 'MSDR', 'LPXBR', 'LNXBR', 'LTXBR', 'LCXBR', 'LCXBR', 'LEDBR',
      'LDXBR', 'LEXBR', 'FIXBR', 'KXBR', 'CXBR', 'AXBR', 'SXBR', 'MXBR', 'DXBR',
      'TBEDR', 'TBDR', 'DIEBR', 'FIEBR', 'THDER', 'DIDBR', 'FIDBR', 'LPXR',
      'LNXR', 'LTXR', 'LCXR', 'LXR', 'LEXR', 'FIXR', 'CXR', 'LZER', 'LZDR',
      'LZXR', 'FIER', 'FIDR', 'SFPC', 'EFPC', 'CEFBR', 'CDFBR', 'CXFBR', 'CEGBR',
      'CEFR', 'CDFR', 'CXFR', 'CFDR', 'CFXR', 'CEGR', 'CDGR', 'CXGR', 'CGER', 'CGDR', 'CGXR',
      'CDGBR', 'CXGBR', 'CGDBR', 'CGEBR', 'CGXBR',
      'LMC', 'LPGR', 'LNGR', 'LTGR', 'LCGR', 'LGC', 'LURAG', 'AGR', 'SGR',
      'ALGR', 'SLGR', 'MSGR', 'DSGR', 'EREGG', 'LRVGR', 'LPGFR', 'LNGFR',
      'LTGFR', 'LCGFR', 'LGFR', 'LLGFR', 'LLGTR', 'AGFR', 'SGFR', 'ALGFR',
      'SLGFR', 'MSGFR', 'DSGFR', 'LRVR', 'CGR', 'CLGR', 'STURG', 'CGFR',
      'CLGFR', 'BCTGR', 'NGR', 'OGR', 'XGR', 'MLGR', 'DLGR', 'ALCGR', 'SLBGR',
      'EPSW', 'TRTT', 'TRTO', 'TROT', 'TROO', 'MLR', 'DLR', 'ALCR', 'SLBR', 'ESEA',
      'LARL', 'LGFI', 'BRCL', 'BRASL', 'XIHF', 'XILF', 'IIHF', 'IILF',
      'NIHF', 'NILF', 'OIHF', 'OILF', 'LLIHF', 'LLILF', 'LLHRL', 'LGHRL',
      'LHRL', 'AGFI', 'AFI', 'ALGFI', 'ALFI', 'CGFI', 'CFI', 'LLGFRL', 'STRL',
      'EXRL', 'PFDRL', 'CGHRL','CHRL', 'CLGHRL', 'CLHRL', 'CGRL', 'CLGRL',
      'CRL', 'CLGFRL', 'CLRL', 'MVCOS', 'ECTG', 'CSST', 'PKU',
      'LRAG', 'LG', 'AG', 'SG', 'ALG', 'SLG', 'MSG', 'DSG', 'CVBG',
      'LRVG', 'LGF', 'LGH', 'LLGF', 'LLGT', 'AGF', 'SGF', 'ALGF', 'SLGF',
      'MSGF', 'DSGF', 'LRV', 'LRVH', 'CG', 'CLG', 'STG', 'CVDG', 'STRVG',
      'CGF', 'CLGF', 'STRV', 'STRVH', 'BCTG', 'NG', 'OG', 'XG', 'MLG',
      'DLG', 'ALCG', 'SLBG', 'STPQ', 'LPQ', 'LLGC', 'LLGH', 'ML', 'DL',
      'ALC', 'SLB', 'PKA',
      'DIL', 'BDIL', 'ANUM', 'COMP', 'MCPU', 'MIO', 'BIFLAG', 'MULDIV',
      'LMG', 'SRAG', 'SLAG', 'SRLG', 'SLLG', 'TRACG', 'RLLG', 'RLL',
      'CLMH', 'CLMY', 'CLT', 'CLTH', 'CLTL', 'CLTNE', 'CLTE', 'CLTNL',
      'CLTNH', 'STMG', 'STCTG', 'STMH', 'STCMH', 'LCTLG', 'CSG', 'CDSG',
      'BXHG', 'BXLEG', 'ICMH', 'MVCLU', 'CLCLU', 'LMH', 'LMY', 'TP',
      'SRAK', 'SLAK', 'SRLK', 'SRLK', 'LOCG', 'BRXHG', 'BRXLG', 'LDEB',
      'LXDB', 'LXEB', 'MXDB', 'KEB', 'CEB', 'AEB', 'SEB', 'MDEB', 'DEB',
      'MAEB', 'MSEB', 'TCEB', 'TCDB', 'TCXB', 'SQEB', 'SQDB', 'MEEB',
      'KDB', 'CDB', 'ADB', 'SDB', 'MDB', 'DDB', 'MADB', 'MSDB', 'LDE',
      'LXD', 'LXE', 'SQE', 'SQD', 'MEE', 'PLO', 'LMD'
    ]

    keywords_z = [
      'IIHH', 'IIHL', 'IILH', 'IILL',
      'LLIHH', 'LLIHL', 'LLILH', 'LLILL',
      'NIHH', 'NIHL', 'NILH', 'NILL',
      'OIHH', 'OIHL', 'OILH', 'OILL',
      'SAM64'
    ]

    if processor in ['360', '370', '390', 'system-z']:
      keywords += keywords_360

    if processor in ['370', '390', 'system-z']:
      keywords += keywords_370

    if processor in ['390', 'system-z']:
      keywords += keywords_390

    if processor in ['system-z']:
      keywords += keywords_z

    opcode_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False)

    registers = [
      'R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10',
      'R11', 'R12', 'R13', 'R14', 'R15',
      'FP0', 'FP2', 'FP4', 'FP6'
    ]

    register_tb = CaseInsensitiveListTokenBuilder(registers, 'register', True)

    values = ['*']

    values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    comment_tb = LeadToEndOfLineTokenBuilder('!', False, 'comment')
    line_comment_tb = AssemblyCommentTokenBuilder('*')

    include_directive_tb = LeadToEndOfLineTokenBuilder('INCLUDE', False, 'directive')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      integer_tb,
      integer_exponent_tb,
      hex_integer_1_tb,
      hex_integer_2_tb,
      hex_integer_3_tb,
      hex_integer_h_tb,
      binary_integer_tb,
      suffixed_integer_tb,
      real_tb,
      values_tb,
      groupers_tb,
      known_operator_tb,
      register_tb,
      opcode_tb,
      directive_tb,
      include_directive_tb,
      preprocessor_tb,
      identifier_tb,
      string_tb,
      hex_string_tb,
      char_string_tb,
      comment_tb,
      line_comment_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    opcode_tokenbuilders = [
      whitespace_tb,
      opcode_tb,
      directive_tb,
      include_directive_tb,
      preprocessor_tb,
      identifier_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    args_tokenbuilders = [
      whitespace_tb,
      integer_tb,
      integer_exponent_tb,
      hex_integer_1_tb,
      hex_integer_2_tb,
      hex_integer_3_tb,
      hex_integer_h_tb,
      binary_integer_tb,
      suffixed_integer_tb,
      real_tb,
      values_tb,
      groupers_tb,
      known_operator_tb,
      register_tb,
      identifier_tb,
      string_tb,
      hex_string_tb,
      char_string_tb,
      comment_tb,
      line_comment_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)
    opcode_tokenizer = Tokenizer(opcode_tokenbuilders)
    args_tokenizer = Tokenizer(args_tokenbuilders)

    # tokenize as free-format
    tokens_free = tokenizer.tokenize(code)
    tokens_free = Examiner.combine_adjacent_identical_tokens(tokens_free, 'invalid operator')
    tokens_free = Examiner.combine_adjacent_identical_tokens(tokens_free, 'invalid')
    tokens_free = AssemblyIBMExaminer.convert_keywords_to_identifiers(tokens_free)
    tokens_free = Examiner.convert_values_to_operators(tokens_free, known_operators)
    self.tokens = tokens_free
    self.convert_asm_identifiers_to_labels()
    self.convert_asm_keywords_to_identifiers()

    self.calc_statistics()
    statistics1 = self.statistics
    self.statistics = {}

    self.calc_confidences(operand_types, group_starts, group_mids, group_ends, None)
    self.calc_line_length_confidence(code, self.max_expected_line)

    confidences_free = self.confidences
    self.confidences = {}
    errors_free = self.errors
    self.errors = []

    # tokenize as space-format
    opcode_extras = '.&=,()+-*/'
    label_leads = '.&$@'
    label_mids = '.&$#@'
    label_ends = ':,'
    comment_leads = '!'
    line_comment_leads = '*'
    use_line_id = True
    tokens_space, indents = Tokenizer.tokenize_asm_code(code, tab_size, opcode_tokenizer, opcode_extras, args_tokenizer, label_leads, label_mids, label_ends, comment_leads, line_comment_leads, use_line_id)
    tokens_space = Examiner.combine_adjacent_identical_tokens(tokens_space, 'invalid operator')
    tokens_space = Examiner.combine_adjacent_identical_tokens(tokens_space, 'invalid')
    tokens_space = Examiner.combine_identifier_colon(tokens_space, ['newline'], [], [])
    tokens_space = Tokenizer.combine_number_and_adjacent_identifier(tokens_space)
    tokens_space = AssemblyIBMExaminer.convert_opcodes_to_keywords(tokens_space, keywords)
    tokens_space = AssemblyIBMExaminer.convert_keywords_to_identifiers(tokens_space)
    tokens_space = Examiner.convert_values_to_operators(tokens_space, known_operators)
    self.tokens = tokens_space
    self.convert_asm_identifiers_to_labels()
    self.convert_asm_keywords_to_identifiers()

    self.calc_statistics()
    statistics2 = self.statistics
    self.statistics = {}

    self.calc_confidences(operand_types, group_starts, group_mids, group_ends, indents)
    self.calc_line_length_confidence(code, self.max_expected_line)

    confidences_space = self.confidences
    self.confidences = {}
    errors_space = self.errors
    self.errors = []

    # select the better of free-format and spaced-format

    confidence_free = 1.0
    for key in confidences_free:
      factor = confidences_free[key]
      confidence_free *= factor

    confidence_space = 1.0
    for key in confidences_space:
      factor = confidences_space[key]
      confidence_space *= factor

    if confidence_space > confidence_free:
      self.tokens = tokens_space
      self.statistics = statistics2
      self.confidences = confidences_space
      self.errors = errors_space
    else:
      self.tokens = tokens_free
      self.statistics = statistics1
      self.confidences = confidences_free
      self.errors = errors_free
Beispiel #9
0
    def __init__(self, code, tab_size, wide):
        super().__init__()

        self.operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        binary_integer_tb = SuffixedIntegerTokenBuilder(['B'], False, None)
        hex_integer_tb = SuffixedIntegerTokenBuilder(['H'], False, 'ABCDEF')
        octal_integer_tb = SuffixedIntegerTokenBuilder(['O'], False, None)
        decimal_integer_tb = SuffixedIntegerTokenBuilder(['D'], False, None)
        real_tb = RealTokenBuilder(True, False, None)
        real_exponent_tb = RealExponentTokenBuilder(True, False, 'E', None)
        binary_real_tb = SuffixedRealTokenBuilder(True, True, ['B'], False,
                                                  None)
        self.operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        self.operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        self.operand_types.append('string')

        label_tb = PL1LabelTokenBuilder()
        self.operand_types.append('label')

        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        directives = [
            '%ACTIVATE', '%DEACTIVATE', '%DECLARE', '%DCL', '%DICTIONARY',
            '%DO', '%ELSE', '%END', '%FATAL', '%GOTO', '%IF', '%INCLUDE',
            '%LIST', '%NOLIST', '%PAGE', '%PROCEDURE', '%PROC', '%REPLACE',
            '%RETURN', '%THEN'
        ]

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        preprocessor_tb = CaseInsensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        title_tb = LeadToEndOfLineTokenBuilder('%TITLE', True, 'preprocessor')
        subtitle_tb = LeadToEndOfLineTokenBuilder('%SBTTL', True,
                                                  'preprocessor')
        error_tb = LeadToEndOfLineTokenBuilder('%ERROR', True, 'preprocessor')
        warn_tb = LeadToEndOfLineTokenBuilder('%WARN', True, 'preprocessor')
        inform_tb = LeadToEndOfLineTokenBuilder('%INFORM', True,
                                                'preprocessor')
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '**', '>', '<', '=', '>=', '<=', '<>', '^>',
            '^<', '^=', '^', '~>', '~<', '~=', '~', '&', '&:', ':=', '|', '|:',
            '||', '!', '!:', '!!', ':', '@', 'NOT', 'AND', 'OR', 'XOR',
            'MINUS', 'PLUS', 'MOD'
        ]

        self.unary_operators = ['+', '-', '^', '~', '@', 'NOT']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        self.group_starts = ['(', '[', ',', '{']
        self.group_mids = [',']
        self.group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'ADDRESS', 'AT', 'BASED', 'BY', 'CALL', 'CASE', 'CLOSE', 'DATA',
            'DECLARE', 'DISABLE', 'DO', 'ELSE', 'ENABLE', 'END', 'EOF',
            'EXTERNAL', 'GO', 'GOTO', 'HALT', 'IF', 'INITIAL', 'INTERRUPT',
            'LABEL', 'LITERALLY', 'OFFSET', 'ON', 'OPEN', 'OTHERWISE', 'OTHER',
            'PROCEDURE', 'PROC', 'PUBLIC', 'READ', 'REENTRANT', 'RETURN',
            'SELECTOR', 'STRUCTURE', 'THEN', 'TO', 'WHILE', 'WRITE'
        ]

        keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword',
                                                     False)

        attributes = [
            'ALIGNED', 'ANY', 'AREA', 'BASED', 'BUILTIN', 'CONDITION', 'COND',
            'CONTROLLED', 'CTL', 'DEFINED', 'DEF', 'DIRECT', 'ENTRY',
            'ENVIRONMENT', 'ENV', 'EXTERNAL', 'EXT', 'FILE', 'GLOBALDEF',
            'GLOBALREF', 'INITIAL', 'INIT', 'INPUT', 'INTERNAL', 'INT'
            'KEYED', 'LABEL', 'LIKE', 'LIST', 'MEMBER', 'NONVARYING', 'NONVAR',
            'OPTIONAL', 'OPTIONS', 'OUTPUT', 'PARAMETER', 'PARM', 'PICTURE',
            'PIC', 'POSITION', 'POS', 'PRECISION', 'PREC', 'PRINT', 'READONLY',
            'RECORD', 'REFER', 'RETURNS', 'SEQUENTIAL', 'SEQL', 'STATIC',
            'STREAM', 'STRUCTURE', 'TRUNCATE', 'UNALIGNED', 'UNAL', 'UNION',
            'UPDATE', 'VARIABLE', 'VARYING', 'VAR'
        ]

        attributes_tb = CaseInsensitiveListTokenBuilder(
            attributes, 'attribute', False)

        functions = [
            'ABS', 'ADJUSTRPL', 'BLOCKINPUT', 'BLOCKINWORD', 'BLOCKINDWORD',
            'BLOCKOUTPUT', 'BLOCKOUTWORD', 'BLOCKOUTDWORD', 'BUILDPTR',
            'BYTESWAP', 'CMPD', 'CARRY', 'CAUSEINTERRUPT',
            'CLEARTASKSWITCHEDFLAG', 'CMPB', 'CMPW', 'CONTROLREGISTER', 'DEC',
            'DOUBLE', 'DEBUGREGISTER', 'FINDB', 'FINDD', 'FINDRD', 'FINDHW',
            'FINDRB', 'FINDRHW', 'FINDRW', 'FINDW', 'FIX', 'FLAGS', 'FLOAT',
            'GETACCESSRIGHTS', 'GETREALERROR', 'GETSEGMENTLIMIT', 'HIGH',
            'IABS', 'INHWORD', 'INITREALMATHUNITSKIPRB', 'INPUT', 'INT SIZE',
            'INWORD SIZE', 'INVALIDATEDATACACHE', 'INVALIDATETLBENTRY'
            'INDWORD', 'LAST', 'LENGTH', 'LOCALTABLE', 'LOCKSET', 'LOW',
            'MACHINESTATUS', 'MOVB', 'MOVBIT', 'MOVD', 'MOVE', 'MOVHW',
            'MOVRB', 'MOVRBIT'
            'MOVRD', 'MOVRHW', 'MOVRW', 'MOVW', 'NIL', 'OFFSETOF', 'OUTDWORD',
            'OUTHWORD', 'OUTPUT', 'OUTWORD', 'PARITY', 'RESTOREGLOBALTABLE',
            'RESTOREINTERRUPTABLE', 'RESTOREREALSTATUS', 'ROL', 'ROR', 'SAL',
            'SAR', 'SAVEGLOBALTABLE', 'SAVEINTERRUPTTABLE', 'SAVEREALSTATUS',
            'SCANBIT', 'SCANRBIT', 'SCL', 'SCR', 'SEGMENTREADABLE',
            'SEGMENTWRITABLE', 'SELECTOROF', 'SETB', 'SETHW', 'SETREALMODE',
            'SETW', 'SHL', 'SHLD', 'SHR', 'SHRD', 'SETD', 'SIGN', 'SIGNED',
            'SKIPB', 'SKIPD', 'SKIPRD'
            'SKIPHW', 'SKIPRHW', 'SKIPRW', 'SKIPW', 'STACKBASE', 'STACKPTR',
            'TASKREGISTER', 'TESTREGISTER', 'TIME', 'UNSIGN',
            'WAITFORINTERRUPT', 'WBINVALIDATEDATACACHE', 'XLAT', 'ZERO'
        ]

        function_tb = CaseInsensitiveListTokenBuilder(functions, 'function',
                                                      True)

        format_items = [
            'A', 'B', 'B1', 'B2', 'B3', 'B4', 'COLUMN', 'COL', 'E', 'F', 'P',
            'R', 'TAB', 'X'
        ]

        format_item_tb = CaseSensitiveListTokenBuilder(format_items, 'format',
                                                       True)
        self.operand_types.append('format')

        options = [
            'APPEND', 'BACKUP_DATE', 'BATCH', 'BLOCK_BOUNDARY_FORMAT',
            'BLOCK_IO', 'BLOCK_SIZE', 'BUCKET_SIZE', 'BY', 'CANCEL_CONTROL_O',
            'CARRIAGE_RETURN_FORMAT', 'CONTIGUOUS', 'CONTIGUOUS_BEST_TRY',
            'CREATION_DATE', 'CURRENT_POSITION', 'DEFAULT_FILE_NAME',
            'DEFERRED_WRITE', 'DELETE', 'EDIT', 'EXPIRATION_DATE',
            'EXTENSION_SIZE', 'FAST_DELETE', 'FILE_ID', 'FILE_ID_TO',
            'FILE_SIZE', 'FIXED_CONTROL_FROM', 'FIXED_CONTROL_SIZE',
            'FIXED_CONTROL_SIZE_TO', 'FIXED_CONTROL_TO',
            'FIXED_LENGTH_RECORDS', 'FROM', 'GROUP_PROTECTION', 'IDENT',
            'IGNORE_LINE_MARKS', 'IN', 'INDEXED', 'INDEX_NUMBER',
            'INITIAL_FILL', 'INTO', 'KEY', 'KEYFROM', 'KEYTO', 'LINESIZE',
            'LOCK_ON_READ', 'LOCK_ON_WRITE', 'MAIN PROCEDURE',
            'MANUAL_UNLOCKING', 'MATCH_GREATER', 'MATCH_GREATER_EQUAL',
            'MATCH_NEXT', 'MATCH_NEXT_EQUAL', 'MAXIMUM_RECORD_NUMBER',
            'MAXIMUM_RECORD_SIZE', 'MULTIBLOCK_COUNT', 'MULTIBUFFER_COUNT',
            'NOLOCK', 'NONEXISTENT_RECORD', 'NONRECURSIVE', 'NORESCAN',
            'NO_ECHO', 'NO_FILTER', 'NO_SHARE', 'OWNER_GROUP', 'OWNER_ID',
            'OWNER_MEMBER', 'OWNER_PROTECTION', 'PAGE', 'PAGESIZE',
            'PRINTER_FORMAT', 'PROMPT', 'PURGE_TYPE_AHEAD', 'READ_AHEAD',
            'READ_CHECK', 'READ_REGARDLESS', 'RECORD_ID', 'RECORD_ID_ACCESS',
            'RECORD_ID_TO', 'RECURSIVE', 'REPEAT', 'RESCAN',
            'RETRIEVAL_POINTERS', 'REVISION_DATE', 'REWIND_ON_CLOSE',
            'REWIND_ON_OPEN', 'SCALARVARYING', 'SET READ', 'SHARED_READ',
            'SHARED_WRITE', 'SKIP', 'SNAP', 'SPOOL', 'STATEMENT', 'SUPERSEDE',
            'SYSTEM', 'SYSTEM_PROTECTION', 'TEMPORARY', 'TIMEOUT_PERIOD',
            'TITLE', 'TO', 'UNDERFLOW', 'UFL', 'UNTIL', 'USER_OPEN',
            'WAIT_FOR_RECORD', 'WHILE', 'WORLD_PROTECTION', 'WRITE_BEHIND',
            'WRITE_CHECK'
        ]

        options_tb = CaseInsensitiveListTokenBuilder(options, 'option', False)

        conditions = [
            'ANYCONDITION', 'CONVERSION', 'CONV', 'ENDFILE', 'ENDPAGE',
            'FINISH', 'FIXEDOVERFLOW', 'FOFL', 'OVERFLOW', 'OFL', 'STORAGE',
            'STRINGRANGE', 'STRG', 'SUBSCRIPTRANGE', 'SUBRG', 'UNDEFINEDFILE',
            'UNDF', 'VAXCONDITION', 'ZERODIVIDE', 'ZDIV'
        ]

        conditions_tb = CaseInsensitiveListTokenBuilder(
            conditions, 'condition', False)

        subroutines = [
            'DISPLAY', 'EXTEND', 'FLUSH', 'NEXT_VOLUME', 'RELEASE', 'RESIGNAL',
            'REWIND', 'SPACEBLOCK'
        ]

        subroutines_tb = CaseInsensitiveListTokenBuilder(
            subroutines, 'subroutine', False)

        types = [
            'ADDRESS', 'BYTE', 'CHARINT', 'DWORD', 'HWORD', 'INTEGER',
            'LONGINT', 'OFFSET', 'POINTER', 'REAL', 'SHORTINT', 'STRUCTURE',
            'QWORD', 'WORD'
        ]

        types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True)
        self.operand_types.append('type')

        values = ['SYSIN', 'SYSPRINT', 'TRUE', 'FALSE']

        values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        self.operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        # tokenize as free-format
        tokenbuilders_free = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, binary_integer_tb, hex_integer_tb,
            octal_integer_tb, decimal_integer_tb, real_tb, real_exponent_tb,
            binary_real_tb, keyword_tb, format_item_tb, function_tb,
            attributes_tb, options_tb, conditions_tb, subroutines_tb, types_tb,
            values_tb, groupers_tb, known_operator_tb, identifier_tb,
            string_tb, label_tb, slash_star_comment_tb, preprocessor_tb,
            title_tb, subtitle_tb, error_tb, warn_tb, inform_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer_free = Tokenizer(tokenbuilders_free)
        tokens_free = tokenizer_free.tokenize(code)
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid operator')
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid')
        self.tokens = tokens_free

        self.calc_statistics()
        statistics_free = self.statistics
        self.statistics = {}

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators,
                                            self.group_ends, allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            self.group_starts, allow_pairs)

        self.calc_group_confidence(tokens, self.group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, self.operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
        confidences_free = self.confidences
        self.confidences = {}
        errors_free = self.errors
        self.errors = []

        # tokenize as fixed-format
        tokenbuilders_fixed = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, binary_integer_tb, hex_integer_tb,
            octal_integer_tb, decimal_integer_tb, real_tb, real_exponent_tb,
            binary_real_tb, keyword_tb, function_tb, attributes_tb, options_tb,
            conditions_tb, subroutines_tb, types_tb, values_tb, groupers_tb,
            known_operator_tb, identifier_tb, string_tb, label_tb,
            slash_star_comment_tb, preprocessor_tb, title_tb, subtitle_tb,
            error_tb, warn_tb, inform_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        comment_start_tb = PL1CommentStartTokenBuilder()
        comment_middle_tb = PL1CommentMiddleTokenBuilder()
        comment_end_tb = PL1CommentEndTokenBuilder()

        type1_tokenbuilders = [comment_start_tb]
        tokenbuilders_fixed_1 = tokenbuilders_fixed + type1_tokenbuilders + [
            invalid_token_builder
        ]
        tokenizer_fixed_1 = Tokenizer(tokenbuilders_fixed_1)

        type2_tokenbuilders = [
            comment_start_tb, comment_middle_tb, comment_end_tb
        ]
        tokenbuilders_fixed_2 = tokenbuilders_fixed + type2_tokenbuilders + [
            invalid_token_builder
        ]
        tokenizer_fixed_2 = Tokenizer(tokenbuilders_fixed_2)

        tokens_fixed = self.tokenize_code(code, tab_size, tokenizer_fixed_1,
                                          tokenizer_fixed_2, wide)
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'invalid operator')
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'invalid')
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'whitespace')
        tokens_fixed = self.convert_broken_comments_to_comments(tokens_fixed)
        self.tokens = tokens_fixed

        self.calc_statistics()
        statistics_fixed = self.statistics
        self.statistics = {}

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators,
                                            self.group_ends, allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            self.group_starts, allow_pairs)

        self.calc_group_confidence(tokens, self.group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, self.operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
        confidences_fixed = self.confidences
        self.confidences = {}
        errors_fixed = self.errors
        self.errors = []

        # compute confidence for free-format and fixed-format
        confidence_free = 1.0
        if len(confidences_free) == 0:
            confidence_free = 0.0
        else:
            for key in confidences_free:
                factor = confidences_free[key]
                confidence_free *= factor

        confidence_fixed = 1.0
        if len(confidences_fixed) == 0:
            confidence_fixed = 0.0
        else:
            for key in confidences_fixed:
                factor = confidences_fixed[key]
                confidence_fixed *= factor

        # select the better of free-format and spaced-format
        if confidence_fixed > confidence_free:
            self.tokens = tokens_fixed
            self.statistics = statistics_fixed
            self.confidences = confidences_fixed
            self.errors = errors_fixed
        else:
            self.tokens = tokens_free
            self.statistics = statistics_free
            self.confidences = confidences_free
            self.errors = errors_free
Beispiel #10
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '0123456789abcdefABCDEF_')
        long_integer_tb = SuffixedIntegerTokenBuilder('L', False, None)
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        float_real_tb = SuffixedRealTokenBuilder(False, False, ['f'], False,
                                                 None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        symbol_tb = PrefixedIdentifierTokenBuilder("'", 'symbol', True)
        operand_types.append('symbol')

        quotes = ['"']
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_string_tb = TripleQuoteStringTokenBuilder(quotes)
        operand_types.append('string')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '&', '|', '^', '<<', '>>', '&&', '||',
            '=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=',
            '>:', '⇒', '=>', '=', '<%', '<:', '←', '<-', '#', '@', '==', '!=',
            '>', '<', '>=', '<=', '!', '~', '<<<', '>>>', '.', '++', '--',
            'new'
        ]

        self.unary_operators = ['+', '-', '*', '!', '~', '++', '--', 'new']

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'abstract', 'case', 'catch', 'class', 'def', 'do', 'else',
            'extends', 'final', 'finally', 'for', 'forSome', 'if', 'implicit',
            'import', 'lazy', 'match', 'object', 'override', 'package',
            'private', 'protected', 'return', 'sealed', 'then', 'throw',
            'trait', 'try', 'type', 'val', 'var', 'while', 'with', 'yield'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['false', 'true', 'null', 'this', 'super']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, long_integer_tb,
            real_tb, real_exponent_tb, float_real_tb, keyword_tb, values_tb,
            groupers_tb, known_operator_tb, identifier_tb, symbol_tb,
            string_tb, triple_string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Beispiel #11
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("_")
        hex_integer_tb = PrefixedIntegerTokenBuilder(
            '0x', False, '_0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '_01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder([
            'G',
            'L',
            'I',
        ], False, '_')
        real_tb = RealTokenBuilder(False, False, "_")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "_")
        suffixed_real_tb = SuffixedRealTokenBuilder(False, False,
                                                    ['G', 'D', 'F'], False,
                                                    '_')
        operand_types.append('number')

        leads = '@_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        regex_tb = RegexTokenBuilder()
        # dollar-slash slash-dollar strings (allow newline)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        shebang_tb = SheBangTokenBuilder()
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+',
            '-',
            '*',
            '/',
            '%',
            '**',
            '=',
            '==',
            '!=',
            '===',
            '!==',
            '>',
            '>=',
            '<',
            '<=',
            '+=',
            '-=',
            '*=',
            '/=',
            '%=',
            '**=',
            '&=',
            '|=',
            '^=',
            '<<=',
            '>>=',
            '!',
            '&',
            '|',
            '~',
            '<<',
            '>>',
            '>>>',
            '^',
            '?.',
            '?:',
            '<>',
            '>>>=',
            '.',
            '.&',
            '.@',
            '::',
            '=~',
            '==~',
            '*.',
            '*:',
            '..',
            '..<',
            '<=>',
            '++',
            '--',
            '->',
            '&&',
            '||',
            '?',
            '##',
            'as',
            'in',
            '!in',
            'instanceof',
            '!instanceof',
            'new',
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--', '?']

        self.postfix_operators = ['++', '--', '&', '*']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        # group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'assert', 'break', 'case', 'catch', 'class', 'const', 'continue',
            'def', 'default', 'do', 'else', 'enum', 'extends', 'finally',
            'for', 'goto', 'if', 'implements', 'import', 'interface', 'new',
            'package', 'return', 'super', 'switch', 'throw', 'throws', 'trait',
            'try', 'var', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'byte', 'char', 'double', 'float', 'int', 'long', 'short',
            'Java.lang.BigInteger'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['null', 'true', 'false', 'this']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, binary_integer_tb,
            suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb,
            keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb,
            identifier_tb, class_type_tb, string_tb, triple_quote_string_tb,
            regex_tb, slash_slash_comment_tb, slash_star_comment_tb,
            shebang_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        # shebang line at start

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Beispiel #12
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder(
            ['U', 'L', 'LL', 'ULL', 'LLU'], False, None)
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l'],
                                                    False, None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        directives = [
            '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else',
            '#elif', '#line', '#include', '#pragma'
        ]

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        c_preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True,
                                                   'preprocessor')
        c_error_tb = LeadToEndOfLineTokenBuilder('#error', True,
                                                 'preprocessor')
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=',
            '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!',
            '&', '|', '~', '<<', '>>', '^', '.', '++', '--', '->', '&&', '||',
            '?', '##', '::', '<=>', '.*', '->*', 'new', 'delete', 'and',
            'and_eq', 'bitand', 'bitor', 'compl', 'not', 'not_eq', 'or',
            'or_eq', 'xor', 'xor_eq'
        ]

        self.unary_operators = [
            '+', '-', '*', '!', '&', '~', '++', '--', 'new', 'delete', 'compl',
            'not'
        ]

        self.postfix_operators = ['++', '--', '&', '*']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'alignas',
            'alignof',
            'asm',
            'atomic_cancel',
            'atomic_commit',
            'atomic_noexcept',
            'audit',
            'auto',
            'axiom',
            'break',
            'case',
            'catch',
            'class',
            'concept',
            'const',
            'consteval',
            'constexpr',
            'const_cast',
            'continue',
            'co_await',
            'co_return',
            'co_yield',
            'decltype',
            'default',
            'do',
            'dynamic_cast',
            'else',
            'enum',
            'explicit',
            'export',
            'extern',
            'final',
            'for',
            'friend',
            'goto',
            'if',
            'import',
            'inline',
            'module',
            'mutable',
            'namespace',
            'noexcept',
            'nullptr',
            'operator',
            'override',
            'private',
            'protected',
            'public',
            'private:',
            'protected:',
            'public:',
            'reflexpr',
            'register',
            'reinterpret_cast',
            'requires',
            'return',
            'signed',
            'sizeof',
            'static',
            'static_assert',
            'static_cast',
            'struct',
            'switch',
            'synchronized',
            'template',
            'thread_local',
            'throw',
            'transaction_safe',
            'transaction_safe_dynamic'
            'try',
            'typedef',
            'typeid',
            'typename',
            'union',
            'unsigned',
            'using',
            'virtual',
            'volatile',
            'while',
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'bool', 'char', 'char8_t', 'char16_t', 'char32_t', 'double',
            'float', 'int', 'long', 'short', 'void', 'wchar_t'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'this', 'true', 'cout', 'cin']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb,
            suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb,
            keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, class_type_tb, string_tb, slash_slash_comment_tb,
            slash_star_comment_tb, c_preprocessor_tb, c_error_tb, c_warning_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Beispiel #13
0
    def __init__(self, code, version):
        super().__init__()
        self.newlines_important = 'always'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder(False)
        real_tb = RealTokenBuilder(False, False, False)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_')
        double_exponent_tb = NullTokenBuilder()
        integer_suffix_tb = SuffixedIntegerTokenBuilder(
            ['%', '&', 'S', 'I', 'L', 'F', 'D', 'R', 'US', 'UI', 'UL'], True,
            '_')
        float_suffix_tb = SuffixedRealTokenBuilder(False, False,
                                                   ['!', '#', 'F', 'D', 'R'],
                                                   True, '_')

        if version in ['basic-80', 'basica', 'gw-basic']:
            double_exponent_tb = RealExponentTokenBuilder(
                False, False, 'D', '_')
            integer_suffix_tb = SuffixedIntegerTokenBuilder(['%'], False, '_')
            float_suffix_tb = SuffixedRealTokenBuilder(False, False,
                                                       ['!', '#'], True, '_')

        hex_constant_tb = PrefixedIntegerTokenBuilder(
            '&H', True, '0123456789ABCDEFabcdef_')
        octal_constant_tb = PrefixedIntegerTokenBuilder(
            '&O', True, '01234567_')
        binary_constant_tb = PrefixedIntegerTokenBuilder('&B', True, '01_')

        operand_types.append('number')

        variable_tb = BasicVariableTokenBuilder('%#!$&')

        if version in ['basic-80', 'basica', 'gw-basic']:
            variable_tb = BasicLongVariableTokenBuilder('%#!$&')

        operand_types.append('variable')

        quotes = ['"']
        string_tb = StuffedQuoteStringTokenBuilder(quotes, False)
        operand_types.append('string')

        remark_tb = RemarkTokenBuilder()
        comment_tb = LeadToEndOfLineTokenBuilder("'", False, 'comment')
        comment2_tb = LeadToEndOfLineTokenBuilder("’", False, 'comment')

        stmt_separator_tb = SingleCharacterTokenBuilder(
            ':', 'statement separator', False)

        known_operators = [
            '+', '-', '*', '/', '^', '=', '>', '>=', '<', '<=', '<>', '#',
            '\\', 'AND', 'OR', 'NOT'
        ]

        known_operators_ms = ['=>', '=<', 'IMP', 'EQV', 'XOR', 'MOD']

        if version in ['basic-80', 'basica', 'gw-basic']:
            known_operators += known_operators_ms

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', '#', 'NOT']

        groupers = ['(', ')', ',', ';']
        group_starts = ['(']
        group_mids = [',', ';']
        group_ends = [')']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'CLOSE', 'CHAIN', 'DATA', 'DEF', 'DIM', 'ELSE', 'END', 'ERROR',
            'FILE', 'FOR', 'GOSUB', 'GOTO', 'IF', 'INPUT', 'LET', 'LINE',
            'MAT', 'NEXT', 'ON', 'ONERR', 'OPEN', 'OUTPUT', 'POKE', 'PRINT',
            'RANDOMIZE', 'READ', 'REM', 'REMARK', 'RESTORE', 'RETURN', 'STEP',
            'STOP', 'THEN', 'TO', 'USING'
        ]

        keywords_plain = ['AS', 'GO']

        keywords_ms = [
            # 'AS',  ## promoted from variable after FIELD
            # 'BASE',  ## promoted from variable after OPTION
            'CALL',
            'CLEAR',
            'CLS',
            'COMMON',
            'DEFDBL',
            'DEFINT',
            'DEFSNG',
            'DEFSTR',
            'ELSE',
            'END',
            'ERASE',
            'ERRLN',
            'ERRNO',
            'ERROR',
            'FIELD',
            'FILES',
            'GET',
            'KILL',
            'LOAD',
            'LPRINT',
            'LSET',
            'MERGE',
            'NULL',
            'ONERR',
            'OPTION',
            'OUT',
            'PUT',
            'RESET',
            'RESUME',
            'RETURN',
            'RSET',
            'RUN',
            'SET',
            'SWAP',
            'SYSTEM',
            'TRON',
            'TROFF',
            'WAIT',
            'WHILE',
            'WEND',
            'WIDTH',
            'WRITE'
        ]

        plus_keywords = ['CHANGE']

        if version in ['']:
            keywords += keywords_plain

        if version in ['basic-80', 'basica', 'gw-basic']:
            keywords += keywords_ms

        keywords_basica = [
            'COLOR', 'KEY', 'LOCATE', 'PAINT', 'PLAY', 'SCREEN', 'SOUND'
        ]

        if version in ['basica', 'gw-basic']:
            keywords += keywords_basica

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['OFF', 'ON']

        values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        functions = [
            'ABS', 'ASC', 'ATN', 'CHR', 'CHR$', 'CON', 'COS', 'DET', 'ERL',
            'ERR', 'EXP', 'IDN', 'INSTR', 'INT', 'INV', 'LEFT', 'LEFT$', 'LEN',
            'LOG', 'MID', 'MID$', 'POS', 'RIGHT', 'RIGHT$', 'RND', 'SGN',
            'SIN', 'SQR', 'STR$', 'TAB', 'TAN', 'TRN', 'VAL', 'ZER'
        ]

        functions_ms = [
            'CDBL', 'CINT', 'CSNG', 'CVI', 'CVD', 'CVS', 'DATE$', 'EOF', 'FIX',
            'FRE', 'HEX$', 'INKEY', 'INP', 'INPUT$', 'INSTR', 'LOC', 'LOF',
            'LPOS', 'MKI$', 'MKD$', 'MKS$', 'OCT$', 'PEEK', 'SPACE$', 'SPC',
            'STRING$', 'TIME$', 'USR', 'VARPTR'
        ]

        if version in ['basic-80', 'basica', 'gw-basic']:
            functions += functions_ms

        function_tb = CaseInsensitiveListTokenBuilder(functions, 'function',
                                                      True)
        user_function_tb = UserFunctionTokenBuilder('%#!$&')
        hardware_function_tb = NullTokenBuilder()

        if version in ['basic-80', 'basica', 'gw-basic']:
            user_function_tb = LongUserFunctionTokenBuilder('%#!$&')
            hardware_function_tb = HardwareFunctionTokenBuilder()

        operand_types.append('function')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, float_suffix_tb, integer_suffix_tb, real_tb,
            real_exponent_tb, double_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, keyword_tb,
            known_operator_tb, function_tb, user_function_tb,
            hardware_function_tb, values_tb, variable_tb, groupers_tb,
            string_tb, remark_tb, comment_tb, comment2_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        operand_types = [
            'number', 'string', 'symbol', 'identifier', 'variable', 'function'
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = BasicExaminer.convert_numbers_to_line_numbers(tokens)

        if version in ['basic-80', 'basica', 'gw-basic']:
            tokens = BasicExaminer.extract_keywords_from_identifiers(
                tokens, keywords, known_operators)
            tokens = BasicExaminer.convert_as_to_keyword(tokens)
            tokens = BasicExaminer.convert_base_to_keyword(tokens)
            tokens = BasicExaminer.convert_operators_to_values(tokens)

        self.tokens = tokens

        self.calc_statistics()

        tokens = self.source_tokens()

        self.calc_statistics()
        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'variable', 'symbol']

        if version not in ['basic-80', 'basica', 'gw-basic']:
            self.calc_operand_n_confidence(tokens, operand_types_2, 2)
            self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_line_format_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Beispiel #14
0
    def __init__(self, code, tab_size, processor):
        super().__init__()

        self.newlines_important = 'always'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        comment_tb = LeadToEndOfLineTokenBuilder(';', True, 'comment')

        if processor in ['pdp-8']:
            comment_tb = LeadToEndOfLineTokenBuilder('/', True, 'comment')

        comment_2_tb = NullTokenBuilder()

        if processor in ['1802']:
            comment_2_tb = LeadToEndOfLineTokenBuilder('..', True, 'comment')

        line_comment_star_tb = AssemblyCommentTokenBuilder('*')
        line_comment_hash_tb = NullTokenBuilder()

        if processor in ['68000']:
            line_comment_hash_tb = AssemblyCommentTokenBuilder('#')

        stmt_separator_tb = NullTokenBuilder()

        if processor in ['pdp-8']:
            stmt_separator_tb = SingleCharacterTokenBuilder(
                ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        integer_1_tb = NullTokenBuilder()
        integer_2_tb = NullTokenBuilder()
        prefixed_integer_tb = PrefixedIntegerTokenBuilder(
            '#', True, '0123456789')
        if processor in ['pdp-11']:
            integer_1_tb = SuffixedIntegerTokenBuilder('$', True, '0123456789')
        if processor in ['z80']:
            integer_1_tb = SuffixedIntegerTokenBuilder('O', True, '0123456789')
            integer_2_tb = SuffixedIntegerTokenBuilder('D', True, '0123456789')

        hex_integer_1_tb = PrefixedIntegerTokenBuilder(
            '&', True, '0123456789abcdefABCDEF')
        hex_integer_2_tb = SuffixedIntegerTokenBuilder(
            'h', False, '0123456789abcdefABCDEF')
        hex_integer_3_tb = PrefixedIntegerTokenBuilder(
            '$', True, '0123456789abcdefABCDEF')
        hex_integer_4_tb = PrefixedIntegerTokenBuilder(
            '#$', True, '0123456789abcdefABCDEF')

        hash_quote_value_tb = NullTokenBuilder()

        if processor in ['pdp-11']:
            hash_quote_value_tb = HashQuoteCharTokenBuilder()

        operand_types.append('number')

        leads = '_.$@#'
        extras = '_.$@#'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        label_tb = LabelTokenBuilder(leads, extras, ':')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        known_operators = [
            '+', '-', '*', '/', '&', '|', '=', '??', '#', '@', "'", '!'
        ]

        self.unary_operators = ['+', '-', '??', '#', '@', "'"]

        self.postfix_operators = ['+']

        groupers = ['(', ')', ',', '[', ']', '<', '>', ':']
        group_starts = ['(', '[', ',', '<']
        group_ends = [')', ']', '>']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        preprocessors = [
            'if', 'ifne', 'ifeq', 'else', 'endif', 'endc', 'error'
        ]

        preprocessors_68000 = ['MACRO', 'ENDM']

        preprocessors_8080 = ['MACRO', 'ENDM']

        preprocessors_8086 = [
            'ELSE', 'ELSEIF', 'ELSEIF2', 'ENDM', 'EXITM', 'FOR', 'FORC',
            'GOTO', 'IF', 'IF2', 'IFB', 'IFNB', 'IFDEF', 'IFNDEF', 'IFDIF',
            'IFDIF[[I]]', 'IFE', 'IFIDN', 'IFIDN[[I]]', 'LOCAL', 'MACRO',
            'PURGE', '.BREAK', '.CONTINUE', '.ELSE', '.ELSEIF', '.ENDIF',
            '.ERR', '.ERR2', '.ERRB', '.ERRDEF', '.ERRDIF', '.ERRDIF[[I]]]',
            '.ERRE', '.ERRIDN', '.ERRIDN[[I]]', '.ERRNB', '.ERRNDEF', '.ERRNZ',
            '.EXIT', '.IF', '.REPEAT', '.UNTIL', '.UNTILCXZ', '.WHILE'
        ]

        if processor in ['68000']:
            preprocessors += preprocessors_68000

        if processor in ['8080']:
            preprocessors += preprocessors_8080

        if processor in ['8086']:
            preprocessors += preprocessors_8086

        preprocessor_tb = CaseInsensitiveListTokenBuilder(
            preprocessors, 'preprocessor', False)

        directives = [
            'DB', 'DW', 'DS', 'EJECT', 'END', 'EQU', 'EXTRN', 'INCLUDE',
            'NAME', 'ORG', 'PAGE', 'SECTION', 'SEGMENT', 'START', 'SUBTITLE',
            'TEXT'
        ]

        directives_6502 = ['DFB', 'DFW']

        directives_6800 = ['CPU', 'NAM']

        directives_68000 = ['=', 'EVEN', 'ODD']

        directives_8080 = [
            'ASEG',
            'CPU',
            'LOCAL',
            'TITLE',
            '.8080',
            '.8086',
            '.6800',
            '.6502',
            ".386",
        ]

        directives_z80 = ['DEFB', 'DEFS', 'DEFW']

        directives_8086 = [
            '=',
            'ABSOLUTE',
            'ALIAS',
            'ALIGN',
            'AS',
            'ASSUME',
            'AT',
            'BITS',
            'BYTE',
            'COMM',
            'COMMON',
            'CPU',
            'CSEG',
            'DEFAULT',
            'DSEG',
            'DWORD',
            'ECHO',
            'ENDP',
            'ENDS',
            'EVEN',
            'EXTERNDEF',
            'FWORD',
            'FORMAT',
            'GLOBAL',
            'GROUP',
            'INCLUDELIB',
            'INS86',
            'INVOKE',
            'LABEL',
            'MMWORD',
            'OPTION',
            'POPCONTEXT',
            'PROC',
            'PROTO',
            'PUBLIC',
            'PUSHCONTEXT',
            'SEGMENT'
            'QWORD',
            'REAL4',
            'REAL8',
            'REAL10',
            'RECORD',
            'STRUCT',
            'TEXTEQU',
            'TBYTE',
            'TYPEDEF',
            'WORD',
            'SBYTE',
            'SDWORD',
            'SWORD',
            'SECT',
            'SECTION',
            'SEGMENT',
            'STATIC'
            'UNION',
            'USE16',
            'USE32',
            'USE64',
            'VIRTUAL',
            'XMMWORD',
            'YMMWORD',
            '.386',
            '.386P',
            '.387',
            '.486',
            '.486P',
            '.586',
            '.586P',
            '.686',
            '.686P',
            '.K3D',
            '.ALLOCSTACK',
            '.ALPHA',
            '.CODE',
            '.CONST',
            '.CREF',
            '.DATA',
            '.DATA?',
            '.DOSSEG',
            '.ENDW',
            '.ENDPROLOG',
            '.FARDATA',
            '.FARDATA?',
            '.FPO',
            '.LIST',
            '.LISTALL',
            '.LISTIF',
            '.LISTMACRO',
            '.LISTMACROALL',
            '.MODEL',
            '.MMX',
            '.NOCREF',
            '.NOLIST',
            '.NOLISTIF',
            '.NOLISTMACRO',
            '.PUSHFRAME',
            '.PUSHREG',
            '.RADIX',
            '.SAFESEH',
            '.SALL',
            '.SAVEREG',
            '.SAVEXMM128',
            '.STACK',
            '.STARTUP',
            '.SEQ',
            '.SETFRAME',
            '.TFCOND',
            '.XLIST',
            '.XMM',
        ]

        directives_80386 = [
            'ALIGN',
            'BITS',
            'GLOBAL',
            'PROC',
            'SECTION',
            'RESB',
            'RESD',
            '.386',
            '.CODE',
            '.DATA',
            '.MODEL',
            '.TEXT',
            '%INCLUDE',
        ]

        directives_pdp8 = ['=']

        directives_pdp11 = [
            '=', 'BYTE', 'WORD', '.odd', '.even', '.blkb', '.blkw', '.byte',
            '.word', '.ascii', '.asciz', '.end', '.hex', '.radix', '.ident',
            '.if', '.ift', '.endc', '.psect', '.mcall', '.macro', '.endm',
            '.restore', '.print', '.error', '.list', '.nlist'
        ]

        if processor in ['6502']:
            directives += directives_6502

        if processor in ['6800']:
            directives += directives_6800

        if processor in ['68000']:
            directives += directives_68000

        if processor in ['8080']:
            directives += directives_8080

        if processor in ['z80']:
            directives += directives_z80

        if processor in ['8086']:
            directives += directives_8086

        if processor in ['80386']:
            directives += directives_80386

        if processor in ['pdp-8']:
            directives += directives_pdp8

        if processor in ['pdp-11']:
            directives += directives_pdp11

        directive_tb = CaseInsensitiveListTokenBuilder(directives, 'directive',
                                                       False)

        title_directive_tb = LeadToEndOfLineTokenBuilder(
            'TITLE', False, 'directive')
        title_directive_2_tb = LeadToEndOfLineTokenBuilder(
            '.TITLE', False, 'directive')
        subtitle_directive_tb = LeadToEndOfLineTokenBuilder(
            'SUBTTL', False, 'directive')
        subtitle_directive_2_tb = LeadToEndOfLineTokenBuilder(
            '.SUBTTL', False, 'directive')
        subtitle_directive_3_tb = LeadToEndOfLineTokenBuilder(
            '.SBTTL', False, 'directive')
        include_directive_tb = LeadToEndOfLineTokenBuilder(
            'INCLUDE', False, 'directive')
        include_directive_2_tb = LeadToEndOfLineTokenBuilder(
            '.INCLUDE', False, 'directive')

        multiline_comment_tb = MultilineCommentTokenBuilder()

        opcodes_1802 = [
            'IDL', 'LDN', 'INC', 'DEC', 'BR', 'BO', 'BZ', 'BDF', 'BPZ', 'BGE',
            'B1', 'B2', 'B3', 'B4', 'SKP', 'NBR', 'BNO', 'BNZ', 'BNF', 'BM',
            'BL', 'BN1', 'BN2', 'BN3', 'BN4', 'LDA', 'STR', 'IRX', 'OUT',
            'INP', 'RET', 'DIS', 'LDXA', 'STXD', 'ADC', 'SDB', 'SHRC', 'RSHR',
            'SMB', 'SAV', 'MARK', 'REQ', 'SEQ', 'ADCI', 'SDBI', 'SHLC', 'RSHL',
            'SMBI', 'GLO', 'GHI', 'PLO', 'PHI', 'LBO', 'LBZ', 'LBDF', 'NOP',
            'LSNO', 'LSNZ', 'LSNF', 'LSKP', 'NLBR', 'LBNQ', 'LBNZ', 'LBNF',
            'LSIE', 'LSQ', 'LSZ', 'LSDF', 'SEP', 'SEX', 'LDX', 'OR', 'AND',
            'XOR', 'ADD', 'SD', 'SHR', 'SM', 'LDI', 'ORI', 'ANI', 'XRI', 'ADI',
            'SDI', 'SHL', 'SMI'
        ]

        registers_1802 = []

        opcodes_6502 = [
            'ADC', 'AND', 'ASL', 'AST', 'BCC', 'BCS', 'BEQ', 'BIT', 'BMI',
            'BNE', 'BPL', 'BRK', 'BVC', 'BVS', 'CLC', 'CLD', 'CLI', 'CLV',
            'CMP', 'CPR', 'CPX', 'CPY', 'DEC', 'DEX', 'DEY', 'EOR', 'INC',
            'INX', 'INY', 'JMP', 'JSR', 'LDA', 'LDX', 'LDY', 'LSR', 'NOP',
            'ORA', 'PHA', 'PHP', 'PLA', 'PLP', 'ROL', 'ROR', 'RTI', 'RTS',
            'SBC', 'SEC', 'SED', 'SEI', 'STA', 'STX', 'STY', 'TAX', 'TAY',
            'TSX', 'TXA', 'TXS', 'TYA'
        ]

        registers_6502 = ['A', 'X', 'Y', 'P', 'S']

        opcodes_6800 = [
            'ABA', 'ADC', 'ADCA', 'ADCB', 'ADD', 'AND', 'ASL', 'ASR', 'BCC',
            'BCS', 'BEQ', 'BGE', 'BGT', 'BHI', 'BIT', 'BLE', 'BLS', 'BLT',
            'BMI', 'BNE', 'BPL', 'BRA', 'BSR', 'BVC', 'BVS', 'CBA', 'CLC',
            'CLI', 'CLR', 'CLRA', 'CLRB', 'CLV', 'CMP', 'COM', 'CPX', 'DAA',
            'DEC', 'DES', 'DEX', 'EOR', 'EORA', 'EROB', 'INC', 'INS', 'INX',
            'JMP', 'JSR', 'LDA', 'LDAA', 'LDAB', 'LDS', 'LDX', 'LSR', 'NEG',
            'NOP', 'ORA', 'PSH', 'PUL', 'ROL', 'ROR', 'RTI', 'RTS', 'SBA',
            'SBC', 'SEC', 'SEI', 'SEV', 'STA', 'STAA', 'STAB', 'STS', 'STX',
            'SUB', 'SWI', 'TAB', 'TAP', 'TBA', 'TPA', 'TST', 'TSX', 'TXS',
            'WAI'
        ]

        registers_6800 = ['A', 'B', 'IX', 'PC', 'SP']

        opcodes_68000 = [
            'AND', 'ANDI', 'EOR', 'EORI', 'NOT', 'OR', 'ORI', 'CLR', 'BCHG',
            'BCLR', 'BSET', 'BTST', 'EXT', 'EXTB', 'MOVE', 'MOVEA', 'MOVEM',
            'MOVEP', 'MOVEQ', 'CMP', 'CMPA', 'CMPI', 'CMPM', 'CMP2', 'LEA',
            'PEA', 'TAS', 'CHK', 'ADD', 'ADDA', 'ADDI', 'ADDQ', 'ADDX', 'SUB',
            'SUBA', 'SUBI', 'SUBQ', 'SUBX', 'MULS', 'MULU', 'DIVS', 'DIVU',
            'NEG', 'NEGX', 'ASL', 'ASR', 'LSL', 'LSR', 'ROL', 'ROR', 'ROXL',
            'ROXR', 'DBCC', 'SWAP', 'TST', 'ANDB', 'ANDIB', 'EORB', 'EORIB',
            'NOTB', 'ORB', 'ORIB', 'CLRB', 'BCHGB', 'BCLRB', 'BSETB', 'BTSTB',
            'EXTB', 'EXTBB', 'MOVEB', 'MOVEAB', 'MOVEMB', 'MOVEPB', 'MOVEQB',
            'CMPB', 'CMPAB', 'CMPIB', 'CMPMB', 'CMP2B', 'LEAB', 'PEAB', 'TASB',
            'CHKB', 'ADDB', 'ADDAB', 'ADDIB', 'ADDQB', 'ADDXB', 'SUBB',
            'SUBAB', 'SUBIB', 'SUBQB', 'SUBXB', 'MULSB', 'MULUB', 'DIVSB',
            'DIVUB', 'NEGB', 'NEGXB', 'ASLB', 'ASRB', 'LSLB', 'LSRB', 'ROLB',
            'RORB', 'ROXLB', 'ROXRB', 'DBCCB', 'SWAPB', 'TSTB', 'ANDW',
            'ANDIW', 'EORW', 'EORIW', 'NOTW', 'ORW', 'ORIW', 'CLRW', 'BCHGW',
            'BCLRW', 'BSETW', 'BTSTW', 'EXTW', 'EXTBW', 'MOVEW', 'MOVEAW',
            'MOVEMW', 'MOVEPW', 'MOVEQW', 'CMPW', 'CMPAW', 'CMPIW', 'CMPMW',
            'CMP2W', 'LEAW', 'PEAW', 'TASW', 'CHKW', 'ADDW', 'ADDAW', 'ADDIW',
            'ADDQW', 'ADDXW', 'SUBW', 'SUBAW', 'SUBIW', 'SUBQW', 'SUBXW',
            'MULSW', 'MULUW', 'DIVSW', 'DIVUW', 'NEGW', 'NEGXW', 'ASLW',
            'ASRW', 'LSLW', 'LSRW', 'ROLW', 'RORW', 'ROXLW', 'ROXRW', 'DBCCW',
            'SWAPW', 'TSTW', 'ANDL', 'ANDIL', 'EORL', 'EORIL', 'NOTL', 'ORL',
            'ORIL', 'CLRL', 'BCHGL', 'BCLRL', 'BSETL', 'BTSTL', 'EXTL',
            'EXTBL', 'MOVEL', 'MOVEAL', 'MOVEML', 'MOVEPL', 'MOVEQL', 'CMPL',
            'CMPAL', 'CMPIL', 'CMPML', 'CMP2L', 'LEAL', 'PEAL', 'TASL', 'CHKL',
            'ADDL', 'ADDAL', 'ADDIL', 'ADDQL', 'ADDXL', 'SUBL', 'SUBAL'
            'SUBIL', 'SUBQL', 'SUBXL', 'MULSL', 'MULUL', 'DIVSL', 'DIVUL',
            'NEGL', 'NEGXL', 'ASLL', 'ASRL', 'LSLL', 'LSRL', 'ROLL', 'RORL',
            'ROXLL', 'ROXRL', 'DBCCL', 'SWAPL', 'TSTL', 'ABCD', 'NBCD', 'PACK',
            'SBCD', 'UNPK', 'BSR', 'BRA', 'BT', 'BF', 'BEQ', 'BNE', 'BLS',
            'BLT', 'BLE', 'BGT', 'BGE', 'BCC', 'BCS', 'BPL', 'BMI', 'BHI',
            'BVC', 'BVS', 'BSRS', 'BRAS', 'BEQS', 'BNES', 'BLSS', 'BLTS',
            'BLES', 'BGTS', 'BGES', 'BCCS', 'BCSS', 'BPLS', 'BMIS', 'BHIS',
            'BVCS', 'BVSS', 'DBSR', 'DBRA', 'DBT', 'DBF', 'DBEQ', 'DBNE',
            'DBLS', 'DBLT', 'DBLE', 'DBGT', 'DBGE', 'DBCC', 'DBCS', 'DBPL',
            'DBMI', 'DBHI', 'DBVC', 'DBVS', 'JSR', 'JMP', 'TRAP', 'HALT',
            'STOP', 'RTD', 'RTE', 'RTR', 'RTS', 'TRAP', 'HALT', 'STOP', 'NOP',
            'MOVE16', 'EXG', 'BFCHG', 'BFCLR', 'BFEXTS', 'BFEXTU', 'BFFFO',
            'BFINS', 'BFSET', 'BFTST', 'FNOP', 'FABS', 'FACOS', 'FASIN',
            'FATAN', 'FCOS', 'FCOSH', 'FETOX', 'FETOXM1', 'FGETMAN', 'FINT',
            'FINTRZ', 'FLOGN', 'FLOGNP1', 'FLOG10', 'FLOG2', 'FNEG', 'FSIN',
            'FSINH', 'FSQRT', 'FTAN', 'FTANH', 'FTENTOX', 'FTWOTOX', 'FTST',
            'DSB', 'DSW', 'DSL', 'DCB', 'DCW', 'DCL', 'AND.B', 'ANDI.B',
            'EOR.B', 'EORI.B', 'NOT.B', 'OR.B', 'ORI.B', 'CLR.B', 'BCHG.B',
            'BCLR.B', 'BSET.B', 'BTST.B', 'EXT.B', 'EXTB.B', 'MOVE.B',
            'MOVEA.B', 'MOVEM.B', 'MOVEP.B', 'MOVEQ.B', 'CMP.B', 'CMPA.B',
            'CMPI.B', 'CMPM.B', 'CMP2.B', 'LEA.B', 'PEA.B', 'TAS.B', 'CHK.B',
            'ADD.B', 'ADDA.B', 'ADDI.B', 'ADDQ.B', 'ADDX.B', 'SUB.B', 'SUBA.B',
            'SUBI.B', 'SUBQ.B', 'SUBX.B', 'MULS.B', 'MULU.B', 'DIVS.B',
            'DIVU.B', 'NEG.B', 'NEGX.B', 'ASL.B', 'ASR.B', 'LSL.B', 'LSR.B',
            'ROL.B', 'ROR.B', 'ROXL.B', 'ROXR.B', 'DBCC.B', 'SWAP.B', 'TST.B',
            'AND.W', 'ANDI.W', 'EOR.W', 'EORI.W', 'NOT.W', 'OR.W', 'ORI.W',
            'CLR.W', 'BCHG.W', 'BCLR.W', 'BSET.W', 'BTST.W', 'EXT.W', 'EXTB.W',
            'MOVE.W', 'MOVEA.W', 'MOVEM.W', 'MOVEP.W', 'MOVEQ.W', 'CMP.W',
            'CMPA.W', 'CMPI.W', 'CMPM.W', 'CMP2.W', 'LEA.W', 'PEA.W', 'TAS.W',
            'CHK.W', 'ADD.W', 'ADDA.W', 'ADDI.W', 'ADDQ.W', 'ADDX.W', 'SUB.W',
            'SUBA.W', 'SUBI.W', 'SUBQ.W', 'SUBX.W', 'MULS.W', 'MULU.W',
            'DIVS.W', 'DIVU.W', 'NEG.W', 'NEGX.W', 'ASL.W', 'ASR.W', 'LSL.W',
            'LSR.W', 'ROL.W', 'ROR.W', 'ROXL.W', 'ROXR.W', 'DBCC.W', 'SWAP.W',
            'TST.W', 'AND.L', 'ANDI.L', 'EOR.L', 'EORI.L', 'NOT.L', 'OR.L',
            'ORI.L', 'CLR.L', 'BCHG.L', 'BCLR.L', 'BSET.L', 'BTST.L', 'EXT.L',
            'EXTB.L', 'MOVE.L', 'MOVEA.L', 'MOVEM.L', 'MOVEP.L', 'MOVEQ.L',
            'CMP.L', 'CMPA.L', 'CMPI.L', 'CMPM.L', 'CMP2.L', 'LEA.L', 'PEA.L',
            'TAS.L', 'CHK.L', 'ADD.L', 'ADDA.L', 'ADDI.L', 'ADDQ.L', 'ADDX.L',
            'SUB.L', 'SUBA.L', 'SUBI.L', 'SUBQ.L', 'SUBX.L', 'MULS.L',
            'MULU.L', 'DIVS.L', 'DIVU.L', 'NEG.L', 'NEGX.L', 'ASL.L', 'ASR.L',
            'LSL.L', 'LSR.L', 'ROL.L', 'ROR.L', 'ROXL.L', 'ROXR.L', 'DBCC.L',
            'SWAP.L', 'TST.L', 'BSR.S', 'BRA.S', 'BT.S', 'BF.S', 'BEQ.S',
            'BNE.S', 'BLS.S', 'BLT.S', 'BLE.S', 'BGT.S', 'BGE.S', 'BCC.S',
            'BCS.S', 'BPL.S', 'BMI.S', 'BHI.S', 'BVC.S', 'BVS.S', 'DS.B',
            'DS.W', 'DS.L', 'DC.B', 'DC.W', 'DC.L'
        ]

        registers_68000 = [
            'D0', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'A0', 'A1', 'A2',
            'A3', 'A4', 'A5', 'A6', 'A7', 'FP0', 'FP1', 'FP2', 'FP3', 'FP4',
            'FP5', 'FP6', 'FP7', 'PC', 'SR'
        ]

        opcodes_8080 = [
            'ACI',
            'ADC',
            'ADD',
            'ADI',
            'ANA',
            'ANI',
            'CALL',
            'CC',
            'CM',
            'CMA',
            'CMC',
            'CMP',
            'CNC',
            'CNZ',
            'CP',
            'CPE',
            'CPI',
            'CPO',
            'CZ',
            'DAA',
            'DAD',
            'DCR',
            'DCX',
            'DI',
            'EI',
            'HLT',
            'IN',
            'INR',
            'INX',
            'JC',
            'JM',
            'JMP',
            'JNC',
            'JNZ',
            'JP',
            'JPE',
            'JPO',
            'JZ',
            'LDAX',
            'LHLD',
            'LXI',
            'MOV',
            'MVI',
            'NOP',
            'ORA',
            'ORI',
            'OUT',
            'PCHL',
            'POP',
            'PUSH',
            'RAL',
            'RAR',
            'RC',
            'RIM',
            'RLC',
            'RET',
            'RM',
            'RNC',
            'RNZ',
            'RP',
            'RPE',
            'RPO',
            'RRC',
            'RST',
            'RZ	',
            'SBB',
            'SBI',
            'SHLD',
            'SIM',
            'SPHL',
            'STA',
            'STC',
            'STAX',
            'SUB',
            'SUI',
            'XCHG',
            'XRA',
            'XRI',
            'XTHL',
        ]

        registers_8080 = ['A', 'B', 'C', 'D', 'E', 'H', 'L', 'M', 'PSW', 'F']

        opcodes_z80 = [
            'ADC', 'ADD', 'AND', 'BIT', 'CALL', 'CCF', 'CP', 'CPD', 'CPDR',
            'CPI', 'CPIR', 'CPL', 'DAA', 'DEC', 'DI', 'DJNZ', 'EI', 'EX',
            'EXX', 'HALT', 'IM', 'IN', 'INC', 'IND', 'INDR', 'INI', 'INIR',
            'JP', 'JR', 'LD', 'LDD', 'LDDR', 'LDI', 'LDIR', 'NEG', 'NOP', 'OR',
            'OTDR', 'OTIR', 'OUT', 'OUTD', 'OUTI', 'POP', 'PUSH', 'RES', 'RET',
            'RETI', 'RETN', 'RL', 'RLA', 'RLC', 'RLCA', 'RLD', 'RR', 'RRA',
            'RRC', 'RRCA', 'RRD', 'RST', 'SBC', 'SCF', 'SET', 'SLA', 'SRA',
            'SRL', 'SUB', 'XOR'
        ]

        registers_z80 = [
            'A', 'B', 'C', 'D', 'E', 'H', 'L', 'F', 'AF', 'BC', 'DE', 'HL',
            "A'", "B'", "C'", "D'", "E'", "H'", "L'", "AF'", "F'", "BC'",
            "DE'", "HL'", 'IX', 'IY', 'PSW', 'M'
        ]

        opcodes_8086 = [
            'AAA',
            'AAD',
            'AAM',
            'AAS',
            'ADC',
            'ADD',
            'AND',
            'CALL',
            'CBW',
            'CLC',
            'CLD',
            'CLI',
            'CMC',
            'CMP',
            'CMPS',
            'CMPSB',
            'CMPW',
            'CMPXCHG',
            'CWD',
            'DAA',
            'DAS',
            'DEC',
            'DIV',
            'ESC',
            'FWAIT',
            'F2XM1',
            'FABS',
            'FADD',
            'FADDP',
            'FBLD',
            'FBSTP',
            'FCHS',
            'FCLEX',
            'FCOM',
            'FCOMP',
            'FCOMPP',
            'FCOS',
            'FDECSTP',
            'FDISI',
            'FDIV',
            'FDIVP',
            'FDIVR',
            'FDIVRP',
            'FENI',
            'FFREE',
            'FIADD',
            'FICOM',
            'FICOMP',
            'FIDIV',
            'FIDIVR',
            'FILD',
            'FIMUL',
            'FINCSTP',
            'FINIT',
            'FIST',
            'FISTP',
            'FISUB',
            'FISUBR',
            'FLD',
            'FLD1',
            'FLDCW',
            'FLDENV',
            'FLDL2E',
            'FLDL2T',
            'FLDLG2',
            'FLDLN2',
            'FLDPI',
            'FLDZ',
            'FMUL',
            'FMULP',
            'FNCLEX',
            'FNDISI',
            'FNENI',
            'FNINIT',
            'FNOP',
            'FNSAVE',
            'FNSTCW',
            'FNSTENV',
            'FNSTSW',
            'FPATAN',
            'FPREM',
            'FPREM1',
            'FPTAN',
            'FRNDINT',
            'FRSTOR',
            'FSAVE',
            'FSCALE',
            'FSETPM',
            'FSIN',
            'FSINCOS',
            'FSQRT',
            'FST',
            'FSTCW',
            'FSTENV',
            'FSTP',
            'FSTSW',
            'FSUB',
            'FSUBP',
            'FSUBRP',
            'FTST',
            'FUCOM',
            'FUCOMP',
            'FUCOMPP',
            'FXAM',
            'FXCH',
            'FXTRACT',
            'FYL2X',
            'FYL2XP1',
            'HLT',
            'IDIV',
            'IMUL',
            'IN',
            'INC',
            'INT',
            'INTO',
            'INVD',
            'IRET',
            'IRETD',
            'JA',
            'JAE',
            'JB',
            'JBE',
            'JC',
            'JCXZ',
            'JE',
            'JECXZ',
            'JG',
            'JGE',
            'JL',
            'JLE',
            'JMP',
            'JNA',
            'JNAE',
            'JNB',
            'JNBE',
            'JNC',
            'JNE',
            'JNG',
            'JNGE',
            'JNL',
            'JNLE',
            'JNO',
            'JNP',
            'JNS',
            'JO',
            'JP',
            'JPE',
            'JPO',
            'JNZ',
            'JS',
            'JZ',
            'LAHF',
            'LAR',
            'LDS',
            'LEA',
            'LES',
            'LOCK',
            'LODS',
            'LODSB',
            'LODSW',
            'LOOP',
            'LOOPE',
            'LOOPNE',
            'LOOPNZ',
            'LOOPZ',
            'MOV',
            'MOVS',
            'MOVSB',
            'MOVSW',
            'MUL',
            'NEG',
            'NOP',
            'NOT',
            'OR',
            'OUT',
            'POP',
            'POPF',
            'POPFD',
            'PUSH',
            'PUSHF',
            'PUSHFD',
            'RCL',
            'RCR',
            'REP',
            'REPE',
            'REPNE',
            'REPNZ',
            'REPZ',
            'RET',
            'RETF',
            'ROL',
            'ROR',
            'SAHF',
            'SAL',
            'SAR',
            'SBB',
            'SCAS',
            'SCASB',
            'SCASW',
            'SHL',
            'SHR',
            'STC',
            'STD',
            'STI',
            'STOS',
            'STOSB',
            'STOSW',
            'SUB',
            'TEST',
            'WAIT',
            'WBINVD',
            'XCHG',
            'XLAT',
            'XLATB',
            'XOR',
        ]

        registers_8086 = [
            'AL', 'AH', 'BL', 'BH', 'CL', 'CH', 'DL', 'DH', 'AX', 'BX', 'CX',
            'DX', 'CS', 'DS', 'SS', 'ES', 'IP', 'SI', 'DI', 'BP', 'SP', 'FLAGS'
        ]

        opcodes_80186 = [
            'BOUND', 'ENTER', 'INS', 'LEAVE', 'OUTS', 'POPA', 'POPAD', 'PUSHA',
            'PUSHAD'
        ]

        opcodes_80286 = [
            'ARPL', 'CLTS', 'LGDT', 'LIDT', 'LLDT', 'LMSW', 'LSL', 'LSS',
            'SGDT', 'SIDT', 'SLDT', 'SMSW', 'STR', 'VERR', 'VERW'
        ]

        registers_80286 = ['TR']

        opcodes_80386 = [
            'BSF', 'BSR', 'BT', 'BTC', 'BTR', 'BTS', 'CDQ', 'CWDE', 'LFS',
            'LGS', 'LSS', 'MOVSX', 'MOVZX', 'SETAE', 'SETB', 'SETC', 'SETNAE',
            'SETNB', 'SETNE', 'SETNZ', 'SETG', 'SETGE', 'SETL', 'SETLE',
            'SETNC', 'SETNG', 'SETNGE', 'SETNL', 'SETNLE', 'SETNO', 'SETNP',
            'SETNS', 'SETE', 'SETO', 'SETP', 'SETPE', 'SETPO', 'SETS', 'SETZ',
            'SHLD', 'SHRD'
        ]

        registers_80386 = [
            'EAX', 'EBX', 'ECX', 'EDX', 'ESI', 'EDI', 'EBP', 'ESP', 'FS', 'GS',
            'EFLAGS'
        ]

        opcodes_80486 = ['BSWAP', 'INVPLG']

        opcodes_pdp8 = [
            'AND', 'TAD', 'ISZ', 'DCA', 'JMS', 'JMP', 'CDF', 'CIF', 'RDF',
            'RIF', 'RIB', 'RMF', 'CLA', 'CLL', 'CMA', 'CML', 'IAC', 'RAR',
            'RAL', 'RTR', 'RTL', 'BSW', 'SMA', 'SZA', 'SNL', 'SPA', 'SNA',
            'SZL', 'OSR', 'HLT', 'MQA', 'MQL', 'SEL', 'LCD', 'XDR', 'STR',
            'SER', 'SDN', 'INTR', 'INIT', 'DILC', 'DICD', 'DISD', 'DILX',
            'DILY', 'DIXY', 'DILE', 'DIRE', 'RCSF', 'RCRA', 'RCRB', 'RCNO',
            'RCRC', 'RCNI', 'RCSD', 'RCSE', 'RCRD', 'RCSI', 'RCTF', 'RPE',
            'RSF', 'RRB', 'RFC', 'PCE', 'PSF', 'PCF', 'PPC', 'PLS', 'KCF',
            'KSF', 'KCC', 'KRS', 'KIE', 'KRB', 'TFL', 'TSF', 'TCF', 'TPC',
            'TSK', 'TLS'
        ]

        opcodes_pdp11 = [
            'CLR', 'CLRB', 'COM', 'COMB', 'INC', 'INCB', 'DEC', 'DECB', 'NEG',
            'NEGB', 'NOP', 'TST', 'TSTB', 'TSTSET', 'WRTLCK', 'ASR', 'ASRB',
            'ASL', 'ASLB', 'ROR', 'RORB', 'ROL', 'ROLB', 'SWAB', 'ADC', 'ADCB',
            'SBC', 'SBCB', 'SXT', 'MOV', 'MOVB', 'ADD', 'SUB', 'CMP', 'CMPB',
            'ASH', 'ASHC', 'MUL', 'DIV', 'BIT', 'BITB', 'BIC', 'BICB', 'BIS',
            'BISB', 'XOR', 'CLR', 'CLRB', 'BR', 'BNE', 'BPL', 'BEQ', 'BMI',
            'BVC', 'BVS', 'BCC', 'BCS', 'BGE', 'BLT', 'BGT', 'BLE', 'SOB',
            'BHI', 'BLOS', 'BHIS', 'BLO', 'JMP', 'JSR', 'RTS', 'MARK', 'EMT',
            'TRAP', 'BPT', 'IOT', 'CSM', 'RTI', 'RTT', 'HALT', 'WAIT', 'RESET',
            'MTPD', 'MTPI', 'MFPD', 'MTPS', 'MFPS', 'MFPT', 'CLC', 'CLV',
            'CLZ', 'CLN', 'CCC', 'SEC', 'SEV', 'SEZ', 'SEN', 'SCC', 'FADD',
            'FSUB', 'FMUL', 'FDIV', 'DIV', 'MUL'
        ]

        registers_pdp11 = ['r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7']

        opcodes = []
        registers = []

        if processor in ['1802']:
            opcodes += opcodes_1802
            registers += registers_1802

        if processor in ['6502']:
            opcodes += opcodes_6502
            registers += registers_6502

        if processor in ['6800']:
            opcodes += opcodes_6800
            registers += registers_6800

        if processor in ['68000']:
            opcodes += opcodes_68000
            registers += registers_68000

        if processor in ['8080']:
            opcodes += opcodes_8080
            registers += registers_8080

        if processor in ['z80']:
            opcodes += opcodes_z80
            registers += registers_z80

        if processor in ['8086', '80186', '80286', '80386', '80486']:
            opcodes += opcodes_8086
            registers += registers_8086

        if processor in ['80286', '80386', '80486']:
            opcodes += opcodes_80186
            opcodes += opcodes_80286
            registers += registers_80286

        if processor in ['80386', '80486']:
            opcodes += opcodes_80386
            registers += registers_80386

        if processor in ['80486']:
            opcodes += opcodes_80486

        if processor in ['pdp-8']:
            opcodes += opcodes_pdp8
            # registers += registers_pdp8

        if processor in ['pdp-11']:
            opcodes += opcodes_pdp11
            registers += registers_pdp11

        opcode_tb = CaseInsensitiveListTokenBuilder(opcodes, 'keyword', False)
        register_tb = CaseInsensitiveListTokenBuilder(registers, 'register',
                                                      True)

        values = ['*', '$', '.']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, integer_1_tb, integer_2_tb,
            prefixed_integer_tb, hex_integer_1_tb, hex_integer_2_tb,
            hex_integer_3_tb, hex_integer_4_tb, hash_quote_value_tb, values_tb,
            groupers_tb, register_tb, opcode_tb, directive_tb,
            title_directive_tb, title_directive_2_tb, subtitle_directive_tb,
            subtitle_directive_2_tb, subtitle_directive_3_tb,
            include_directive_tb, include_directive_2_tb, multiline_comment_tb,
            preprocessor_tb, identifier_tb, label_tb, string_tb, comment_tb,
            comment_2_tb, line_comment_star_tb, line_comment_hash_tb,
            known_operator_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        opcode_tokenbuilders = [
            opcode_tb, directive_tb, title_directive_tb, subtitle_directive_tb,
            include_directive_tb, preprocessor_tb, invalid_token_builder
        ]

        args_tokenbuilders = [
            integer_tb, integer_exponent_tb, hex_integer_1_tb,
            hex_integer_2_tb, hex_integer_3_tb, hex_integer_4_tb, values_tb,
            groupers_tb, known_operator_tb, register_tb, identifier_tb,
            label_tb, string_tb, comment_tb, line_comment_star_tb,
            line_comment_hash_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        opcode_tokenizer = Tokenizer(opcode_tokenbuilders)
        args_tokenizer = Tokenizer(args_tokenbuilders)

        # tokenize as free-format
        tokens_free = tokenizer.tokenize(code)
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid operator')
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid')
        tokens_free = Examiner.combine_identifier_colon(
            tokens_free, ['newline'], [], [])
        tokens_free = Tokenizer.combine_number_and_adjacent_identifier(
            tokens_free)
        tokens_free = Examiner.convert_values_to_operators(
            tokens_free, known_operators)
        self.tokens = tokens_free
        self.convert_asm_identifiers_to_labels()
        self.convert_asm_keywords_to_operators()
        self.convert_asm_keywords_to_identifiers()

        self.calc_statistics()
        statistics_free = self.statistics
        self.statistics = {}

        self.calc_confidences(operand_types, group_starts, group_mids,
                              group_ends, None)
        self.calc_line_length_confidence(code, self.max_expected_line)
        confidences_free = self.confidences
        self.confidences = {}
        errors_free = self.errors
        self.errors = []

        if processor in ['pdp-8', 'pdp-11']:
            # do not try space-format, it never exists for these processors
            tokens_space = []
            statistics_space = {}
            confidences_space = {}
            errors_space = []
        else:
            # tokenize as space-format
            opcode_extras = '.&=,()+-*/'
            label_leads = '.&$@#'
            label_mids = '.&$#@_'
            label_ends = ':'
            comment_leads = '*;'
            line_comment_leads = ''
            use_line_id = False
            tokens_space, indents = Tokenizer.tokenize_asm_code(
                code, tab_size, opcode_tokenizer, opcode_extras,
                args_tokenizer, label_leads, label_mids, label_ends,
                comment_leads, line_comment_leads, use_line_id)
            tokens_space = Examiner.combine_adjacent_identical_tokens(
                tokens_space, 'invalid operator')
            tokens_space = Examiner.combine_adjacent_identical_tokens(
                tokens_space, 'invalid')
            tokens_space = Examiner.combine_identifier_colon(
                tokens_space, ['newline'], [], [])
            tokens_space = Tokenizer.combine_number_and_adjacent_identifier(
                tokens_space)
            tokens_space = Examiner.convert_values_to_operators(
                tokens_space, known_operators)
            self.tokens = tokens_space
            self.convert_asm_identifiers_to_labels()

            self.calc_statistics()
            statistics_space = self.statistics
            self.statistics = {}

            self.calc_confidences(operand_types, group_starts, group_mids,
                                  group_ends, indents)
            self.calc_line_length_confidence(code, self.max_expected_line)
            confidences_space = self.confidences
            self.confidences = {}
            errors_space = self.errors
            self.errors = []

        # compute confidence for free-format and spaced-format
        confidence_free = 1.0
        if len(confidences_free) == 0:
            confidence_free = 0.0
        else:
            for key in confidences_free:
                factor = confidences_free[key]
                confidence_free *= factor

        confidence_space = 1.0
        if len(confidences_space) == 0:
            confidence_space = 0.0
        else:
            for key in confidences_space:
                factor = confidences_space[key]
                confidence_space *= factor

        # select the better of free-format and spaced-format
        if confidence_space > confidence_free:
            self.tokens = tokens_space
            self.statistics = statistics_space
            self.confidences = confidences_space
            self.errors = errors_space
        else:
            self.tokens = tokens_free
            self.statistics = statistics_free
            self.confidences = confidences_free
            self.errors = errors_free
Beispiel #15
0
    def __init__(self, code):
        super().__init__()
        ctrlz_char = ''
        code = self.TrimCtrlZText(code, ctrlz_char)

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        hex_constant_tb = SuffixedIntegerTokenBuilder(
            'H', True, '0123456789ABCDEFabcdef')
        octal_constant_tb = SuffixedIntegerTokenBuilder('C', True, '01234567')
        binary_constant_tb = SuffixedIntegerTokenBuilder('B', True, '01')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ["'", '"']
        string_tb = StringTokenBuilder(quotes, 0)
        operand_types.append('string')

        paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment')

        known_operators = [
            ':=', '=', '>', '>=', '<', '<=', '#', '<>', '+', '-', '*', '/',
            'DIV', 'MOD', 'AND', 'OR', 'NOT', '^', '.', '..', 'IN', '&'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', 'NOT', '@', '^', '.']

        self.postfix_operators = ['^']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '|']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':', '|']
        group_ends = [')', ']', '}']

        groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'BEGIN', 'BY', 'CASE', 'CONST', 'DEFINITION', 'DO', 'ELSE',
            'ELSIF', 'END', 'EXCEPT', 'EXIT', 'EXPORT', 'FINALLY', 'FOR',
            'FROM', 'IF', 'IMPLEMENTATION', 'IMPORT', 'LOOP', 'MODULE', 'OF',
            'PROCEDURE', 'QUALIFIED', 'REPEAT', 'THEN', 'TO', 'TYPE', 'VAR',
            'WITH', 'WHILE'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'ARRAY', 'BOOLEAN', 'CARDINAL', 'CHAR', 'INTEGER', 'POINTER',
            'REAL', 'RECORD', 'SET'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['FALSE', 'NIL', 'TRUE']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, keyword_tb, types_tb,
            values_tb, known_operator_tb, groupers_tb, identifier_tb,
            string_tb, paren_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        self.tokens = tokens

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'identifier', 'variable']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(
            ['BEGIN', 'RECORD', 'CASE', 'DO', 'IF', 'WHILE'], ['END'])
        self.calc_paired_blockers_confidence(['REPEAT'], ['UNTIL'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Beispiel #16
0
  def __init__(self, code, block_comment_limit):
    super().__init__()

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder("'")
    integer_exponent_tb = IntegerExponentTokenBuilder("'")
    hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF_')
    binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_')
    suffixed_integer_tb = SuffixedIntegerTokenBuilder(['U', 'L', 'LU', 'UL'], False, None)
    real_tb = RealTokenBuilder(False, False, "'")
    suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l', 'i'], False, None)
    real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
    hex_real_tb = HexRealExponentTokenBuilder()
    operand_types.append('number')

    leads = '_'
    extras = '_'
    identifier_tb = IdentifierTokenBuilder(leads, extras)
    operand_types.append('identifier')

    attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False)
    operand_types.append('attribute')

    # string suffix: c,w,d
    quotes = ['"', "'", "’"]
    string_tb = EscapedStringTokenBuilder(quotes, 0)
    r_string_tb = PrefixedStringTokenBuilder('r', True, quotes)
    backtick_string_tb = EscapedStringTokenBuilder(['`'], 0)
    x_string_tb = PrefixedStringTokenBuilder('x', True, quotes)
    q_string_tb = PrefixedStringTokenBuilder('q', True, quotes)
    # q{} string
    cwd_string_tb = SuffixedStringTokenBuilder(quotes, 'cwd', False)
    operand_types.append('string')

    class_type_tb = ClassTypeTokenBuilder()
    operand_types.append('class')

    slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
    slash_star_comment_tb = SlashStarCommentTokenBuilder()
    slash_plus_comment_tb = NestedCommentTokenBuilder('/+', '+/', block_comment_limit)

    line_continuation_tb = SingleCharacterTokenBuilder('\\', 'line continuation', False)
    terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False)

    known_operators = [
      '/', '/=', '.', '..', '...', '&', '&=', '&&', '|', '|=', '||',
      '-', '-=', '--', '+', '+=', '++', '<', '<=', '<<', '<<=', '>', '>=',
      '>>=', '>>>=', '>>', '>>>', '!', '!=', '?', ',', ':', '$',
      '=', '==', '*', '*=', '%', '%=', '^', '^=', '^^', '^^=', '~', '~=',
      '@', '=>', '#',
      'new', 'delete',
      'typeof', 'is'
    ]

    self.unary_operators = [
      '+', '-', '*',
      '!', '&', '~',
      '++', '--', ':',
      'new', 'delete',
      'typeof', 'is'
    ]

    self.postfix_operators = [
      '++', '--', '&', ':'
    ]

    groupers = ['(', ')', ',', '[', ']', '{', '}']
    group_starts = ['(', '[', ',', '{']
    group_mids = [',']
    group_ends = [')', ']', '}']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    keywords = [
      'abstract', 'alias', 'align', 'asm', 'assert', 'auto',
      'body', 'break', 'case', 'cast', 'catch', 'class', 'const', 'continue',
      'debug', 'default', 'delegate', 'deprecated', 'do',
      'else', 'enum', 'export', 'extern',
      'final', 'finally', 'for', 'foreach', 'foreach_reverse', 'function',
      'goto',
      'if', 'immutable', 'import', 'in', 'inout', 'interface', 'invariant',
      'lazy',
      'macro', 'mixin', 'module',
      'nothrow',
      'out', 'override',
      'package', 'pragma', 'private', 'protected', 'public', 'pure',
      'ref', 'return',
      'scope', 'shared', 'static', 'struct', 'switch', 'synchronized',
      'template', 'throw', 'try', 'typeid',
      'union', 'unittest', 'version', 'while', 'with',
      '__gshared', '__traits', '__vector', '__parameters'
]

    keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

    types = [
      'bool', 'byte', 'cdouble', 'cent', 'cfloat', 'char', 'creal',
      'dchar', 'double', 'float', 'idouble', 'ifloat', 'int', 'ireal',
      'long', 'real', 'short', 'ubyte', 'ucent', 'uint', 'ulong', 'ushort',
      'void', 'wchar'
    ]

    types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
    operand_types.append('type')

    values = [
      'false', 'null', 'super', 'this', 'true',
      '__FILE__', '__FILE_FULL_PATH__', '__MODULE__', '__LINE__',
      '__FUNCTION__', '__PRETTY_FUNCTION__',
      '__DATE__', '__EOF__', '__TIME__','__TIMESTAMP__',
      '__VENDOR__', '__VERSION__'
    ]

    values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      line_continuation_tb,
      terminators_tb,
      integer_tb,
      integer_exponent_tb,
      hex_integer_tb,
      binary_integer_tb,
      suffixed_integer_tb,
      real_tb,
      real_exponent_tb,
      suffixed_real_tb,
      hex_real_tb,
      keyword_tb,
      types_tb,
      values_tb,
      groupers_tb,
      known_operator_tb,
      identifier_tb,
      attribute_tb,
      class_type_tb,
      string_tb,
      r_string_tb,
      x_string_tb,
      backtick_string_tb,
      q_string_tb,
      cwd_string_tb,
      slash_slash_comment_tb,
      slash_star_comment_tb,
      slash_plus_comment_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)
    tokens = tokenizer.tokenize(code)
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator')
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
    tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment'])
    self.tokens = tokens
    self.convert_identifiers_to_labels()

    number_suffixes = ['f', 'F', 'i', 'I', 'u', 'U', 'l', 'L', 'ul', 'uL', 'Ul', 'UL', 'lu', 'lU', 'Lu', 'LU']
    tokens = self.combine_tokens_and_adjacent_types(tokens, 'number', 'identifier', number_suffixes)

    string_suffixes = ['c', 'w', 'd']
    self.tokens = self.combine_tokens_and_adjacent_types(tokens, 'string', 'identifier', string_suffixes)

    self.calc_statistics()

    tokens = self.source_tokens()
    tokens = Examiner.join_all_lines(tokens)

    self.calc_token_confidence()
    self.calc_token_2_confidence()

    num_operators = self.count_my_tokens(['operator', 'invalid operator'])
    if num_operators > 0:
      self.calc_operator_confidence(num_operators)
      allow_pairs = []
      self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
      self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
      self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

    self.calc_group_confidence(tokens, group_mids)

    operand_types_2 = ['number', 'symbol']
    self.calc_operand_n_confidence(tokens, operand_types_2, 2)
    self.calc_operand_n_confidence(tokens, operand_types, 4)

    self.calc_keyword_confidence()

    self.calc_paired_blockers_confidence(['{'], ['}'])
    self.calc_line_length_confidence(code, self.max_expected_line)
Beispiel #17
0
    def __init__(self, code, tab_size, wide):
        super().__init__()

        self.operand_types = []

        self.whitespace_tb = WhitespaceTokenBuilder()
        self.newline_tb = NewlineTokenBuilder()

        self.integer_tb = IntegerTokenBuilder(None)
        self.integer_exponent_tb = IntegerExponentTokenBuilder(None)
        self.binary_integer_tb = SuffixedIntegerTokenBuilder(['B'], False,
                                                             None)
        self.real_tb = RealTokenBuilder(False, False, None)
        self.real_exponent_tb = RealExponentTokenBuilder(
            False, False, 'E', None)
        self.binary_real_tb = SuffixedRealTokenBuilder(True, True, ['B'],
                                                       False, None)
        self.operand_types.append('number')

        leads = '_'
        extras = '_'
        self.identifier_tb = IdentifierTokenBuilder(leads, extras)
        self.operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        self.string_tb = EscapedStringTokenBuilder(quotes, 0)
        self.operand_types.append('string')

        self.label_tb = PL1LabelTokenBuilder()
        self.operand_types.append('label')

        self.slash_star_comment_tb = SlashStarCommentTokenBuilder()

        self.jcl_tb = JCLTokenBuilder()

        directives = [
            '%ACTIVATE', '%DEACTIVATE', '%DECLARE', '%DCL', '%DICTIONARY',
            '%DO', '%ELSE', '%END', '%FATAL', '%GOTO', '%IF', '%INCLUDE',
            '%LIST', '%NOLIST', '%PAGE', '%PROCEDURE', '%PROC', '%REPLACE',
            '%RETURN', '%THEN'
        ]

        self.line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        self.preprocessor_tb = CaseInsensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        self.title_tb = LeadToEndOfLineTokenBuilder('%TITLE', True,
                                                    'preprocessor')
        self.subtitle_tb = LeadToEndOfLineTokenBuilder('%SBTTL', True,
                                                       'preprocessor')
        self.error_tb = LeadToEndOfLineTokenBuilder('%ERROR', True,
                                                    'preprocessor')
        self.warn_tb = LeadToEndOfLineTokenBuilder('%WARN', True,
                                                   'preprocessor')
        self.inform_tb = LeadToEndOfLineTokenBuilder('%INFORM', True,
                                                     'preprocessor')
        self.terminators_tb = SingleCharacterTokenBuilder(
            ';', 'statement terminator', False)

        known_operators = [
            '+', '-', '*', '/', '**', '>', '<', '=', '>=', '<=', '¬>', '¬<',
            '¬=', '^>', '^<', '^=', '^', '~>', '~<', '~=', '~', '¬', '&', '&:',
            '|', '|:', '||', '!', '!:', '!!', ':'
        ]

        self.unary_operators = ['+', '-', '^', '~', '¬']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        self.group_starts = ['(', '[', ',', '{']
        self.group_mids = [',']
        self.group_ends = [')', ']', '}']

        self.groupers_tb = CaseInsensitiveListTokenBuilder(
            groupers, 'group', False)

        self.known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'ALLOCATE', 'ALLOC', 'BEGIN', 'CALL', 'CLOSE', 'DECLARE', 'DCL',
            'DO', 'ELSE', 'END', 'FORMAT', 'FREE', 'GET', 'GOTO', 'GO TO',
            'IF', 'LEAVE', 'ON', 'OPEN', 'OTHERWISE', 'OTHER', 'PROCEDURE',
            'PROC', 'PUT', 'READ', 'RETURN', 'REVERT', 'REWRITE', 'SELECT',
            'SIGNAL', 'STOP', 'THEN', 'WHEN', 'WRITE'
        ]

        self.keyword_tb = CaseInsensitiveListTokenBuilder(
            keywords, 'keyword', False)

        attributes = [
            'ALIGNED', 'ANY', 'AREA', 'BASED', 'BUILTIN', 'CONDITION', 'COND',
            'CONTROLLED', 'CTL', 'DEFINED', 'DEF', 'DIRECT', 'ENTRY',
            'ENVIRONMENT', 'ENV', 'EXTERNAL', 'EXT', 'FILE', 'GLOBALDEF',
            'GLOBALREF', 'INITIAL', 'INIT', 'INPUT', 'INTERNAL', 'INT'
            'KEYED', 'LABEL', 'LIKE', 'LIST', 'MEMBER', 'NONVARYING', 'NONVAR',
            'OPTIONAL', 'OPTIONS', 'OUTPUT', 'PARAMETER', 'PARM', 'PICTURE',
            'PIC', 'POSITION', 'POS', 'PRECISION', 'PREC', 'PRINT', 'READONLY',
            'RECORD', 'REFER', 'RETURNS', 'SEQUENTIAL', 'SEQL', 'STATIC',
            'STREAM', 'STRUCTURE', 'TRUNCATE', 'UNALIGNED', 'UNAL', 'UNION',
            'UPDATE', 'VARIABLE', 'VARYING', 'VAR'
        ]

        self.attributes_tb = CaseInsensitiveListTokenBuilder(
            attributes, 'attribute', False)

        functions = [
            'ABS', 'ACOS', 'ACTUALCOUNT', 'ADD', 'ADDR', 'ADDREL',
            'ALLOCATION', 'ALLOCN', 'ASIN', 'ATAN', 'ATAND', 'ATANH',
            'AUTOMATIC', 'AUTO', 'BINARY', 'BIN', 'BIT', 'BOOL', 'BYTE',
            'BYTESIZE', 'CEIL', 'CHARACTER', 'CHAR', 'COLLATE', 'COPY', 'COS',
            'COSD', 'COSH', 'DATE', 'DATETIME', 'DECIMAL', 'DEC', 'DECODE',
            'DESCRIPTOR', 'DESC', 'DIMENSION', 'DIM', 'DIVIDE', 'EMPTY',
            'ENCODE', 'ERROR', 'EVERY', 'EXP', 'FIXED', 'FLOAT', 'FLOOR',
            'HBOUND', 'HIGH', 'INDEX', 'INFORM', 'INT', 'LBOUND', 'LENGTH',
            'LINE', 'LINENO', 'LOG', 'LOG10', 'LOG2', 'LOW', 'LTRIM', 'MAX',
            'MAXLENGTH', 'MIN', 'MOD', 'MULTIPLY', 'NULL', 'OFFSET',
            'ONARGSLIST', 'ONCHAR', 'ONCODE', 'ONFILE', 'ONKEY', 'ONSOURCE',
            'PAGENO', 'POINTER', 'PTR', 'POSINT', 'PRESENT', 'PROD', 'RANK',
            'REFERENCE', 'REVERSE', 'ROUND', 'RTRIM', 'SEARCH', 'SIGN', 'SIN',
            'SIND', 'SINH', 'SIZE', 'SOME', 'SQRT', 'STRING', 'SUBSTR',
            'SUBTRACT', 'SUM', 'TAN', 'TAND', 'TANH', 'TIME', 'TRANSLATE',
            'TRIM', 'TRUNC', 'UNSPEC', 'VALID', 'VALUE', 'VAL', 'VARIANT',
            'VERIFY', 'WARN'
        ]

        self.function_tb = CaseInsensitiveListTokenBuilder(
            functions, 'function', True)

        format_items = [
            'A', 'B', 'B1', 'B2', 'B3', 'B4', 'COLUMN', 'COL', 'E', 'F', 'P',
            'R', 'TAB', 'X'
        ]

        self.format_item_tb = CaseSensitiveListTokenBuilder(
            format_items, 'format', True)
        self.operand_types.append('format')

        options = [
            'APPEND', 'BACKUP_DATE', 'BATCH', 'BLOCK_BOUNDARY_FORMAT',
            'BLOCK_IO', 'BLOCK_SIZE', 'BUCKET_SIZE', 'BY', 'CANCEL_CONTROL_O',
            'CARRIAGE_RETURN_FORMAT', 'CONTIGUOUS', 'CONTIGUOUS_BEST_TRY',
            'CREATION_DATE', 'CURRENT_POSITION', 'DEFAULT_FILE_NAME',
            'DEFERRED_WRITE', 'DELETE', 'EDIT', 'EXPIRATION_DATE',
            'EXTENSION_SIZE', 'FAST_DELETE', 'FILE_ID', 'FILE_ID_TO',
            'FILE_SIZE', 'FIXED_CONTROL_FROM', 'FIXED_CONTROL_SIZE',
            'FIXED_CONTROL_SIZE_TO', 'FIXED_CONTROL_TO',
            'FIXED_LENGTH_RECORDS', 'FROM', 'GROUP_PROTECTION', 'IDENT',
            'IGNORE_LINE_MARKS', 'IN', 'INDEXED', 'INDEX_NUMBER',
            'INITIAL_FILL', 'INTO', 'KEY', 'KEYFROM', 'KEYTO', 'LINESIZE',
            'LOCK_ON_READ', 'LOCK_ON_WRITE', 'MAIN PROCEDURE',
            'MANUAL_UNLOCKING', 'MATCH_GREATER', 'MATCH_GREATER_EQUAL',
            'MATCH_NEXT', 'MATCH_NEXT_EQUAL', 'MAXIMUM_RECORD_NUMBER',
            'MAXIMUM_RECORD_SIZE', 'MULTIBLOCK_COUNT', 'MULTIBUFFER_COUNT',
            'NOLOCK', 'NONEXISTENT_RECORD', 'NONRECURSIVE', 'NORESCAN',
            'NO_ECHO', 'NO_FILTER', 'NO_SHARE', 'OWNER_GROUP', 'OWNER_ID',
            'OWNER_MEMBER', 'OWNER_PROTECTION', 'PAGE', 'PAGESIZE',
            'PRINTER_FORMAT', 'PROMPT', 'PURGE_TYPE_AHEAD', 'READ_AHEAD',
            'READ_CHECK', 'READ_REGARDLESS', 'RECORD_ID', 'RECORD_ID_ACCESS',
            'RECORD_ID_TO', 'RECURSIVE', 'REPEAT', 'RESCAN',
            'RETRIEVAL_POINTERS', 'REVISION_DATE', 'REWIND_ON_CLOSE',
            'REWIND_ON_OPEN', 'SCALARVARYING', 'SET READ', 'SHARED_READ',
            'SHARED_WRITE', 'SKIP', 'SNAP', 'SPOOL', 'STATEMENT', 'SUPERSEDE',
            'SYSTEM', 'SYSTEM_PROTECTION', 'TEMPORARY', 'TIMEOUT_PERIOD',
            'TITLE', 'TO', 'UNDERFLOW', 'UFL', 'UNTIL', 'USER_OPEN',
            'WAIT_FOR_RECORD', 'WHILE', 'WORLD_PROTECTION', 'WRITE_BEHIND',
            'WRITE_CHECK'
        ]

        self.options_tb = CaseInsensitiveListTokenBuilder(
            options, 'option', False)

        conditions = [
            'ANYCONDITION', 'CONVERSION', 'CONV', 'ENDFILE', 'ENDPAGE',
            'FINISH', 'FIXEDOVERFLOW', 'FOFL', 'OVERFLOW', 'OFL', 'STORAGE',
            'STRINGRANGE', 'STRG', 'SUBSCRIPTRANGE', 'SUBRG', 'UNDEFINEDFILE',
            'UNDF', 'VAXCONDITION', 'ZERODIVIDE', 'ZDIV'
        ]

        self.conditions_tb = CaseInsensitiveListTokenBuilder(
            conditions, 'condition', False)

        subroutines = [
            'DISPLAY', 'EXTEND', 'FLUSH', 'NEXT_VOLUME', 'RELEASE', 'RESIGNAL',
            'REWIND', 'SPACEBLOCK'
        ]

        self.subroutines_tb = CaseInsensitiveListTokenBuilder(
            subroutines, 'subroutine', False)

        types = [
            'FIXED', 'BINARY', 'FLOAT', 'DECIMAL', 'BIT', 'CHARACTER',
            'PICTURE'
        ]

        self.types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True)
        self.operand_types.append('type')

        values = ['SYSIN', 'SYSPRINT']

        self.values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        self.operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        # tokenize as free-format
        tokenbuilders_free = [
            self.newline_tb, self.whitespace_tb, self.line_continuation_tb,
            self.terminators_tb, self.integer_tb, self.integer_exponent_tb,
            self.binary_integer_tb, self.real_tb, self.real_exponent_tb,
            self.binary_real_tb, self.keyword_tb, self.function_tb,
            self.attributes_tb, self.options_tb, self.conditions_tb,
            self.subroutines_tb, self.types_tb, self.values_tb,
            self.groupers_tb, self.known_operator_tb, self.identifier_tb,
            self.string_tb, self.label_tb, self.slash_star_comment_tb,
            self.preprocessor_tb, self.title_tb, self.subtitle_tb,
            self.error_tb, self.warn_tb, self.inform_tb, self.jcl_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer_free = Tokenizer(tokenbuilders_free)
        tokens_free = tokenizer_free.tokenize(code)
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid operator')
        tokens_free = Examiner.combine_adjacent_identical_tokens(
            tokens_free, 'invalid')
        self.tokens = tokens_free

        self.calc_statistics()
        statistics_free = self.statistics
        self.statistics = {}

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators,
                                            self.group_ends, allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            self.group_starts, allow_pairs)

        self.calc_group_confidence(tokens, self.group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, self.operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
        confidences_free = self.confidences
        self.confidences = {}
        errors_free = self.errors
        self.errors = []

        # tokenize as fixed-format
        tokenbuilders_fixed = [
            self.newline_tb, self.whitespace_tb, self.line_continuation_tb,
            self.terminators_tb, self.integer_tb, self.integer_exponent_tb,
            self.binary_integer_tb, self.real_tb, self.real_exponent_tb,
            self.binary_real_tb, self.keyword_tb, self.function_tb,
            self.attributes_tb, self.options_tb, self.conditions_tb,
            self.subroutines_tb, self.types_tb, self.values_tb,
            self.groupers_tb, self.known_operator_tb, self.identifier_tb,
            self.string_tb, self.label_tb, self.slash_star_comment_tb,
            self.preprocessor_tb, self.title_tb, self.subtitle_tb,
            self.error_tb, self.warn_tb, self.inform_tb, self.jcl_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        comment_start_tb = PL1CommentStartTokenBuilder()
        comment_middle_tb = PL1CommentMiddleTokenBuilder()
        comment_end_tb = PL1CommentEndTokenBuilder()

        type1_tokenbuilders = [comment_start_tb]
        tokenbuilders_fixed_1 = tokenbuilders_fixed + type1_tokenbuilders + [
            invalid_token_builder
        ]
        tokenizer_fixed_1 = Tokenizer(tokenbuilders_fixed_1)

        type2_tokenbuilders = [
            comment_start_tb, comment_middle_tb, comment_end_tb
        ]
        tokenbuilders_fixed_2 = tokenbuilders_fixed + type2_tokenbuilders + [
            invalid_token_builder
        ]
        tokenizer_fixed_2 = Tokenizer(tokenbuilders_fixed_2)

        tokens_fixed = self.tokenize_code(code, tab_size, tokenizer_fixed_1,
                                          tokenizer_fixed_2, wide)
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'invalid operator')
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'invalid')
        tokens_fixed = Examiner.combine_adjacent_identical_tokens(
            tokens_fixed, 'whitespace')
        tokens_fixed = self.convert_broken_comments_to_comments(tokens_fixed)
        self.tokens = tokens_fixed

        self.calc_statistics()
        statistics_fixed = self.statistics
        self.statistics = {}

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators,
                                            self.group_ends, allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            self.group_starts, allow_pairs)

        self.calc_group_confidence(tokens, self.group_mids)

        operand_types_2 = ['number', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, self.operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
        confidences_fixed = self.confidences
        self.confidences = {}
        errors_fixed = self.errors
        self.errors = []

        # compute confidence for free-format and fixed-format
        confidence_free = 1.0
        if len(confidences_free) == 0:
            confidence_free = 0.0
        else:
            for key in confidences_free:
                factor = confidences_free[key]
                confidence_free *= factor

        confidence_fixed = 1.0
        if len(confidences_fixed) == 0:
            confidence_fixed = 0.0
        else:
            for key in confidences_fixed:
                factor = confidences_fixed[key]
                confidence_fixed *= factor

        # select the better of free-format and spaced-format
        if confidence_fixed > confidence_free:
            self.tokens = tokens_fixed
            self.statistics = statistics_fixed
            self.confidences = confidences_fixed
            self.errors = errors_fixed
        else:
            self.tokens = tokens_free
            self.statistics = statistics_free
            self.confidences = confidences_free
            self.errors = errors_free
Beispiel #18
0
    def __init__(self, code, year):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        suffixed_integer_tb = SuffixedIntegerTokenBuilder(
            ['U', 'L', 'LL', 'ULL', 'LLU'], False, None)
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l'],
                                                    False, None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
        slash_star_comment_tb = SlashStarCommentTokenBuilder()

        directives = [
            '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else',
            '#elif', '#line', '#include', '#pragma'
        ]

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        c_preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True,
                                                   'preprocessor')
        c_error_tb = LeadToEndOfLineTokenBuilder('#error', True,
                                                 'preprocessor')
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=',
            '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!',
            '&', '|', '~', '<<', '>>', '^', '.', '++', '--', '->', '&&', '||',
            '?', '##'
        ]

        self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--']

        self.postfix_operators = ['++', '--', '&', '*']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'auto', 'break', 'case', 'const', 'continue', 'default', 'do',
            'else', 'enum', 'extern', 'for', 'goto', 'if', 'inline',
            'register', 'return', 'signed', 'sizeof', 'static', 'struct',
            'switch', 'typedef', 'union', 'unsigned', 'volatile', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = ['char', 'double', 'float', 'int', 'long', 'short']

        types_89 = ['void']

        types_99 = ['bool', 'complex']

        if year in ['89', '99']:
            types += types_89

        if year in ['99']:
            types += types_99

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['NULL']

        values_89 = []

        values_99 = ['...', 'true', 'false']

        if year in ['89', '99']:
            values += values_89

        if year in ['99']:
            values += values_99

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb,
            whitespace_tb,
            line_continuation_tb,
            terminators_tb,
            integer_tb,
            integer_exponent_tb,
            hex_integer_tb,
            binary_integer_tb,
            suffixed_integer_tb,
            real_tb,
            real_exponent_tb,
            suffixed_real_tb,
            keyword_tb,
            types_tb,
            values_tb,
            groupers_tb,
            known_operator_tb,
            identifier_tb,
            class_type_tb,
            string_tb,
        ]

        if year in ['99']:
            tokenbuilders += [
                slash_slash_comment_tb,
            ]

        tokenbuilders += [
            slash_star_comment_tb, c_preprocessor_tb, c_error_tb, c_warning_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['statement terminator', 'newline'], ['{'],
            ['whitespace', 'comment'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)