Exemple #1
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     TripleQuoteStringTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     BlockTokenBuilder.__escape_z__()
     BraceCommentTokenBuilder.__escape_z__()
     SlashSlashCommentTokenBuilder.__escape_z__()
     SlashStarCommentTokenBuilder.__escape_z__()
     GenericNumberTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Exemple #2
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     StringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     SuffixedIntegerTokenBuilder.__escape_z__()
     BlockTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Exemple #3
0
 def __escape_z__():
   InvalidTokenBuilder.__escape_z__()
   WhitespaceTokenBuilder.__escape_z__()
   NewlineTokenBuilder.__escape_z__()
   StuffedQuoteStringTokenBuilder.__escape_z__()
   IntegerTokenBuilder.__escape_z__()
   IntegerExponentTokenBuilder.__escape_z__()
   RealTokenBuilder.__escape_z__()
   RealExponentTokenBuilder.__escape_z__()
   CaseInsensitiveListTokenBuilder.__escape_z__()
   CaseSensitiveListTokenBuilder.__escape_z__()
   BlockTokenBuilder.__escape_z__()
   HTMLIdentifierTokenBuilder.__escape_z__()
   HTMLListTokenBuilder.__escape_z__()
   HTMLAttributeTokenBuilder.__escape_z__()
   HTMLUnicodeTokenBuilder.__escape_z__()
   return 'Escape ?Z'
Exemple #4
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     BlockTokenBuilder.__escape_z__()
     HaskellClassTokenBuilder.__escape_z__()
     HaskellIdentifierTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Exemple #5
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     PrefixedIntegerTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     PrefixedIdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     BlockTokenBuilder.__escape_z__()
     KeywordTokenBuilder.__escape_z__()
     MatlabStringTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Exemple #6
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     EscapedStringTokenBuilder.__escape_z__()
     PrefixedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     IdentifierTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     BlockTokenBuilder.__escape_z__()
     TripleQuoteStringTokenBuilder.__escape_z__()
     SlashSlashCommentTokenBuilder.__escape_z__()
     TripleSlashCommentTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     NullTokenBuilder.__escape_z__()
     ClassTypeTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Exemple #7
0
 def __escape_z__():
     InvalidTokenBuilder.__escape_z__()
     WhitespaceTokenBuilder.__escape_z__()
     NewlineTokenBuilder.__escape_z__()
     StuffedQuoteStringTokenBuilder.__escape_z__()
     PrefixedStringTokenBuilder.__escape_z__()
     IntegerTokenBuilder.__escape_z__()
     IntegerExponentTokenBuilder.__escape_z__()
     RealTokenBuilder.__escape_z__()
     RealExponentTokenBuilder.__escape_z__()
     CaseInsensitiveListTokenBuilder.__escape_z__()
     CaseSensitiveListTokenBuilder.__escape_z__()
     SingleCharacterTokenBuilder.__escape_z__()
     LeadToEndOfLineTokenBuilder.__escape_z__()
     BlockTokenBuilder.__escape_z__()
     CobolIdentifierTokenBuilder.__escape_z__()
     PictureTokenBuilder.__escape_z__()
     CRPictureTokenBuilder.__escape_z__()
     CobolPreprocessorTokenBuilder.__escape_z__()
     AsteriskCommentTokenBuilder.__escape_z__()
     return 'Escape ?Z'
Exemple #8
0
    def __init__(self, code):
        super().__init__()
        ctrlz_char = ''
        code = self.TrimCtrlZText(code, ctrlz_char)

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        hex_constant_tb = SuffixedIntegerTokenBuilder(
            'H', True, '0123456789ABCDEFabcdef')
        octal_constant_tb = SuffixedIntegerTokenBuilder('C', True, '01234567')
        binary_constant_tb = SuffixedIntegerTokenBuilder('B', True, '01')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ["'", '"']
        string_tb = StringTokenBuilder(quotes, 0)
        operand_types.append('string')

        paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment')

        known_operators = [
            ':=', '=', '>', '>=', '<', '<=', '#', '<>', '+', '-', '*', '/',
            'DIV', 'MOD', 'AND', 'OR', 'NOT', '^', '.', '..', 'IN', '&'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', 'NOT', '@', '^', '.']

        self.postfix_operators = ['^']

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '|']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', ':', '|']
        group_ends = [')', ']', '}']

        groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'BEGIN', 'BY', 'CASE', 'CONST', 'DEFINITION', 'DO', 'ELSE',
            'ELSIF', 'END', 'EXCEPT', 'EXIT', 'EXPORT', 'FINALLY', 'FOR',
            'FROM', 'IF', 'IMPLEMENTATION', 'IMPORT', 'LOOP', 'MODULE', 'OF',
            'PROCEDURE', 'QUALIFIED', 'REPEAT', 'THEN', 'TO', 'TYPE', 'VAR',
            'WITH', 'WHILE'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'ARRAY', 'BOOLEAN', 'CARDINAL', 'CHAR', 'INTEGER', 'POINTER',
            'REAL', 'RECORD', 'SET'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['FALSE', 'NIL', 'TRUE']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, keyword_tb, types_tb,
            values_tb, known_operator_tb, groupers_tb, identifier_tb,
            string_tb, paren_star_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        self.tokens = tokens

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'identifier', 'variable']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(
            ['BEGIN', 'RECORD', 'CASE', 'DO', 'IF', 'WHILE'], ['END'])
        self.calc_paired_blockers_confidence(['REPEAT'], ['UNTIL'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Exemple #9
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        identifier_tb = HaskellIdentifierTokenBuilder()
        operand_types.append('identifier')

        class_tb = HaskellClassTokenBuilder()
        operand_types.append('class')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        line_comment_tb = LeadToEndOfLineTokenBuilder('--', False, 'comment')
        block_comment_tb = BlockTokenBuilder('{-', '-}', 'comment')

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)

        groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::']
        group_starts = ['(', '[', ',', '{']
        group_ends = [')', ']', '}']
        group_mids = [',', ':']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        operators_tb = HaskellOperatorTokenBuilder('#$%&*+./<=>?@\\^|-~')

        known_operators = ["'", '..']

        known_operators_tb = CaseInsensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.postfix_operators = ['..', "'"]

        keywords = [
            'case', 'class', 'data', 'deriving', 'do', 'else', 'if', 'import',
            'in', 'infix', 'infix1', 'infixr', 'instance', 'let', 'module',
            'newtype', 'of', 'then', 'type', 'where'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True)

        values = ['True', 'False', 'Nothing', '_']

        value_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb,
            keyword_tb, groupers_tb, operators_tb, known_operators_tb,
            identifier_tb, value_tb, class_tb, string_tb, line_comment_tb,
            block_comment_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment'])
        HaskellExaminer.convert_keywords_to_identifiers(tokens)
        self.tokens = tokens
        # self.convert_identifiers_to_labels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        # self.calc_token_2_confidence(['*', ';'])

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            # self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        # operand_types_2 = ['number']
        # self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # operand_types = ['number', 'string', 'symbol', 'identifier', 'variable']
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
  def __init__(self, code, year, extension, tab_size, wide):
    super().__init__()
    self.max_expected_line = 80

    if year is not None and year not in ['68', '1968', '74', '1974', '85', '1985']:
      raise CodeStatException('Unknown year for language')

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder(None)
    integer_exponent_tb = IntegerExponentTokenBuilder(None)
    real_tb = RealTokenBuilder(False, True, None)
    real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', None)
    operand_types.append('number')

    identifier_tb = CobolIdentifierTokenBuilder()
    operand_types.append('identifier')

    quotes = ['"', "'", "’"]
    string_tb = StuffedQuoteStringTokenBuilder(quotes, True)
    operand_types.append('string')

    picture_tb = PictureTokenBuilder()
    cr_picture_tb = CRPictureTokenBuilder()
    operand_types.append('picture')

    terminators_tb = SingleCharacterTokenBuilder('.', 'statement terminator', False)

    known_operators = [
      'ADD', 'SUBTRACT', 'MULTIPLY', 'DIVIDE',
      '+', '-', '*', '/', '**',
      '=', '<>', '>', '>=', '<', '<=',
      'AND', 'OR', 'NOT',
      ':'
    ]

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    self.unary_operators = [
      '+', '-'
    ]

    groupers = ['(', ')', ',']
    group_starts = ['(']
    group_mids = [',']
    # group_ends = [')']

    groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

    keywords = [
      'ACCEPT', 'ACCESS', 'ADD', 'ADDRESS', 'ADVANCING', 'AFTER', 'ALL',
      'ALPHABETIC', 'ALPHABETIC-LOWER', 'ALPHABETIC-UPPER',
      'ALPHANUMERIC', 'ALPHANUMERIC-EDITED', 'ALTER', 'ALTERNATE', 'AND',
      'APPLY', 'ARE', 'AREA', 'AREAS', 'ASCENDING', 'ASSIGN', 'AT', 'AUTHOR',
      'BEFORE', 'BLOCK', 'BY',
      'CALL', 'CANCEL', 'CD', 'CF', 'CH', 'CHARACTER', 'CHARACTERS',
      'CLOCK-UNITS', 'CLOSE', 'COBOL', 'CODE', 'COLUMN', 'COMMA',
      'COMMUNICATION', 'COMP', 'COMPUTATIONAL', 'COMPUTE', 'CONFIGURATION',
      'CONTAINS', 'CONTROL', 'CONTROLS', 'COPY', 'CORR', 'CORRESPONDING',
      'COUNT', 'CURRENCY',
      'DATA', 'DATE', 'DATE-COMPILED', 'DATE-WRITTEN',
      'DE', 'DEBUG-CONTENTS', 'DEBUG-ITEM', 'DEBUG-LINE', 'DEBUG-NAME',
      'DEBUG-SUB-1', 'DEBUG-SUB-2', 'DEBUG-SUB-3',
      'DECIMAL-POINT', 'DECLARATIVES', 'DELIMITED', 'DELIMITER', 'DEPENDING',
      'DESCENDING', 'DESTINATION', 'DETAIL', 'DISABLE', 'DISPLAY',
      'DIVIDE', 'DIVISION', 'DOWN',
      'EGI', 'ELSE', 'EMI', 'ENABLE', 'END', 'ENTER', 'ENVIRONMENT', 'EQUAL',
      'ERROR', 'ESI', 'EVERY', 'EXIT', 'EXTEND',
      'FD', 'FILE', 'FILE-CONTROL', 'FILLER', 'FINAL', 'FIRST', 'FOOTING', 'FOR',
      'FROM',
      'GENERATE', 'GIVING', 'GLOBAL', 'GO', 'GOBACK', 'GREATER', 'GROUP',
      'HEADING', 'HIGH-VALUE', 'HIGH-VALUES',
      'I-O', 'I-O-CONTROL',
      'IDENTIFICATION', 'IF', 'IN', 'INDEX', 'INDEXED', 'INDICATE', 'INITIAL',
      'INITIATE', 'INPUT', 'INPUT-OUTPUT', 'INSTALLATION', 'INTO', 'INVALID',
      'IS',
      'JUST', 'JUSTIFIED',
      'KEY',
      'LABEL', 'LAST', 'LEADING', 'LEFT', 'LENGTH', 'LESS', 'LIMIT', 'LIMITS',
      'LINE', 'LINE-COUNTER', 'LINES', 'LINKAGE',
      'LOCK', 'LOW-VALUE', 'LOW-VALUES',
      'MEMORY', 'MERGE', 'MESSAGE', 'MODE', 'MODULES', 'MOVE', 'MULTIPLE',
      'MULTIPLY',
      'NEGATIVE', 'NEXT', 'NO', 'NOT', 'NUMBER', 'NUMERIC', 'NUMERIC-EDITED',
      'OBJECT-COMPUTER', 'OCCURS', 'OF', 'OMITTED',
      'OPEN', 'OPTIONAL', 'OR', 'OUTPUT', 'OVERFLOW',
      'PAGE', 'PAGE-COUNTER', 'PERFORM', 'PF', 'PH', 'PIC', 'PICTURE',
      'PLUS', 'POINTER', 'POSITION', 'POSITIVE', 'PROCEDURE', 'PROCEED',
      'PROGRAM', 'PROGRAM-ID',
      'QUEUE', 'QUOTE', 'QUOTES',
      'RANDOM', 'RD', 'READ', 'RECEIVE', 'RECORD', 'RECORDS', 'REDEFINES',
      'REEL', 'REFERENCE', 'RELATIVE', 'RELEASE', 'REMAINDER',
      'RENAMES', 'REPLACE', 'REPLACING', 'REPORT', 'REPORTING', 'REPORTS',
      'RERUN', 'RESERVE', 'RESET', 'RETURN', 'REVERSED', 'REWIND', 'REWRITE',
      'RF', 'RH', 'RIGHT', 'ROUNDED', 'RUN',
      'SAME', 'SD', 'SEARCH', 'SECTION', 'SECURITY', 'SEGMENT', 'SEGMENT-LIMIT',
      'SELECT', 'SEND', 'SENTENCE', 'SEQUENCE', 'SEQUENTIAL', 'SET', 'SIGN', 'SIZE',
      'SORT', 'SOURCE', 'SOURCE-COMPUTER', 'SPECIAL-NAMES', 'STANDARD', 'STATUS',
      'STOP', 'STRING','SUB-QUEUE-1', 'SUB-QUEUE-2', 'SUB-QUEUE-3', 'SUBTRACT',
      'SUM', 'SUPPRESS', 'SYMBOLIC', 'SYNC', 'SYNCHRONIZED',
      'TABLE', 'TALLY', 'TAPE', 'TERMINAL', 'TERMINATE', 'TEST', 'TEXT', 'THAN',
      'THEN', 'THROUGH', 'THRU', 'TIME', 'TIMES', 'TITLE', 'TO', 'TYPE',
      'UNIT', 'UNSTRING', 'UNTIL', 'UP', 'UPON', 'USAGE', 'USE', 'USING',
      'VALUE', 'VALUES', 'VARYING',
      'WHEN',
      'WITH', 'WORDS', 'WORKING-STORAGE', 'WRITE'
    ]

    keywords_68_only = [
      'ACTUAL',
      'FILE-LIMITS',
      'NOMINAL',
      'PROCESSING',
      'NOTE',
      'REMARKS',
      'SEEK',
      'TODAY'
    ]

    keywords_74 = [
      'ALSO',
      'BOTTOM',
      'CODE-SET', 'COLLATING', 'COMMON',
      'DAY', 'DELETE', 'DEBUGGING', 'DUPLICATES', 'DYNAMIC',
      'END-OF-PAGE', 'EOP', 'EXCEPTION',
      'INSPECT',
      'LINAGE', 'LINAGE-COUNTER',
      'NATIVE',
      'ORGANIZATION',
      'PACKED-DECIMAL', 'PADDING', 'PRINTING', 'PROCEDURES',
      'REFERENCES', 'REMOVAL',
      'SEPARATE', 'SORT-MERGE', 'STANDARD-1', 'STANDARD-2', 'START',
      'TALLYING', 'TOP', 'TRAILING'
    ]

    keywords_85 = [
      'ALPHABET', 'ANY',
      'BINARY',
      'CONTENT', 'CONTINUE', 'CONVERTING',
      'DAY-OF-WEEK',
      'END-ADD', 'END-CALL', 'END-COMPUTE', 'END-DELETE', 'END-DIVIDE',
      'END-EVALUATE', 'END-IF', 'END-MULTIPLY', 'END-PERFORM', 'END-READ',
      'END-RECEIVE', 'END-RETURN', 'END-REWRITE', 'END-SEARCH', 'END-START',
      'END-STRING', 'END-SUBTRACT', 'END-UNSTRING', 'END-WRITE',
      'EVALUATE', 'EXTERNAL',
      'INITIALIZE',
      'ORDER', 'OTHER',
      'PURGE'
    ]

    if year in ['68', '1968']:
      keywords += keywords_68_only

    if year in ['74', '1974', '85', '1985']:
      keywords += keywords_74

    if year in ['85', '1985']:
      keywords += keywords_85

    keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False)

    values = [
      'BLANK', 'SPACE', 'SPACES', 'ZERO', 'ZEROES', 'ZEROS',
      'NO', 'OFF', 'ON'
    ]

    values_85 = ['FALSE', 'TRUE']

    if year in ['85', '1985']:
      values += values_85

    value_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
    operand_types.append('value')

    exec_tb = BlockTokenBuilder('EXEC', 'END-EXEC', 'exec block')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      terminators_tb,
      integer_tb,
      integer_exponent_tb,
      real_tb,
      real_exponent_tb,
      picture_tb,
      cr_picture_tb,
      keyword_tb,
      known_operator_tb,
      groupers_tb,
      value_tb,
      identifier_tb,
      string_tb,
      exec_tb,
      self.unknown_operator_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)

    tokens = self.tokenize_code(code, tab_size, tokenizer, wide)
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator')
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
    self.tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'whitespace')

    self.convert_numbers_to_pictures()
    self.convert_numbers_to_levels()

    self.calc_statistics()

    tokens = self.source_tokens()
    tokens = Examiner.join_all_lines(tokens)

    self.calc_token_confidence()
    self.calc_token_2_confidence()

    num_operators = self.count_my_tokens(['operator', 'invalid operator'])
    if num_operators > 0:
      self.calc_operator_confidence(num_operators)
      allow_pairs = []
      self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
      # self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
      self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

    self.calc_group_confidence(tokens, group_mids)

    # self.calc_operand_n_confidence(tokens, operand_types, 2)
    # self.calc_operand_n_confidence(tokens, operand_types, 4)

    self.calc_keyword_confidence()

    self.calc_picture_confidence()

    if not wide:
      self.calc_line_length_confidence(code, self.max_expected_line)

    expected_keyword_confidence = self.check_expected_keywords()
    self.confidences['expected_keywords'] = expected_keyword_confidence
Exemple #11
0
    def __init__(self, code, comment):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        number_tb = GenericNumberTokenBuilder()
        operand_types.append('number')

        leads = ''
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’", '`']
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_string_tb = TripleQuoteStringTokenBuilder(quotes)
        operand_types.append('string')

        comment_tbs = []

        if comment == 'ada':
            comment_tbs = [LeadToEndOfLineTokenBuilder('--', True, 'comment')]
        if comment == 'hash':
            comment_tbs = [LeadToEndOfLineTokenBuilder('#', True, 'comment')]
        if comment == 'bang':
            comment_tbs = [LeadToEndOfLineTokenBuilder('!', True, 'comment')]
        if comment == 'cobol-inline':
            comment_tbs = [LeadToEndOfLineTokenBuilder('*>', True, 'comment')]
        if comment == 'percent':
            comment_tbs = [LeadToEndOfLineTokenBuilder('%', True, 'comment')]
        if comment == 'cobol':
            pass
        if comment == 'fortran':
            pass
        if comment == 'basic':
            comment_tbs = [
                LeadToEndOfLineTokenBuilder("REM", False, 'comment'),
                LeadToEndOfLineTokenBuilder("'", True, 'comment')
            ]
        if comment == 'c':
            comment_tbs = [SlashStarCommentTokenBuilder()]
        if comment == 'cpp':
            comment_tbs = [
                SlashSlashCommentTokenBuilder(),
                SlashStarCommentTokenBuilder()
            ]
        if comment == 'pascal':
            comment_tbs = [
                BraceCommentTokenBuilder(),
                BlockTokenBuilder('(*', '*)', 'comment')
            ]

        known_operators = [
            '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=',
            '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!',
            '&', '|', '~', '<<', '>>', '^', '.', '..', '...', ':', '++', '--',
            '->', '&&', '||', '?', '##', '\\', '_', '@', '#', '$', '`', '```'
        ]

        groupers = ['(', ')', ',', '[', ']', '{', '}', ';']
        # group_starts = ['(', '[', ',', '{']
        group_mids = [',', ';']
        # group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders1 = [
            whitespace_tb, newline_tb, number_tb, groupers_tb,
            known_operator_tb, identifier_tb, string_tb, triple_string_tb
        ]

        tokenbuilders2 = [self.unknown_operator_tb, invalid_token_builder]

        tokenbuilders = tokenbuilders1 + comment_tbs + tokenbuilders2

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = Examiner.combine_identifier_colon(
            tokens, ['newline', 'statement separator'], ['{'],
            ['whitespace', 'comment', 'line description'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()

        self.calc_statistics()

        # tokens = self.source_tokens()
        # tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            # allow_pairs = []
            # self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            # self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        # self.calc_operand_n_confidence(tokens, operand_types, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        # self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Exemple #12
0
    def __init__(self, code, variant):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        class_type_tb = ClassTypeTokenBuilder()
        operand_types.append('class')

        quotes = ['"']
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        prefixed_string_tb = PrefixedStringTokenBuilder('@', False, quotes)
        char_tb = FsharpCharTokenBuilder(["'", "’"])
        operand_types.append('string')

        slash_slash_comment_tb = NullTokenBuilder()
        parens_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment')
        triple_slash_comment_tb = NullTokenBuilder()
        if variant in ['fsharp']:
            slash_slash_comment_tb = SlashSlashCommentTokenBuilder()
            triple_slash_comment_tb = TripleSlashCommentTokenBuilder()

        directives = [
            '#if', '#else', '#elif', '#endif', '#define', '#undef', '#line',
            '#region', '#endregion', '#pragma'
        ]

        preprocessor_tb = CaseSensitiveListTokenBuilder(
            directives, 'preprocessor', False)
        c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True,
                                                   'preprocessor')
        c_error_tb = LeadToEndOfLineTokenBuilder('#error', True,
                                                 'preprocessor')

        known_operators = [
            'and', 'as', 'in', 'mod', 'not', 'of', 'or', 'when', '::', '+',
            '-', '*', '/', '+.', '-.', '*.', '/.', '=', "'", '->', '>', '<',
            '>=', '<=', '==', '^', '||', '.', '#'
        ]

        known_operators_fsharp = [
            'new', '!', '!=', '%', '%%', '%?', '&', '&&', '&&&', '(|', '|)',
            '*?', '**', '+?', '-?', '->', '..', '.. ..', '/?', ':', ':=', ':/',
            '<<', '<<<', '<-', '<>', '<>?', '<=?', '<|', '<||', '<|||', '<@',
            '@>', '<@@', '@@>', '=?', '==', '>?', '>>', '>>>', '>=?', '?',
            '|||', '^^^', '?>=', '?>', '?<=', '?<', '?=', '?<>', '?+', '?-',
            '?*', '?/', '>=?', '>?', '<=?', '<?', '=?', '<>?', '+?', '-?',
            '*?', '/?', '?>=?', '?>?', '?<=?', '?<?', '?=?', '?<>?', '?+?',
            '?-?', '?*?', '?/?', '@', '|>', '||>', '|||>', '~~', '~~~', '~-',
            '~+', ':>', ':?>', "'"
        ]

        if variant in ['fsharp']:
            known_operators += known_operators_fsharp

        self.unary_operators = ['new', 'not', "'", '-']

        self.postfix_operators = ["'"]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        groupers = [
            '(', ')', ',', '[', ']', '{', '}', 'begin', 'end', ';', '|'
        ]

        groupers_fsharp = ['[|', '|]', '[<', '>]', '^']

        if variant in ['fsharp']:
            groupers += groupers_fsharp

        # group_starts = ['(', '[', ',', '{', '[|', '[<']
        group_mids = [',', ';', '^', '|']
        group_ends = [')', ']', '}', '|]', '>]']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'assert', 'class', 'def', 'do', 'done', 'downto', 'else',
            'exception', 'failwith', 'for', 'fun', 'function', 'if', 'inherit',
            'lazy', 'let', 'match', 'method', 'module', 'object', 'open',
            'raise', 'rec', 'sig', 'then', 'to', 'try', 'type', 'val',
            'virtual', 'while', 'with'
        ]

        keywords_fsharp = [
            'abstract', 'break', 'default', 'delegate', 'downcast', 'elif',
            'extern', 'finally', 'fixed', 'global', 'inline', 'interface',
            'internal', 'let!', 'match!', 'member', 'mutable', 'namespace',
            'override', 'private', 'public', 'return', 'return!', 'upcast',
            'use', 'use!', 'yield', 'yield!'
        ]

        if variant in ['fsharp']:
            keywords += keywords_fsharp

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'bool', 'byte', 'char', 'double', 'float', 'int', 'list', 'long',
            'number', 'object', 'range', 'string', 'struct', 'union', 'unit',
            'void'
        ]

        types_fsharp = [
            'decimal', 'sbyte', 'short', 'uint', 'ulong', 'ushort', 'void'
        ]

        if variant in ['fsharp']:
            types += types_fsharp

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['base', 'false', 'null', 'true', '_']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, integer_tb, integer_exponent_tb,
            real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb,
            known_operator_tb, groupers_tb, identifier_tb, class_type_tb,
            string_tb, triple_quote_string_tb, prefixed_string_tb, char_tb,
            triple_slash_comment_tb, slash_slash_comment_tb,
            parens_star_comment_tb, preprocessor_tb, c_error_tb, c_warning_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Exemple #13
0
    def __init__(self, code):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(True, True, None)
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None)
        hex_constant_tb = PrefixedIntegerTokenBuilder(
            '$', True, '0123456789ABCDEFabcdef')
        octal_constant_tb = PrefixedIntegerTokenBuilder('&', True, '01234567')
        binary_constant_tb = PrefixedIntegerTokenBuilder('%', True, '01')
        char_constant_tb = PrefixedIntegerTokenBuilder('#', True, '0123456789')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ["'"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        brace_comment_tb = BraceCommentTokenBuilder()
        paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment')

        known_operators = [
            '+', '-', '*', '/', '=', '<>', '>', '>=', '<', '<=', 'and', 'or',
            'not', '&', '|', '~', '<<', '>>', ':=', '^', '~', '@', '.', ':',
            '..', 'div', 'mod', 'shl', 'shr', 'in'
        ]

        known_operator_tb = CaseInsensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', 'not', '@', '^', '.']

        self.postfix_operators = ['^']

        groupers = ['(', ')', ',', '[', ']']
        group_starts = ['(', '[', ',']
        group_mids = [',']
        group_ends = [')', ']']

        groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'begin', 'break', 'case', 'const', 'do', 'downto', 'else', 'end',
            'for', 'forward', 'function', 'goto', 'if', 'label', 'of',
            'otherwise', 'packed', 'procedure', 'program', 'repeat', 'reset',
            'then', 'to', 'type', 'until', 'uses', 'value', 'var', 'while',
            'with'
        ]

        keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword',
                                                     False)

        types = [
            'array', 'boolean', 'char', 'file', 'integer', 'real', 'record',
            'set', 'string'
        ]

        types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'nil', 'true']

        values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb,
            octal_constant_tb, binary_constant_tb, char_constant_tb,
            keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, string_tb, brace_comment_tb, paren_star_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = self.combine_identifier_colon(
            tokens, ['statement separator'], ['begin'],
            ['whitespace', 'comment', 'newline', 'line description'])
        self.tokens = tokens
        self.convert_identifiers_to_labels()
        self.convert_identifiers_to_labels_2()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'identifier', 'variable']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['begin', 'record', 'case'],
                                             ['end'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Exemple #14
0
    def __init__(self, code, year, extension):
        super().__init__()

        if year is not None and year not in ['2002', '2014']:
            raise CodeStatException('Unknown year for language')

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, True, None)
        real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', None)

        identifier_tb = CobolIdentifierTokenBuilder()

        quotes = ['"', "'", "’"]
        string_tb = StuffedQuoteStringTokenBuilder(quotes, False)
        n_string_tb = PrefixedStringTokenBuilder('N', False, quotes)
        nx_string_tb = PrefixedStringTokenBuilder('NX', False, quotes)

        picture_tb = PictureTokenBuilder()
        cr_picture_tb = CRPictureTokenBuilder()

        inline_comment_tb = LeadToEndOfLineTokenBuilder('*>', True, 'comment')
        star_comment_tb = AsteriskCommentTokenBuilder()

        terminators_tb = SingleCharacterTokenBuilder('.',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            'ADD', 'SUBTRACT', 'MULTIPLY', 'DIVIDE', '+', '-', '*', '/', '**',
            '=', '<>', '>', '>=', '<', '<=', 'AND', 'OR', 'NOT', 'B-AND',
            'B-NOT', 'B-OR', 'B-XOR', ':'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', 'NOT']

        groupers = ['(', ')', ',']
        group_starts = ['(']
        group_mids = [',']
        # group_ends = [')']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'ACCEPT', 'ACCESS', 'ADD', 'ADDRESS', 'ADVANCING', 'AFTER', 'ALL',
            'ALPHABET', 'ALPHABETIC', 'ALPHABETIC-LOWER', 'ALPHABETIC-UPPER',
            'ALPHANUMERIC', 'ALPHANUMERIC-EDITED', 'ALSO', 'ALTER',
            'ALTERNATE', 'AND', 'ANY', 'APPLY', 'ARE', 'AREA', 'AREAS',
            'ASCENDING', 'ASSIGN', 'AT', 'AUTHOR', 'BEFORE', 'BEGINNING',
            'BELL', 'BINARY', 'BLOCK', 'BOTTOM', 'BY', 'BYTE-LENGTH', 'CALL',
            'CANCEL', 'CBL', 'CD', 'CF', 'CH', 'CHARACTER', 'CHARACTERS',
            'CLOCK-UNITS', 'CLOSE', 'COBOL', 'CODE', 'CODE-SET', 'COL',
            'COLLATING', 'COLS', 'COLUMN', 'COMMA', 'COMMON', 'COMMUNICATION',
            'COMP', 'COMPUTATIONAL', 'COMPUTE', 'CONFIGURATION', 'CONTAINS',
            'CONTENT', 'CONTINUE', 'CONTROL', 'CONTROLS', 'CONVERTING', 'COPY',
            'CORR', 'CORRESPONDING', 'COUNT', 'CURRENCY', 'DATA', 'DATE',
            'DATE-COMPILED', 'DATE-WRITTEN', 'DAY', 'DAY-OF-WEEK', 'DE',
            'DEBUG-CONTENTS', 'DEBUG-ITEM', 'DEBUG-LINE', 'DEBUG-NAME',
            'DEBUG-SUB-1', 'DEBUG-SUB-2', 'DEBUG-SUB-3', 'DECIMAL-POINT',
            'DECLARATIVES', 'DELETE', 'DELIMITED', 'DELIMITER', 'DEPENDING',
            'DESCENDING', 'DESTINATION', 'DISABLE', 'DIVIDE', 'DIVISION',
            'DOWN', 'DUPLICATES', 'DYNAMIC', 'EGI', 'ELSE', 'EMI', 'ENABLE',
            'END', 'END-ACCEPT', 'END-ADD', 'END-CALL', 'END-COMPUTE',
            'END-DELETE', 'END-DISPLAY', 'END-DIVIDE', 'END-EVALUATE',
            'END-EXEC', 'END-IF', 'END-MULTIPLY', 'END-OF-PAGE', 'END-PERFORM',
            'END-READ', 'END-RECEIVE', 'END-RETURN', 'END-REWRITE',
            'END-SEARCH', 'END-START', 'END-STRING', 'END-SUBTRACT',
            'END-UNSTRING', 'END-WRITE', 'ENTER', 'ENVIRONMENT', 'EOL', 'EOP',
            'EQUAL', 'ERROR', 'ESI', 'EVALUATE', 'EVERY', 'EXCEPTION', 'EXEC',
            'EXIT', 'EXTEND', 'EXTERNAL', 'FD', 'FILE', 'FILE-CONTROL',
            'FILLER', 'FINAL', 'FIRST', 'FOOTING', 'FOR', 'FROM', 'FULL',
            'GENERATE', 'GIVING', 'GLOBAL', 'GO', 'GOBACK', 'GREATER', 'GROUP',
            'HEADING', 'HIGH-VALUE', 'HIGH-VALUES', 'I-O', 'I-O-CONTROL',
            'IDENTIFICATION', 'IF', 'IN', 'INDEX', 'INDEXED', 'INDICATE',
            'INITIAL', 'INITIALIZE', 'INITIATE', 'INPUT', 'INPUT-OUTPUT',
            'INSPECT', 'INSTALLATION', 'INTO', 'INVALID', 'IS', 'JUST',
            'JUSTIFIED', 'KEY', 'LABEL', 'LAST', 'LEADING', 'LEFT', 'LENGTH',
            'LESS', 'LIMIT', 'LIMITS', 'LINAGE', 'LINAGE-COUNTER', 'LINE',
            'LINE-COUNTER', 'LINES', 'LINKAGE', 'LOCK', 'LOW-VALUE',
            'LOW-VALUES', 'MEMORY', 'MERGE', 'MESSAGE', 'MODE', 'MODULES',
            'MOVE', 'MULTIPLE', 'MULTIPLY', 'NATIVE', 'NEGATIVE', 'NEXT',
            'NOT', 'NUMBER', 'NUMBERS', 'NUMERIC', 'NUMERIC-EDITED',
            'OBJECT-COMPUTER', 'OCCURS', 'OF', 'OMITTED', 'OPEN', 'OPTIONAL',
            'OR', 'ORDER', 'ORGANIZATION', 'OTHER', 'OUTPUT', 'OVERFLOW',
            'PACKED-DECIMAL', 'PADDING', 'PAGE', 'PAGE-COUNTER', 'PARAGRAPH',
            'PERFORM', 'PF', 'PH', 'PIC', 'PICTURE', 'PLUS', 'POINTER',
            'POSITION', 'POSITIVE', 'PRINTING', 'PROCEDURE', 'PROCEDURES',
            'PROCEED', 'PROGRAM', 'PROGRAM-ID', 'PURGE', 'QUEUE', 'QUOTE',
            'QUOTES', 'RANDOM', 'RD', 'READ', 'RECEIVE', 'RECORD', 'RECORDS',
            'REDEFINES', 'REEL', 'REFERENCE', 'RELATIVE', 'RELEASE',
            'REMAINDER', 'REMOVAL', 'RENAMES', 'REPLACE', 'REPLACING',
            'REPORT', 'REPORTING', 'REPORTS', 'RERUN', 'RESERVE', 'RESET',
            'RESUME', 'RETRY', 'RETURN', 'REVERSED', 'REWIND', 'REWRITE', 'RF',
            'RH', 'RIGHT', 'ROUNDED', 'RUN', 'SAME', 'SD', 'SEARCH', 'SECTION',
            'SECURE', 'SECURITY', 'SEGMENT', 'SEGMENT-LIMIT', 'SELECT', 'SEND',
            'SENTENCE', 'SEPARATE', 'SEQUENCE', 'SEQUENTIAL', 'SET', 'SIGN',
            'SIZE', 'SORT', 'SORT-MERGE', 'SOURCE', 'SOURCE-COMPUTER',
            'SPECIAL-NAMES', 'STANDARD', 'STANDARD-1', 'STANDARD-2', 'START',
            'STATUS', 'STOP', 'STRING', 'SUB-QUEUE-1', 'SUB-QUEUE-2',
            'SUB-QUEUE-3', 'SUBTRACT', 'SUM', 'SUPPRESS', 'SYMBOLIC', 'SYNC',
            'SYNCHRONIZED', 'TABLE', 'TALLY', 'TALLYING', 'TAPE', 'TERMINAL',
            'TERMINATE', 'TEST', 'TEXT', 'THAN', 'THEN', 'THROUGH', 'THRU',
            'TIME', 'TIMES', 'TITLE', 'TO', 'TOP', 'TRAILING', 'TYPE', 'UNIT',
            'UNSTRING', 'UNTIL', 'UP', 'UPON', 'USAGE', 'USE', 'USING',
            'VALUE', 'VALUES', 'VARYING', 'WHEN', 'WITH', 'WORDS',
            'WORKING-STORAGE', 'WRITE'
        ]

        keywords_2002 = [
            'ACTIVE-CLASS',
            'ALIGNED',
            'ALLOCATE',
            'ANYCASE',
            'ARITHMETIC',
            'AUTO',
            'AUTOMATIC',
            'BACKGROUND-COLOR',
            'BASED',
            'BASIS',
            'BINARY-CHAR',
            'BINARY-DOUBLE',
            'BINARY-LONG',
            'BINARY-SHORT',
            'BIT',
            'BLINK',
            'BOOLEAN',
            'CENTER',
            'CLASS',
            'CLASS-ID',
            'CLASSIFICATION',
            'COLUMNS',
            'COM-REG',
            'CONDITION',
            'CONSTANT',
            'CRT',
            'CURSOR',
            'CYCLE',
            'DATA-POINTER',
            'DBCS',
            'DEBUGGING',
            'DETAIL',
            'DISPLAY',
            'DISPLAY-1',
            'DISPLAY-OF',
            'EC',
            'EGCS',
            'EJECT',
            'END-INVOKE',
            'ENDING',
            'ENTRY-CONVENTION',
            'ENTRY-FIELD',
            'EO',
            'EOS',
            'ERASE',
            'EXCEPTION-OBJECT',
            'EXCLUSIVE',
            'EXPANDS',
            'EXTERN',
            'FACTORY',
            'FLOAT-EXTENDED',
            'FLOAT-LONG',
            'FLOAT-SHORT',
            'FOREGROUND-COLOR',
            'FOREVER',
            'FORMAT',
            'FREE',
            'FUNCTION',
            'FUNCTION-ID',
            'GET',
            'GROUP-USAGE',
            'HIGHLIGHT',
            'IGNORING',
            'IMPLEMENTS',
            'INHERITS',
            'INITIALIZED',
            'INSERT',
            'INTERFACE',
            'INTERFACE-ID',
            'INTRINSIC',
            'INVOKE',
            'KANJI',
            'LC_ALL',
            'LC_COLLATE',
            'LC_CTYPE',
            'LC_MESSAGES',
            'LC_MONEY',
            'LC_NUMERIC',
            'LC_TIME',
            'LOCAL-STORAGE',
            'LOCALE',
            'LOWLIGHT',
            'MANUAL',
            'METACLASS',
            'METHOD',
            'METHOD-ID',
            'MINUS',
            'MORE-LABELS',
            'NATIONAL',
            'NATIONAL-EDITED',
            'NATIONAL-OF',
            'NATIVE_BINARY',
            'NESTED',
            'NEW',
            'NONE',
            'NORMAL',
            'OBJECT',
            'OBJECT-REFERENCE',
            'ONLY',
            'OPTIONS',
            'OVERRIDE',
            'PHYSICAL',
            'PRESENT',
            'PREVIOUS',
            'PROCEDURE-POINTER',
            'PROCESSING',
            'PROGRAM-POINTER',
            'PROPERTY',
            'PROTOTYPE',
            'RAISE',
            'RAISING',
            'READY',
            'RECURSIVE',
            'REFERENCES',
            'RELATION',
            'RELOAD',
            'REPOSITORY',
            'REQUIRED',
            'RETURN-CODE',
            'RETURNING',
            'ROUNDING',
            'SCREEN',
            'SECONDS',
            'SERVICE',
            'SHARING',
            'SHIFT-IN',
            'SHIFT-OUT',
            'SIGNED',
            'SKIP1',
            'SKIP2',
            'SKIP3',
            'SORT-CONTROL',
            'SORT-CORE-SIZE',
            'SORT-FILE-SIZE',
            'SORT-MESSAGE',
            'SORT-MODE-SIZE',
            'SORT-RETURN',
            'SOURCES',
            'STATEMENT',
            'STEP',
            'STRONG',
            'SYMBOL',
            'SYSTEM-DEFAULT',
            'TRACE',
            'TYPEDEF',
            'UCS-4',
            'UNDERLINE',
            'UNIVERSAL',
            'UNLOCK',
            'UNSIGNED',
            'USER-DEFAULT',
            'UTF-16',
            'UTF-8',
            'VAL-STATUS',
            'VALID',
            'VALIDATE',
            'VALIDATE-STATUS',
            'WHEN-COMPILED',
            'WRITE-ONLY',
            'YYYYDDD',
            'YYYYMMDD',
        ]

        keywords_2014 = [
            'AWAY-FROM-ZERO', 'NEAREST-AWAY-FROM-ZERO', 'NEAREST-EVEN',
            'NEAREST-TOWARD-ZERO', 'TOWARD-GREATER', 'TOWARD-LESSER',
            'CAPACITY', 'FLOAT-BINARY-128', 'FLOAT-BINARY-32',
            'FLOAT-BINARY-64', 'FLOAT-DECIMAL-16', 'FLOAT-DECIMAL-34',
            'FLOAT-INFINITY', 'FLOAT-NOT-A-NUMBER', 'FUNCTION-POINTER',
            'INTERMEDIATE', 'PHYSICAL', 'PREFIXED', 'PROHIBITED', 'SHORT',
            'STANDARD-BINARY', 'STANDARD-DECIMAL', 'TRUNCATION'
        ]

        keywords_ibm = ['ABSENT', 'ID', 'PASSWORD', 'UNBOUNDED']

        keywords_gnu = [
            'ARGUMENT-NUMBER', 'ARGUMENT-VALUE', 'ASCII', 'BINARY-C-LONG',
            'BINARY-SEQUENTIAL', 'CARD-PUNCH', 'CARD-READER', 'CASSETTE',
            'CHAIN', 'CHAINING', 'COLOR', 'COMMAND-LINE', 'COMMIT', 'COMP-1',
            'COMP-2', 'COMP-3', 'COMP-4', 'COMP-5', 'COMP-6', 'COMP-X',
            'COMPUTATIONAL-1', 'COMPUTATIONAL-2', 'COMPUTATIONAL-3',
            'COMPUTATIONAL-4', 'COMPUTATIONAL-5', 'COMPUTATIONAL-6',
            'COMPUTATIONAL-X', 'CONVERSION', 'CRT-UNDER', 'DISC', 'DISK',
            'EBCDIC', 'ECHO', 'END-CHAIN', 'ENTRY', 'ENVIRONMENT-NAME',
            'ENVIRONMENT-VALUE', 'ESCAPE', 'F', 'FILE-ID', 'FIXED',
            'FLOAT-DECIMAL-7', 'ID', 'IGNORE', 'KEPT', 'KEYBOARD',
            'LEFT-JUSTIFY', 'LEFTLINE', 'LINE-SEQUENTIAL', 'LOWER',
            'MAGNETIC-TAPE', 'NAME', 'NO-ECHO', 'NOTHING', 'OVERLINE', 'PRINT',
            'PRINTER', 'PRINTER-1', 'PROCEDURE-POINTER', 'PROCEDURES',
            'PROMPT', 'PROTECTED', 'RECORDING', 'REVERSE', 'RIGHT-JUSTIFY',
            'ROLLBACK', 'S', 'SCROLL', 'SIGNED-INT', 'SIGNED-LONG',
            'SIGNED-SHORT', 'SPACE-FILL', 'STATIC', 'STDCALL', 'SYSTEM-OFFSET',
            'TAB', 'TIME-OUT', 'TRAILING-SIGN', 'U', 'UNSIGNED-INT',
            'UNSIGNED-LONG', 'UNSIGNED-SHORT', 'UPDATE', 'UPPER', 'USER', 'V',
            'VARIABLE', 'WAIT', 'WRAP', 'ZERO-FILL'
        ]

        keywords_acu = [
            '3-D', 'ACTION', 'ACTIVE-X', 'ADJUSTABLE-COLUMNS', 'ALIGNMENT',
            'AUTO-DECIMAL', 'AUTO-SPIN', 'BACKGROUND-HIGH', 'BACKGROUND-LOW',
            'BACKGROUND-STANDARD', 'BAR', 'BITMAP', 'BITMAP-END',
            'BITMAP-HANDLE', 'BITMAP-NUMBER', 'BITMAP-START',
            'BITMAP-TRAILING', 'BITMAP-TRANSPARENT-COLOR', 'BITMAP-WIDTH',
            'BOX', 'BOXED', 'BUSY', 'BUTTONS', 'CALENDAR-FONT',
            'CANCEL-BUTTON', 'CELL', 'CELL-COLOR', 'CELL-DATA', 'CELL-FONT',
            'CELL-PROTECTION', 'CENTERED-HEADING', 'CENTURY-DATE', 'CHECK-BOX',
            'CLEAR-SELECTION', 'CLINE', 'CLINES', 'COLORS', 'COLUMN-COLOR',
            'COLUMN-DIVIDERS', 'COLUMN-FONT', 'COLUMN-HEADINGS',
            'COLUMN-PROTECTION', 'COMBO-BOX', 'COPY-SELECTION', 'CSIZE',
            'CURSOR-COL', 'CURSOR-COLOR', 'CURSOR-FRAME-WIDTH', 'CURSOR-ROW',
            'CURSOR-X', 'CURSOR-Y', 'CUSTOM-PRINT-TEMPLATE', 'DASHED',
            'DATA-COLUMNS', 'DATA-TYPES', 'DATE-ENTRY', 'DEFAULT-BUTTON',
            'DEFAULT-FONT', 'DESTROY', 'DISPLAY-COLUMNS', 'DISPLAY-FORMAT',
            'DOTDASH', 'DOTTED', 'DOUBLE', 'DRAG-COLOR', 'DROP-DOWN',
            'DROP-LIST', 'END-COLOR', 'END-MODIFY', 'ENGRAVED',
            'ENSURE-VISIBLE', 'ENTRY-FIELD', 'ENTRY-REASON', 'ESCAPE-BUTTON',
            'EVENT', 'EVENT-LIST', 'EXCEPTION-VALUE', 'EXPAND',
            'EXTERNAL-FORM', 'FILE-NAME', 'FILE-POS', 'FILL-COLOR',
            'FILL-COLOR-2', 'FILL-PERCENT', 'FINISH-REASON', 'FIXED-FONT',
            'FIXED-WIDTH', 'FLAT', 'FLAT-BUTTONS', 'FLOAT', 'FLOATING', 'FONT',
            'FRAME', 'FRAMED', 'FULL-HEIGHT', 'GRID', 'GO-BACK', 'GO-FORWARD',
            'GO-HOME', 'GO-SEARCH', 'GRAPHICAL', 'GRID', 'GROUP-VALUE',
            'HANDLE', 'HAS-CHILDREN', 'HEADING-COLOR', 'HEADING-DIVIDER-COLOR',
            'HEADING-FONT', 'HEAVY', 'HEIGHT-IN-CELLS', 'HIDDEN-DATA',
            'HIGH-COLOR', 'HOT-TRACK', 'HSCROLL', 'HSCROLL-POS', 'ICON',
            'IDENTIFIED', 'INDEPENDENT', 'INQUIRE', 'INSERTION-INDEX',
            'INSERTION-ROWS', 'ITEM', 'ITEM-TEXT', 'ITEM-TO-ADD',
            'ITEM-TO-DELETE', 'ITEM-TO-EMPTY', 'ITEM-VALUE', 'LABEL',
            'LABEL-OFFSET', 'LARGE-FONT', 'LARGE-OFFSET', 'LAST-ROW',
            'LAYOUT-DATA', 'LAYOUT-MANAGER', 'LEADING-SHIFT', 'LEFT-TEXT',
            'LINES-AT-ROOT', 'LIST-BOX', 'LM-RESIZE', 'LONG-DATE', 'LOW-COLOR',
            'LOWERED', 'MASS-UPDATE', 'MAX-LINES', 'MAX-PROGRESS', 'MAX-TEXT',
            'MAX-VAL', 'MEDIUM-FONT', 'MENU', 'MIN-VAL', 'MODIFY', 'MULTILINE',
            'NAVIGATE-URL', 'NEXT-ITEM', 'NO-AUTOSEL', 'NO-AUTO-DEFAULT',
            'NO-BOX', 'NO-DIVIDERS', 'NO-F4', 'NO-FOCUS', 'NO-GROUP-TAB',
            'NO-KEY-LETTER', 'NO-SEARCH', 'NO-UPDOWN', 'NOTAB', 'NOTIFY',
            'NOTIFY-CHANGE', 'NOTIFY-DBLCLICK', 'NOTIFY-SELCHANGE',
            'NUM-COL-HEADINGS', 'NUM-ROWS', 'OK-BUTTON', 'OVERLAP-LEFT',
            'OVERLAP-TOP', 'PAGE-SETUP', 'PAGED', 'PARENT', 'PERMANENT',
            'PIXEL', 'PLACEMENT', 'POP-UP', 'POSITION-SHIFT',
            'PRINT-NO-PROMPT', 'PRINT-PREVIEW', 'PRIORITY', 'PROGRESS',
            'PROPERTIES', 'PROPERTY', 'PUSH-BUTTON', 'QUERY-INDEX',
            'RADIO-BUTTON', 'RAISED', 'READ-ONLY', 'RECORD-DATA',
            'RECORD-TO-ADD', 'RECORD-TO-DELETE', 'REFRESH', 'REGION-COLOR',
            'RESET-GRID', 'RESET-LIST', 'RESET-TABS', 'RIGHT-ALIGN', 'RIMMED',
            'ROW-COLOR', 'ROW-COLOR-PATTERN', 'ROW-DIVIDERS', 'ROW-FONT',
            'ROW-HEADINGS', 'ROW-PROTECTION', 'SAVE-AS', 'SAVE-AS-NO-PROMPT',
            'SCROLL-BAR', 'SEARCH-OPTIONS', 'SEARCH-TEXT', 'SELECT-ALL',
            'SELECTION-INDEX', 'SELECTION-TEXT', 'SELF-ACT', 'SEPARATION',
            'SHADING', 'SHADOW', 'SHORT-DATE', 'SHOW-LINES', 'SHOW-NONE',
            'SHOW-SEL-ALWAYS', 'SMALL-FONT', 'SORT-ORDER', 'SPINNER', 'SQUARE',
            'START-X', 'START-Y', 'STATIC-LIST', 'STATUS-BAR', 'STATUS-TEXT',
            'STYLE', 'SUBWINDOW', 'TAB-TO-ADD', 'TAB-TO-DELETE', 'TEMPORARY',
            'TERMINATION-VALUE', 'THREAD', 'THREADS', 'THUMB-POSITION',
            'TILED-HEADINGS', 'TITLE', 'TITLE-POSITION', 'TRADITIONAL-FONT',
            'TRAILING-SHIFT', 'TRANSPARENT', 'TREE-VIEW', 'UNFRAMED',
            'UNSORTED', 'USE-ALT', 'USE-RETURN', 'USE TAB', 'VALUE-FORMAT',
            'VARIANT', 'VERTICAL', 'VERY-HEAVY', 'VIRTUAL-WIDTH', 'VPADDING',
            'VSCROLL', 'VSCROLL-BAR', 'VSCROLL-POS', 'VTOP', 'WEB-BROWSER',
            'WIDTH', 'WIDTH-IN-CELLS', 'WINDOW', 'X', 'Y'
        ]

        if year in ['2002', '2014']:
            keywords += keywords_2002

        if year == '2014':
            keywords += keywords_2014

        if extension.lower() == 'acu':
            keywords += keywords_acu

        if extension.lower() == 'ibm':
            keywords += keywords_ibm

        if extension.lower() == 'gnu':
            keywords += keywords_gnu

        keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword',
                                                     False)

        values = [
            'BLANK', 'SPACE', 'SPACES', 'ZERO', 'ZEROES', 'ZEROS', 'FALSE',
            'NO', 'OFF', 'ON', 'TRUE'
        ]

        values_2002 = ['NULL', 'NULLS', 'SELF', 'SUPER']

        if year in ['2002', '2014']:
            values += values_2002

        value_tb = CaseInsensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        cobol_preprocessor_tb = CobolPreprocessorTokenBuilder()

        exec_tb = BlockTokenBuilder('EXEC', 'END-EXEC', 'exec block')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb,
            whitespace_tb,
            terminators_tb,
            integer_tb,
            integer_exponent_tb,
            real_tb,
            real_exponent_tb,
            picture_tb,
            cr_picture_tb,
            keyword_tb,
            star_comment_tb,  # before operator, to catch single star as comment
            known_operator_tb,
            groupers_tb,
            value_tb,
            identifier_tb,
            string_tb,
            n_string_tb,
            nx_string_tb,
            inline_comment_tb,
            cobol_preprocessor_tb,
            exec_tb,
            self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.convert_numbers_to_pictures()
        self.convert_numbers_to_levels()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            # self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        # self.calc_operand_n_confidence(tokens, operand_types, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)

        self.calc_picture_confidence()
        expected_keyword_confidence = self.check_expected_keywords()
        self.confidences['expected_keywords'] = expected_keyword_confidence
Exemple #15
0
    def __init__(self, code, version):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        command_tb = PrefixedIdentifierTokenBuilder('!', 'command', False)

        metaclass_tb = PrefixedIdentifierTokenBuilder('?', 'metaclass', False)

        quotes = ['"', "'", "’"]
        string_tb = MatlabStringTokenBuilder(quotes, False)
        operand_types.append('string')

        line_comment_m_tb = LeadToEndOfLineTokenBuilder('%', False, 'comment')
        line_comment_o_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')
        block_comment_m_tb = BlockTokenBuilder('%{', '%}', 'comment')
        block_comment_o_tb = BlockTokenBuilder('#{', '#}', 'comment')

        line_continuation_tb = KeywordTokenBuilder('...', 'line continuation')

        known_operators = [
            '+', '-', '.*', '*', './', '/', '\\', '.^', '^', ".'", "'", '=',
            '==', '~=', '>', '>=', '<', '<=', '&', '|', '&&', '||', '~', '@',
            '.', '.?'
        ]

        operators_octave = [
            '++', '--', '+=', '-=', '*=', '/=', '^=', '!', '!=', '**'
        ]

        if version == 'octave':
            known_operators += operators_octave

        self.unary_operators = ['+', '-', '~', '@']

        self.postfix_operators = ["'"]

        groupers = ['(', ')', ',', '[', ']', '{', '}', ';', ':']
        group_starts = ['(', '[', ',', '{']
        # group_mids = [',', ';', ':']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'break', 'case', 'catch', 'classdef', 'continue', 'else', 'elseif',
            'end', 'for', 'function', 'global', 'if', 'otherwise', 'parfor',
            'persistent', 'return', 'spmd', 'switch', 'try', 'while'
        ]

        keywords_octave = ['endfor', 'endif', 'endwhile']

        if version == 'octave':
            keywords += keywords_octave

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['inf', 'Nan']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb,
            real_exponent_tb, keyword_tb, values_tb, groupers_tb,
            known_operator_tb, identifier_tb, command_tb, metaclass_tb,
            string_tb, line_comment_m_tb, block_comment_m_tb
        ]

        tokenbuilders_2 = [line_comment_o_tb, block_comment_o_tb]

        if version == 'octave':
            tokenbuilders += tokenbuilders_2

        tokenbuilders_9 = [self.unknown_operator_tb, invalid_token_builder]

        tokenbuilders += tokenbuilders_9

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_all_lines(tokens)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        # self.calc_group_confidence(tokens, group_mids)

        # operand_types_2 = ['number']
        # self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Exemple #16
0
  def __init__(self, code):
    super().__init__()

    operand_types = []

    whitespace_tb = WhitespaceTokenBuilder()
    newline_tb = NewlineTokenBuilder()

    integer_tb = IntegerTokenBuilder(None)
    integer_exponent_tb = IntegerExponentTokenBuilder(None)
    real_tb = RealTokenBuilder(False, False, None)
    real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
    operand_types.append('number')

    quotes = ['"', "'", "’"]
    string_tb = StuffedQuoteStringTokenBuilder(quotes, False)
    operand_types.append('string')

    attribute_tb = HTMLAttributeTokenBuilder()
    operand_types.append('attribute')

    unicode_tb = HTMLUnicodeTokenBuilder()
    operand_types.append('character')

    groupers_tb = CaseInsensitiveListTokenBuilder(['<', '</', '>', '/>'], 'group', False)
    group_starts = ['<']
    group_ends = ['>', '/>']

    identifier_tb = HTMLIdentifierTokenBuilder()
    operand_types.append('identifier')

    known_operators = [
      '='
    ]

    known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False)

    punctuation = [
      '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '+',
      '`', '~', '[', ']', '{', '}', ';', ':', "'", '"',
      ',', '.', '?', '>', '/', '\\', '|' # omit '<'
    ]

    punctuation_tb = CaseInsensitiveListTokenBuilder(punctuation, 'punctuation', False)

    keywords = [
      'a', 'abbr', 'address', 'area', 'article', 'aside', 'audio',
      'b', 'base', 'bdi', 'bdo', 'blockquote', 'body', 'br', 'button',
      'canvas', 'caption', 'cite', 'code', 'col', 'colgroup',
      'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog',
      'div', 'dl', 'dt',
      'em', 'embed',
      'fieldset', 'figcaption', 'figure', 'footer', 'form',
      'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header',
      'hgroup', 'hr', 'html',
      'i', 'iframe', 'img', 'input', 'ins',
      'kbd', 'keygen',
      'label', 'legend', 'li', 'link',
      'main', 'map', 'mark', 'math', 'menu', 'menuitem',
      'meta', 'meter',
      'nav', 'noscript',
      'object', 'ol', 'optgroup', 'option', 'output',
      'p', 'param', 'picture', 'pre', 'progress',
      'q',
      'rb', 'rp', 'rt', 'rtc', 'ruby',
      's', 'samp', 'script', 'section', 'select', 'slot', 'small',
      'source', 'span', 'strong', 'style', 'sub', 'summary', 'sup', 'svg',
      'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', 'thead',
      'time', 'title', 'tr', 'track',
      'u', 'ul',
      'var', 'video',
      'wbr'
    ]

    keyword_tb = HTMLListTokenBuilder(keywords, 'keyword', False)

    comment_tb = BlockTokenBuilder('<!--', '-->', 'comment')

    script_tb = BlockTokenBuilder('<script', '</script>', 'script')

    invalid_token_builder = InvalidTokenBuilder()

    tokenbuilders = [
      newline_tb,
      whitespace_tb,
      integer_tb,
      integer_exponent_tb,
      real_tb,
      real_exponent_tb,
      string_tb,
      attribute_tb,
      known_operator_tb,
      groupers_tb,
      keyword_tb,
      identifier_tb,
      punctuation_tb,
      comment_tb,
      script_tb,
      self.unknown_operator_tb,
      unicode_tb,
      invalid_token_builder
    ]

    tokenizer = Tokenizer(tokenbuilders)
    tokens = tokenizer.tokenize(code)
    tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator')
    self.tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')

    self.calc_statistics()

    tokens = self.source_tokens()
    tokens = Examiner.join_all_lines(tokens)

    self.calc_token_confidence()
    # self.calc_token_2_confidence()

    num_operators = self.count_my_tokens(['operator', 'invalid operator'])
    if num_operators > 0:
      self.calc_operator_confidence(num_operators)
      allow_pairs = []
      self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
      self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs)
      self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

    # self.calc_group_confidence(tokens, group_mids)

    # self.calc_operand_n_confidence(tokens, operand_types, 2)
    # self.calc_operand_n_confidence(tokens, operand_types, 4)

    self.calc_keyword_confidence()