def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() TripleQuoteStringTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() BlockTokenBuilder.__escape_z__() BraceCommentTokenBuilder.__escape_z__() SlashSlashCommentTokenBuilder.__escape_z__() SlashStarCommentTokenBuilder.__escape_z__() GenericNumberTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() StringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() BlockTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() StuffedQuoteStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() BlockTokenBuilder.__escape_z__() HTMLIdentifierTokenBuilder.__escape_z__() HTMLListTokenBuilder.__escape_z__() HTMLAttributeTokenBuilder.__escape_z__() HTMLUnicodeTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() BlockTokenBuilder.__escape_z__() HaskellClassTokenBuilder.__escape_z__() HaskellIdentifierTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() BlockTokenBuilder.__escape_z__() KeywordTokenBuilder.__escape_z__() MatlabStringTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() BlockTokenBuilder.__escape_z__() TripleQuoteStringTokenBuilder.__escape_z__() SlashSlashCommentTokenBuilder.__escape_z__() TripleSlashCommentTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() NullTokenBuilder.__escape_z__() ClassTypeTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() StuffedQuoteStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() BlockTokenBuilder.__escape_z__() CobolIdentifierTokenBuilder.__escape_z__() PictureTokenBuilder.__escape_z__() CRPictureTokenBuilder.__escape_z__() CobolPreprocessorTokenBuilder.__escape_z__() AsteriskCommentTokenBuilder.__escape_z__() return 'Escape ?Z'
def __init__(self, code): super().__init__() ctrlz_char = '' code = self.TrimCtrlZText(code, ctrlz_char) operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(True, True, None) real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None) hex_constant_tb = SuffixedIntegerTokenBuilder( 'H', True, '0123456789ABCDEFabcdef') octal_constant_tb = SuffixedIntegerTokenBuilder('C', True, '01234567') binary_constant_tb = SuffixedIntegerTokenBuilder('B', True, '01') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ["'", '"'] string_tb = StringTokenBuilder(quotes, 0) operand_types.append('string') paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment') known_operators = [ ':=', '=', '>', '>=', '<', '<=', '#', '<>', '+', '-', '*', '/', 'DIV', 'MOD', 'AND', 'OR', 'NOT', '^', '.', '..', 'IN', '&' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', 'NOT', '@', '^', '.'] self.postfix_operators = ['^'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '|'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':', '|'] group_ends = [')', ']', '}'] groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'BEGIN', 'BY', 'CASE', 'CONST', 'DEFINITION', 'DO', 'ELSE', 'ELSIF', 'END', 'EXCEPT', 'EXIT', 'EXPORT', 'FINALLY', 'FOR', 'FROM', 'IF', 'IMPLEMENTATION', 'IMPORT', 'LOOP', 'MODULE', 'OF', 'PROCEDURE', 'QUALIFIED', 'REPEAT', 'THEN', 'TO', 'TYPE', 'VAR', 'WITH', 'WHILE' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'ARRAY', 'BOOLEAN', 'CARDINAL', 'CHAR', 'INTEGER', 'POINTER', 'REAL', 'RECORD', 'SET' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['FALSE', 'NIL', 'TRUE'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, string_tb, paren_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = tokens self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'identifier', 'variable'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence( ['BEGIN', 'RECORD', 'CASE', 'DO', 'IF', 'WHILE'], ['END']) self.calc_paired_blockers_confidence(['REPEAT'], ['UNTIL']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') identifier_tb = HaskellIdentifierTokenBuilder() operand_types.append('identifier') class_tb = HaskellClassTokenBuilder() operand_types.append('class') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') line_comment_tb = LeadToEndOfLineTokenBuilder('--', False, 'comment') block_comment_tb = BlockTokenBuilder('{-', '-}', 'comment') line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '::'] group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) operators_tb = HaskellOperatorTokenBuilder('#$%&*+./<=>?@\\^|-~') known_operators = ["'", '..'] known_operators_tb = CaseInsensitiveListTokenBuilder( known_operators, 'operator', False) self.postfix_operators = ['..', "'"] keywords = [ 'case', 'class', 'data', 'deriving', 'do', 'else', 'if', 'import', 'in', 'infix', 'infix1', 'infixr', 'instance', 'let', 'module', 'newtype', 'of', 'then', 'type', 'where' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', True) values = ['True', 'False', 'Nothing', '_'] value_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, keyword_tb, groupers_tb, operators_tb, known_operators_tb, identifier_tb, value_tb, class_tb, string_tb, line_comment_tb, block_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) HaskellExaminer.convert_keywords_to_identifiers(tokens) self.tokens = tokens # self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() # self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] # self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) # operand_types_2 = ['number'] # self.calc_operand_n_confidence(tokens, operand_types_2, 2) # operand_types = ['number', 'string', 'symbol', 'identifier', 'variable'] # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, year, extension, tab_size, wide): super().__init__() self.max_expected_line = 80 if year is not None and year not in ['68', '1968', '74', '1974', '85', '1985']: raise CodeStatException('Unknown year for language') operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, True, None) real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', None) operand_types.append('number') identifier_tb = CobolIdentifierTokenBuilder() operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = StuffedQuoteStringTokenBuilder(quotes, True) operand_types.append('string') picture_tb = PictureTokenBuilder() cr_picture_tb = CRPictureTokenBuilder() operand_types.append('picture') terminators_tb = SingleCharacterTokenBuilder('.', 'statement terminator', False) known_operators = [ 'ADD', 'SUBTRACT', 'MULTIPLY', 'DIVIDE', '+', '-', '*', '/', '**', '=', '<>', '>', '>=', '<', '<=', 'AND', 'OR', 'NOT', ':' ] known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) self.unary_operators = [ '+', '-' ] groupers = ['(', ')', ','] group_starts = ['('] group_mids = [','] # group_ends = [')'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'ACCEPT', 'ACCESS', 'ADD', 'ADDRESS', 'ADVANCING', 'AFTER', 'ALL', 'ALPHABETIC', 'ALPHABETIC-LOWER', 'ALPHABETIC-UPPER', 'ALPHANUMERIC', 'ALPHANUMERIC-EDITED', 'ALTER', 'ALTERNATE', 'AND', 'APPLY', 'ARE', 'AREA', 'AREAS', 'ASCENDING', 'ASSIGN', 'AT', 'AUTHOR', 'BEFORE', 'BLOCK', 'BY', 'CALL', 'CANCEL', 'CD', 'CF', 'CH', 'CHARACTER', 'CHARACTERS', 'CLOCK-UNITS', 'CLOSE', 'COBOL', 'CODE', 'COLUMN', 'COMMA', 'COMMUNICATION', 'COMP', 'COMPUTATIONAL', 'COMPUTE', 'CONFIGURATION', 'CONTAINS', 'CONTROL', 'CONTROLS', 'COPY', 'CORR', 'CORRESPONDING', 'COUNT', 'CURRENCY', 'DATA', 'DATE', 'DATE-COMPILED', 'DATE-WRITTEN', 'DE', 'DEBUG-CONTENTS', 'DEBUG-ITEM', 'DEBUG-LINE', 'DEBUG-NAME', 'DEBUG-SUB-1', 'DEBUG-SUB-2', 'DEBUG-SUB-3', 'DECIMAL-POINT', 'DECLARATIVES', 'DELIMITED', 'DELIMITER', 'DEPENDING', 'DESCENDING', 'DESTINATION', 'DETAIL', 'DISABLE', 'DISPLAY', 'DIVIDE', 'DIVISION', 'DOWN', 'EGI', 'ELSE', 'EMI', 'ENABLE', 'END', 'ENTER', 'ENVIRONMENT', 'EQUAL', 'ERROR', 'ESI', 'EVERY', 'EXIT', 'EXTEND', 'FD', 'FILE', 'FILE-CONTROL', 'FILLER', 'FINAL', 'FIRST', 'FOOTING', 'FOR', 'FROM', 'GENERATE', 'GIVING', 'GLOBAL', 'GO', 'GOBACK', 'GREATER', 'GROUP', 'HEADING', 'HIGH-VALUE', 'HIGH-VALUES', 'I-O', 'I-O-CONTROL', 'IDENTIFICATION', 'IF', 'IN', 'INDEX', 'INDEXED', 'INDICATE', 'INITIAL', 'INITIATE', 'INPUT', 'INPUT-OUTPUT', 'INSTALLATION', 'INTO', 'INVALID', 'IS', 'JUST', 'JUSTIFIED', 'KEY', 'LABEL', 'LAST', 'LEADING', 'LEFT', 'LENGTH', 'LESS', 'LIMIT', 'LIMITS', 'LINE', 'LINE-COUNTER', 'LINES', 'LINKAGE', 'LOCK', 'LOW-VALUE', 'LOW-VALUES', 'MEMORY', 'MERGE', 'MESSAGE', 'MODE', 'MODULES', 'MOVE', 'MULTIPLE', 'MULTIPLY', 'NEGATIVE', 'NEXT', 'NO', 'NOT', 'NUMBER', 'NUMERIC', 'NUMERIC-EDITED', 'OBJECT-COMPUTER', 'OCCURS', 'OF', 'OMITTED', 'OPEN', 'OPTIONAL', 'OR', 'OUTPUT', 'OVERFLOW', 'PAGE', 'PAGE-COUNTER', 'PERFORM', 'PF', 'PH', 'PIC', 'PICTURE', 'PLUS', 'POINTER', 'POSITION', 'POSITIVE', 'PROCEDURE', 'PROCEED', 'PROGRAM', 'PROGRAM-ID', 'QUEUE', 'QUOTE', 'QUOTES', 'RANDOM', 'RD', 'READ', 'RECEIVE', 'RECORD', 'RECORDS', 'REDEFINES', 'REEL', 'REFERENCE', 'RELATIVE', 'RELEASE', 'REMAINDER', 'RENAMES', 'REPLACE', 'REPLACING', 'REPORT', 'REPORTING', 'REPORTS', 'RERUN', 'RESERVE', 'RESET', 'RETURN', 'REVERSED', 'REWIND', 'REWRITE', 'RF', 'RH', 'RIGHT', 'ROUNDED', 'RUN', 'SAME', 'SD', 'SEARCH', 'SECTION', 'SECURITY', 'SEGMENT', 'SEGMENT-LIMIT', 'SELECT', 'SEND', 'SENTENCE', 'SEQUENCE', 'SEQUENTIAL', 'SET', 'SIGN', 'SIZE', 'SORT', 'SOURCE', 'SOURCE-COMPUTER', 'SPECIAL-NAMES', 'STANDARD', 'STATUS', 'STOP', 'STRING','SUB-QUEUE-1', 'SUB-QUEUE-2', 'SUB-QUEUE-3', 'SUBTRACT', 'SUM', 'SUPPRESS', 'SYMBOLIC', 'SYNC', 'SYNCHRONIZED', 'TABLE', 'TALLY', 'TAPE', 'TERMINAL', 'TERMINATE', 'TEST', 'TEXT', 'THAN', 'THEN', 'THROUGH', 'THRU', 'TIME', 'TIMES', 'TITLE', 'TO', 'TYPE', 'UNIT', 'UNSTRING', 'UNTIL', 'UP', 'UPON', 'USAGE', 'USE', 'USING', 'VALUE', 'VALUES', 'VARYING', 'WHEN', 'WITH', 'WORDS', 'WORKING-STORAGE', 'WRITE' ] keywords_68_only = [ 'ACTUAL', 'FILE-LIMITS', 'NOMINAL', 'PROCESSING', 'NOTE', 'REMARKS', 'SEEK', 'TODAY' ] keywords_74 = [ 'ALSO', 'BOTTOM', 'CODE-SET', 'COLLATING', 'COMMON', 'DAY', 'DELETE', 'DEBUGGING', 'DUPLICATES', 'DYNAMIC', 'END-OF-PAGE', 'EOP', 'EXCEPTION', 'INSPECT', 'LINAGE', 'LINAGE-COUNTER', 'NATIVE', 'ORGANIZATION', 'PACKED-DECIMAL', 'PADDING', 'PRINTING', 'PROCEDURES', 'REFERENCES', 'REMOVAL', 'SEPARATE', 'SORT-MERGE', 'STANDARD-1', 'STANDARD-2', 'START', 'TALLYING', 'TOP', 'TRAILING' ] keywords_85 = [ 'ALPHABET', 'ANY', 'BINARY', 'CONTENT', 'CONTINUE', 'CONVERTING', 'DAY-OF-WEEK', 'END-ADD', 'END-CALL', 'END-COMPUTE', 'END-DELETE', 'END-DIVIDE', 'END-EVALUATE', 'END-IF', 'END-MULTIPLY', 'END-PERFORM', 'END-READ', 'END-RECEIVE', 'END-RETURN', 'END-REWRITE', 'END-SEARCH', 'END-START', 'END-STRING', 'END-SUBTRACT', 'END-UNSTRING', 'END-WRITE', 'EVALUATE', 'EXTERNAL', 'INITIALIZE', 'ORDER', 'OTHER', 'PURGE' ] if year in ['68', '1968']: keywords += keywords_68_only if year in ['74', '1974', '85', '1985']: keywords += keywords_74 if year in ['85', '1985']: keywords += keywords_85 keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) values = [ 'BLANK', 'SPACE', 'SPACES', 'ZERO', 'ZEROES', 'ZEROS', 'NO', 'OFF', 'ON' ] values_85 = ['FALSE', 'TRUE'] if year in ['85', '1985']: values += values_85 value_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') exec_tb = BlockTokenBuilder('EXEC', 'END-EXEC', 'exec block') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, picture_tb, cr_picture_tb, keyword_tb, known_operator_tb, groupers_tb, value_tb, identifier_tb, string_tb, exec_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = self.tokenize_code(code, tab_size, tokenizer, wide) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'whitespace') self.convert_numbers_to_pictures() self.convert_numbers_to_levels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) # self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) # self.calc_operand_n_confidence(tokens, operand_types, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_picture_confidence() if not wide: self.calc_line_length_confidence(code, self.max_expected_line) expected_keyword_confidence = self.check_expected_keywords() self.confidences['expected_keywords'] = expected_keyword_confidence
def __init__(self, code, comment): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() number_tb = GenericNumberTokenBuilder() operand_types.append('number') leads = '' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’", '`'] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') comment_tbs = [] if comment == 'ada': comment_tbs = [LeadToEndOfLineTokenBuilder('--', True, 'comment')] if comment == 'hash': comment_tbs = [LeadToEndOfLineTokenBuilder('#', True, 'comment')] if comment == 'bang': comment_tbs = [LeadToEndOfLineTokenBuilder('!', True, 'comment')] if comment == 'cobol-inline': comment_tbs = [LeadToEndOfLineTokenBuilder('*>', True, 'comment')] if comment == 'percent': comment_tbs = [LeadToEndOfLineTokenBuilder('%', True, 'comment')] if comment == 'cobol': pass if comment == 'fortran': pass if comment == 'basic': comment_tbs = [ LeadToEndOfLineTokenBuilder("REM", False, 'comment'), LeadToEndOfLineTokenBuilder("'", True, 'comment') ] if comment == 'c': comment_tbs = [SlashStarCommentTokenBuilder()] if comment == 'cpp': comment_tbs = [ SlashSlashCommentTokenBuilder(), SlashStarCommentTokenBuilder() ] if comment == 'pascal': comment_tbs = [ BraceCommentTokenBuilder(), BlockTokenBuilder('(*', '*)', 'comment') ] known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '^', '.', '..', '...', ':', '++', '--', '->', '&&', '||', '?', '##', '\\', '_', '@', '#', '$', '`', '```' ] groupers = ['(', ')', ',', '[', ']', '{', '}', ';'] # group_starts = ['(', '[', ',', '{'] group_mids = [',', ';'] # group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) invalid_token_builder = InvalidTokenBuilder() tokenbuilders1 = [ whitespace_tb, newline_tb, number_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, triple_string_tb ] tokenbuilders2 = [self.unknown_operator_tb, invalid_token_builder] tokenbuilders = tokenbuilders1 + comment_tbs + tokenbuilders2 tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['newline', 'statement separator'], ['{'], ['whitespace', 'comment', 'line description']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() # tokens = self.source_tokens() # tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) # allow_pairs = [] # self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) # self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) # self.calc_operand_n_confidence(tokens, operand_types, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) # self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, variant): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(True, True, None) real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) prefixed_string_tb = PrefixedStringTokenBuilder('@', False, quotes) char_tb = FsharpCharTokenBuilder(["'", "’"]) operand_types.append('string') slash_slash_comment_tb = NullTokenBuilder() parens_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment') triple_slash_comment_tb = NullTokenBuilder() if variant in ['fsharp']: slash_slash_comment_tb = SlashSlashCommentTokenBuilder() triple_slash_comment_tb = TripleSlashCommentTokenBuilder() directives = [ '#if', '#else', '#elif', '#endif', '#define', '#undef', '#line', '#region', '#endregion', '#pragma' ] preprocessor_tb = CaseSensitiveListTokenBuilder( directives, 'preprocessor', False) c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True, 'preprocessor') c_error_tb = LeadToEndOfLineTokenBuilder('#error', True, 'preprocessor') known_operators = [ 'and', 'as', 'in', 'mod', 'not', 'of', 'or', 'when', '::', '+', '-', '*', '/', '+.', '-.', '*.', '/.', '=', "'", '->', '>', '<', '>=', '<=', '==', '^', '||', '.', '#' ] known_operators_fsharp = [ 'new', '!', '!=', '%', '%%', '%?', '&', '&&', '&&&', '(|', '|)', '*?', '**', '+?', '-?', '->', '..', '.. ..', '/?', ':', ':=', ':/', '<<', '<<<', '<-', '<>', '<>?', '<=?', '<|', '<||', '<|||', '<@', '@>', '<@@', '@@>', '=?', '==', '>?', '>>', '>>>', '>=?', '?', '|||', '^^^', '?>=', '?>', '?<=', '?<', '?=', '?<>', '?+', '?-', '?*', '?/', '>=?', '>?', '<=?', '<?', '=?', '<>?', '+?', '-?', '*?', '/?', '?>=?', '?>?', '?<=?', '?<?', '?=?', '?<>?', '?+?', '?-?', '?*?', '?/?', '@', '|>', '||>', '|||>', '~~', '~~~', '~-', '~+', ':>', ':?>', "'" ] if variant in ['fsharp']: known_operators += known_operators_fsharp self.unary_operators = ['new', 'not', "'", '-'] self.postfix_operators = ["'"] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) groupers = [ '(', ')', ',', '[', ']', '{', '}', 'begin', 'end', ';', '|' ] groupers_fsharp = ['[|', '|]', '[<', '>]', '^'] if variant in ['fsharp']: groupers += groupers_fsharp # group_starts = ['(', '[', ',', '{', '[|', '[<'] group_mids = [',', ';', '^', '|'] group_ends = [')', ']', '}', '|]', '>]'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'assert', 'class', 'def', 'do', 'done', 'downto', 'else', 'exception', 'failwith', 'for', 'fun', 'function', 'if', 'inherit', 'lazy', 'let', 'match', 'method', 'module', 'object', 'open', 'raise', 'rec', 'sig', 'then', 'to', 'try', 'type', 'val', 'virtual', 'while', 'with' ] keywords_fsharp = [ 'abstract', 'break', 'default', 'delegate', 'downcast', 'elif', 'extern', 'finally', 'fixed', 'global', 'inline', 'interface', 'internal', 'let!', 'match!', 'member', 'mutable', 'namespace', 'override', 'private', 'public', 'return', 'return!', 'upcast', 'use', 'use!', 'yield', 'yield!' ] if variant in ['fsharp']: keywords += keywords_fsharp keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'bool', 'byte', 'char', 'double', 'float', 'int', 'list', 'long', 'number', 'object', 'range', 'string', 'struct', 'union', 'unit', 'void' ] types_fsharp = [ 'decimal', 'sbyte', 'short', 'uint', 'ulong', 'ushort', 'void' ] if variant in ['fsharp']: types += types_fsharp types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['base', 'false', 'null', 'true', '_'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, class_type_tb, string_tb, triple_quote_string_tb, prefixed_string_tb, char_tb, triple_slash_comment_tb, slash_slash_comment_tb, parens_star_comment_tb, preprocessor_tb, c_error_tb, c_warning_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(True, True, None) real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None) hex_constant_tb = PrefixedIntegerTokenBuilder( '$', True, '0123456789ABCDEFabcdef') octal_constant_tb = PrefixedIntegerTokenBuilder('&', True, '01234567') binary_constant_tb = PrefixedIntegerTokenBuilder('%', True, '01') char_constant_tb = PrefixedIntegerTokenBuilder('#', True, '0123456789') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ["'"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') brace_comment_tb = BraceCommentTokenBuilder() paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment') known_operators = [ '+', '-', '*', '/', '=', '<>', '>', '>=', '<', '<=', 'and', 'or', 'not', '&', '|', '~', '<<', '>>', ':=', '^', '~', '@', '.', ':', '..', 'div', 'mod', 'shl', 'shr', 'in' ] known_operator_tb = CaseInsensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', 'not', '@', '^', '.'] self.postfix_operators = ['^'] groupers = ['(', ')', ',', '[', ']'] group_starts = ['(', '[', ','] group_mids = [','] group_ends = [')', ']'] groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'begin', 'break', 'case', 'const', 'do', 'downto', 'else', 'end', 'for', 'forward', 'function', 'goto', 'if', 'label', 'of', 'otherwise', 'packed', 'procedure', 'program', 'repeat', 'reset', 'then', 'to', 'type', 'until', 'uses', 'value', 'var', 'while', 'with' ] keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'array', 'boolean', 'char', 'file', 'integer', 'real', 'record', 'set', 'string' ] types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'nil', 'true'] values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, char_constant_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, string_tb, brace_comment_tb, paren_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = self.combine_identifier_colon( tokens, ['statement separator'], ['begin'], ['whitespace', 'comment', 'newline', 'line description']) self.tokens = tokens self.convert_identifiers_to_labels() self.convert_identifiers_to_labels_2() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'identifier', 'variable'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['begin', 'record', 'case'], ['end']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, year, extension): super().__init__() if year is not None and year not in ['2002', '2014']: raise CodeStatException('Unknown year for language') operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, True, None) real_exponent_tb = RealExponentTokenBuilder(False, True, 'E', None) identifier_tb = CobolIdentifierTokenBuilder() quotes = ['"', "'", "’"] string_tb = StuffedQuoteStringTokenBuilder(quotes, False) n_string_tb = PrefixedStringTokenBuilder('N', False, quotes) nx_string_tb = PrefixedStringTokenBuilder('NX', False, quotes) picture_tb = PictureTokenBuilder() cr_picture_tb = CRPictureTokenBuilder() inline_comment_tb = LeadToEndOfLineTokenBuilder('*>', True, 'comment') star_comment_tb = AsteriskCommentTokenBuilder() terminators_tb = SingleCharacterTokenBuilder('.', 'statement terminator', False) known_operators = [ 'ADD', 'SUBTRACT', 'MULTIPLY', 'DIVIDE', '+', '-', '*', '/', '**', '=', '<>', '>', '>=', '<', '<=', 'AND', 'OR', 'NOT', 'B-AND', 'B-NOT', 'B-OR', 'B-XOR', ':' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', 'NOT'] groupers = ['(', ')', ','] group_starts = ['('] group_mids = [','] # group_ends = [')'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'ACCEPT', 'ACCESS', 'ADD', 'ADDRESS', 'ADVANCING', 'AFTER', 'ALL', 'ALPHABET', 'ALPHABETIC', 'ALPHABETIC-LOWER', 'ALPHABETIC-UPPER', 'ALPHANUMERIC', 'ALPHANUMERIC-EDITED', 'ALSO', 'ALTER', 'ALTERNATE', 'AND', 'ANY', 'APPLY', 'ARE', 'AREA', 'AREAS', 'ASCENDING', 'ASSIGN', 'AT', 'AUTHOR', 'BEFORE', 'BEGINNING', 'BELL', 'BINARY', 'BLOCK', 'BOTTOM', 'BY', 'BYTE-LENGTH', 'CALL', 'CANCEL', 'CBL', 'CD', 'CF', 'CH', 'CHARACTER', 'CHARACTERS', 'CLOCK-UNITS', 'CLOSE', 'COBOL', 'CODE', 'CODE-SET', 'COL', 'COLLATING', 'COLS', 'COLUMN', 'COMMA', 'COMMON', 'COMMUNICATION', 'COMP', 'COMPUTATIONAL', 'COMPUTE', 'CONFIGURATION', 'CONTAINS', 'CONTENT', 'CONTINUE', 'CONTROL', 'CONTROLS', 'CONVERTING', 'COPY', 'CORR', 'CORRESPONDING', 'COUNT', 'CURRENCY', 'DATA', 'DATE', 'DATE-COMPILED', 'DATE-WRITTEN', 'DAY', 'DAY-OF-WEEK', 'DE', 'DEBUG-CONTENTS', 'DEBUG-ITEM', 'DEBUG-LINE', 'DEBUG-NAME', 'DEBUG-SUB-1', 'DEBUG-SUB-2', 'DEBUG-SUB-3', 'DECIMAL-POINT', 'DECLARATIVES', 'DELETE', 'DELIMITED', 'DELIMITER', 'DEPENDING', 'DESCENDING', 'DESTINATION', 'DISABLE', 'DIVIDE', 'DIVISION', 'DOWN', 'DUPLICATES', 'DYNAMIC', 'EGI', 'ELSE', 'EMI', 'ENABLE', 'END', 'END-ACCEPT', 'END-ADD', 'END-CALL', 'END-COMPUTE', 'END-DELETE', 'END-DISPLAY', 'END-DIVIDE', 'END-EVALUATE', 'END-EXEC', 'END-IF', 'END-MULTIPLY', 'END-OF-PAGE', 'END-PERFORM', 'END-READ', 'END-RECEIVE', 'END-RETURN', 'END-REWRITE', 'END-SEARCH', 'END-START', 'END-STRING', 'END-SUBTRACT', 'END-UNSTRING', 'END-WRITE', 'ENTER', 'ENVIRONMENT', 'EOL', 'EOP', 'EQUAL', 'ERROR', 'ESI', 'EVALUATE', 'EVERY', 'EXCEPTION', 'EXEC', 'EXIT', 'EXTEND', 'EXTERNAL', 'FD', 'FILE', 'FILE-CONTROL', 'FILLER', 'FINAL', 'FIRST', 'FOOTING', 'FOR', 'FROM', 'FULL', 'GENERATE', 'GIVING', 'GLOBAL', 'GO', 'GOBACK', 'GREATER', 'GROUP', 'HEADING', 'HIGH-VALUE', 'HIGH-VALUES', 'I-O', 'I-O-CONTROL', 'IDENTIFICATION', 'IF', 'IN', 'INDEX', 'INDEXED', 'INDICATE', 'INITIAL', 'INITIALIZE', 'INITIATE', 'INPUT', 'INPUT-OUTPUT', 'INSPECT', 'INSTALLATION', 'INTO', 'INVALID', 'IS', 'JUST', 'JUSTIFIED', 'KEY', 'LABEL', 'LAST', 'LEADING', 'LEFT', 'LENGTH', 'LESS', 'LIMIT', 'LIMITS', 'LINAGE', 'LINAGE-COUNTER', 'LINE', 'LINE-COUNTER', 'LINES', 'LINKAGE', 'LOCK', 'LOW-VALUE', 'LOW-VALUES', 'MEMORY', 'MERGE', 'MESSAGE', 'MODE', 'MODULES', 'MOVE', 'MULTIPLE', 'MULTIPLY', 'NATIVE', 'NEGATIVE', 'NEXT', 'NOT', 'NUMBER', 'NUMBERS', 'NUMERIC', 'NUMERIC-EDITED', 'OBJECT-COMPUTER', 'OCCURS', 'OF', 'OMITTED', 'OPEN', 'OPTIONAL', 'OR', 'ORDER', 'ORGANIZATION', 'OTHER', 'OUTPUT', 'OVERFLOW', 'PACKED-DECIMAL', 'PADDING', 'PAGE', 'PAGE-COUNTER', 'PARAGRAPH', 'PERFORM', 'PF', 'PH', 'PIC', 'PICTURE', 'PLUS', 'POINTER', 'POSITION', 'POSITIVE', 'PRINTING', 'PROCEDURE', 'PROCEDURES', 'PROCEED', 'PROGRAM', 'PROGRAM-ID', 'PURGE', 'QUEUE', 'QUOTE', 'QUOTES', 'RANDOM', 'RD', 'READ', 'RECEIVE', 'RECORD', 'RECORDS', 'REDEFINES', 'REEL', 'REFERENCE', 'RELATIVE', 'RELEASE', 'REMAINDER', 'REMOVAL', 'RENAMES', 'REPLACE', 'REPLACING', 'REPORT', 'REPORTING', 'REPORTS', 'RERUN', 'RESERVE', 'RESET', 'RESUME', 'RETRY', 'RETURN', 'REVERSED', 'REWIND', 'REWRITE', 'RF', 'RH', 'RIGHT', 'ROUNDED', 'RUN', 'SAME', 'SD', 'SEARCH', 'SECTION', 'SECURE', 'SECURITY', 'SEGMENT', 'SEGMENT-LIMIT', 'SELECT', 'SEND', 'SENTENCE', 'SEPARATE', 'SEQUENCE', 'SEQUENTIAL', 'SET', 'SIGN', 'SIZE', 'SORT', 'SORT-MERGE', 'SOURCE', 'SOURCE-COMPUTER', 'SPECIAL-NAMES', 'STANDARD', 'STANDARD-1', 'STANDARD-2', 'START', 'STATUS', 'STOP', 'STRING', 'SUB-QUEUE-1', 'SUB-QUEUE-2', 'SUB-QUEUE-3', 'SUBTRACT', 'SUM', 'SUPPRESS', 'SYMBOLIC', 'SYNC', 'SYNCHRONIZED', 'TABLE', 'TALLY', 'TALLYING', 'TAPE', 'TERMINAL', 'TERMINATE', 'TEST', 'TEXT', 'THAN', 'THEN', 'THROUGH', 'THRU', 'TIME', 'TIMES', 'TITLE', 'TO', 'TOP', 'TRAILING', 'TYPE', 'UNIT', 'UNSTRING', 'UNTIL', 'UP', 'UPON', 'USAGE', 'USE', 'USING', 'VALUE', 'VALUES', 'VARYING', 'WHEN', 'WITH', 'WORDS', 'WORKING-STORAGE', 'WRITE' ] keywords_2002 = [ 'ACTIVE-CLASS', 'ALIGNED', 'ALLOCATE', 'ANYCASE', 'ARITHMETIC', 'AUTO', 'AUTOMATIC', 'BACKGROUND-COLOR', 'BASED', 'BASIS', 'BINARY-CHAR', 'BINARY-DOUBLE', 'BINARY-LONG', 'BINARY-SHORT', 'BIT', 'BLINK', 'BOOLEAN', 'CENTER', 'CLASS', 'CLASS-ID', 'CLASSIFICATION', 'COLUMNS', 'COM-REG', 'CONDITION', 'CONSTANT', 'CRT', 'CURSOR', 'CYCLE', 'DATA-POINTER', 'DBCS', 'DEBUGGING', 'DETAIL', 'DISPLAY', 'DISPLAY-1', 'DISPLAY-OF', 'EC', 'EGCS', 'EJECT', 'END-INVOKE', 'ENDING', 'ENTRY-CONVENTION', 'ENTRY-FIELD', 'EO', 'EOS', 'ERASE', 'EXCEPTION-OBJECT', 'EXCLUSIVE', 'EXPANDS', 'EXTERN', 'FACTORY', 'FLOAT-EXTENDED', 'FLOAT-LONG', 'FLOAT-SHORT', 'FOREGROUND-COLOR', 'FOREVER', 'FORMAT', 'FREE', 'FUNCTION', 'FUNCTION-ID', 'GET', 'GROUP-USAGE', 'HIGHLIGHT', 'IGNORING', 'IMPLEMENTS', 'INHERITS', 'INITIALIZED', 'INSERT', 'INTERFACE', 'INTERFACE-ID', 'INTRINSIC', 'INVOKE', 'KANJI', 'LC_ALL', 'LC_COLLATE', 'LC_CTYPE', 'LC_MESSAGES', 'LC_MONEY', 'LC_NUMERIC', 'LC_TIME', 'LOCAL-STORAGE', 'LOCALE', 'LOWLIGHT', 'MANUAL', 'METACLASS', 'METHOD', 'METHOD-ID', 'MINUS', 'MORE-LABELS', 'NATIONAL', 'NATIONAL-EDITED', 'NATIONAL-OF', 'NATIVE_BINARY', 'NESTED', 'NEW', 'NONE', 'NORMAL', 'OBJECT', 'OBJECT-REFERENCE', 'ONLY', 'OPTIONS', 'OVERRIDE', 'PHYSICAL', 'PRESENT', 'PREVIOUS', 'PROCEDURE-POINTER', 'PROCESSING', 'PROGRAM-POINTER', 'PROPERTY', 'PROTOTYPE', 'RAISE', 'RAISING', 'READY', 'RECURSIVE', 'REFERENCES', 'RELATION', 'RELOAD', 'REPOSITORY', 'REQUIRED', 'RETURN-CODE', 'RETURNING', 'ROUNDING', 'SCREEN', 'SECONDS', 'SERVICE', 'SHARING', 'SHIFT-IN', 'SHIFT-OUT', 'SIGNED', 'SKIP1', 'SKIP2', 'SKIP3', 'SORT-CONTROL', 'SORT-CORE-SIZE', 'SORT-FILE-SIZE', 'SORT-MESSAGE', 'SORT-MODE-SIZE', 'SORT-RETURN', 'SOURCES', 'STATEMENT', 'STEP', 'STRONG', 'SYMBOL', 'SYSTEM-DEFAULT', 'TRACE', 'TYPEDEF', 'UCS-4', 'UNDERLINE', 'UNIVERSAL', 'UNLOCK', 'UNSIGNED', 'USER-DEFAULT', 'UTF-16', 'UTF-8', 'VAL-STATUS', 'VALID', 'VALIDATE', 'VALIDATE-STATUS', 'WHEN-COMPILED', 'WRITE-ONLY', 'YYYYDDD', 'YYYYMMDD', ] keywords_2014 = [ 'AWAY-FROM-ZERO', 'NEAREST-AWAY-FROM-ZERO', 'NEAREST-EVEN', 'NEAREST-TOWARD-ZERO', 'TOWARD-GREATER', 'TOWARD-LESSER', 'CAPACITY', 'FLOAT-BINARY-128', 'FLOAT-BINARY-32', 'FLOAT-BINARY-64', 'FLOAT-DECIMAL-16', 'FLOAT-DECIMAL-34', 'FLOAT-INFINITY', 'FLOAT-NOT-A-NUMBER', 'FUNCTION-POINTER', 'INTERMEDIATE', 'PHYSICAL', 'PREFIXED', 'PROHIBITED', 'SHORT', 'STANDARD-BINARY', 'STANDARD-DECIMAL', 'TRUNCATION' ] keywords_ibm = ['ABSENT', 'ID', 'PASSWORD', 'UNBOUNDED'] keywords_gnu = [ 'ARGUMENT-NUMBER', 'ARGUMENT-VALUE', 'ASCII', 'BINARY-C-LONG', 'BINARY-SEQUENTIAL', 'CARD-PUNCH', 'CARD-READER', 'CASSETTE', 'CHAIN', 'CHAINING', 'COLOR', 'COMMAND-LINE', 'COMMIT', 'COMP-1', 'COMP-2', 'COMP-3', 'COMP-4', 'COMP-5', 'COMP-6', 'COMP-X', 'COMPUTATIONAL-1', 'COMPUTATIONAL-2', 'COMPUTATIONAL-3', 'COMPUTATIONAL-4', 'COMPUTATIONAL-5', 'COMPUTATIONAL-6', 'COMPUTATIONAL-X', 'CONVERSION', 'CRT-UNDER', 'DISC', 'DISK', 'EBCDIC', 'ECHO', 'END-CHAIN', 'ENTRY', 'ENVIRONMENT-NAME', 'ENVIRONMENT-VALUE', 'ESCAPE', 'F', 'FILE-ID', 'FIXED', 'FLOAT-DECIMAL-7', 'ID', 'IGNORE', 'KEPT', 'KEYBOARD', 'LEFT-JUSTIFY', 'LEFTLINE', 'LINE-SEQUENTIAL', 'LOWER', 'MAGNETIC-TAPE', 'NAME', 'NO-ECHO', 'NOTHING', 'OVERLINE', 'PRINT', 'PRINTER', 'PRINTER-1', 'PROCEDURE-POINTER', 'PROCEDURES', 'PROMPT', 'PROTECTED', 'RECORDING', 'REVERSE', 'RIGHT-JUSTIFY', 'ROLLBACK', 'S', 'SCROLL', 'SIGNED-INT', 'SIGNED-LONG', 'SIGNED-SHORT', 'SPACE-FILL', 'STATIC', 'STDCALL', 'SYSTEM-OFFSET', 'TAB', 'TIME-OUT', 'TRAILING-SIGN', 'U', 'UNSIGNED-INT', 'UNSIGNED-LONG', 'UNSIGNED-SHORT', 'UPDATE', 'UPPER', 'USER', 'V', 'VARIABLE', 'WAIT', 'WRAP', 'ZERO-FILL' ] keywords_acu = [ '3-D', 'ACTION', 'ACTIVE-X', 'ADJUSTABLE-COLUMNS', 'ALIGNMENT', 'AUTO-DECIMAL', 'AUTO-SPIN', 'BACKGROUND-HIGH', 'BACKGROUND-LOW', 'BACKGROUND-STANDARD', 'BAR', 'BITMAP', 'BITMAP-END', 'BITMAP-HANDLE', 'BITMAP-NUMBER', 'BITMAP-START', 'BITMAP-TRAILING', 'BITMAP-TRANSPARENT-COLOR', 'BITMAP-WIDTH', 'BOX', 'BOXED', 'BUSY', 'BUTTONS', 'CALENDAR-FONT', 'CANCEL-BUTTON', 'CELL', 'CELL-COLOR', 'CELL-DATA', 'CELL-FONT', 'CELL-PROTECTION', 'CENTERED-HEADING', 'CENTURY-DATE', 'CHECK-BOX', 'CLEAR-SELECTION', 'CLINE', 'CLINES', 'COLORS', 'COLUMN-COLOR', 'COLUMN-DIVIDERS', 'COLUMN-FONT', 'COLUMN-HEADINGS', 'COLUMN-PROTECTION', 'COMBO-BOX', 'COPY-SELECTION', 'CSIZE', 'CURSOR-COL', 'CURSOR-COLOR', 'CURSOR-FRAME-WIDTH', 'CURSOR-ROW', 'CURSOR-X', 'CURSOR-Y', 'CUSTOM-PRINT-TEMPLATE', 'DASHED', 'DATA-COLUMNS', 'DATA-TYPES', 'DATE-ENTRY', 'DEFAULT-BUTTON', 'DEFAULT-FONT', 'DESTROY', 'DISPLAY-COLUMNS', 'DISPLAY-FORMAT', 'DOTDASH', 'DOTTED', 'DOUBLE', 'DRAG-COLOR', 'DROP-DOWN', 'DROP-LIST', 'END-COLOR', 'END-MODIFY', 'ENGRAVED', 'ENSURE-VISIBLE', 'ENTRY-FIELD', 'ENTRY-REASON', 'ESCAPE-BUTTON', 'EVENT', 'EVENT-LIST', 'EXCEPTION-VALUE', 'EXPAND', 'EXTERNAL-FORM', 'FILE-NAME', 'FILE-POS', 'FILL-COLOR', 'FILL-COLOR-2', 'FILL-PERCENT', 'FINISH-REASON', 'FIXED-FONT', 'FIXED-WIDTH', 'FLAT', 'FLAT-BUTTONS', 'FLOAT', 'FLOATING', 'FONT', 'FRAME', 'FRAMED', 'FULL-HEIGHT', 'GRID', 'GO-BACK', 'GO-FORWARD', 'GO-HOME', 'GO-SEARCH', 'GRAPHICAL', 'GRID', 'GROUP-VALUE', 'HANDLE', 'HAS-CHILDREN', 'HEADING-COLOR', 'HEADING-DIVIDER-COLOR', 'HEADING-FONT', 'HEAVY', 'HEIGHT-IN-CELLS', 'HIDDEN-DATA', 'HIGH-COLOR', 'HOT-TRACK', 'HSCROLL', 'HSCROLL-POS', 'ICON', 'IDENTIFIED', 'INDEPENDENT', 'INQUIRE', 'INSERTION-INDEX', 'INSERTION-ROWS', 'ITEM', 'ITEM-TEXT', 'ITEM-TO-ADD', 'ITEM-TO-DELETE', 'ITEM-TO-EMPTY', 'ITEM-VALUE', 'LABEL', 'LABEL-OFFSET', 'LARGE-FONT', 'LARGE-OFFSET', 'LAST-ROW', 'LAYOUT-DATA', 'LAYOUT-MANAGER', 'LEADING-SHIFT', 'LEFT-TEXT', 'LINES-AT-ROOT', 'LIST-BOX', 'LM-RESIZE', 'LONG-DATE', 'LOW-COLOR', 'LOWERED', 'MASS-UPDATE', 'MAX-LINES', 'MAX-PROGRESS', 'MAX-TEXT', 'MAX-VAL', 'MEDIUM-FONT', 'MENU', 'MIN-VAL', 'MODIFY', 'MULTILINE', 'NAVIGATE-URL', 'NEXT-ITEM', 'NO-AUTOSEL', 'NO-AUTO-DEFAULT', 'NO-BOX', 'NO-DIVIDERS', 'NO-F4', 'NO-FOCUS', 'NO-GROUP-TAB', 'NO-KEY-LETTER', 'NO-SEARCH', 'NO-UPDOWN', 'NOTAB', 'NOTIFY', 'NOTIFY-CHANGE', 'NOTIFY-DBLCLICK', 'NOTIFY-SELCHANGE', 'NUM-COL-HEADINGS', 'NUM-ROWS', 'OK-BUTTON', 'OVERLAP-LEFT', 'OVERLAP-TOP', 'PAGE-SETUP', 'PAGED', 'PARENT', 'PERMANENT', 'PIXEL', 'PLACEMENT', 'POP-UP', 'POSITION-SHIFT', 'PRINT-NO-PROMPT', 'PRINT-PREVIEW', 'PRIORITY', 'PROGRESS', 'PROPERTIES', 'PROPERTY', 'PUSH-BUTTON', 'QUERY-INDEX', 'RADIO-BUTTON', 'RAISED', 'READ-ONLY', 'RECORD-DATA', 'RECORD-TO-ADD', 'RECORD-TO-DELETE', 'REFRESH', 'REGION-COLOR', 'RESET-GRID', 'RESET-LIST', 'RESET-TABS', 'RIGHT-ALIGN', 'RIMMED', 'ROW-COLOR', 'ROW-COLOR-PATTERN', 'ROW-DIVIDERS', 'ROW-FONT', 'ROW-HEADINGS', 'ROW-PROTECTION', 'SAVE-AS', 'SAVE-AS-NO-PROMPT', 'SCROLL-BAR', 'SEARCH-OPTIONS', 'SEARCH-TEXT', 'SELECT-ALL', 'SELECTION-INDEX', 'SELECTION-TEXT', 'SELF-ACT', 'SEPARATION', 'SHADING', 'SHADOW', 'SHORT-DATE', 'SHOW-LINES', 'SHOW-NONE', 'SHOW-SEL-ALWAYS', 'SMALL-FONT', 'SORT-ORDER', 'SPINNER', 'SQUARE', 'START-X', 'START-Y', 'STATIC-LIST', 'STATUS-BAR', 'STATUS-TEXT', 'STYLE', 'SUBWINDOW', 'TAB-TO-ADD', 'TAB-TO-DELETE', 'TEMPORARY', 'TERMINATION-VALUE', 'THREAD', 'THREADS', 'THUMB-POSITION', 'TILED-HEADINGS', 'TITLE', 'TITLE-POSITION', 'TRADITIONAL-FONT', 'TRAILING-SHIFT', 'TRANSPARENT', 'TREE-VIEW', 'UNFRAMED', 'UNSORTED', 'USE-ALT', 'USE-RETURN', 'USE TAB', 'VALUE-FORMAT', 'VARIANT', 'VERTICAL', 'VERY-HEAVY', 'VIRTUAL-WIDTH', 'VPADDING', 'VSCROLL', 'VSCROLL-BAR', 'VSCROLL-POS', 'VTOP', 'WEB-BROWSER', 'WIDTH', 'WIDTH-IN-CELLS', 'WINDOW', 'X', 'Y' ] if year in ['2002', '2014']: keywords += keywords_2002 if year == '2014': keywords += keywords_2014 if extension.lower() == 'acu': keywords += keywords_acu if extension.lower() == 'ibm': keywords += keywords_ibm if extension.lower() == 'gnu': keywords += keywords_gnu keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) values = [ 'BLANK', 'SPACE', 'SPACES', 'ZERO', 'ZEROES', 'ZEROS', 'FALSE', 'NO', 'OFF', 'ON', 'TRUE' ] values_2002 = ['NULL', 'NULLS', 'SELF', 'SUPER'] if year in ['2002', '2014']: values += values_2002 value_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') cobol_preprocessor_tb = CobolPreprocessorTokenBuilder() exec_tb = BlockTokenBuilder('EXEC', 'END-EXEC', 'exec block') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, picture_tb, cr_picture_tb, keyword_tb, star_comment_tb, # before operator, to catch single star as comment known_operator_tb, groupers_tb, value_tb, identifier_tb, string_tb, n_string_tb, nx_string_tb, inline_comment_tb, cobol_preprocessor_tb, exec_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.convert_numbers_to_pictures() self.convert_numbers_to_levels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) # self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) # self.calc_operand_n_confidence(tokens, operand_types, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_length_confidence(code, self.max_expected_line) self.calc_picture_confidence() expected_keyword_confidence = self.check_expected_keywords() self.confidences['expected_keywords'] = expected_keyword_confidence
def __init__(self, code, version): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') command_tb = PrefixedIdentifierTokenBuilder('!', 'command', False) metaclass_tb = PrefixedIdentifierTokenBuilder('?', 'metaclass', False) quotes = ['"', "'", "’"] string_tb = MatlabStringTokenBuilder(quotes, False) operand_types.append('string') line_comment_m_tb = LeadToEndOfLineTokenBuilder('%', False, 'comment') line_comment_o_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment') block_comment_m_tb = BlockTokenBuilder('%{', '%}', 'comment') block_comment_o_tb = BlockTokenBuilder('#{', '#}', 'comment') line_continuation_tb = KeywordTokenBuilder('...', 'line continuation') known_operators = [ '+', '-', '.*', '*', './', '/', '\\', '.^', '^', ".'", "'", '=', '==', '~=', '>', '>=', '<', '<=', '&', '|', '&&', '||', '~', '@', '.', '.?' ] operators_octave = [ '++', '--', '+=', '-=', '*=', '/=', '^=', '!', '!=', '**' ] if version == 'octave': known_operators += operators_octave self.unary_operators = ['+', '-', '~', '@'] self.postfix_operators = ["'"] groupers = ['(', ')', ',', '[', ']', '{', '}', ';', ':'] group_starts = ['(', '[', ',', '{'] # group_mids = [',', ';', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'break', 'case', 'catch', 'classdef', 'continue', 'else', 'elseif', 'end', 'for', 'function', 'global', 'if', 'otherwise', 'parfor', 'persistent', 'return', 'spmd', 'switch', 'try', 'while' ] keywords_octave = ['endfor', 'endif', 'endwhile'] if version == 'octave': keywords += keywords_octave keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['inf', 'Nan'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, command_tb, metaclass_tb, string_tb, line_comment_m_tb, block_comment_m_tb ] tokenbuilders_2 = [line_comment_o_tb, block_comment_o_tb] if version == 'octave': tokenbuilders += tokenbuilders_2 tokenbuilders_9 = [self.unknown_operator_tb, invalid_token_builder] tokenbuilders += tokenbuilders_9 tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) # self.calc_group_confidence(tokens, group_mids) # operand_types_2 = ['number'] # self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) operand_types.append('number') quotes = ['"', "'", "’"] string_tb = StuffedQuoteStringTokenBuilder(quotes, False) operand_types.append('string') attribute_tb = HTMLAttributeTokenBuilder() operand_types.append('attribute') unicode_tb = HTMLUnicodeTokenBuilder() operand_types.append('character') groupers_tb = CaseInsensitiveListTokenBuilder(['<', '</', '>', '/>'], 'group', False) group_starts = ['<'] group_ends = ['>', '/>'] identifier_tb = HTMLIdentifierTokenBuilder() operand_types.append('identifier') known_operators = [ '=' ] known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) punctuation = [ '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '+', '`', '~', '[', ']', '{', '}', ';', ':', "'", '"', ',', '.', '?', '>', '/', '\\', '|' # omit '<' ] punctuation_tb = CaseInsensitiveListTokenBuilder(punctuation, 'punctuation', False) keywords = [ 'a', 'abbr', 'address', 'area', 'article', 'aside', 'audio', 'b', 'base', 'bdi', 'bdo', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'cite', 'code', 'col', 'colgroup', 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'div', 'dl', 'dt', 'em', 'embed', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html', 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'keygen', 'label', 'legend', 'li', 'link', 'main', 'map', 'mark', 'math', 'menu', 'menuitem', 'meta', 'meter', 'nav', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output', 'p', 'param', 'picture', 'pre', 'progress', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'script', 'section', 'select', 'slot', 'small', 'source', 'span', 'strong', 'style', 'sub', 'summary', 'sup', 'svg', 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'track', 'u', 'ul', 'var', 'video', 'wbr' ] keyword_tb = HTMLListTokenBuilder(keywords, 'keyword', False) comment_tb = BlockTokenBuilder('<!--', '-->', 'comment') script_tb = BlockTokenBuilder('<script', '</script>', 'script') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, string_tb, attribute_tb, known_operator_tb, groupers_tb, keyword_tb, identifier_tb, punctuation_tb, comment_tb, script_tb, self.unknown_operator_tb, unicode_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() # self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) # self.calc_group_confidence(tokens, group_mids) # self.calc_operand_n_confidence(tokens, operand_types, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence()