def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() SuffixedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() SuffixedRealTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() SlashSlashCommentTokenBuilder.__escape_z__() SlashStarCommentTokenBuilder.__escape_z__() ClassTypeTokenBuilder.__escape_z__() HexRealExponentTokenBuilder.__escape_z__() NestedCommentTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() SuffixedRealTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() StuffedQuoteStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() SuffixedRealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() NullTokenBuilder.__escape_z__() BasicVariableTokenBuilder.__escape_z__() BasicLongVariableTokenBuilder.__escape_z__() RemarkTokenBuilder.__escape_z__() UserFunctionTokenBuilder.__escape_z__() LongUserFunctionTokenBuilder.__escape_z__() HardwareFunctionTokenBuilder.__escape_z__() return 'Escape ?Z'
def __init__(self, code, tab_size, wide): super().__init__() self.operand_types = [] self.whitespace_tb = WhitespaceTokenBuilder() self.newline_tb = NewlineTokenBuilder() self.integer_tb = IntegerTokenBuilder(None) self.integer_exponent_tb = IntegerExponentTokenBuilder(None) self.binary_integer_tb = SuffixedIntegerTokenBuilder(['B'], False, None) self.real_tb = RealTokenBuilder(False, False, None) self.real_exponent_tb = RealExponentTokenBuilder( False, False, 'E', None) self.binary_real_tb = SuffixedRealTokenBuilder(True, True, ['B'], False, None) self.operand_types.append('number') leads = '_' extras = '_' self.identifier_tb = IdentifierTokenBuilder(leads, extras) self.operand_types.append('identifier') quotes = ['"', "'", "’"] self.string_tb = EscapedStringTokenBuilder(quotes, 0) self.operand_types.append('string') self.label_tb = PL1LabelTokenBuilder() self.operand_types.append('label') self.slash_star_comment_tb = SlashStarCommentTokenBuilder() self.jcl_tb = JCLTokenBuilder() directives = [ '%ACTIVATE', '%DEACTIVATE', '%DECLARE', '%DCL', '%DICTIONARY', '%DO', '%ELSE', '%END', '%FATAL', '%GOTO', '%IF', '%INCLUDE', '%LIST', '%NOLIST', '%PAGE', '%PROCEDURE', '%PROC', '%REPLACE', '%RETURN', '%THEN' ] self.line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) self.preprocessor_tb = CaseInsensitiveListTokenBuilder( directives, 'preprocessor', False) self.title_tb = LeadToEndOfLineTokenBuilder('%TITLE', True, 'preprocessor') self.subtitle_tb = LeadToEndOfLineTokenBuilder('%SBTTL', True, 'preprocessor') self.error_tb = LeadToEndOfLineTokenBuilder('%ERROR', True, 'preprocessor') self.warn_tb = LeadToEndOfLineTokenBuilder('%WARN', True, 'preprocessor') self.inform_tb = LeadToEndOfLineTokenBuilder('%INFORM', True, 'preprocessor') self.terminators_tb = SingleCharacterTokenBuilder( ';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '**', '>', '<', '=', '>=', '<=', '¬>', '¬<', '¬=', '^>', '^<', '^=', '^', '~>', '~<', '~=', '~', '¬', '&', '&:', '|', '|:', '||', '!', '!:', '!!', ':' ] self.unary_operators = ['+', '-', '^', '~', '¬'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}'] self.group_starts = ['(', '[', ',', '{'] self.group_mids = [','] self.group_ends = [')', ']', '}'] self.groupers_tb = CaseInsensitiveListTokenBuilder( groupers, 'group', False) self.known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'ALLOCATE', 'ALLOC', 'BEGIN', 'CALL', 'CLOSE', 'DECLARE', 'DCL', 'DO', 'ELSE', 'END', 'FORMAT', 'FREE', 'GET', 'GOTO', 'GO TO', 'IF', 'LEAVE', 'ON', 'OPEN', 'OTHERWISE', 'OTHER', 'PROCEDURE', 'PROC', 'PUT', 'READ', 'RETURN', 'REVERT', 'REWRITE', 'SELECT', 'SIGNAL', 'STOP', 'THEN', 'WHEN', 'WRITE' ] self.keyword_tb = CaseInsensitiveListTokenBuilder( keywords, 'keyword', False) attributes = [ 'ALIGNED', 'ANY', 'AREA', 'BASED', 'BUILTIN', 'CONDITION', 'COND', 'CONTROLLED', 'CTL', 'DEFINED', 'DEF', 'DIRECT', 'ENTRY', 'ENVIRONMENT', 'ENV', 'EXTERNAL', 'EXT', 'FILE', 'GLOBALDEF', 'GLOBALREF', 'INITIAL', 'INIT', 'INPUT', 'INTERNAL', 'INT' 'KEYED', 'LABEL', 'LIKE', 'LIST', 'MEMBER', 'NONVARYING', 'NONVAR', 'OPTIONAL', 'OPTIONS', 'OUTPUT', 'PARAMETER', 'PARM', 'PICTURE', 'PIC', 'POSITION', 'POS', 'PRECISION', 'PREC', 'PRINT', 'READONLY', 'RECORD', 'REFER', 'RETURNS', 'SEQUENTIAL', 'SEQL', 'STATIC', 'STREAM', 'STRUCTURE', 'TRUNCATE', 'UNALIGNED', 'UNAL', 'UNION', 'UPDATE', 'VARIABLE', 'VARYING', 'VAR' ] self.attributes_tb = CaseInsensitiveListTokenBuilder( attributes, 'attribute', False) functions = [ 'ABS', 'ACOS', 'ACTUALCOUNT', 'ADD', 'ADDR', 'ADDREL', 'ALLOCATION', 'ALLOCN', 'ASIN', 'ATAN', 'ATAND', 'ATANH', 'AUTOMATIC', 'AUTO', 'BINARY', 'BIN', 'BIT', 'BOOL', 'BYTE', 'BYTESIZE', 'CEIL', 'CHARACTER', 'CHAR', 'COLLATE', 'COPY', 'COS', 'COSD', 'COSH', 'DATE', 'DATETIME', 'DECIMAL', 'DEC', 'DECODE', 'DESCRIPTOR', 'DESC', 'DIMENSION', 'DIM', 'DIVIDE', 'EMPTY', 'ENCODE', 'ERROR', 'EVERY', 'EXP', 'FIXED', 'FLOAT', 'FLOOR', 'HBOUND', 'HIGH', 'INDEX', 'INFORM', 'INT', 'LBOUND', 'LENGTH', 'LINE', 'LINENO', 'LOG', 'LOG10', 'LOG2', 'LOW', 'LTRIM', 'MAX', 'MAXLENGTH', 'MIN', 'MOD', 'MULTIPLY', 'NULL', 'OFFSET', 'ONARGSLIST', 'ONCHAR', 'ONCODE', 'ONFILE', 'ONKEY', 'ONSOURCE', 'PAGENO', 'POINTER', 'PTR', 'POSINT', 'PRESENT', 'PROD', 'RANK', 'REFERENCE', 'REVERSE', 'ROUND', 'RTRIM', 'SEARCH', 'SIGN', 'SIN', 'SIND', 'SINH', 'SIZE', 'SOME', 'SQRT', 'STRING', 'SUBSTR', 'SUBTRACT', 'SUM', 'TAN', 'TAND', 'TANH', 'TIME', 'TRANSLATE', 'TRIM', 'TRUNC', 'UNSPEC', 'VALID', 'VALUE', 'VAL', 'VARIANT', 'VERIFY', 'WARN' ] self.function_tb = CaseInsensitiveListTokenBuilder( functions, 'function', True) format_items = [ 'A', 'B', 'B1', 'B2', 'B3', 'B4', 'COLUMN', 'COL', 'E', 'F', 'P', 'R', 'TAB', 'X' ] self.format_item_tb = CaseSensitiveListTokenBuilder( format_items, 'format', True) self.operand_types.append('format') options = [ 'APPEND', 'BACKUP_DATE', 'BATCH', 'BLOCK_BOUNDARY_FORMAT', 'BLOCK_IO', 'BLOCK_SIZE', 'BUCKET_SIZE', 'BY', 'CANCEL_CONTROL_O', 'CARRIAGE_RETURN_FORMAT', 'CONTIGUOUS', 'CONTIGUOUS_BEST_TRY', 'CREATION_DATE', 'CURRENT_POSITION', 'DEFAULT_FILE_NAME', 'DEFERRED_WRITE', 'DELETE', 'EDIT', 'EXPIRATION_DATE', 'EXTENSION_SIZE', 'FAST_DELETE', 'FILE_ID', 'FILE_ID_TO', 'FILE_SIZE', 'FIXED_CONTROL_FROM', 'FIXED_CONTROL_SIZE', 'FIXED_CONTROL_SIZE_TO', 'FIXED_CONTROL_TO', 'FIXED_LENGTH_RECORDS', 'FROM', 'GROUP_PROTECTION', 'IDENT', 'IGNORE_LINE_MARKS', 'IN', 'INDEXED', 'INDEX_NUMBER', 'INITIAL_FILL', 'INTO', 'KEY', 'KEYFROM', 'KEYTO', 'LINESIZE', 'LOCK_ON_READ', 'LOCK_ON_WRITE', 'MAIN PROCEDURE', 'MANUAL_UNLOCKING', 'MATCH_GREATER', 'MATCH_GREATER_EQUAL', 'MATCH_NEXT', 'MATCH_NEXT_EQUAL', 'MAXIMUM_RECORD_NUMBER', 'MAXIMUM_RECORD_SIZE', 'MULTIBLOCK_COUNT', 'MULTIBUFFER_COUNT', 'NOLOCK', 'NONEXISTENT_RECORD', 'NONRECURSIVE', 'NORESCAN', 'NO_ECHO', 'NO_FILTER', 'NO_SHARE', 'OWNER_GROUP', 'OWNER_ID', 'OWNER_MEMBER', 'OWNER_PROTECTION', 'PAGE', 'PAGESIZE', 'PRINTER_FORMAT', 'PROMPT', 'PURGE_TYPE_AHEAD', 'READ_AHEAD', 'READ_CHECK', 'READ_REGARDLESS', 'RECORD_ID', 'RECORD_ID_ACCESS', 'RECORD_ID_TO', 'RECURSIVE', 'REPEAT', 'RESCAN', 'RETRIEVAL_POINTERS', 'REVISION_DATE', 'REWIND_ON_CLOSE', 'REWIND_ON_OPEN', 'SCALARVARYING', 'SET READ', 'SHARED_READ', 'SHARED_WRITE', 'SKIP', 'SNAP', 'SPOOL', 'STATEMENT', 'SUPERSEDE', 'SYSTEM', 'SYSTEM_PROTECTION', 'TEMPORARY', 'TIMEOUT_PERIOD', 'TITLE', 'TO', 'UNDERFLOW', 'UFL', 'UNTIL', 'USER_OPEN', 'WAIT_FOR_RECORD', 'WHILE', 'WORLD_PROTECTION', 'WRITE_BEHIND', 'WRITE_CHECK' ] self.options_tb = CaseInsensitiveListTokenBuilder( options, 'option', False) conditions = [ 'ANYCONDITION', 'CONVERSION', 'CONV', 'ENDFILE', 'ENDPAGE', 'FINISH', 'FIXEDOVERFLOW', 'FOFL', 'OVERFLOW', 'OFL', 'STORAGE', 'STRINGRANGE', 'STRG', 'SUBSCRIPTRANGE', 'SUBRG', 'UNDEFINEDFILE', 'UNDF', 'VAXCONDITION', 'ZERODIVIDE', 'ZDIV' ] self.conditions_tb = CaseInsensitiveListTokenBuilder( conditions, 'condition', False) subroutines = [ 'DISPLAY', 'EXTEND', 'FLUSH', 'NEXT_VOLUME', 'RELEASE', 'RESIGNAL', 'REWIND', 'SPACEBLOCK' ] self.subroutines_tb = CaseInsensitiveListTokenBuilder( subroutines, 'subroutine', False) types = [ 'FIXED', 'BINARY', 'FLOAT', 'DECIMAL', 'BIT', 'CHARACTER', 'PICTURE' ] self.types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True) self.operand_types.append('type') values = ['SYSIN', 'SYSPRINT'] self.values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) self.operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() # tokenize as free-format tokenbuilders_free = [ self.newline_tb, self.whitespace_tb, self.line_continuation_tb, self.terminators_tb, self.integer_tb, self.integer_exponent_tb, self.binary_integer_tb, self.real_tb, self.real_exponent_tb, self.binary_real_tb, self.keyword_tb, self.function_tb, self.attributes_tb, self.options_tb, self.conditions_tb, self.subroutines_tb, self.types_tb, self.values_tb, self.groupers_tb, self.known_operator_tb, self.identifier_tb, self.string_tb, self.label_tb, self.slash_star_comment_tb, self.preprocessor_tb, self.title_tb, self.subtitle_tb, self.error_tb, self.warn_tb, self.inform_tb, self.jcl_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer_free = Tokenizer(tokenbuilders_free) tokens_free = tokenizer_free.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid') self.tokens = tokens_free self.calc_statistics() statistics_free = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] # tokenize as fixed-format tokenbuilders_fixed = [ self.newline_tb, self.whitespace_tb, self.line_continuation_tb, self.terminators_tb, self.integer_tb, self.integer_exponent_tb, self.binary_integer_tb, self.real_tb, self.real_exponent_tb, self.binary_real_tb, self.keyword_tb, self.function_tb, self.attributes_tb, self.options_tb, self.conditions_tb, self.subroutines_tb, self.types_tb, self.values_tb, self.groupers_tb, self.known_operator_tb, self.identifier_tb, self.string_tb, self.label_tb, self.slash_star_comment_tb, self.preprocessor_tb, self.title_tb, self.subtitle_tb, self.error_tb, self.warn_tb, self.inform_tb, self.jcl_tb, self.unknown_operator_tb, invalid_token_builder ] comment_start_tb = PL1CommentStartTokenBuilder() comment_middle_tb = PL1CommentMiddleTokenBuilder() comment_end_tb = PL1CommentEndTokenBuilder() type1_tokenbuilders = [comment_start_tb] tokenbuilders_fixed_1 = tokenbuilders_fixed + type1_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_1 = Tokenizer(tokenbuilders_fixed_1) type2_tokenbuilders = [ comment_start_tb, comment_middle_tb, comment_end_tb ] tokenbuilders_fixed_2 = tokenbuilders_fixed + type2_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_2 = Tokenizer(tokenbuilders_fixed_2) tokens_fixed = self.tokenize_code(code, tab_size, tokenizer_fixed_1, tokenizer_fixed_2, wide) tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid operator') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'whitespace') tokens_fixed = self.convert_broken_comments_to_comments(tokens_fixed) self.tokens = tokens_fixed self.calc_statistics() statistics_fixed = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_fixed = self.confidences self.confidences = {} errors_fixed = self.errors self.errors = [] # compute confidence for free-format and fixed-format confidence_free = 1.0 if len(confidences_free) == 0: confidence_free = 0.0 else: for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_fixed = 1.0 if len(confidences_fixed) == 0: confidence_fixed = 0.0 else: for key in confidences_fixed: factor = confidences_fixed[key] confidence_fixed *= factor # select the better of free-format and spaced-format if confidence_fixed > confidence_free: self.tokens = tokens_fixed self.statistics = statistics_fixed self.confidences = confidences_fixed self.errors = errors_fixed else: self.tokens = tokens_free self.statistics = statistics_free self.confidences = confidences_free self.errors = errors_free
def __init__(self, code, tab_size, wide): super().__init__() self.operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) binary_integer_tb = SuffixedIntegerTokenBuilder(['B'], False, None) hex_integer_tb = SuffixedIntegerTokenBuilder(['H'], False, 'ABCDEF') octal_integer_tb = SuffixedIntegerTokenBuilder(['O'], False, None) decimal_integer_tb = SuffixedIntegerTokenBuilder(['D'], False, None) real_tb = RealTokenBuilder(True, False, None) real_exponent_tb = RealExponentTokenBuilder(True, False, 'E', None) binary_real_tb = SuffixedRealTokenBuilder(True, True, ['B'], False, None) self.operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) self.operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) self.operand_types.append('string') label_tb = PL1LabelTokenBuilder() self.operand_types.append('label') slash_star_comment_tb = SlashStarCommentTokenBuilder() directives = [ '%ACTIVATE', '%DEACTIVATE', '%DECLARE', '%DCL', '%DICTIONARY', '%DO', '%ELSE', '%END', '%FATAL', '%GOTO', '%IF', '%INCLUDE', '%LIST', '%NOLIST', '%PAGE', '%PROCEDURE', '%PROC', '%REPLACE', '%RETURN', '%THEN' ] line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) preprocessor_tb = CaseInsensitiveListTokenBuilder( directives, 'preprocessor', False) title_tb = LeadToEndOfLineTokenBuilder('%TITLE', True, 'preprocessor') subtitle_tb = LeadToEndOfLineTokenBuilder('%SBTTL', True, 'preprocessor') error_tb = LeadToEndOfLineTokenBuilder('%ERROR', True, 'preprocessor') warn_tb = LeadToEndOfLineTokenBuilder('%WARN', True, 'preprocessor') inform_tb = LeadToEndOfLineTokenBuilder('%INFORM', True, 'preprocessor') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '**', '>', '<', '=', '>=', '<=', '<>', '^>', '^<', '^=', '^', '~>', '~<', '~=', '~', '&', '&:', ':=', '|', '|:', '||', '!', '!:', '!!', ':', '@', 'NOT', 'AND', 'OR', 'XOR', 'MINUS', 'PLUS', 'MOD' ] self.unary_operators = ['+', '-', '^', '~', '@', 'NOT'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}'] self.group_starts = ['(', '[', ',', '{'] self.group_mids = [','] self.group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'ADDRESS', 'AT', 'BASED', 'BY', 'CALL', 'CASE', 'CLOSE', 'DATA', 'DECLARE', 'DISABLE', 'DO', 'ELSE', 'ENABLE', 'END', 'EOF', 'EXTERNAL', 'GO', 'GOTO', 'HALT', 'IF', 'INITIAL', 'INTERRUPT', 'LABEL', 'LITERALLY', 'OFFSET', 'ON', 'OPEN', 'OTHERWISE', 'OTHER', 'PROCEDURE', 'PROC', 'PUBLIC', 'READ', 'REENTRANT', 'RETURN', 'SELECTOR', 'STRUCTURE', 'THEN', 'TO', 'WHILE', 'WRITE' ] keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) attributes = [ 'ALIGNED', 'ANY', 'AREA', 'BASED', 'BUILTIN', 'CONDITION', 'COND', 'CONTROLLED', 'CTL', 'DEFINED', 'DEF', 'DIRECT', 'ENTRY', 'ENVIRONMENT', 'ENV', 'EXTERNAL', 'EXT', 'FILE', 'GLOBALDEF', 'GLOBALREF', 'INITIAL', 'INIT', 'INPUT', 'INTERNAL', 'INT' 'KEYED', 'LABEL', 'LIKE', 'LIST', 'MEMBER', 'NONVARYING', 'NONVAR', 'OPTIONAL', 'OPTIONS', 'OUTPUT', 'PARAMETER', 'PARM', 'PICTURE', 'PIC', 'POSITION', 'POS', 'PRECISION', 'PREC', 'PRINT', 'READONLY', 'RECORD', 'REFER', 'RETURNS', 'SEQUENTIAL', 'SEQL', 'STATIC', 'STREAM', 'STRUCTURE', 'TRUNCATE', 'UNALIGNED', 'UNAL', 'UNION', 'UPDATE', 'VARIABLE', 'VARYING', 'VAR' ] attributes_tb = CaseInsensitiveListTokenBuilder( attributes, 'attribute', False) functions = [ 'ABS', 'ADJUSTRPL', 'BLOCKINPUT', 'BLOCKINWORD', 'BLOCKINDWORD', 'BLOCKOUTPUT', 'BLOCKOUTWORD', 'BLOCKOUTDWORD', 'BUILDPTR', 'BYTESWAP', 'CMPD', 'CARRY', 'CAUSEINTERRUPT', 'CLEARTASKSWITCHEDFLAG', 'CMPB', 'CMPW', 'CONTROLREGISTER', 'DEC', 'DOUBLE', 'DEBUGREGISTER', 'FINDB', 'FINDD', 'FINDRD', 'FINDHW', 'FINDRB', 'FINDRHW', 'FINDRW', 'FINDW', 'FIX', 'FLAGS', 'FLOAT', 'GETACCESSRIGHTS', 'GETREALERROR', 'GETSEGMENTLIMIT', 'HIGH', 'IABS', 'INHWORD', 'INITREALMATHUNITSKIPRB', 'INPUT', 'INT SIZE', 'INWORD SIZE', 'INVALIDATEDATACACHE', 'INVALIDATETLBENTRY' 'INDWORD', 'LAST', 'LENGTH', 'LOCALTABLE', 'LOCKSET', 'LOW', 'MACHINESTATUS', 'MOVB', 'MOVBIT', 'MOVD', 'MOVE', 'MOVHW', 'MOVRB', 'MOVRBIT' 'MOVRD', 'MOVRHW', 'MOVRW', 'MOVW', 'NIL', 'OFFSETOF', 'OUTDWORD', 'OUTHWORD', 'OUTPUT', 'OUTWORD', 'PARITY', 'RESTOREGLOBALTABLE', 'RESTOREINTERRUPTABLE', 'RESTOREREALSTATUS', 'ROL', 'ROR', 'SAL', 'SAR', 'SAVEGLOBALTABLE', 'SAVEINTERRUPTTABLE', 'SAVEREALSTATUS', 'SCANBIT', 'SCANRBIT', 'SCL', 'SCR', 'SEGMENTREADABLE', 'SEGMENTWRITABLE', 'SELECTOROF', 'SETB', 'SETHW', 'SETREALMODE', 'SETW', 'SHL', 'SHLD', 'SHR', 'SHRD', 'SETD', 'SIGN', 'SIGNED', 'SKIPB', 'SKIPD', 'SKIPRD' 'SKIPHW', 'SKIPRHW', 'SKIPRW', 'SKIPW', 'STACKBASE', 'STACKPTR', 'TASKREGISTER', 'TESTREGISTER', 'TIME', 'UNSIGN', 'WAITFORINTERRUPT', 'WBINVALIDATEDATACACHE', 'XLAT', 'ZERO' ] function_tb = CaseInsensitiveListTokenBuilder(functions, 'function', True) format_items = [ 'A', 'B', 'B1', 'B2', 'B3', 'B4', 'COLUMN', 'COL', 'E', 'F', 'P', 'R', 'TAB', 'X' ] format_item_tb = CaseSensitiveListTokenBuilder(format_items, 'format', True) self.operand_types.append('format') options = [ 'APPEND', 'BACKUP_DATE', 'BATCH', 'BLOCK_BOUNDARY_FORMAT', 'BLOCK_IO', 'BLOCK_SIZE', 'BUCKET_SIZE', 'BY', 'CANCEL_CONTROL_O', 'CARRIAGE_RETURN_FORMAT', 'CONTIGUOUS', 'CONTIGUOUS_BEST_TRY', 'CREATION_DATE', 'CURRENT_POSITION', 'DEFAULT_FILE_NAME', 'DEFERRED_WRITE', 'DELETE', 'EDIT', 'EXPIRATION_DATE', 'EXTENSION_SIZE', 'FAST_DELETE', 'FILE_ID', 'FILE_ID_TO', 'FILE_SIZE', 'FIXED_CONTROL_FROM', 'FIXED_CONTROL_SIZE', 'FIXED_CONTROL_SIZE_TO', 'FIXED_CONTROL_TO', 'FIXED_LENGTH_RECORDS', 'FROM', 'GROUP_PROTECTION', 'IDENT', 'IGNORE_LINE_MARKS', 'IN', 'INDEXED', 'INDEX_NUMBER', 'INITIAL_FILL', 'INTO', 'KEY', 'KEYFROM', 'KEYTO', 'LINESIZE', 'LOCK_ON_READ', 'LOCK_ON_WRITE', 'MAIN PROCEDURE', 'MANUAL_UNLOCKING', 'MATCH_GREATER', 'MATCH_GREATER_EQUAL', 'MATCH_NEXT', 'MATCH_NEXT_EQUAL', 'MAXIMUM_RECORD_NUMBER', 'MAXIMUM_RECORD_SIZE', 'MULTIBLOCK_COUNT', 'MULTIBUFFER_COUNT', 'NOLOCK', 'NONEXISTENT_RECORD', 'NONRECURSIVE', 'NORESCAN', 'NO_ECHO', 'NO_FILTER', 'NO_SHARE', 'OWNER_GROUP', 'OWNER_ID', 'OWNER_MEMBER', 'OWNER_PROTECTION', 'PAGE', 'PAGESIZE', 'PRINTER_FORMAT', 'PROMPT', 'PURGE_TYPE_AHEAD', 'READ_AHEAD', 'READ_CHECK', 'READ_REGARDLESS', 'RECORD_ID', 'RECORD_ID_ACCESS', 'RECORD_ID_TO', 'RECURSIVE', 'REPEAT', 'RESCAN', 'RETRIEVAL_POINTERS', 'REVISION_DATE', 'REWIND_ON_CLOSE', 'REWIND_ON_OPEN', 'SCALARVARYING', 'SET READ', 'SHARED_READ', 'SHARED_WRITE', 'SKIP', 'SNAP', 'SPOOL', 'STATEMENT', 'SUPERSEDE', 'SYSTEM', 'SYSTEM_PROTECTION', 'TEMPORARY', 'TIMEOUT_PERIOD', 'TITLE', 'TO', 'UNDERFLOW', 'UFL', 'UNTIL', 'USER_OPEN', 'WAIT_FOR_RECORD', 'WHILE', 'WORLD_PROTECTION', 'WRITE_BEHIND', 'WRITE_CHECK' ] options_tb = CaseInsensitiveListTokenBuilder(options, 'option', False) conditions = [ 'ANYCONDITION', 'CONVERSION', 'CONV', 'ENDFILE', 'ENDPAGE', 'FINISH', 'FIXEDOVERFLOW', 'FOFL', 'OVERFLOW', 'OFL', 'STORAGE', 'STRINGRANGE', 'STRG', 'SUBSCRIPTRANGE', 'SUBRG', 'UNDEFINEDFILE', 'UNDF', 'VAXCONDITION', 'ZERODIVIDE', 'ZDIV' ] conditions_tb = CaseInsensitiveListTokenBuilder( conditions, 'condition', False) subroutines = [ 'DISPLAY', 'EXTEND', 'FLUSH', 'NEXT_VOLUME', 'RELEASE', 'RESIGNAL', 'REWIND', 'SPACEBLOCK' ] subroutines_tb = CaseInsensitiveListTokenBuilder( subroutines, 'subroutine', False) types = [ 'ADDRESS', 'BYTE', 'CHARINT', 'DWORD', 'HWORD', 'INTEGER', 'LONGINT', 'OFFSET', 'POINTER', 'REAL', 'SHORTINT', 'STRUCTURE', 'QWORD', 'WORD' ] types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True) self.operand_types.append('type') values = ['SYSIN', 'SYSPRINT', 'TRUE', 'FALSE'] values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) self.operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() # tokenize as free-format tokenbuilders_free = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, binary_integer_tb, hex_integer_tb, octal_integer_tb, decimal_integer_tb, real_tb, real_exponent_tb, binary_real_tb, keyword_tb, format_item_tb, function_tb, attributes_tb, options_tb, conditions_tb, subroutines_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, label_tb, slash_star_comment_tb, preprocessor_tb, title_tb, subtitle_tb, error_tb, warn_tb, inform_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer_free = Tokenizer(tokenbuilders_free) tokens_free = tokenizer_free.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid') self.tokens = tokens_free self.calc_statistics() statistics_free = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] # tokenize as fixed-format tokenbuilders_fixed = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, binary_integer_tb, hex_integer_tb, octal_integer_tb, decimal_integer_tb, real_tb, real_exponent_tb, binary_real_tb, keyword_tb, function_tb, attributes_tb, options_tb, conditions_tb, subroutines_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, label_tb, slash_star_comment_tb, preprocessor_tb, title_tb, subtitle_tb, error_tb, warn_tb, inform_tb, self.unknown_operator_tb, invalid_token_builder ] comment_start_tb = PL1CommentStartTokenBuilder() comment_middle_tb = PL1CommentMiddleTokenBuilder() comment_end_tb = PL1CommentEndTokenBuilder() type1_tokenbuilders = [comment_start_tb] tokenbuilders_fixed_1 = tokenbuilders_fixed + type1_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_1 = Tokenizer(tokenbuilders_fixed_1) type2_tokenbuilders = [ comment_start_tb, comment_middle_tb, comment_end_tb ] tokenbuilders_fixed_2 = tokenbuilders_fixed + type2_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_2 = Tokenizer(tokenbuilders_fixed_2) tokens_fixed = self.tokenize_code(code, tab_size, tokenizer_fixed_1, tokenizer_fixed_2, wide) tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid operator') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'whitespace') tokens_fixed = self.convert_broken_comments_to_comments(tokens_fixed) self.tokens = tokens_fixed self.calc_statistics() statistics_fixed = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_fixed = self.confidences self.confidences = {} errors_fixed = self.errors self.errors = [] # compute confidence for free-format and fixed-format confidence_free = 1.0 if len(confidences_free) == 0: confidence_free = 0.0 else: for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_fixed = 1.0 if len(confidences_fixed) == 0: confidence_fixed = 0.0 else: for key in confidences_fixed: factor = confidences_fixed[key] confidence_fixed *= factor # select the better of free-format and spaced-format if confidence_fixed > confidence_free: self.tokens = tokens_fixed self.statistics = statistics_fixed self.confidences = confidences_fixed self.errors = errors_fixed else: self.tokens = tokens_free self.statistics = statistics_free self.confidences = confidences_free self.errors = errors_free
def __init__(self, code, block_comment_limit): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF_') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_') suffixed_integer_tb = SuffixedIntegerTokenBuilder(['U', 'L', 'LU', 'UL'], False, None) real_tb = RealTokenBuilder(False, False, "'") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l', 'i'], False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") hex_real_tb = HexRealExponentTokenBuilder() operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) operand_types.append('attribute') # string suffix: c,w,d quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) r_string_tb = PrefixedStringTokenBuilder('r', True, quotes) backtick_string_tb = EscapedStringTokenBuilder(['`'], 0) x_string_tb = PrefixedStringTokenBuilder('x', True, quotes) q_string_tb = PrefixedStringTokenBuilder('q', True, quotes) # q{} string cwd_string_tb = SuffixedStringTokenBuilder(quotes, 'cwd', False) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() slash_plus_comment_tb = NestedCommentTokenBuilder('/+', '+/', block_comment_limit) line_continuation_tb = SingleCharacterTokenBuilder('\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '/', '/=', '.', '..', '...', '&', '&=', '&&', '|', '|=', '||', '-', '-=', '--', '+', '+=', '++', '<', '<=', '<<', '<<=', '>', '>=', '>>=', '>>>=', '>>', '>>>', '!', '!=', '?', ',', ':', '$', '=', '==', '*', '*=', '%', '%=', '^', '^=', '^^', '^^=', '~', '~=', '@', '=>', '#', 'new', 'delete', 'typeof', 'is' ] self.unary_operators = [ '+', '-', '*', '!', '&', '~', '++', '--', ':', 'new', 'delete', 'typeof', 'is' ] self.postfix_operators = [ '++', '--', '&', ':' ] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) keywords = [ 'abstract', 'alias', 'align', 'asm', 'assert', 'auto', 'body', 'break', 'case', 'cast', 'catch', 'class', 'const', 'continue', 'debug', 'default', 'delegate', 'deprecated', 'do', 'else', 'enum', 'export', 'extern', 'final', 'finally', 'for', 'foreach', 'foreach_reverse', 'function', 'goto', 'if', 'immutable', 'import', 'in', 'inout', 'interface', 'invariant', 'lazy', 'macro', 'mixin', 'module', 'nothrow', 'out', 'override', 'package', 'pragma', 'private', 'protected', 'public', 'pure', 'ref', 'return', 'scope', 'shared', 'static', 'struct', 'switch', 'synchronized', 'template', 'throw', 'try', 'typeid', 'union', 'unittest', 'version', 'while', 'with', '__gshared', '__traits', '__vector', '__parameters' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'bool', 'byte', 'cdouble', 'cent', 'cfloat', 'char', 'creal', 'dchar', 'double', 'float', 'idouble', 'ifloat', 'int', 'ireal', 'long', 'real', 'short', 'ubyte', 'ucent', 'uint', 'ulong', 'ushort', 'void', 'wchar' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = [ 'false', 'null', 'super', 'this', 'true', '__FILE__', '__FILE_FULL_PATH__', '__MODULE__', '__LINE__', '__FUNCTION__', '__PRETTY_FUNCTION__', '__DATE__', '__EOF__', '__TIME__','__TIMESTAMP__', '__VENDOR__', '__VERSION__' ] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, hex_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, attribute_tb, class_type_tb, string_tb, r_string_tb, x_string_tb, backtick_string_tb, q_string_tb, cwd_string_tb, slash_slash_comment_tb, slash_star_comment_tb, slash_plus_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() number_suffixes = ['f', 'F', 'i', 'I', 'u', 'U', 'l', 'L', 'ul', 'uL', 'Ul', 'UL', 'lu', 'lU', 'Lu', 'LU'] tokens = self.combine_tokens_and_adjacent_types(tokens, 'number', 'identifier', number_suffixes) string_suffixes = ['c', 'w', 'd'] self.tokens = self.combine_tokens_and_adjacent_types(tokens, 'string', 'identifier', string_suffixes) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, block_comment_limit): super().__init__() self.newlines_important = 'parens' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) imaginary_tb = SuffixedRealTokenBuilder(False, False, ['im', 'cx'], True, None) operand_types.append('number') leads = '_' extras = '_' suffixes = '!' identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes) operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder(':', 'symbol', True) operand_types.append('symbol') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) operand_types.append('attribute') dollar_sign_tb = SingleCharacterTokenBuilder('$', 'identifier', True) quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) raw_string_tb = PrefixedRawStringTokenBuilder('raw', True, quotes) b_string_tb = PrefixedStringTokenBuilder('b', True, quotes) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment') nested_comment_tb = NestedCommentTokenBuilder('#=', '=#', block_comment_limit) line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ 'where', 'in', 'isa', '′', "'", '+', '-', '*', '/', '\\', '^', '%', '//', '<<', '>>', '<<<', '>>>', ':', '=', '==', '!=', '===', '!==', '+=', '-=', '*=', '/=', '^=', '%=', '<', '>', '<=', '>=', '~', '&', '|', '!', '&&', '||', '?', '.', '<:', '>:', '::', '->', '...', '..', '∀', '≤', '≥', '⊻', '⊽', '⊼' ] # 0x391 through 0x3a9 (capital) # 0x3b1 through 0x3c9 (small) greek_letters = [ 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω' ] greek_letter_tb = CaseSensitiveListTokenBuilder( greek_letters, 'identifier', True) self.unary_operators = [ 'isa', '+', '-', '~', '!', '.', ':', '::', "'", '<:', '>:', 'in', '..' ] self.postfix_operators = ['...', '′'] groupers = ['(', ')', ',', '[', ']', '{', '}'] # group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'baremodule', 'begin', 'break', 'catch', 'const', 'continue', 'do', 'else', 'elseif', 'end', 'export', 'finally', 'for', 'function', 'global', 'if', 'import', 'let', 'local', 'macro', 'module', 'quote', 'return', 'struct', 'try', 'using', 'while', 'abstract', 'mutable', 'primitive', 'type' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'Int8', 'UInt8', 'Int16', 'UInt16', 'Int32', 'UInt32', 'Int64', 'UInt64', 'Int128', 'UInt128', 'Float16', 'Float32', 'Float64', 'Bool', 'Char' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, imaginary_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, symbol_tb, attribute_tb, dollar_sign_tb, greek_letter_tb, string_tb, raw_string_tb, b_string_tb, triple_quote_string_tb, comment_tb, nested_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = JuliaExaminer.split_symbols_to_operators_identifiers( tokens, group_ends) self.tokens = tokens self.convert_keywords_to_identifiers() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'identifier', 'symbol'] self.calc_operand_confidence(tokens, operand_types_2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '0123456789abcdefABCDEF_') long_integer_tb = SuffixedIntegerTokenBuilder('L', False, None) real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") float_real_tb = SuffixedRealTokenBuilder(False, False, ['f'], False, None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder("'", 'symbol', True) operand_types.append('symbol') quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '&', '|', '^', '<<', '>>', '&&', '||', '=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '>:', '⇒', '=>', '=', '<%', '<:', '←', '<-', '#', '@', '==', '!=', '>', '<', '>=', '<=', '!', '~', '<<<', '>>>', '.', '++', '--', 'new' ] self.unary_operators = ['+', '-', '*', '!', '~', '++', '--', 'new'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'abstract', 'case', 'catch', 'class', 'def', 'do', 'else', 'extends', 'final', 'finally', 'for', 'forSome', 'if', 'implicit', 'import', 'lazy', 'match', 'object', 'override', 'package', 'private', 'protected', 'return', 'sealed', 'then', 'throw', 'trait', 'try', 'type', 'val', 'var', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['false', 'true', 'null', 'this', 'super'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, long_integer_tb, real_tb, real_exponent_tb, float_real_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, symbol_tb, string_tb, triple_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("_") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '_0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '_01') suffixed_integer_tb = SuffixedIntegerTokenBuilder([ 'G', 'L', 'I', ], False, '_') real_tb = RealTokenBuilder(False, False, "_") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "_") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['G', 'D', 'F'], False, '_') operand_types.append('number') leads = '@_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) regex_tb = RegexTokenBuilder() # dollar-slash slash-dollar strings (allow newline) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() shebang_tb = SheBangTokenBuilder() terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '**', '=', '==', '!=', '===', '!==', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '**=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '>>>', '^', '?.', '?:', '<>', '>>>=', '.', '.&', '.@', '::', '=~', '==~', '*.', '*:', '..', '..<', '<=>', '++', '--', '->', '&&', '||', '?', '##', 'as', 'in', '!in', 'instanceof', '!instanceof', 'new', ] self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--', '?'] self.postfix_operators = ['++', '--', '&', '*'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] # group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'assert', 'break', 'case', 'catch', 'class', 'const', 'continue', 'def', 'default', 'do', 'else', 'enum', 'extends', 'finally', 'for', 'goto', 'if', 'implements', 'import', 'interface', 'new', 'package', 'return', 'super', 'switch', 'throw', 'throws', 'trait', 'try', 'var', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'byte', 'char', 'double', 'float', 'int', 'long', 'short', 'Java.lang.BigInteger' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['null', 'true', 'false', 'this'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, class_type_tb, string_tb, triple_quote_string_tb, regex_tb, slash_slash_comment_tb, slash_star_comment_tb, shebang_tb, self.unknown_operator_tb, invalid_token_builder ] # shebang line at start tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') suffixed_integer_tb = SuffixedIntegerTokenBuilder( ['U', 'L', 'LL', 'ULL', 'LLU'], False, None) real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l'], False, None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() directives = [ '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else', '#elif', '#line', '#include', '#pragma' ] line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) c_preprocessor_tb = CaseSensitiveListTokenBuilder( directives, 'preprocessor', False) c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True, 'preprocessor') c_error_tb = LeadToEndOfLineTokenBuilder('#error', True, 'preprocessor') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '^', '.', '++', '--', '->', '&&', '||', '?', '##', '::', '<=>', '.*', '->*', 'new', 'delete', 'and', 'and_eq', 'bitand', 'bitor', 'compl', 'not', 'not_eq', 'or', 'or_eq', 'xor', 'xor_eq' ] self.unary_operators = [ '+', '-', '*', '!', '&', '~', '++', '--', 'new', 'delete', 'compl', 'not' ] self.postfix_operators = ['++', '--', '&', '*'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'alignas', 'alignof', 'asm', 'atomic_cancel', 'atomic_commit', 'atomic_noexcept', 'audit', 'auto', 'axiom', 'break', 'case', 'catch', 'class', 'concept', 'const', 'consteval', 'constexpr', 'const_cast', 'continue', 'co_await', 'co_return', 'co_yield', 'decltype', 'default', 'do', 'dynamic_cast', 'else', 'enum', 'explicit', 'export', 'extern', 'final', 'for', 'friend', 'goto', 'if', 'import', 'inline', 'module', 'mutable', 'namespace', 'noexcept', 'nullptr', 'operator', 'override', 'private', 'protected', 'public', 'private:', 'protected:', 'public:', 'reflexpr', 'register', 'reinterpret_cast', 'requires', 'return', 'signed', 'sizeof', 'static', 'static_assert', 'static_cast', 'struct', 'switch', 'synchronized', 'template', 'thread_local', 'throw', 'transaction_safe', 'transaction_safe_dynamic' 'try', 'typedef', 'typeid', 'typename', 'union', 'unsigned', 'using', 'virtual', 'volatile', 'while', ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'bool', 'char', 'char8_t', 'char16_t', 'char32_t', 'double', 'float', 'int', 'long', 'short', 'void', 'wchar_t' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'this', 'true', 'cout', 'cin'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, class_type_tb, string_tb, slash_slash_comment_tb, slash_star_comment_tb, c_preprocessor_tb, c_error_tb, c_warning_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, version): super().__init__() self.newlines_important = 'always' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder(False) real_tb = RealTokenBuilder(False, False, False) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_') double_exponent_tb = NullTokenBuilder() integer_suffix_tb = SuffixedIntegerTokenBuilder( ['%', '&', 'S', 'I', 'L', 'F', 'D', 'R', 'US', 'UI', 'UL'], True, '_') float_suffix_tb = SuffixedRealTokenBuilder(False, False, ['!', '#', 'F', 'D', 'R'], True, '_') if version in ['basic-80', 'basica', 'gw-basic']: double_exponent_tb = RealExponentTokenBuilder( False, False, 'D', '_') integer_suffix_tb = SuffixedIntegerTokenBuilder(['%'], False, '_') float_suffix_tb = SuffixedRealTokenBuilder(False, False, ['!', '#'], True, '_') hex_constant_tb = PrefixedIntegerTokenBuilder( '&H', True, '0123456789ABCDEFabcdef_') octal_constant_tb = PrefixedIntegerTokenBuilder( '&O', True, '01234567_') binary_constant_tb = PrefixedIntegerTokenBuilder('&B', True, '01_') operand_types.append('number') variable_tb = BasicVariableTokenBuilder('%#!$&') if version in ['basic-80', 'basica', 'gw-basic']: variable_tb = BasicLongVariableTokenBuilder('%#!$&') operand_types.append('variable') quotes = ['"'] string_tb = StuffedQuoteStringTokenBuilder(quotes, False) operand_types.append('string') remark_tb = RemarkTokenBuilder() comment_tb = LeadToEndOfLineTokenBuilder("'", False, 'comment') comment2_tb = LeadToEndOfLineTokenBuilder("’", False, 'comment') stmt_separator_tb = SingleCharacterTokenBuilder( ':', 'statement separator', False) known_operators = [ '+', '-', '*', '/', '^', '=', '>', '>=', '<', '<=', '<>', '#', '\\', 'AND', 'OR', 'NOT' ] known_operators_ms = ['=>', '=<', 'IMP', 'EQV', 'XOR', 'MOD'] if version in ['basic-80', 'basica', 'gw-basic']: known_operators += known_operators_ms known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '#', 'NOT'] groupers = ['(', ')', ',', ';'] group_starts = ['('] group_mids = [',', ';'] group_ends = [')'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'CLOSE', 'CHAIN', 'DATA', 'DEF', 'DIM', 'ELSE', 'END', 'ERROR', 'FILE', 'FOR', 'GOSUB', 'GOTO', 'IF', 'INPUT', 'LET', 'LINE', 'MAT', 'NEXT', 'ON', 'ONERR', 'OPEN', 'OUTPUT', 'POKE', 'PRINT', 'RANDOMIZE', 'READ', 'REM', 'REMARK', 'RESTORE', 'RETURN', 'STEP', 'STOP', 'THEN', 'TO', 'USING' ] keywords_plain = ['AS', 'GO'] keywords_ms = [ # 'AS', ## promoted from variable after FIELD # 'BASE', ## promoted from variable after OPTION 'CALL', 'CLEAR', 'CLS', 'COMMON', 'DEFDBL', 'DEFINT', 'DEFSNG', 'DEFSTR', 'ELSE', 'END', 'ERASE', 'ERRLN', 'ERRNO', 'ERROR', 'FIELD', 'FILES', 'GET', 'KILL', 'LOAD', 'LPRINT', 'LSET', 'MERGE', 'NULL', 'ONERR', 'OPTION', 'OUT', 'PUT', 'RESET', 'RESUME', 'RETURN', 'RSET', 'RUN', 'SET', 'SWAP', 'SYSTEM', 'TRON', 'TROFF', 'WAIT', 'WHILE', 'WEND', 'WIDTH', 'WRITE' ] plus_keywords = ['CHANGE'] if version in ['']: keywords += keywords_plain if version in ['basic-80', 'basica', 'gw-basic']: keywords += keywords_ms keywords_basica = [ 'COLOR', 'KEY', 'LOCATE', 'PAINT', 'PLAY', 'SCREEN', 'SOUND' ] if version in ['basica', 'gw-basic']: keywords += keywords_basica keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['OFF', 'ON'] values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') functions = [ 'ABS', 'ASC', 'ATN', 'CHR', 'CHR$', 'CON', 'COS', 'DET', 'ERL', 'ERR', 'EXP', 'IDN', 'INSTR', 'INT', 'INV', 'LEFT', 'LEFT$', 'LEN', 'LOG', 'MID', 'MID$', 'POS', 'RIGHT', 'RIGHT$', 'RND', 'SGN', 'SIN', 'SQR', 'STR$', 'TAB', 'TAN', 'TRN', 'VAL', 'ZER' ] functions_ms = [ 'CDBL', 'CINT', 'CSNG', 'CVI', 'CVD', 'CVS', 'DATE$', 'EOF', 'FIX', 'FRE', 'HEX$', 'INKEY', 'INP', 'INPUT$', 'INSTR', 'LOC', 'LOF', 'LPOS', 'MKI$', 'MKD$', 'MKS$', 'OCT$', 'PEEK', 'SPACE$', 'SPC', 'STRING$', 'TIME$', 'USR', 'VARPTR' ] if version in ['basic-80', 'basica', 'gw-basic']: functions += functions_ms function_tb = CaseInsensitiveListTokenBuilder(functions, 'function', True) user_function_tb = UserFunctionTokenBuilder('%#!$&') hardware_function_tb = NullTokenBuilder() if version in ['basic-80', 'basica', 'gw-basic']: user_function_tb = LongUserFunctionTokenBuilder('%#!$&') hardware_function_tb = HardwareFunctionTokenBuilder() operand_types.append('function') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, float_suffix_tb, integer_suffix_tb, real_tb, real_exponent_tb, double_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, keyword_tb, known_operator_tb, function_tb, user_function_tb, hardware_function_tb, values_tb, variable_tb, groupers_tb, string_tb, remark_tb, comment_tb, comment2_tb, self.unknown_operator_tb, invalid_token_builder ] operand_types = [ 'number', 'string', 'symbol', 'identifier', 'variable', 'function' ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = BasicExaminer.convert_numbers_to_line_numbers(tokens) if version in ['basic-80', 'basica', 'gw-basic']: tokens = BasicExaminer.extract_keywords_from_identifiers( tokens, keywords, known_operators) tokens = BasicExaminer.convert_as_to_keyword(tokens) tokens = BasicExaminer.convert_base_to_keyword(tokens) tokens = BasicExaminer.convert_operators_to_values(tokens) self.tokens = tokens self.calc_statistics() tokens = self.source_tokens() self.calc_statistics() self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'variable', 'symbol'] if version not in ['basic-80', 'basica', 'gw-basic']: self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_format_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, year): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') suffixed_integer_tb = SuffixedIntegerTokenBuilder( ['U', 'L', 'LL', 'ULL', 'LLU'], False, None) real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l'], False, None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() directives = [ '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else', '#elif', '#line', '#include', '#pragma' ] line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) c_preprocessor_tb = CaseSensitiveListTokenBuilder( directives, 'preprocessor', False) c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True, 'preprocessor') c_error_tb = LeadToEndOfLineTokenBuilder('#error', True, 'preprocessor') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '^', '.', '++', '--', '->', '&&', '||', '?', '##' ] self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--'] self.postfix_operators = ['++', '--', '&', '*'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'auto', 'break', 'case', 'const', 'continue', 'default', 'do', 'else', 'enum', 'extern', 'for', 'goto', 'if', 'inline', 'register', 'return', 'signed', 'sizeof', 'static', 'struct', 'switch', 'typedef', 'union', 'unsigned', 'volatile', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = ['char', 'double', 'float', 'int', 'long', 'short'] types_89 = ['void'] types_99 = ['bool', 'complex'] if year in ['89', '99']: types += types_89 if year in ['99']: types += types_99 types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['NULL'] values_89 = [] values_99 = ['...', 'true', 'false'] if year in ['89', '99']: values += values_89 if year in ['99']: values += values_99 values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, class_type_tb, string_tb, ] if year in ['99']: tokenbuilders += [ slash_slash_comment_tb, ] tokenbuilders += [ slash_star_comment_tb, c_preprocessor_tb, c_error_tb, c_warning_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)