def __init__(self, code, extension): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) operand_types.append('number') num_variable_tb = PrefixedIntegerTokenBuilder('$', False, '0123456789') operand_types.append('variable') known_variables = [ 'ARGC', 'ARGV', 'ENVIRON', 'FILENAME', 'FS', 'NF', 'NR', 'FNR', 'OFMT', 'OFS', 'ORS', 'RLENGTH', 'RS', 'RSTART', 'SUBSEP', ] known_variables_gnu = [ 'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'IGNORECASE', 'LINT', 'PROCINFO', 'TEXTDOMAIN' ] if extension == 'gnu': known_variables += known_variables_gnu variable_tb = CaseSensitiveListTokenBuilder(known_variables, 'variable', True) regex_tb = RegexTokenBuilder() operand_types.append('regex') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') hash_comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment') line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '=', '+', '-', '*', '/', '%', '^', '++', '--', '==', '+=', '-=', '*=', '/=', '%=', '^=', '!=', '>', '>=', '<', '<=', '&&', '||', '|', '!', '?', ':', '~', '!~' ] self.unary_operators = ['+', '-', '!', '~', '++', '--'] self.postfix_operators = [ '++', '--', ] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'BEGIN', 'END', 'if', 'else', 'while', 'do', 'for', 'break', 'continue', 'delete', 'next', 'nextfile', 'function', 'func', 'exit' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, variable_tb, num_variable_tb, real_tb, real_exponent_tb, keyword_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb, string_tb, hash_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'variable', 'regex'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() self.newlines_important = 'parens' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(True, True, '_') real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_') operand_types.append('number') identifier_tb = RubyIdentifierTokenBuilder() operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder(':', 'symbol', True) operand_types.append('symbol') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 10) operand_types.append('string') regex_tb = RegexTokenBuilder() operand_types.append('regex') heredoc_tb = HereDocTokenBuilder('<<-') hash_comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment') known_operators = [ '!', '~', '**', '*', '/', '%', '+', '-', '<<', '>>', '&', '|', '^', '<', '<=', '>', '>=', '==', '===', '!=', '=~', '!~', '<=>', '&&', '||', '..', '...', '?', ':', '=', '**=', '*=', '/=', '%=', '+=', '-=', '<<=', '>>=', '&&=', '&=', '||=', '|=', '^=', 'not', 'and', 'or', 'in', '.', '.:', '=>', '::', '<<-' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '!', '~', '&', '*', '**', '<<-'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'BEGIN', 'END', 'alias', 'begin', 'break', 'case', 'class', 'def', 'defined?', 'do', 'else', 'elsif', 'end', 'ensure', 'for', 'if', 'module', 'next', 'redo', 'rescue', 'retry', 'return', 'then', 'undef', 'unless', 'until', 'when', 'while', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['nil', 'self', 'true', 'false', 'super'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') array_markers = ['%w', '%q', '%Q', '%i', '%s', '%x'] array_marker_tb = CaseSensitiveListTokenBuilder( array_markers, 'identifier', True) invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, symbol_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb, array_marker_tb, string_tb, heredoc_tb, hash_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.convert_bars_to_groups() self.convert_keywords_to_identifiers(['.']) self.convert_operators_to_identifiers() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) # self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() openers = ['begin', 'def', 'do', 'class', 'module'] closers = ['end'] self.calc_paired_blockers_confidence(openers, closers) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() self.newlines_important = 'parens' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(False, False, '_') real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’", '`'] string_tb = EscapedStringTokenBuilder(quotes, 10) raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes) operand_types.append('string') hash_comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment') known_operators = [ '+', '-', '*', '/', '**', '^', '%%', '%/%', '%*%', '%in%', '<', '<=', '>', '>=', '==', '!=', '!', '|', '&', '||', '&&', '.', ':', '::', '[[', ']]', '@', '$', '=', '<-', '<<-', '->', '->>' ] self.unary_operators = ['+', '-', '!', '@', '.'] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) user_operator_tb = ROperatorTokenBuilder() groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'if', 'else', 'repeat', 'while', 'function', 'for', 'in', 'next', 'break', 'library', 'print', 'lapply', 'rep', 'list', 'matrix', 'colnames', 'rownames', 'cbind', 'dim' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = [ 'TRUE', 'FALSE', 'NULL', 'Inf', 'NaN', 'NA', 'NA_integer_', 'NA_real_', 'NA_complex_', 'NA_character_', '...' ] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, user_operator_tb, known_operator_tb, groupers_tb, identifier_tb, string_tb, raw_string_tb, hash_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.convert_keywords_to_identifiers(['<-', '.', '=']) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = [ 'number', 'string', 'identifier', 'variable', 'symbol' ] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() self.newlines_important = 'always' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(False, False, '_') real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False) quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes) byte_string_tb = PrefixedStringTokenBuilder('b', True, quotes) unicode_string_tb = PrefixedStringTokenBuilder('u', True, quotes) fast_string_tb = PrefixedStringTokenBuilder('f', True, quotes) operand_types.append('string') triple_quote_comment_tb = TripleQuoteStringTokenBuilder(quotes) raw_triple_quote_comment_tb = RawTripleQuoteCommentTokenBuilder() hash_comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment') known_operators = [ '+', '-', '*', '/', '%', '@', '=', ':=', '==', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '&', '|', '~', '<<', '>>', '**', '.', ':', '++', '--', 'and', 'or', 'in', 'is', 'not' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', 'not', '~', '++', '--', '.'] self.postfix_operators = ['++', '--', ':'] self.adjective_operators = ['not'] self.keyword_postfix = [':'] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) continuation_chars = ['\\'] line_continuation_tb = CaseInsensitiveListTokenBuilder( continuation_chars, 'line continuation', False) keywords = [ 'as', 'assert', 'break', 'case', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'finally', 'for', 'from', 'global', 'if', 'import', 'lambda', 'match', 'nonlocal', 'pass', 'print', 'raise', 'return', 'try', 'while', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['False', 'None', 'True'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, decorator_tb, string_tb, raw_string_tb, byte_string_tb, unicode_string_tb, fast_string_tb, hash_comment_tb, triple_quote_comment_tb, raw_triple_quote_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [['not', 'in']] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = [ 'number', 'string', 'identifier', 'variable', 'symbol' ] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_format_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, block_comment_limit): super().__init__() self.newlines_important = 'parens' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) imaginary_tb = SuffixedRealTokenBuilder(False, False, ['im', 'cx'], True, None) operand_types.append('number') leads = '_' extras = '_' suffixes = '!' identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes) operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder(':', 'symbol', True) operand_types.append('symbol') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) operand_types.append('attribute') dollar_sign_tb = SingleCharacterTokenBuilder('$', 'identifier', True) quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) raw_string_tb = PrefixedRawStringTokenBuilder('raw', True, quotes) b_string_tb = PrefixedStringTokenBuilder('b', True, quotes) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment') nested_comment_tb = NestedCommentTokenBuilder('#=', '=#', block_comment_limit) line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ 'where', 'in', 'isa', '′', "'", '+', '-', '*', '/', '\\', '^', '%', '//', '<<', '>>', '<<<', '>>>', ':', '=', '==', '!=', '===', '!==', '+=', '-=', '*=', '/=', '^=', '%=', '<', '>', '<=', '>=', '~', '&', '|', '!', '&&', '||', '?', '.', '<:', '>:', '::', '->', '...', '..', '∀', '≤', '≥', '⊻', '⊽', '⊼' ] # 0x391 through 0x3a9 (capital) # 0x3b1 through 0x3c9 (small) greek_letters = [ 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω' ] greek_letter_tb = CaseSensitiveListTokenBuilder( greek_letters, 'identifier', True) self.unary_operators = [ 'isa', '+', '-', '~', '!', '.', ':', '::', "'", '<:', '>:', 'in', '..' ] self.postfix_operators = ['...', '′'] groupers = ['(', ')', ',', '[', ']', '{', '}'] # group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'baremodule', 'begin', 'break', 'catch', 'const', 'continue', 'do', 'else', 'elseif', 'end', 'export', 'finally', 'for', 'function', 'global', 'if', 'import', 'let', 'local', 'macro', 'module', 'quote', 'return', 'struct', 'try', 'using', 'while', 'abstract', 'mutable', 'primitive', 'type' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'Int8', 'UInt8', 'Int16', 'UInt16', 'Int32', 'UInt32', 'Int64', 'UInt64', 'Int128', 'UInt128', 'Float16', 'Float32', 'Float64', 'Bool', 'Char' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'true'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, real_tb, real_exponent_tb, imaginary_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, symbol_tb, attribute_tb, dollar_sign_tb, greek_letter_tb, string_tb, raw_string_tb, b_string_tb, triple_quote_string_tb, comment_tb, nested_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = JuliaExaminer.split_symbols_to_operators_identifiers( tokens, group_ends) self.tokens = tokens self.convert_keywords_to_identifiers() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'identifier', 'symbol'] self.calc_operand_confidence(tokens, operand_types_2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() self.newlines_important = 'parens' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) stmt_terminator_tb = SingleCharacterTokenBuilder( '.', 'statement terminator', False) integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder('_') real_tb = RealTokenBuilder(False, False, '_') real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_') operand_types.append('number') variable_tb = PrologVariableTokenBuilder() operand_types.append('variable') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') comment_tb = LeadToEndOfLineTokenBuilder('%', True, 'comment') special_symbols = ['!'] special_symbol_tb = CaseSensitiveListTokenBuilder( special_symbols, 'identifier', True) known_operators = [ '-->', ':-', '?-', '|', '->', '*->', ':=', '\\+', '<', '=', '=..', '=@=', '\\=@=', '=:=', '=<', '==', '=\\=', '>', '>=', '@<', '@=<', '@>', '@>=', '\\=', '\\==', 'as', 'is', '>:<', ':<', ':', '+', '-', '/\\', '\\/', 'xor', '?', '*', '/', '//', 'div', 'rdiv', '<<', '>>', 'mod', 'rem', '**', '^', '+', '-', '\\', '$' ] self.unary_operators = ['+', '-', ':-', '\\', '\\+'] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) groupers = ['(', ')', ',', '[', ']', '{', '}', '|'] group_starts = ['(', '[', ',', '{'] group_mids = [',', '|'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'dynamic', 'discontiguous', 'initialization', 'meta_predicate', 'module_transparent', 'multifile', 'public', 'thread_local', 'thread_initialization', 'volatile' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['(-)'] value_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ whitespace_tb, newline_tb, stmt_separator_tb, stmt_terminator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb, known_operator_tb, special_symbol_tb, variable_tb, groupers_tb, identifier_tb, string_tb, value_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = [ 'number', 'string', 'identifier', 'variable', 'symbol' ] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) # self.calc_keyword_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() self.newlines_important = 'parens' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) bracket_string_tb = DoubleBracketStringTokenBuilder() operand_types.append('string') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '^', '<', '>', '<=', '>=', '==', '~=', '=', '..', '.', '#', ':', 'and', 'not', 'or' ] self.unary_operators = ['+', '-', '#', 'not'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}'] # group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'break', 'do', 'else', 'elseif', 'end', 'for', 'function', 'if', 'in', 'local', 'repeat', 'return', 'then', 'until', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['false', 'true', 'nil', '...'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') line_comment_tb = LeadToEndOfLineTokenBuilder('--', True, 'comment') block_comment_tb = LuaBlockCommentTokenBuilder() invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb, real_exponent_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, bracket_string_tb, line_comment_tb, block_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_parens_continued_lines(tokens) tokens = Examiner.join_operator_continued_lines( tokens, self.postfix_operators) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'identifier'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)