This metaclass registers CassandraType classes in the global by-cassandra-typename and by-cql-typename registries, unless their class name starts with an underscore. """ def __new__(metacls, name, bases, dct): dct.setdefault('cassname', name) cls = type.__new__(metacls, name, bases, dct) if not name.startswith('_'): _casstypes[name] = cls _cqltypes[cls.typename] = cls return cls casstype_scanner = re.Scanner(( (r'[()]', lambda s, t: t), (r'[a-zA-Z0-9_.:]+', lambda s, t: t), (r'[\s,]', None), )) def lookup_casstype_simple(casstype): """ Given a Cassandra type name (either fully distinguished or not), hand back the CassandraType class responsible for it. If a name is not recognized, a custom _UnrecognizedType subclass will be created for it. This function does not handle complex types (so no type parameters-- nothing with parentheses). Use lookup_casstype() instead if you might need that. """
import operator sym_tab= {} stack = [] # Stack to hold the values. # Scanner object. Isolate each token and take # appropriate action: push a numeric value, # but perform operation on top two elements on # stack if an operator is found. scanner = re.Scanner([ (r"[ \t\n]", lambda s, t: None), (r"-?(\d*\.)?\d+", lambda s, t: stack.append(float(t))), (r"[a-zA-Z_][a-zA-Z_0-9]*", lambda s, t: stack.append(t)), (r"\d+", lambda s, t: stack.append(int(t))), (r"[+]", lambda s, t: bin_op(operator.add)), (r"[-]", lambda s, t: bin_op(operator.sub)), (r"[*]", lambda s, t: bin_op(operator.mul)), (r"[/]", lambda s, t: bin_op(operator.truediv)), (r"[\^]", lambda s, t: bin_op(operator.pow)), (r"[=]", lambda s, t: assign_op()), ]) # Binary Operator function. pop top two elements # from stack and push the result back on the stack. def bin_op(action): ''' Binary Operation evaluator: If an operand is a variable name, look it up in the symbol table and replace with the corresponding value, before being evaluated.
cur += " " + str(i[1]) return cur def handleOperator(cur, i): # convert operator into machine code cur += " " + str(i[1]) return cur def handleDigit(cur, i): # convert digit into machine code cur += " " + str(i[1]) return cur scanner = re.Scanner([ (r"//.*", comment), # find comments in code (r"[A-Z]+", instruction), # find instructions in code (r"[a-zA-Z0-9]+,??", operand), # find operands in code (r"[+\-*/=]", operator), # find operators in code (r"[0-9]+(\.[0-9]+)?", digit), # find digits in code (r"\n", end_stmnt), # find end of line (r"\s+", None) # do nothing with the rest ]) assemblyTable = [['MOV', 'MOV'], ['JMP', 'JMP']] # assembly table operandTable = [['ax', 'ax'], ['bx', 'bx']] # operand table file = open(sys.argv[1]) tokens, remainder = scanner.scan(file.read()) #tokens = assemble(tokens) print(tokens)
def _handle_key_value(s, t): return t.split('=', 1) def _handle_word(s, t): if t.startswith('.'): return '.', t[1:] if t.startswith('#'): return 'id', t[1:] return t, t _scanner = re.Scanner([ (r'[^ =]+=".*?"', _handle_double_quote), (r"[^ =]+='.*?'", _handle_single_quote), (r'[^ =]+=[^ =]+', _handle_key_value), (r'[^ =]+', _handle_word), (r' ', None) ]) def get_attrs(str): """ Parse attribute list and return a list of attribute tuples. """ return _scanner.scan(str)[0] def isheader(elem): return elem.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] class AttrListTreeprocessor(Treeprocessor):
S_BOOL = lambda x, token: ['bool', bool(token) ] S_EMPTY = lambda x, token: ['empty', ''] S_STRING = lambda x, token: ['string', token] S_TRAILING = lambda x, token: ['trailing', None] class ArgumentError(Exception): """Thrown when args encounters a command line format error.""" pass SCANNER = re.Scanner([ (r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}", S_EMAIL_ADDR), (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]", S_IP_ADDRESS), (r"-+[a-zA-Z0-9]+", S_OPTION), (r"True", S_BOOL), (r"[0-9]+", S_INT), (r"--", S_TRAILING), (r"[a-z\-]+", S_WORD), (r"\s", S_EMPTY), (r".+", S_STRING), ]) def match(tokens, of_type = None): """ Responsible for taking a token off and processing it, ensuring it is of the correct type. If of_type is None (the default) then you are asking for anything. """ # check the type (first element) if of_type:
# No known examples of starting with a space yet. # self.name, raw = raw.strip().partition(' ')[0::2] self.name, raw = raw.lstrip().partition(' ')[0::2] self.buf.append(raw) def finalize(self): self.instructions = ''.join(self.buf) del self.buf WORD, FLAG = 0, 1 scanner = re.Scanner( [ (r'\\\S{1}', lambda s, t: (t, FLAG)), # A flag of the form \x (r'"[^"]*"', lambda s, t: (t[1:-1], WORD)), # Quoted word ( r'[^\s\\"]\S*', lambda s, t: (t, WORD) ), # A non-quoted word, must not start with a backslash or a space or a quote (r'\s+', None), ], flags=re.DOTALL) null = object() def parser(name, field_map, default_field_name=None): field_map = dict((x.split(':') for x in field_map.split())) def parse(raw, log=None): ans = {} last_option = None
class TemplateFormatter(string.Formatter): ''' Provides a format function that substitutes '' for any missing value ''' _validation_string = 'This Is Some Text THAT SHOULD be LONG Enough.%^&*' # Dict to do recursion detection. It is up the the individual get_value # method to use it. It is cleared when starting to format a template composite_values = {} def __init__(self): string.Formatter.__init__(self) self.book = None self.kwargs = None self.strip_results = True self.locals = {} def _do_format(self, val, fmt): if not fmt or not val: return val if val == self._validation_string: val = '0' typ = fmt[-1] if typ == 's': pass elif 'bcdoxXn'.find(typ) >= 0: try: val = int(val) except: raise ValueError( _('format: type {0} requires an integer value, got {1}').format(typ, val)) elif 'eEfFgGn%'.find(typ) >= 0: try: val = float(val) except: raise ValueError( _('format: type {0} requires a decimal (float) value, got {1}').format(typ, val)) return unicode(('{0:'+fmt+'}').format(val)) def _explode_format_string(self, fmt): try: matches = self.format_string_re.match(fmt) if matches is None or matches.lastindex != 3: return fmt, '', '' return matches.groups() except: if DEBUG: traceback.print_exc() return fmt, '', '' format_string_re = re.compile(r'^(.*)\|([^\|]*)\|(.*)$', re.DOTALL) compress_spaces = re.compile(r'\s+') backslash_comma_to_comma = re.compile(r'\\,') arg_parser = re.Scanner([ (r',', lambda x,t: ''), (r'.*?((?<!\\),)', lambda x,t: t[:-1]), (r'.*?\)', lambda x,t: t[:-1]), ]) # ################# 'Functional' template language ###################### lex_scanner = re.Scanner([ (r'[(),=;]', lambda x,t: (1, t)), (r'-?[\d\.]+', lambda x,t: (3, t)), (r'\$', lambda x,t: (2, t)), (r'\w+', lambda x,t: (2, t)), (r'".*?((?<!\\)")', lambda x,t: (3, t[1:-1])), (r'\'.*?((?<!\\)\')', lambda x,t: (3, t[1:-1])), (r'\n#.*?(?:(?=\n)|$)', None), (r'\s', None) ], flags=re.DOTALL) def _eval_program(self, val, prog, column_name): # keep a cache of the lex'ed program under the theory that re-lexing # is much more expensive than the cache lookup. This is certainly true # for more than a few tokens, but it isn't clear for simple programs. if tweaks['compile_gpm_templates']: if column_name is not None and self.template_cache is not None: lprog = self.template_cache.get(column_name, None) if lprog: return lprog.evaluate(self, self.kwargs, self.book, self.locals) lprog = self.lex_scanner.scan(prog) compile_text = ('__funcs__ = formatter_functions().get_functions()\n' 'def evaluate(self, formatter, kwargs, book, locals):\n' ) else: lprog = self.lex_scanner.scan(prog) compile_text = None parser = _CompileParser(val, lprog, self, compile_text) val = parser.program() if parser.compile_text: global compile_counter compile_counter += 1 f = compile_user_function("__A" + str(compile_counter), 'doc', -1, parser.compile_text) self.template_cache[column_name] = f else: if column_name is not None and self.template_cache is not None: lprog = self.template_cache.get(column_name, None) if not lprog: lprog = self.lex_scanner.scan(prog) self.template_cache[column_name] = lprog else: lprog = self.lex_scanner.scan(prog) parser = _Parser(val, lprog, self) val = parser.program() return val # ################# Override parent classes methods ##################### def get_value(self, key, args, kwargs): raise Exception('get_value must be implemented in the subclass') def format_field(self, val, fmt): # ensure we are dealing with a string. if isinstance(val, (int, float)): if val: val = unicode(val) else: val = '' # Handle conditional text fmt, prefix, suffix = self._explode_format_string(fmt) # Handle functions # First see if we have a functional-style expression if fmt.startswith('\''): p = 0 else: p = fmt.find(':\'') if p >= 0: p += 1 if p >= 0 and fmt[-1] == '\'': val = self._eval_program(val, fmt[p+1:-1], None) colon = fmt[0:p].find(':') if colon < 0: dispfmt = '' else: dispfmt = fmt[0:colon] else: # check for old-style function references p = fmt.find('(') dispfmt = fmt if p >= 0 and fmt[-1] == ')': colon = fmt[0:p].find(':') if colon < 0: dispfmt = '' colon = 0 else: dispfmt = fmt[0:colon] colon += 1 funcs = formatter_functions().get_functions() fname = fmt[colon:p].strip() if fname in funcs: func = funcs[fname] if func.arg_count == 2: # only one arg expected. Don't bother to scan. Avoids need # for escaping characters args = [fmt[p+1:-1]] else: args = self.arg_parser.scan(fmt[p+1:])[0] args = [self.backslash_comma_to_comma.sub(',', a) for a in args] if (func.arg_count == 1 and (len(args) != 1 or args[0])) or \ (func.arg_count > 1 and func.arg_count != len(args)+1): raise ValueError('Incorrect number of arguments for function '+ fmt[0:p]) if func.arg_count == 1: val = func.eval_(self, self.kwargs, self.book, self.locals, val) if self.strip_results: val = val.strip() else: val = func.eval_(self, self.kwargs, self.book, self.locals, val, *args) if self.strip_results: val = val.strip() else: return _('%s: unknown function')%fname if val: val = self._do_format(val, dispfmt) if not val: return '' return prefix + val + suffix def evaluate(self, fmt, args, kwargs): if fmt.startswith('program:'): ans = self._eval_program(kwargs.get('$', None), fmt[8:], self.column_name) else: ans = self.vformat(fmt, args, kwargs) if self.strip_results: return self.compress_spaces.sub(' ', ans).strip() return ans # ######### a formatter that throws exceptions ############ def unsafe_format(self, fmt, kwargs, book, strip_results=True): self.strip_results = strip_results self.column_name = self.template_cache = None self.kwargs = kwargs self.book = book self.composite_values = {} self.locals = {} return self.evaluate(fmt, [], kwargs) # ######### a formatter guaranteed not to throw an exception ############ def safe_format(self, fmt, kwargs, error_value, book, column_name=None, template_cache=None, strip_results=True): self.strip_results = strip_results self.column_name = column_name self.template_cache = template_cache self.kwargs = kwargs self.book = book self.composite_values = {} self.locals = {} try: ans = self.evaluate(fmt, [], kwargs) except Exception as e: if DEBUG: # and getattr(e, 'is_locking_error', False): traceback.print_exc() if column_name: prints('Error evaluating column named:', column_name) ans = error_value + ' ' + e.message return ans
_number_types = frozenset((int, long, float)) _name_from_hex_string = unhexlify def trim_if_startswith(s, prefix): if s.startswith(prefix): return s[len(prefix):] return s _casstypes = {} _cqltypes = {} cql_type_scanner = re.Scanner(( ('frozen', None), (r'[a-zA-Z0-9_]+', lambda s, t: t), (r'[\s,<>]', None), )) def cql_types_from_string(cql_type): return cql_type_scanner.scan(cql_type)[0] class CassandraTypeType(type): """ The CassandraType objects in this module will normally be used directly, rather than through instances of those types. They can be instantiated, of course, but the type information is what this driver mainly needs. This metaclass registers CassandraType classes in the global
PROFILE_TYPE_COLOURS, PROFILE_MALFORMED, SUPPORTED_PROFILE_TYPES, \ HEADER_ATTRS, HEADER_COMMIT_COLOUR, HEADER_INFO_COLOUR, HEADER_SLASH_COLOUR, \ DESC_COMMIT_ATTRS, DESC_COMMIT_COLOUR, PROFILE_DELIMITER, ID_TYPE_COLOUR from perun.utils.log import cprint, cprintln import perun.vcs as vcs # Init colorama for multiplatform colours colorama.init() UNTRACKED_REGEX = \ re.compile(r"([^\\]+)-([0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]{2}).perf") # Regex for parsing the formating tag [<tag>:<size>f<fill_char>] FMT_REGEX = re.compile("[[]([a-zA-Z]+)(:[0-9]+)?(f.)?[]]") # Scanner for parsing formating strings, i.e. breaking it to parts FMT_SCANNER = re.Scanner([ (r"[[]([a-zA-Z]+)(:[0-9]+)?(f.)?[]]", lambda scanner, token: ("fmt_string", token)), (r"[^][]*", lambda scanner, token: ("rest", token)), ]) def lookup_minor_version(func): """If the minor_version is not given by the caller, it looks up the HEAD in the repo. If the @p func is called with minor_version parameter set to None, then this decorator performs a lookup of the minor_version corresponding to the head of the repository. Arguments: func(function): decorated function for which we will lookup the minor_version Returns:
ASN1Scanner = re.Scanner([ # (r'(--).*(\n|(--))', tokenize_comment), (r'(/\*).*(\*/)', tokenize_comment), (r'".*(?<!")"(?!")', tokenize_charstr), # (r'::=', tokenize_definition), (r':', tokenize_colon), (r'=', tokenize_equal), (r',', tokenize_comma), (r'\(|\)', tokenize_parenth), (r'\[{2}|\]{2}', tokenize_dbracket), (r'\[|\]', tokenize_bracket), (r'\{|\}', tokenize_curlyb), (r'\.\.\.', tokenize_tdot), (r'\.\.', tokenize_ddot), (r'\.', tokenize_dot), #(r'\.\s{0,}&', tokenize_dotamper), (r'\||(?:UNION)', tokenize_union), (r'\^|(?:INTERSECTION)', tokenize_intersect), (r'EXCEPT', tokenize_complement), (r'<', tokenize_lthan), (r'>', tokenize_gthan), (r'@', tokenize_arrowb), (r'\!', tokenize_exclam), (r"'", tokenize_apost), (r'"', tokenize_quote), # should expand to get the whole string # (r'{0}'.format(_RE_NATIVE_TYPES), tokenize_typenative), (r'[A-Z][A-Z0-9\-]{0,}', tokenize_classref), (r'[A-Z][a-zA-Z0-9\-]{0,}', tokenize_typeref), (r'&[a-zA-Z]{1}[a-zA-Z0-9\-]{0,}', tokenize_fieldref), (r'[a-z]{1}[a-zA-Z0-9\-]{0,}', tokenize_identifier), # (r'UNIVERSAL', tokenize_univers), (r'APPLICATION', tokenize_applic), (r'PRIVATE', tokenize_priv), # (r'ALL', tokenize_all), (r'MIN', tokenize_min), (r'MAX', tokenize_max), (r'MINUS-INFINITY', tokenize_mininf), (r'PLUS-INFINITY', tokenize_plusinf), (r'NULL', tokenize_null), (r'(?:FALSE)|(?:TRUE)', tokenize_bool), (r'(?:[+\-]{0,1}\s{0,})[0-9]{1,}', tokenize_integer), (r'', tokenize_real), (r'', tokenize_bstring), (r'', tokenize_hstring), #(r'', tokenize_), #(r'', tokenize_), #(r'', tokenize_), (r'\s{1,}', None), ])
def tokenize_noop(scanner, token): return token # translate strftime spec into mostly equivalent PostgreSQL spec _scanner = re.Scanner([ (py, tokenize_noop) for py in _strftime_to_postgresql_rules.keys() ] + [ # "%e" is in the C standard and Python actually generates this if your spec # contains "%c" but we don't officially support it as a specifier so we # need to special case it in the scanner (r'%e', tokenize_noop), # double quotes need to be escaped (r'"', lambda scanner, token: re.escape(token)), # spaces should be greedily consumed and kept (r'\s+', tokenize_noop), (r'[%s]' % re.escape(string.punctuation), tokenize_noop), # everything else except double quotes and spaces (r'[^%s\s]+' % re.escape(string.punctuation), tokenize_noop), ]) _lexicon_values = frozenset(_strftime_to_postgresql_rules.values()) _strftime_blacklist = frozenset(['%w', '%U', '%c', '%x', '%X', '%e'])
class Modifier(object): """ An expression in a macro primitive. We interpret gerber expressions according to the following grammar, rooted at TOP: TOP -> number ESUB TOP -> minus number ESUB TOP -> variable TOPE TOPE -> TOPE -> equate E TOPE -> op E E -> variable ESUB E -> number ESUB E -> minus number ESUB ESUB -> ESUB -> op E All numerical operations are given the same precedence and are grouped right to left. """ scanner = re.Scanner([ (r'\d*\.\d+', lambda s, tok: Token('number', float(tok))), (r'\d+', lambda s, tok: Token('number', float(tok))), (r'\$\d+', lambda s, tok: Token('variable', int(tok[1:]))), (r'[\+xX/-]', lambda s, tok: Token('op', tok.lower())), (r'=', lambda s, tok: Token('equate', None)), ]) def __init__(self, expr): self.tokens, remainder = self.scanner.scan(expr) if remainder: raise InvalidExpression(expr) def evaluate(self, values): """ Evaluate the expression given a values dict and return the resulting value. The values dict may be modified in the process (by an equate expression). """ stack = [] self._evaluate_top(list(self.tokens), stack, values) return self._pop_value(stack, values) def _evaluate_top(self, tokens, stack, values): """ Evaluate the TOP production. """ token = tokens.pop(0) if token.type == 'number': stack.append(token) self._evaluate_sube(tokens, stack, values) elif token.type == 'variable': stack.append(token) self._evaluate_tope(tokens, stack, values) elif token == ('op', '-'): if not tokens: raise InvalidExpression(token, stack, values) num = tokens.pop(0) if num.type != 'number': raise InvalidExpression(token, num, stack, values) stack.append(Token('number', -num.value)) else: raise InvalidExpression(token, tokens) def _evaluate_tope(self, tokens, stack, values): """ Evaluate the TOPE production. """ if not tokens: return token = tokens.pop(0) if token.type == 'equate': var = stack.pop() self._evaluate_e(tokens, stack, values) values[var.value] = self._pop_value(stack, values) stack.append(Token('number', values[var.value])) elif token.type == 'op': self._evaluate_e(tokens, stack, values) self._evaluate_op(token, stack, values) else: raise InvalidExpression(token, tokens) def _evaluate_e(self, tokens, stack, values): """ Evaluate the E production. """ token = tokens.pop(0) if token.type in ('number', 'variable'): stack.append(token) self._evaluate_sube(tokens, stack, values) elif token == ('op', '-'): if not tokens: raise InvalidExpression(token, stack, values) num = tokens.pop(0) if num.type != 'number': raise InvalidExpression(token, num, stack, values) stack.append(Token('number', -num.value)) else: raise InvalidExpression(token, tokens) def _evaluate_sube(self, tokens, stack, values): """ Evaluate the SUBE production. """ if not tokens: return token = tokens.pop(0) if token.type == 'op': self._evaluate_e(tokens, stack, values) self._evaluate_op(token, stack, values) else: raise InvalidExpression(token, tokens) def _evaluate_op(self, operand, stack, values): """ Evaluate the given operand and push the value onto the stack. """ val2 = self._pop_value(stack, values) val1 = self._pop_value(stack, values) if operand.value == '+': val = val1 + val2 elif operand.value == '-': val = val1 - val2 elif operand.value == 'x': val = val1 * val2 elif operand.value == '/': val = val1 / val2 else: raise InvalidExpression(operand, stack) stack.append(Token('number', val)) def _pop_value(self, stack, values): """ Pop a value from the stack and return it, replacing variables with their values. """ token = stack.pop() if token.type == 'number': return token.value elif token.type == 'variable': return values[token.value] else: raise InvalidExpression(token, stack, values)
class TemplateFormatter(string.Formatter): ''' Provides a format function that substitutes '' for any missing value ''' _validation_string = 'This Is Some Text THAT SHOULD be LONG Enough.%^&*' # Dict to do recursion detection. It is up to the individual get_value # method to use it. It is cleared when starting to format a template composite_values = {} def __init__(self): string.Formatter.__init__(self) self.book = None self.kwargs = None self.strip_results = True self.locals = {} self.funcs = formatter_functions().get_functions() self.gpm_parser = _Parser() self.gpm_interpreter = _Interpreter() def _do_format(self, val, fmt): if not fmt or not val: return val if val == self._validation_string: val = '0' typ = fmt[-1] if typ == 's': pass elif 'bcdoxXn'.find(typ) >= 0: try: val = int(val) except Exception: raise ValueError( _('format: type {0} requires an integer value, got {1}'). format(typ, val)) elif 'eEfFgGn%'.find(typ) >= 0: try: val = float(val) except: raise ValueError( _('format: type {0} requires a decimal (float) value, got {1}' ).format(typ, val)) return unicode_type(('{0:' + fmt + '}').format(val)) def _explode_format_string(self, fmt): try: matches = self.format_string_re.match(fmt) if matches is None or matches.lastindex != 3: return fmt, '', '' return matches.groups() except: if DEBUG: traceback.print_exc() return fmt, '', '' format_string_re = re.compile(r'^(.*)\|([^\|]*)\|(.*)$', re.DOTALL) compress_spaces = re.compile(r'\s+') backslash_comma_to_comma = re.compile(r'\\,') arg_parser = re.Scanner([ (r',', lambda x, t: ''), (r'.*?((?<!\\),)', lambda x, t: t[:-1]), (r'.*?\)', lambda x, t: t[:-1]), ]) # ################# 'Functional' template language ###################### lex_scanner = re.Scanner( [ (r'(==#|!=#|<=#|<#|>=#|>#)', lambda x, t: (_Parser.LEX_NUMERIC_INFIX, t)), (r'(==|!=|<=|<|>=|>)', lambda x, t: (_Parser.LEX_STRING_INFIX, t)), # noqa (r'(if|then|else|fi)\b', lambda x, t: (_Parser.LEX_KEYWORD, t)), # noqa (r'[(),=;]', lambda x, t: (_Parser.LEX_OP, t)), # noqa (r'-?[\d\.]+', lambda x, t: (_Parser.LEX_CONST, t)), # noqa (r'\$', lambda x, t: (_Parser.LEX_ID, t)), # noqa (r'\w+', lambda x, t: (_Parser.LEX_ID, t)), # noqa (r'".*?((?<!\\)")', lambda x, t: (_Parser.LEX_CONST, t[1:-1])), # noqa (r'\'.*?((?<!\\)\')', lambda x, t: (_Parser.LEX_CONST, t[1:-1])), # noqa (r'\n#.*?(?:(?=\n)|$)', None), (r'\s', None), ], flags=re.DOTALL) def _eval_program(self, val, prog, column_name): if column_name is not None and self.template_cache is not None: tree = self.template_cache.get(column_name, None) if not tree: tree = self.gpm_parser.program(self, self.funcs, self.lex_scanner.scan(prog)) self.template_cache[column_name] = tree else: tree = self.gpm_parser.program(self, self.funcs, self.lex_scanner.scan(prog)) return self.gpm_interpreter.program(self.funcs, self, tree, val) def _eval_sfm_call(self, template_name, args): func = self.funcs[template_name] tree = func.cached_parse_tree if tree is None: tree = self.gpm_parser.program( self, self.funcs, self.lex_scanner.scan(func.program_text[len('program:'):])) func.cached_parse_tree = tree return self.gpm_interpreter.program(self.funcs, self, tree, None, is_call=True, args=args) # ################# Override parent classes methods ##################### def get_value(self, key, args, kwargs): raise Exception('get_value must be implemented in the subclass') def format_field(self, val, fmt): # ensure we are dealing with a string. if isinstance(val, numbers.Number): if val: val = unicode_type(val) else: val = '' # Handle conditional text fmt, prefix, suffix = self._explode_format_string(fmt) # Handle functions # First see if we have a functional-style expression if fmt.startswith('\''): p = 0 else: p = fmt.find(':\'') if p >= 0: p += 1 if p >= 0 and fmt[-1] == '\'': val = self._eval_program(val, fmt[p + 1:-1], None) colon = fmt[0:p].find(':') if colon < 0: dispfmt = '' else: dispfmt = fmt[0:colon] else: # check for old-style function references p = fmt.find('(') dispfmt = fmt if p >= 0 and fmt[-1] == ')': colon = fmt[0:p].find(':') if colon < 0: dispfmt = '' colon = 0 else: dispfmt = fmt[0:colon] colon += 1 fname = fmt[colon:p].strip() if fname in self.funcs: func = self.funcs[fname] if func.arg_count == 2: # only one arg expected. Don't bother to scan. Avoids need # for escaping characters args = [fmt[p + 1:-1]] else: args = self.arg_parser.scan(fmt[p + 1:])[0] args = [ self.backslash_comma_to_comma.sub(',', a) for a in args ] if not func.is_python: args.insert(0, val) val = self._eval_sfm_call(fname, args) else: if (func.arg_count == 1 and (len(args) != 1 or args[0])) or \ (func.arg_count > 1 and func.arg_count != len(args)+1): raise ValueError( _('Incorrect number of arguments for function {0}' ).format(fname)) if func.arg_count == 1: val = func.eval_(self, self.kwargs, self.book, self.locals, val) if self.strip_results: val = val.strip() else: val = func.eval_(self, self.kwargs, self.book, self.locals, val, *args) if self.strip_results: val = val.strip() else: return _('%s: unknown function') % fname if val: val = self._do_format(val, dispfmt) if not val: return '' return prefix + val + suffix def evaluate(self, fmt, args, kwargs): if fmt.startswith('program:'): ans = self._eval_program(kwargs.get('$', None), fmt[8:], self.column_name) else: ans = self.vformat(fmt, args, kwargs) if self.strip_results: return self.compress_spaces.sub(' ', ans).strip() return ans # ######### a formatter that throws exceptions ############ def unsafe_format(self, fmt, kwargs, book, strip_results=True): self.strip_results = strip_results self.column_name = self.template_cache = None self.kwargs = kwargs self.book = book self.composite_values = {} self.locals = {} return self.evaluate(fmt, [], kwargs) # ######### a formatter guaranteed not to throw an exception ############ def safe_format(self, fmt, kwargs, error_value, book, column_name=None, template_cache=None, strip_results=True, template_functions=None): self.strip_results = strip_results self.column_name = column_name self.template_cache = template_cache self.kwargs = kwargs self.book = book if template_functions: self.funcs = template_functions else: self.funcs = formatter_functions().get_functions() self.composite_values = {} self.locals = {} try: ans = self.evaluate(fmt, [], kwargs) except Exception as e: if DEBUG: # and getattr(e, 'is_locking_error', False): traceback.print_exc() if column_name: prints('Error evaluating column named:', column_name) ans = error_value + ' ' + error_message(e) return ans
class ChopGrammar: #TODO Add support for escaped sequences? scanner=re.Scanner([ (r'"((?:[^\t\n\r\f\v"])*)"', lambda scanner, token:("QUOTED", token)), (r"'((?:[^\t\n\r\f\v'])*)'", lambda scanner, token:("QUOTED", token)), (r"[ ]", lambda scanner, token:("SPACE", token)), (r"\;", lambda scanner, token:("SEMICOLON", token)), (r"\(", lambda scanner, token:("BTEE", token)), (r"\)", lambda scanner, token:("ETEE", token)), (r"\|", lambda scanner, token:("PIPE", token)), (r"\,", lambda scanner, token:("COMMA", token)), (r"[^\t\n\r\f\v'\";()|,-][^ \t\n\r\f\v'\";()|,]*", lambda scanner, token:("STRING", token)), (r"--[a-zA-Z0-9_-]+", lambda scanner, token:("OPTION", token)), (r"-[a-zA-Z0-9]+", lambda scanner, token:("OPTION", token)), (r"-", lambda scanner, token:("STRING", token)), ]) def __init__(self): self.top_modules = [] self.all_modules = [] self.strbuff = None def parseGrammar(self, grammar_string): results, remainder = self.scanner.scan(grammar_string) if remainder: return (None, None) nresults = [] for token in results: if token[0] != "SPACE": nresults.append(token) results = nresults self.verify_chains(results) return self.all_modules def find_tee_end(self, chain, left): btee_stack = [True] #Assume left is the position of BTEE right = left + 1 while right < len(chain): if chain[right][0] == "BTEE": btee_stack.append(True) elif chain[right][0] == "ETEE": if not len(btee_stack): #there's no cooresponding BTEE raise Exception("Unexpected End Tee token ')'") #return left #error if len(btee_stack) == 1: #this is the ETEE we're looking for return right btee_stack.pop() right += 1 raise Exception("Unable to find end of Tee") #return left #error def verify_chains(self, chains): left = 0 right= 0 flows = [] #get chain #pdb.set_trace() while right < len(chains): while right < len(chains) and chains[right][0] != "SEMICOLON": right += 1 chain = chains[left:right] right += 1 left = right (ancestors, children) = self.verify_chain(chain) flows.extend(ancestors) self.top_modules = flows return True def verify_chain(self, chain): left = 0 right= 0 ancestors = [] parents = [] #get chain while right < len(chain): while right < len(chain) and (chain[right][0] != "PIPE" and chain[right][0] != "BTEE"): right += 1 if right >= len(chain) or chain[right][0] == "PIPE": #Assume Invocation invocation = chain[left:right] mod = self.verify_invocation(invocation) if len(parents) == 0: parents.append(mod) ancestors.append(mod) else: for parent in parents: parent.children.append(mod) mod.parents.append(parent) parents = [mod] elif chain[right][0] == "BTEE": #Must find end of TEE if left != right: raise Exception("Unexpected Tee") #left = right right = self.find_tee_end(chain, left) tee = chain[left + 1: right] #Remove the TEE elements if (right + 1) < len(chain): #There's more tokens after the end of the tee if chain[right + 1][0] != "PIPE": raise Exception('Unexpected token after TEE', chain[right + 1][0]) else: right += 1 (tparents, tchildren) = self.verify_tee(tee) if len(parents) == 0: parents = tchildren ancestors = tparents else: for parent in parents: for tparent in tparents: parent.children.append(tparent) tparent.parents.append(parent) parents = tchildren right += 1 left = right #return True return (ancestors,parents) def verify_tee(self, tee): left = 0 right = 0 comma = False parents = [] children = [] while right < len(tee): while right < len(tee) and (tee[right][0] != "COMMA" and tee[right][0] != "BTEE"): right += 1 if right >= len(tee) or tee[right][0] == "COMMA": #Element of TEE, i.e., a chain if right < len(tee) and tee[right][0] == 'COMMA': comma = True chain = tee[left:right] (cparents, cchildren) = self.verify_chain(chain) for cparent in cparents: parents.append(cparent) for cchild in cchildren: children.append(cchild) elif tee[right][0] == "BTEE": #TEE in the Chain, need to skip it to find the comma right = self.find_tee_end(tee,right) continue right += 1 left = right if not comma: raise Exception('Usage of a Tee requires at least two elements') return (parents, children) def verify_invocation(self, invocation): right = 1 if invocation[0][0] != "STRING": raise Exception("Invocation must begin with a 'STRING' token, not a %s token" % invocation[0][0]) mymod = __ChopModule__(invocation[0][1].rstrip()) while right < len(invocation): if invocation[right][0] == "OPTION": mymod.arguments.append(invocation[right][1].rstrip()) if (right + 1) < len(invocation): #Check if the next element is the argument to the option if invocation[right + 1][0] == "QUOTED": #Need to strip the quotes mymod.arguments.append(invocation[right + 1][1].rstrip()[1:-1]) right += 1 #skip the parameter elif invocation[right + 1][0] == "STRING": mymod.arguments.append(invocation[right + 1][1].rstrip()) right += 1 #skip the parameter #If not, just skip it and let it be parsed out elif (invocation[right][0] == "QUOTED"): if (right + 1) < len(invocation): raise Exception("QUOTED token must be last element of invocation or following a OPTION token") #Need to remove the quotes from the quoted string mymod.arguments.append(invocation[right][1].rstrip()[1:-1]) elif (invocation[right][0] == "STRING"): if (right + 1) < len(invocation): raise Exception("STRING token must be last element of invocation or following a OPTION token") mymod.arguments.append(invocation[right][1].rstrip()) else: raise Exception("Unexpected %s token %s" % (invocation[right][0], invocation[right][1])) right += 1 self.all_modules.append(mymod) return mymod def get_family_(self, top, tabs = 0): for i in range(0, tabs): self.strbuff.write("\t") self.strbuff.write("%s -->\n" % top.name) if len(top.children): for child in top.children: self.get_family_(child, tabs + 1) def get_family(self, top): if self.strbuff is not None: self.strbuff.close() self.strbuff = StringIO() self.get_family_(top) output = self.strbuff.getvalue() self.strbuff.close() return output def get_tree(self): output = "" for t in self.top_modules: output += self.get_family(t) + "\n" return output def print_family(self, top, tabs = 0): #print Self for i in range (0, tabs): print "\t", print top.name, "-->" if len(top.children): for child in top.children: self.print_family(child, tabs + 1) def print_tree(self): for t in self.top_modules: self.print_family(t)
def pop_bone_context(): global bone_context bone_context = bone_context[:-1] return bone_context[len(bone_context) - 1] reserved = ["HIERARCHY", "ROOT", "OFFSET", "CHANNELS", "MOTION"] channel_names = [ "Xposition", "Yposition", "Zposition", "Zrotation", "Xrotation", "Yrotation" ] scanner = re.Scanner([ (r"[a-zA-Z_]\w*", identifier), (r"-*[0-9]+(\.[0-9]+)?", digit), (r"}", close_brace), (r"{", open_brace), (r":", None), (r"\s+", None), ]) def read_offset(bvh, token_index): if (bvh[token_index] != ("IDENT", "OFFSET")): return None, None token_index = token_index + 1 offsets = [0.0] * 3 for i in range(0, 3): offsets[i] = float(bvh[token_index][1]) token_index = token_index + 1 return offsets, token_index
REScannerASN1 = re.Scanner( [ # (r'(--).*?([%s]|(--)|$)' % _NL, lambda s, t: (TOK_CMT, t)), (r'(/\*).*?(\*/)', lambda s, t: (TOK_CMT, t)), (r'".*?(?<!")"(?!")', lambda s, t: (TOK_CSTR, t)), # (r'::=', lambda s, t: TOK_ASSI), (r':', lambda s, t: TOK_COL), (r';', lambda s, t: TOK_SCOL), (r'=', lambda s, t: TOK_EQU), (r',', lambda s, t: TOK_COM), (r'\(', lambda s, t: TOK_PARO), (r'\)', lambda s, t: TOK_PARC), (r'\[{2}', lambda s, t: TOK_DBRAO), (r'\]{2}', lambda s, t: TOK_DBRAC), (r'\[', lambda s, t: TOK_BRAO), (r'\]', lambda s, t: TOK_BRAC), (r'\{', lambda s, t: TOK_CBRAO), (r'\}', lambda s, t: TOK_CBRAC), (r'\.\.\.', lambda s, t: TOK_TDOT), (r'\.\.', lambda s, t: TOK_DDOT), (r'\.', lambda s, t: TOK_DOT), (r'\||(?:UNION%s)' % _EXC, lambda s, t: TOK_UNIO), (r'\^|(?:INTERSECTION%s)' % _EXC, lambda s, t: TOK_INTER), (r'<', lambda s, t: TOK_LTHAN), (r'>', lambda s, t: TOK_GTHAN), (r'@', lambda s, t: TOK_ARRO), (r'\!', lambda s, t: TOK_EXCL), # (r'ABSENT%s' % _EXC, lambda s, t: TOK_ABS), (r'ALL%s' % _EXC, lambda s, t: TOK_ALL), (r'APPLICATION%s' % _EXC, lambda s, t: TOK_TAPP), (r'AUTOMATIC%s' % _EXC, lambda s, t: TOK_AUTO), (r'BEGIN%s' % _EXC, lambda s, t: TOK_BEG), (r'BY%s' % _EXC, lambda s, t: TOK_BY), (r'COMPONENT%s' % _EXC, lambda s, t: TOK_COMP), (r'COMPONENTS%s' % _EXC, lambda s, t: TOK_COMPS), (r'CONSTRAINED%s' % _EXC, lambda s, t: TOK_CONST), (r'CONTAINING%s' % _EXC, lambda s, t: TOK_CONT), (r'DEFAULT%s' % _EXC, lambda s, t: TOK_DEF), (r'DEFINITIONS%s' % _EXC, lambda s, t: TOK_DEFI), (r'ENCODED%s' % _EXC, lambda s, t: TOK_ENC), (r'END%s' % _EXC, lambda s, t: TOK_END), (r'EXCEPT%s' % _EXC, lambda s, t: TOK_EXCE), (r'EXPLICIT%s' % _EXC, lambda s, t: TOK_TEXP), (r'EXPORTS%s' % _EXC, lambda s, t: TOK_EXP), (r'EXTENSIBILITY%sIMPLIED%s' % (REScannerSNL, _EXC), lambda s, t: TOK_EXTI), (r'FALSE%s' % _EXC, lambda s, t: TOK_FALS), (r'FROM%s' % _EXC, lambda s, t: TOK_FROM), (r'IMPLICIT%s' % _EXC, lambda s, t: TOK_TIMP), (r'IMPORTS%s' % _EXC, lambda s, t: TOK_IMP), (r'INCLUDES%s' % _EXC, lambda s, t: TOK_INCL), (r'MAX%s' % _EXC, lambda s, t: TOK_MAX), (r'MIN%s' % _EXC, lambda s, t: TOK_MIN), (r'MINUS-INFINITY%s' % _EXC, lambda s, t: TOK_MINF), (r'NOT-A-NUMBER%s' % _EXC, lambda s, t: TOK_NAN), (r'NULL%s' % _EXC, lambda s, t: TOK_NULL), (r'OF%s' % _EXC, lambda s, t: TOK_OF), (r'OPTIONAL%s' % _EXC, lambda s, t: TOK_OPT), (r'PATTERN%s' % _EXC, lambda s, t: TOK_PAT), (r'PLUS-INFINITY%s' % _EXC, lambda s, t: TOK_PINF), (r'PRESENT%s' % _EXC, lambda s, t: TOK_PRES), (r'PRIVATE%s' % _EXC, lambda s, t: TOK_TPRI), (r'SIZE%s' % _EXC, lambda s, t: TOK_SIZE), (r'TAGS%s' % _EXC, lambda s, t: TOK_TAGS), (r'TRUE%s' % _EXC, lambda s, t: TOK_TRUE), (r'UNIQUE%s' % _EXC, lambda s, t: TOK_UNIQ), (r'UNIVERSAL%s' % _EXC, lambda s, t: TOK_TUNI), (r'WITH%sSYNTAX%s' % (REScannerSNL, _EXC), lambda s, t: TOK_WSYN), # (r'%s' % REScannerReal, lambda s, t: (TOK_INT, t)), (r'%s' % REScannerInt, lambda s, t: (TOK_REAL, t)), (r'%s' % REScannerBStr, lambda s, t: (TOK_BSTR, t)), (r'%s' % REScannerHStr, lambda s, t: (TOK_HSTR, t)), # (r'(%s)%s' % (REScannerNTypes, _EXC), lambda s, t: (TOK_NTYPE, t)), (r'&[a-zA-Z](?:\-{0,1}[a-zA-Z0-9]{1,}){0,}%s' % _EXC, lambda s, t: (TOK_CLAID, t)), (r'[A-Z](?:\-{0,1}[a-zA-Z0-9]{1,}){0,}%s' % _EXC, lambda s, t: (TOK_HID, t)), (r'[A-Z](?:\-{0,1}[a-zA-Z0-9]{1,}){0,}%s' % _EXC, lambda s, t: (TOK_ID, t)), (r'[a-z](?:\-{0,1}[a-zA-Z0-9]{1,}){0,}%s' % _EXC, lambda s, t: (TOK_LID, t)), # (r'%s' % REScannerSNL, None) ], flags=re.DOTALL)
class Parser(object): def __init__(self): self.current_token = 0 self.tokens = None OPCODE = 1 WORD = 2 QUOTED_WORD = 3 EOF = 4 REPLACEMENTS = tuple( ('\\' + x, codepoint_to_chr(i + 1)) for i, x in enumerate('\\"()')) # Had to translate named constants to numeric values lex_scanner = re.Scanner([(r'[()]', lambda x, t: (Parser.OPCODE, t)), (r'@.+?:[^")\s]+', lambda x, t: (Parser.WORD, unicode_type(t))), (r'[^"()\s]+', lambda x, t: (Parser.WORD, unicode_type(t))), (r'".*?((?<!\\)")', lambda x, t: (Parser.QUOTED_WORD, t[1:-1])), (r'\s+', None)], flags=re.DOTALL) def token(self, advance=False): if self.is_eof(): return None res = self.tokens[self.current_token][1] if advance: self.current_token += 1 return res def lcase_token(self, advance=False): if self.is_eof(): return None res = self.tokens[self.current_token][1] if advance: self.current_token += 1 return icu_lower(res) def token_type(self): if self.is_eof(): return self.EOF return self.tokens[self.current_token][0] def is_eof(self): return self.current_token >= len(self.tokens) def advance(self): self.current_token += 1 def tokenize(self, expr): # Strip out escaped backslashes, quotes and parens so that the # lex scanner doesn't get confused. We put them back later. for k, v in self.REPLACEMENTS: expr = expr.replace(k, v) tokens = self.lex_scanner.scan(expr)[0] def unescape(x): for k, v in self.REPLACEMENTS: x = x.replace(v, k[1:]) return x return [(tt, unescape(tv) if tt in (self.WORD, self.QUOTED_WORD) else tv) for tt, tv in tokens] def parse(self, expr, locations): self.locations = locations self.tokens = self.tokenize(expr) self.current_token = 0 prog = self.or_expression() if not self.is_eof(): raise ParseException(_('Extra characters at end of search')) # prints(self.tokens, '\n', prog) return prog def or_expression(self): lhs = self.and_expression() if self.lcase_token() == 'or': self.advance() return ['or', lhs, self.or_expression()] return lhs def and_expression(self): lhs = self.not_expression() if self.lcase_token() == 'and': self.advance() return ['and', lhs, self.and_expression()] # Account for the optional 'and' if ((self.token_type() in [self.WORD, self.QUOTED_WORD] or self.token() == '(') and self.lcase_token() != 'or'): return ['and', lhs, self.and_expression()] return lhs def not_expression(self): if self.lcase_token() == 'not': self.advance() return ['not', self.not_expression()] return self.location_expression() def location_expression(self): if self.token_type() == self.OPCODE and self.token() == '(': self.advance() res = self.or_expression() if self.token_type() != self.OPCODE or self.token( advance=True) != ')': raise ParseException(_('missing )')) return res if self.token_type() not in (self.WORD, self.QUOTED_WORD): raise ParseException( _('Invalid syntax. Expected a lookup name or a word')) return self.base_token() def base_token(self): if self.token_type() == self.QUOTED_WORD: return ['token', 'all', self.token(advance=True)] words = self.token(advance=True).split(':') # The complexity here comes from having colon-separated search # values. That forces us to check that the first "word" in a colon- # separated group is a valid location. If not, then the token must # be reconstructed. We also have the problem that locations can be # followed by quoted strings that appear as the next token. and that # tokens can be a sequence of colons. # We have a location if there is more than one word and the first # word is in locations. This check could produce a "wrong" answer if # the search string is something like 'author: "foo"' because it # will be interpreted as 'author:"foo"'. I am choosing to accept the # possible error. The expression should be written '"author:" foo' if len(words) > 1 and words[0].lower() in self.locations: loc = words[0].lower() words = words[1:] if len(words) == 1 and self.token_type() == self.QUOTED_WORD: return ['token', loc, self.token(advance=True)] return ['token', icu_lower(loc), ':'.join(words)] return ['token', 'all', ':'.join(words)]
import re scanner=re.Scanner([ (r"[\w\.-]+@[\w\.-]+\.[\w\.-]+", lambda scanner,token:("Mail", token)), (r"([0-9]+[\-][0-9]+)", lambda scanner,token:("Endash num", token)), (r"([0-9])+", lambda scanner,token:("Int", token)), (r"[A-Za-z]+", lambda scanner,token:("Word", token)), (r"[!?;:,'-]+", lambda scanner,token:("Punct", token)), (r"[\(]", lambda scanner,token:("LParen", token)), (r"[\)]", lambda scanner,token:("RParen", token)), (r"[.]", lambda scanner,token:("Dot", token)), (r"\s\-\s", lambda scanner,token:("Em dash", token)), (r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])", lambda scanner,token:("Emoji", token)), (r"\s+", None), ]) results, remainder = scanner.scan("The Book! (Tue) 55-88 Milani-lop'ed mss. (-) 123 - [email protected] 😋 😎") print(results)
warnings.warn( 'locale specific date formats (%%c, %%x, %%X) are not yet implemented ' 'for %s' % platform.system()) # translate strftime spec into mostly equivalent PostgreSQL spec _scanner = re.Scanner( # double quotes need to be escaped [('"', lambda scanner, token: r'\"')] + [( '|'.join( map( '(?:{})'.format, itertools.chain( _strftime_to_postgresql_rules.keys(), [ # "%e" is in the C standard and Python actually # generates this if your spec contains "%c" but we # don't officially support it as a specifier so we # need to special case it in the scanner '%e', r'\s+', r'[{}]'.format(re.escape(string.punctuation)), r'[^{}\s]+'.format(re.escape(string.punctuation)), ], ), )), lambda scanner, token: token, )]) _lexicon_values = frozenset(_strftime_to_postgresql_rules.values()) _strftime_blacklist = frozenset(['%w', '%U', '%c', '%x', '%X', '%e'])
def parse(string): """ Order of operations () Group = Numeric Match - Not " " And / Or """ def identity(scanner, token): return token def unknown(scanner, token): raise ValueError("Unknown token", token) scanner = re.Scanner([ (REG_NUM, identity), (REG_TAG, identity), (REG_OP, identity), (REG_GROUP, identity), (r'\s+', None), (r'', unknown), ]) tokens = scanner.scan(string)[0] if tokens.count('(') != tokens.count(')'): raise ValueError('Unbalanced parentheses') def query_matching_files(match): parts = match.split("=") if re.match(reg_full(REG_TAG_NAME), parts[0]): clause = 'name = "{}"'.format(parts[0]) elif re.match(reg_full(REG_TAG_ID), parts[0]): clause = 'pk_id = {}'.format(parts[0]) else: raise ValueError("Found invalid tag specifier", parts[0]) if len(parts) > 2: raise ValueError("Found too many `='", match) elif len(parts) == 2: if '..' in parts[1]: [lower, upper] = parts[1].split('..') clause += ' AND amount BETWEEN {} AND {}'.format(lower, upper) else: clause += ' AND amount = {}'.format(parts[1]) return ("file.pk_id IN (" "SELECT pk_fk_file_id FROM file_has_tag" " LEFT JOIN tag ON pk_fk_tag_id = pk_id" " WHERE " + clause + ")") def reg_full(reg): return '^' + reg + '$' def parse_numeric(tokens): result = [] try: while True: token = tokens.next() if token == '=': if len(result) == 0: raise ValueError("Expected tag before `='") elif not re.match(reg_full(REG_TAG), result[-1]): raise ValueError("Expected tag before `='", result[-1]) try: number = tokens.next() except StopIteration: raise ValueError("Expected something after `='") if not re.match(reg_full(REG_NUM), number): raise ValueError("Expected number after `='", number) result[-1] += '=' + number # group the tokens elif '..' in token: raise ValueError("An amount requires a tag") else: result.append(token) except StopIteration: return result def parse_tag(tokens): result = [] try: while True: token = tokens.next() if re.match(reg_full(REG_TAG_MATCH), token): result.append(query_matching_files(token)) else: result.append(token) except StopIteration: return result def parse_not(tokens): result = [] try: while True: token = tokens.next() if token == '-': tag = tokens.next() if not re.match(r'file\.pk_id', tag): raise ValueError("Expected tag query after `-'", tag) result.append('NOT (' + tag + ')') else: result.append(token) except StopIteration: return result def parse_group(tokens): result = [] try: while True: token = tokens.next() if token == '(': result.append(parse_group(tokens)) elif token == ')': return result else: result.append(token) except StopIteration: return result def parse_or(tokens): if tokens[0] == '/' or tokens[-1] == '/': raise ValueError("Found OR on edge") result = "" left = [] for token in tokens: if token == '/': result += '(' + ' AND '.join(left) + ')' result += ' OR ' left = [] elif type(token) == list: left.append(parse_or(token)) else: left.append(token) result += '(' + ' AND '.join(left) + ')' result = '(' + result + ')' return result tokens = parse_numeric(iter(tokens)) # group 3 num tokens to 1 token tokens = parse_tag(iter(tokens)) # convert tag to query tokens = parse_not(iter(tokens)) # group a not token to the tag query tokens = parse_group(iter(tokens)) # convert groups to sublists query = parse_or(tokens) return query
def expect(expected_type, toks): line_num, (typ, tok_contents) = next(toks) if typ != expected_type: msg = 'Expected a %s, but got "%s" at line %s' % ( expected_type, tok_contents, line_num) raise CMakeParseError(msg) # http://stackoverflow.com/questions/691148/pythonic-way-to-implement-a-tokenizer # TODO: Handle multiline strings. scanner = re.Scanner([ (r'#.*', lambda scanner, token: ("comment", token)), (r'"[^"]*"', lambda scanner, token: ("string", token)), (r"\(", lambda scanner, token: ("left paren", token)), (r"\)", lambda scanner, token: ("right paren", token)), (r'[^ \t\r\n()#"]+', lambda scanner, token: ("word", token)), (r'\n', lambda scanner, token: ("newline", token)), (r"\s+", None), # skip other whitespace ]) def tokenize(s): """ Yields pairs of the form (line_num, (token_type, token_contents)) given a string containing the contents of a CMakeLists file. """ toks, remainder = scanner.scan(s) line_num = 1 if remainder != '': msg = 'Unrecognized tokens at line %s: %s' % (line_num, remainder)
def tokenize(contents): """ Scan a string and return a list of Token objects representing the contents of the cmake listfile. """ # https://cmake.org/cmake/help/v3.0/manual/ # cmake-language.7.html#grammar-token-unquoted_legacy legacy_pattern = "({})+".format("|".join([ # Make-style variable like $(MAKE) r'\$\([^\$\(\)]+\)', # Quoted-substring r'"[^"\\]*(?:\\.[^"\\]*)*"', # Any element except whitespace or one of '()#"\' r'[^\s\(\)#"\\]', # Escape sequences # https://cmake.org/cmake/help/v3.0/manual/ # cmake-language.7.html#grammar-token-escape_sequence r'\\[\(\)#" \\\$@\^\t\r\n;]' ])) # https://cmake.org/cmake/help/v3.0/manual/ # cmake-language.7.html#unquoted-argument unquoted_pattern = "({})+".format("|".join([ # Any element except whitespace or one of '()#"\' r'[^\s\(\)#"\\]', # Escape sequences # https://cmake.org/cmake/help/v3.0/manual/ # cmake-language.7.html#grammar-token-escape_sequence r'\\[\(\)#" \\\$@\^\t\r\n;]' ])) # Regexes are in priority order. Changing the order may alter the # behavior of the lexer scanner = re.Scanner( [ # double quoted string # NOTE(josh): regex borrowed from # https://stackoverflow.com/a/37379449/141023 (r'(?<![^\s\(])"[^"\\]*(?:\\.[^"\\]*)*"(?![^\s\)])', lambda s, t: (TokenType.QUOTED_LITERAL, t)), # single quoted string (r"(?<![^\s\(])'[^'\\]*(?:\\.[^'\\]*)*'(?![^\s\)])", lambda s, t: (TokenType.QUOTED_LITERAL, t)), # bracket argument (r"(?<![^\s\(])\[(=*)\[.*\]\1\](?![^\s\)])", lambda s, t: (TokenType.BRACKET_ARGUMENT, t)), (r"(?<![^\s\(])-?[0-9]+(?![^\s\)\(])", lambda s, t: (TokenType.NUMBER, t)), # Either a valid function name or variable name. (r"(?<![^\s\(])[a-zA-z_][a-zA-Z0-9_]*(?![^\s\)\(])", lambda s, t: (TokenType.WORD, t)), (r"(?<![^\s\(])\${[a-zA-z_][a-zA-Z0-9_]*}(?![^\s\)])", lambda s, t: (TokenType.DEREF, t)), # unquoted_legacy (legacy_pattern, lambda s, t: (TokenType.UNQUOTED_LITERAL, t)), # unquoted_element+ (unquoted_pattern, lambda s, t: (TokenType.UNQUOTED_LITERAL, t)), (r"\(", lambda s, t: (TokenType.LEFT_PAREN, t)), (r"\)", lambda s, t: (TokenType.RIGHT_PAREN, t)), # NOTE(josh): bare carriage returns are very unlikely to be used but # just for the case of explicitnes, if we ever encounter any we treat # it as a newline (r"\r?\n", lambda s, t: (TokenType.NEWLINE, t)), (r"\r\n?", lambda s, t: (TokenType.NEWLINE, t)), # NOTE(josh): don't match '\s' here or we'll miss some newline tokens # TODO(josh): should we match unicode whitespace too? (r"[ \t\f\v]+", lambda s, t: (TokenType.WHITESPACE, t)), (r"#\s*(cmake-format|cmf): off[^\n]*", lambda s, t: (TokenType.FORMAT_OFF, t)), (r"#\s*(cmake-format|cmf): on[^\n]*", lambda s, t: (TokenType.FORMAT_ON, t)), # bracket comment (r"#\[(=*)\[.*\]\1\]", lambda s, t: (TokenType.BRACKET_COMMENT, t)), # line comment (r"#[^\n]*", lambda s, t: (TokenType.COMMENT, t)), # Catch-all for literals which are compound statements. (r"([^\s\(\)]+|[^\s\(]*[^\)]|[^\(][^\s\)]*)", lambda s, t: (TokenType.UNQUOTED_LITERAL, t)), ], re.DOTALL) tokens_return = [] if contents.startswith("\ufeff"): tokens_return = [ Token(tok_type=TokenType.BYTEORDER_MARK, spelling=contents[0], index=-1, begin=SourceLocation((0, 0, 0)), end=SourceLocation((0, 0, 0))) ] contents = contents[1:] tokens, remainder = scanner.scan(contents) assert not remainder, "Unparsed tokens: {}".format(remainder) # Now add line, column, and serial number to token objects. We get lineno # by maintaining a running count of newline characters encountered among # tokens so far, and column count by splitting the most recent token on # it's right most newline. Note that line and numbers are 1-indexed to match # up with editors but column numbers are zero indexed because its fun to be # inconsistent. lineno = 1 col = 0 offset = 0 for tok_index, (tok_type, spelling) in enumerate(tokens): if sys.version_info[0] < 3: assert isinstance(spelling, unicode) begin = SourceLocation((lineno, col, offset)) newlines = spelling.count('\n') lineno += newlines if newlines: col = len(spelling.rsplit('\n', 1)[1]) else: col += len(spelling) offset += len(bytearray(spelling, 'utf-8')) tokens_return.append( Token(tok_type=tok_type, spelling=spelling, index=tok_index, begin=begin, end=SourceLocation((lineno, col, offset)))) return tokens_return
def scan(self, input_str): scanner = re.Scanner([(r"/", self.slash), (r"{\w*}", self.group), (r"\*", self.star), (r"(?:\\.|[^{\*/])*", self.literal),]) return scanner.scan(input_str)
import re scanner = re.Scanner([ (r"\[[^\]]*\]", lambda scanner, token: token), (r"\+", lambda scanner, token: "R_PLUS"), (r"\*", lambda scanner, token: "R_KLEENE"), (r"%", lambda scanner, token: "R_WILD"), (r"\^", lambda scanner, token: "R_START"), (r"\$", lambda scanner, token: "R_END"), (r"\?", lambda scanner, token: "R_QUESTION"), (r"[\.~``;_a-zA-Z0-9\s=:\{\}\-\\]+", lambda scanner, token: "R_FREE"), (r'.', lambda scanner, token: None), ]) def tokenizeRegex(s): results, remainder = scanner.scan(s) return results if __name__ == '__main__': print(tokenizeRegex("^discount[^(]*\\([0-9]+\\%\\)$")) print(tokenizeRegex("'helloworld'"))
def __init__(self, lexicon): self.scanner = re.Scanner(lexicon)
_junk = ur"[^א-ת%sa-zA-Z0-9!?.,:;\-()\[\]{}]+" % _NIKUD #%%&!?.,;:\-()\[\]{}\"'\/\\+]+" #% _NIKUD is_all_heb = re.compile(ur"^%s+$" % (_heb_letter), re.UNICODE).match is_a_number = re.compile(r"^%s$" % _numeric, re.UNICODE).match is_all_lat = re.compile(r"^[a-zA-Z]+$", re.UNICODE).match is_sep = re.compile(r"^\|+$").match is_punct = re.compile(r"^[.?!]+").match #### scanner scanner = re.Scanner([ (r"\s+", None), (_url, url), (_heb_word_plus, heb), (_eng_word, eng), (_numeric, num), (_opening_punc, punct), (_closing_punc, punct), (_eos_punct, punct), (_internal_punct, punct), (_junk, junk), ]) ##### tokenize def tokenize(sent): tok = sent parts, reminder = scanner.scan(tok) assert (not reminder) return parts