Python Scanner Examples, re.Scanner Python Examples

Example #1

0

Show file

File: cqltypes.py Project: murbans1/python-cql

    This metaclass registers CassandraType classes in the global
    by-cassandra-typename and by-cql-typename registries, unless their class
    name starts with an underscore.
    """
    def __new__(metacls, name, bases, dct):
        dct.setdefault('cassname', name)
        cls = type.__new__(metacls, name, bases, dct)
        if not name.startswith('_'):
            _casstypes[name] = cls
            _cqltypes[cls.typename] = cls
        return cls


casstype_scanner = re.Scanner((
    (r'[()]', lambda s, t: t),
    (r'[a-zA-Z0-9_.:]+', lambda s, t: t),
    (r'[\s,]', None),
))


def lookup_casstype_simple(casstype):
    """
    Given a Cassandra type name (either fully distinguished or not), hand
    back the CassandraType class responsible for it. If a name is not
    recognized, a custom _UnrecognizedType subclass will be created for it.

    This function does not handle complex types (so no type parameters--
    nothing with parentheses). Use lookup_casstype() instead if you might need
    that.
    """

Example #2

0

Show file

File: reading_rpn_file.py Project: steamhyperpolyglot/python-dev

import operator

sym_tab= {}
stack = []      # Stack to hold the values.

# Scanner object. Isolate each token and take
# appropriate action: push a numeric value, 
# but perform operation on top two elements on
# stack if an operator is found.

scanner = re.Scanner([
    (r"[ \t\n]", lambda s, t: None),
    (r"-?(\d*\.)?\d+", lambda s, t: stack.append(float(t))),
    (r"[a-zA-Z_][a-zA-Z_0-9]*", lambda s, t: stack.append(t)),
    (r"\d+", lambda s, t: stack.append(int(t))),
    (r"[+]", lambda s, t: bin_op(operator.add)),
    (r"[-]", lambda s, t: bin_op(operator.sub)),
    (r"[*]", lambda s, t: bin_op(operator.mul)),
    (r"[/]", lambda s, t: bin_op(operator.truediv)),
    (r"[\^]", lambda s, t: bin_op(operator.pow)),
    (r"[=]", lambda s, t: assign_op()),
])

# Binary Operator function. pop top two elements
# from stack and push the result back on the stack.

def bin_op(action):
    '''
    Binary Operation evaluator: If an operand is
    a variable name, look it up in the symbol table
    and replace with the corresponding value, before
    being evaluated.

Example #3

0

Show file

    cur += " " + str(i[1])
    return cur


def handleOperator(cur, i):  # convert operator into machine code
    cur += " " + str(i[1])
    return cur


def handleDigit(cur, i):  # convert digit into machine code
    cur += " " + str(i[1])
    return cur


scanner = re.Scanner([
    (r"//.*", comment),  # find comments in code
    (r"[A-Z]+", instruction),  # find instructions in code
    (r"[a-zA-Z0-9]+,??", operand),  # find operands in code
    (r"[+\-*/=]", operator),  # find operators in code
    (r"[0-9]+(\.[0-9]+)?", digit),  # find digits in code
    (r"\n", end_stmnt),  # find end of line
    (r"\s+", None)  # do nothing with the rest
])

assemblyTable = [['MOV', 'MOV'], ['JMP', 'JMP']]  # assembly table
operandTable = [['ax', 'ax'], ['bx', 'bx']]  # operand table
file = open(sys.argv[1])
tokens, remainder = scanner.scan(file.read())
#tokens = assemble(tokens)
print(tokens)

Example #4

0

Show file

File: attr_list.py Project: Alexlr10/DoceMel

def _handle_key_value(s, t):
    return t.split('=', 1)


def _handle_word(s, t):
    if t.startswith('.'):
        return '.', t[1:]
    if t.startswith('#'):
        return 'id', t[1:]
    return t, t


_scanner = re.Scanner([
    (r'[^ =]+=".*?"', _handle_double_quote),
    (r"[^ =]+='.*?'", _handle_single_quote),
    (r'[^ =]+=[^ =]+', _handle_key_value),
    (r'[^ =]+', _handle_word),
    (r' ', None)
])


def get_attrs(str):
    """ Parse attribute list and return a list of attribute tuples. """
    return _scanner.scan(str)[0]


def isheader(elem):
    return elem.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']


class AttrListTreeprocessor(Treeprocessor):

Example #5

0

Show file

S_BOOL = lambda x, token:  ['bool', bool(token) ]
S_EMPTY = lambda x, token:  ['empty', '']
S_STRING = lambda x, token:  ['string', token]
S_TRAILING = lambda x, token:  ['trailing', None]

class ArgumentError(Exception): 
    """Thrown when args encounters a command line format error."""
    pass


SCANNER = re.Scanner([
    (r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}", S_EMAIL_ADDR),
    (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]", S_IP_ADDRESS),
    (r"-+[a-zA-Z0-9]+", S_OPTION),
    (r"True", S_BOOL),
    (r"[0-9]+", S_INT),
    (r"--", S_TRAILING),
    (r"[a-z\-]+", S_WORD),
    (r"\s", S_EMPTY),
    (r".+", S_STRING),
])


def match(tokens, of_type = None):
    """
    Responsible for taking a token off and processing it, ensuring it is 
    of the correct type.  If of_type is None (the default) then you are
    asking for anything.
    """
    # check the type (first element)
    if of_type:

Example #6

0

Show file

            # No known examples of starting with a space yet.
            # self.name, raw = raw.strip().partition(' ')[0::2]
            self.name, raw = raw.lstrip().partition(' ')[0::2]
        self.buf.append(raw)

    def finalize(self):
        self.instructions = ''.join(self.buf)
        del self.buf


WORD, FLAG = 0, 1
scanner = re.Scanner(
    [
        (r'\\\S{1}', lambda s, t: (t, FLAG)),  # A flag of the form \x
        (r'"[^"]*"', lambda s, t: (t[1:-1], WORD)),  # Quoted word
        (
            r'[^\s\\"]\S*', lambda s, t: (t, WORD)
        ),  # A non-quoted word, must not start with a backslash or a space or a quote
        (r'\s+', None),
    ],
    flags=re.DOTALL)

null = object()


def parser(name, field_map, default_field_name=None):

    field_map = dict((x.split(':') for x in field_map.split()))

    def parse(raw, log=None):
        ans = {}
        last_option = None

Example #7

0

Show file

File: formatter.py Project: osewadare/calibre

class TemplateFormatter(string.Formatter):
    '''
    Provides a format function that substitutes '' for any missing value
    '''

    _validation_string = 'This Is Some Text THAT SHOULD be LONG Enough.%^&*'

    # Dict to do recursion detection. It is up the the individual get_value
    # method to use it. It is cleared when starting to format a template
    composite_values = {}

    def __init__(self):
        string.Formatter.__init__(self)
        self.book = None
        self.kwargs = None
        self.strip_results = True
        self.locals = {}

    def _do_format(self, val, fmt):
        if not fmt or not val:
            return val
        if val == self._validation_string:
            val = '0'
        typ = fmt[-1]
        if typ == 's':
            pass
        elif 'bcdoxXn'.find(typ) >= 0:
            try:
                val = int(val)
            except:
                raise ValueError(
                    _('format: type {0} requires an integer value, got {1}').format(typ, val))
        elif 'eEfFgGn%'.find(typ) >= 0:
            try:
                val = float(val)
            except:
                raise ValueError(
                    _('format: type {0} requires a decimal (float) value, got {1}').format(typ, val))
        return unicode(('{0:'+fmt+'}').format(val))

    def _explode_format_string(self, fmt):
        try:
            matches = self.format_string_re.match(fmt)
            if matches is None or matches.lastindex != 3:
                return fmt, '', ''
            return matches.groups()
        except:
            if DEBUG:
                traceback.print_exc()
            return fmt, '', ''

    format_string_re = re.compile(r'^(.*)\|([^\|]*)\|(.*)$', re.DOTALL)
    compress_spaces = re.compile(r'\s+')
    backslash_comma_to_comma = re.compile(r'\\,')

    arg_parser = re.Scanner([
                (r',', lambda x,t: ''),
                (r'.*?((?<!\\),)', lambda x,t: t[:-1]),
                (r'.*?\)', lambda x,t: t[:-1]),
        ])

    # ################# 'Functional' template language ######################

    lex_scanner = re.Scanner([
                (r'[(),=;]', lambda x,t: (1, t)),
                (r'-?[\d\.]+', lambda x,t: (3, t)),
                (r'\$', lambda x,t: (2, t)),
                (r'\w+', lambda x,t: (2, t)),
                (r'".*?((?<!\\)")', lambda x,t: (3, t[1:-1])),
                (r'\'.*?((?<!\\)\')', lambda x,t: (3, t[1:-1])),
                (r'\n#.*?(?:(?=\n)|$)', None),
                (r'\s',                 None)
        ], flags=re.DOTALL)

    def _eval_program(self, val, prog, column_name):
        # keep a cache of the lex'ed program under the theory that re-lexing
        # is much more expensive than the cache lookup. This is certainly true
        # for more than a few tokens, but it isn't clear for simple programs.
        if tweaks['compile_gpm_templates']:
            if column_name is not None and self.template_cache is not None:
                lprog = self.template_cache.get(column_name, None)
                if lprog:
                    return lprog.evaluate(self, self.kwargs, self.book, self.locals)
                lprog = self.lex_scanner.scan(prog)
                compile_text = ('__funcs__ = formatter_functions().get_functions()\n'
                                'def evaluate(self, formatter, kwargs, book, locals):\n'
                                )
            else:
                lprog = self.lex_scanner.scan(prog)
                compile_text = None
            parser = _CompileParser(val, lprog, self, compile_text)
            val = parser.program()
            if parser.compile_text:
                global compile_counter
                compile_counter += 1
                f = compile_user_function("__A" + str(compile_counter), 'doc', -1, parser.compile_text)
                self.template_cache[column_name] = f
        else:
            if column_name is not None and self.template_cache is not None:
                lprog = self.template_cache.get(column_name, None)
                if not lprog:
                    lprog = self.lex_scanner.scan(prog)
                    self.template_cache[column_name] = lprog
            else:
                lprog = self.lex_scanner.scan(prog)
            parser = _Parser(val, lprog, self)
            val = parser.program()
        return val

    # ################# Override parent classes methods #####################

    def get_value(self, key, args, kwargs):
        raise Exception('get_value must be implemented in the subclass')

    def format_field(self, val, fmt):
        # ensure we are dealing with a string.
        if isinstance(val, (int, float)):
            if val:
                val = unicode(val)
            else:
                val = ''
        # Handle conditional text
        fmt, prefix, suffix = self._explode_format_string(fmt)

        # Handle functions
        # First see if we have a functional-style expression
        if fmt.startswith('\''):
            p = 0
        else:
            p = fmt.find(':\'')
            if p >= 0:
                p += 1
        if p >= 0 and fmt[-1] == '\'':
            val = self._eval_program(val, fmt[p+1:-1], None)
            colon = fmt[0:p].find(':')
            if colon < 0:
                dispfmt = ''
            else:
                dispfmt = fmt[0:colon]
        else:
            # check for old-style function references
            p = fmt.find('(')
            dispfmt = fmt
            if p >= 0 and fmt[-1] == ')':
                colon = fmt[0:p].find(':')
                if colon < 0:
                    dispfmt = ''
                    colon = 0
                else:
                    dispfmt = fmt[0:colon]
                    colon += 1

                funcs = formatter_functions().get_functions()
                fname = fmt[colon:p].strip()
                if fname in funcs:
                    func = funcs[fname]
                    if func.arg_count == 2:
                        # only one arg expected. Don't bother to scan. Avoids need
                        # for escaping characters
                        args = [fmt[p+1:-1]]
                    else:
                        args = self.arg_parser.scan(fmt[p+1:])[0]
                        args = [self.backslash_comma_to_comma.sub(',', a) for a in args]
                    if (func.arg_count == 1 and (len(args) != 1 or args[0])) or \
                            (func.arg_count > 1 and func.arg_count != len(args)+1):
                        raise ValueError('Incorrect number of arguments for function '+ fmt[0:p])
                    if func.arg_count == 1:
                        val = func.eval_(self, self.kwargs, self.book, self.locals, val)
                        if self.strip_results:
                            val = val.strip()
                    else:
                        val = func.eval_(self, self.kwargs, self.book, self.locals, val, *args)
                        if self.strip_results:
                            val = val.strip()
                else:
                    return _('%s: unknown function')%fname
        if val:
            val = self._do_format(val, dispfmt)
        if not val:
            return ''
        return prefix + val + suffix

    def evaluate(self, fmt, args, kwargs):
        if fmt.startswith('program:'):
            ans = self._eval_program(kwargs.get('$', None), fmt[8:], self.column_name)
        else:
            ans = self.vformat(fmt, args, kwargs)
        if self.strip_results:
            return self.compress_spaces.sub(' ', ans).strip()
        return ans

    # ######### a formatter that throws exceptions ############

    def unsafe_format(self, fmt, kwargs, book, strip_results=True):
        self.strip_results = strip_results
        self.column_name = self.template_cache = None
        self.kwargs = kwargs
        self.book = book
        self.composite_values = {}
        self.locals = {}
        return self.evaluate(fmt, [], kwargs)

    # ######### a formatter guaranteed not to throw an exception ############

    def safe_format(self, fmt, kwargs, error_value, book,
                    column_name=None, template_cache=None,
                    strip_results=True):
        self.strip_results = strip_results
        self.column_name = column_name
        self.template_cache = template_cache
        self.kwargs = kwargs
        self.book = book
        self.composite_values = {}
        self.locals = {}
        try:
            ans = self.evaluate(fmt, [], kwargs)
        except Exception as e:
            if DEBUG:  # and getattr(e, 'is_locking_error', False):
                traceback.print_exc()
                if column_name:
                    prints('Error evaluating column named:', column_name)
            ans = error_value + ' ' + e.message
        return ans

Example #8

0

Show file

File: cqltypes.py Project: aminaMuraina/ScriptDbNoSQL

    _number_types = frozenset((int, long, float))
    _name_from_hex_string = unhexlify


def trim_if_startswith(s, prefix):
    if s.startswith(prefix):
        return s[len(prefix):]
    return s


_casstypes = {}
_cqltypes = {}

cql_type_scanner = re.Scanner((
    ('frozen', None),
    (r'[a-zA-Z0-9_]+', lambda s, t: t),
    (r'[\s,<>]', None),
))


def cql_types_from_string(cql_type):
    return cql_type_scanner.scan(cql_type)[0]


class CassandraTypeType(type):
    """
    The CassandraType objects in this module will normally be used directly,
    rather than through instances of those types. They can be instantiated,
    of course, but the type information is what this driver mainly needs.

    This metaclass registers CassandraType classes in the global

Example #9

0

Show file

File: commands.py Project: petr-muller/perun

    PROFILE_TYPE_COLOURS, PROFILE_MALFORMED, SUPPORTED_PROFILE_TYPES, \
    HEADER_ATTRS, HEADER_COMMIT_COLOUR, HEADER_INFO_COLOUR, HEADER_SLASH_COLOUR, \
    DESC_COMMIT_ATTRS, DESC_COMMIT_COLOUR, PROFILE_DELIMITER, ID_TYPE_COLOUR
from perun.utils.log import cprint, cprintln
import perun.vcs as vcs

# Init colorama for multiplatform colours
colorama.init()
UNTRACKED_REGEX = \
    re.compile(r"([^\\]+)-([0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]{2}).perf")
# Regex for parsing the formating tag [<tag>:<size>f<fill_char>]
FMT_REGEX = re.compile("[[]([a-zA-Z]+)(:[0-9]+)?(f.)?[]]")
# Scanner for parsing formating strings, i.e. breaking it to parts
FMT_SCANNER = re.Scanner([
    (r"[[]([a-zA-Z]+)(:[0-9]+)?(f.)?[]]", lambda scanner, token:
     ("fmt_string", token)),
    (r"[^][]*", lambda scanner, token: ("rest", token)),
])


def lookup_minor_version(func):
    """If the minor_version is not given by the caller, it looks up the HEAD in the repo.

    If the @p func is called with minor_version parameter set to None,
    then this decorator performs a lookup of the minor_version corresponding
    to the head of the repository.

    Arguments:
        func(function): decorated function for which we will lookup the minor_version

    Returns:

Example #10

0

Show file

File: utils.py Project: yifili09/pycrate

ASN1Scanner = re.Scanner([
    #
    (r'(--).*(\n|(--))', tokenize_comment),
    (r'(/\*).*(\*/)', tokenize_comment),
    (r'".*(?<!")"(?!")', tokenize_charstr),
    #
    (r'::=', tokenize_definition),
    (r':', tokenize_colon),
    (r'=', tokenize_equal),
    (r',', tokenize_comma),
    (r'\(|\)', tokenize_parenth),
    (r'\[{2}|\]{2}', tokenize_dbracket),
    (r'\[|\]', tokenize_bracket),
    (r'\{|\}', tokenize_curlyb),
    (r'\.\.\.', tokenize_tdot),
    (r'\.\.', tokenize_ddot),
    (r'\.', tokenize_dot),
    #(r'\.\s{0,}&', tokenize_dotamper),
    (r'\||(?:UNION)', tokenize_union),
    (r'\^|(?:INTERSECTION)', tokenize_intersect),
    (r'EXCEPT', tokenize_complement),
    (r'<', tokenize_lthan),
    (r'>', tokenize_gthan),
    (r'@', tokenize_arrowb),
    (r'\!', tokenize_exclam),
    (r"'", tokenize_apost),
    (r'"', tokenize_quote),  # should expand to get the whole string
    #
    (r'{0}'.format(_RE_NATIVE_TYPES), tokenize_typenative),
    (r'[A-Z][A-Z0-9\-]{0,}', tokenize_classref),
    (r'[A-Z][a-zA-Z0-9\-]{0,}', tokenize_typeref),
    (r'&[a-zA-Z]{1}[a-zA-Z0-9\-]{0,}', tokenize_fieldref),
    (r'[a-z]{1}[a-zA-Z0-9\-]{0,}', tokenize_identifier),
    #
    (r'UNIVERSAL', tokenize_univers),
    (r'APPLICATION', tokenize_applic),
    (r'PRIVATE', tokenize_priv),
    #
    (r'ALL', tokenize_all),
    (r'MIN', tokenize_min),
    (r'MAX', tokenize_max),
    (r'MINUS-INFINITY', tokenize_mininf),
    (r'PLUS-INFINITY', tokenize_plusinf),
    (r'NULL', tokenize_null),
    (r'(?:FALSE)|(?:TRUE)', tokenize_bool),
    (r'(?:[+\-]{0,1}\s{0,})[0-9]{1,}', tokenize_integer),
    (r'', tokenize_real),
    (r'', tokenize_bstring),
    (r'', tokenize_hstring),
    #(r'', tokenize_),
    #(r'', tokenize_),
    #(r'', tokenize_),
    (r'\s{1,}', None),
])

Example #11

0

Show file

File: compiler.py Project: djKooks/ibis

def tokenize_noop(scanner, token):
    return token


# translate strftime spec into mostly equivalent PostgreSQL spec
_scanner = re.Scanner([
    (py, tokenize_noop)
    for py in _strftime_to_postgresql_rules.keys()
] + [
    # "%e" is in the C standard and Python actually generates this if your spec
    # contains "%c" but we don't officially support it as a specifier so we
    # need to special case it in the scanner
    (r'%e', tokenize_noop),

    # double quotes need to be escaped
    (r'"', lambda scanner, token: re.escape(token)),

    # spaces should be greedily consumed and kept
    (r'\s+', tokenize_noop),

    (r'[%s]' % re.escape(string.punctuation), tokenize_noop),

    # everything else except double quotes and spaces
    (r'[^%s\s]+' % re.escape(string.punctuation), tokenize_noop),
])


_lexicon_values = frozenset(_strftime_to_postgresql_rules.values())

_strftime_blacklist = frozenset(['%w', '%U', '%c', '%x', '%X', '%e'])

Example #12

0

Show file

class Modifier(object):
    """ An expression in a macro primitive.

    We interpret gerber expressions according to the following
    grammar, rooted at TOP:

      TOP -> number ESUB
      TOP -> minus number ESUB
      TOP -> variable TOPE
      TOPE ->
      TOPE -> equate E
      TOPE -> op E
      E -> variable ESUB
      E -> number ESUB
      E -> minus number ESUB
      ESUB ->
      ESUB -> op E

    All numerical operations are given the same precedence and are
    grouped right to left.
    """

    scanner = re.Scanner([
        (r'\d*\.\d+', lambda s, tok: Token('number', float(tok))),
        (r'\d+', lambda s, tok: Token('number', float(tok))),
        (r'\$\d+', lambda s, tok: Token('variable', int(tok[1:]))),
        (r'[\+xX/-]', lambda s, tok: Token('op', tok.lower())),
        (r'=', lambda s, tok: Token('equate', None)),
    ])

    def __init__(self, expr):
        self.tokens, remainder = self.scanner.scan(expr)
        if remainder:
            raise InvalidExpression(expr)

    def evaluate(self, values):
        """ Evaluate the expression given a values dict and return the
        resulting value. The values dict may be modified in the
        process (by an equate expression). """
        stack = []
        self._evaluate_top(list(self.tokens), stack, values)
        return self._pop_value(stack, values)

    def _evaluate_top(self, tokens, stack, values):
        """ Evaluate the TOP production. """
        token = tokens.pop(0)

        if token.type == 'number':
            stack.append(token)
            self._evaluate_sube(tokens, stack, values)
        elif token.type == 'variable':
            stack.append(token)
            self._evaluate_tope(tokens, stack, values)
        elif token == ('op', '-'):
            if not tokens:
                raise InvalidExpression(token, stack, values)
            num = tokens.pop(0)
            if num.type != 'number':
                raise InvalidExpression(token, num, stack, values)
            stack.append(Token('number', -num.value))
        else:
            raise InvalidExpression(token, tokens)

    def _evaluate_tope(self, tokens, stack, values):
        """ Evaluate the TOPE production. """
        if not tokens:
            return

        token = tokens.pop(0)

        if token.type == 'equate':
            var = stack.pop()
            self._evaluate_e(tokens, stack, values)
            values[var.value] = self._pop_value(stack, values)
            stack.append(Token('number', values[var.value]))
        elif token.type == 'op':
            self._evaluate_e(tokens, stack, values)
            self._evaluate_op(token, stack, values)
        else:
            raise InvalidExpression(token, tokens)

    def _evaluate_e(self, tokens, stack, values):
        """ Evaluate the E production. """
        token = tokens.pop(0)

        if token.type in ('number', 'variable'):
            stack.append(token)
            self._evaluate_sube(tokens, stack, values)
        elif token == ('op', '-'):
            if not tokens:
                raise InvalidExpression(token, stack, values)
            num = tokens.pop(0)
            if num.type != 'number':
                raise InvalidExpression(token, num, stack, values)
            stack.append(Token('number', -num.value))
        else:
            raise InvalidExpression(token, tokens)

    def _evaluate_sube(self, tokens, stack, values):
        """ Evaluate the SUBE production. """
        if not tokens:
            return

        token = tokens.pop(0)

        if token.type == 'op':
            self._evaluate_e(tokens, stack, values)
            self._evaluate_op(token, stack, values)
        else:
            raise InvalidExpression(token, tokens)

    def _evaluate_op(self, operand, stack, values):
        """ Evaluate the given operand and push the value onto the
        stack. """
        val2 = self._pop_value(stack, values)
        val1 = self._pop_value(stack, values)

        if operand.value == '+':
            val = val1 + val2
        elif operand.value == '-':
            val = val1 - val2
        elif operand.value == 'x':
            val = val1 * val2
        elif operand.value == '/':
            val = val1 / val2
        else:
            raise InvalidExpression(operand, stack)

        stack.append(Token('number', val))

    def _pop_value(self, stack, values):
        """ Pop a value from the stack and return it,
        replacing variables with their values. """

        token = stack.pop()

        if token.type == 'number':
            return token.value
        elif token.type == 'variable':
            return values[token.value]
        else:
            raise InvalidExpression(token, stack, values)

Example #13

0

Show file

File: formatter.py Project: xinxiemy/calibre

class TemplateFormatter(string.Formatter):
    '''
    Provides a format function that substitutes '' for any missing value
    '''

    _validation_string = 'This Is Some Text THAT SHOULD be LONG Enough.%^&*'

    # Dict to do recursion detection. It is up to the individual get_value
    # method to use it. It is cleared when starting to format a template
    composite_values = {}

    def __init__(self):
        string.Formatter.__init__(self)
        self.book = None
        self.kwargs = None
        self.strip_results = True
        self.locals = {}
        self.funcs = formatter_functions().get_functions()
        self.gpm_parser = _Parser()
        self.gpm_interpreter = _Interpreter()

    def _do_format(self, val, fmt):
        if not fmt or not val:
            return val
        if val == self._validation_string:
            val = '0'
        typ = fmt[-1]
        if typ == 's':
            pass
        elif 'bcdoxXn'.find(typ) >= 0:
            try:
                val = int(val)
            except Exception:
                raise ValueError(
                    _('format: type {0} requires an integer value, got {1}').
                    format(typ, val))
        elif 'eEfFgGn%'.find(typ) >= 0:
            try:
                val = float(val)
            except:
                raise ValueError(
                    _('format: type {0} requires a decimal (float) value, got {1}'
                      ).format(typ, val))
        return unicode_type(('{0:' + fmt + '}').format(val))

    def _explode_format_string(self, fmt):
        try:
            matches = self.format_string_re.match(fmt)
            if matches is None or matches.lastindex != 3:
                return fmt, '', ''
            return matches.groups()
        except:
            if DEBUG:
                traceback.print_exc()
            return fmt, '', ''

    format_string_re = re.compile(r'^(.*)\|([^\|]*)\|(.*)$', re.DOTALL)
    compress_spaces = re.compile(r'\s+')
    backslash_comma_to_comma = re.compile(r'\\,')

    arg_parser = re.Scanner([
        (r',', lambda x, t: ''),
        (r'.*?((?<!\\),)', lambda x, t: t[:-1]),
        (r'.*?\)', lambda x, t: t[:-1]),
    ])

    # ################# 'Functional' template language ######################

    lex_scanner = re.Scanner(
        [
            (r'(==#|!=#|<=#|<#|>=#|>#)', lambda x, t:
             (_Parser.LEX_NUMERIC_INFIX, t)),
            (r'(==|!=|<=|<|>=|>)', lambda x, t:
             (_Parser.LEX_STRING_INFIX, t)),  # noqa
            (r'(if|then|else|fi)\b', lambda x, t:
             (_Parser.LEX_KEYWORD, t)),  # noqa
            (r'[(),=;]', lambda x, t: (_Parser.LEX_OP, t)),  # noqa
            (r'-?[\d\.]+', lambda x, t: (_Parser.LEX_CONST, t)),  # noqa
            (r'\$', lambda x, t: (_Parser.LEX_ID, t)),  # noqa
            (r'\w+', lambda x, t: (_Parser.LEX_ID, t)),  # noqa
            (r'".*?((?<!\\)")', lambda x, t:
             (_Parser.LEX_CONST, t[1:-1])),  # noqa
            (r'\'.*?((?<!\\)\')', lambda x, t:
             (_Parser.LEX_CONST, t[1:-1])),  # noqa
            (r'\n#.*?(?:(?=\n)|$)', None),
            (r'\s', None),
        ],
        flags=re.DOTALL)

    def _eval_program(self, val, prog, column_name):
        if column_name is not None and self.template_cache is not None:
            tree = self.template_cache.get(column_name, None)
            if not tree:
                tree = self.gpm_parser.program(self, self.funcs,
                                               self.lex_scanner.scan(prog))
                self.template_cache[column_name] = tree
        else:
            tree = self.gpm_parser.program(self, self.funcs,
                                           self.lex_scanner.scan(prog))
        return self.gpm_interpreter.program(self.funcs, self, tree, val)

    def _eval_sfm_call(self, template_name, args):
        func = self.funcs[template_name]
        tree = func.cached_parse_tree
        if tree is None:
            tree = self.gpm_parser.program(
                self, self.funcs,
                self.lex_scanner.scan(func.program_text[len('program:'):]))
            func.cached_parse_tree = tree
        return self.gpm_interpreter.program(self.funcs,
                                            self,
                                            tree,
                                            None,
                                            is_call=True,
                                            args=args)

    # ################# Override parent classes methods #####################

    def get_value(self, key, args, kwargs):
        raise Exception('get_value must be implemented in the subclass')

    def format_field(self, val, fmt):
        # ensure we are dealing with a string.
        if isinstance(val, numbers.Number):
            if val:
                val = unicode_type(val)
            else:
                val = ''
        # Handle conditional text
        fmt, prefix, suffix = self._explode_format_string(fmt)

        # Handle functions
        # First see if we have a functional-style expression
        if fmt.startswith('\''):
            p = 0
        else:
            p = fmt.find(':\'')
            if p >= 0:
                p += 1
        if p >= 0 and fmt[-1] == '\'':
            val = self._eval_program(val, fmt[p + 1:-1], None)
            colon = fmt[0:p].find(':')
            if colon < 0:
                dispfmt = ''
            else:
                dispfmt = fmt[0:colon]
        else:
            # check for old-style function references
            p = fmt.find('(')
            dispfmt = fmt
            if p >= 0 and fmt[-1] == ')':
                colon = fmt[0:p].find(':')
                if colon < 0:
                    dispfmt = ''
                    colon = 0
                else:
                    dispfmt = fmt[0:colon]
                    colon += 1

                fname = fmt[colon:p].strip()
                if fname in self.funcs:
                    func = self.funcs[fname]
                    if func.arg_count == 2:
                        # only one arg expected. Don't bother to scan. Avoids need
                        # for escaping characters
                        args = [fmt[p + 1:-1]]
                    else:
                        args = self.arg_parser.scan(fmt[p + 1:])[0]
                        args = [
                            self.backslash_comma_to_comma.sub(',', a)
                            for a in args
                        ]
                    if not func.is_python:
                        args.insert(0, val)
                        val = self._eval_sfm_call(fname, args)
                    else:
                        if (func.arg_count == 1 and (len(args) != 1 or args[0])) or \
                                (func.arg_count > 1 and func.arg_count != len(args)+1):
                            raise ValueError(
                                _('Incorrect number of arguments for function {0}'
                                  ).format(fname))
                        if func.arg_count == 1:
                            val = func.eval_(self, self.kwargs, self.book,
                                             self.locals, val)
                            if self.strip_results:
                                val = val.strip()
                        else:
                            val = func.eval_(self, self.kwargs, self.book,
                                             self.locals, val, *args)
                            if self.strip_results:
                                val = val.strip()
                else:
                    return _('%s: unknown function') % fname
        if val:
            val = self._do_format(val, dispfmt)
        if not val:
            return ''
        return prefix + val + suffix

    def evaluate(self, fmt, args, kwargs):
        if fmt.startswith('program:'):
            ans = self._eval_program(kwargs.get('$', None), fmt[8:],
                                     self.column_name)
        else:
            ans = self.vformat(fmt, args, kwargs)
        if self.strip_results:
            return self.compress_spaces.sub(' ', ans).strip()
        return ans

    # ######### a formatter that throws exceptions ############

    def unsafe_format(self, fmt, kwargs, book, strip_results=True):
        self.strip_results = strip_results
        self.column_name = self.template_cache = None
        self.kwargs = kwargs
        self.book = book
        self.composite_values = {}
        self.locals = {}
        return self.evaluate(fmt, [], kwargs)

    # ######### a formatter guaranteed not to throw an exception ############

    def safe_format(self,
                    fmt,
                    kwargs,
                    error_value,
                    book,
                    column_name=None,
                    template_cache=None,
                    strip_results=True,
                    template_functions=None):
        self.strip_results = strip_results
        self.column_name = column_name
        self.template_cache = template_cache
        self.kwargs = kwargs
        self.book = book
        if template_functions:
            self.funcs = template_functions
        else:
            self.funcs = formatter_functions().get_functions()
        self.composite_values = {}
        self.locals = {}
        try:
            ans = self.evaluate(fmt, [], kwargs)
        except Exception as e:
            if DEBUG:  # and getattr(e, 'is_locking_error', False):
                traceback.print_exc()
                if column_name:
                    prints('Error evaluating column named:', column_name)
            ans = error_value + ' ' + error_message(e)
        return ans

Example #14

0

Show file

class ChopGrammar:

    #TODO Add support for escaped sequences?
    scanner=re.Scanner([
        (r'"((?:[^\t\n\r\f\v"])*)"',               lambda scanner, token:("QUOTED", token)),
        (r"'((?:[^\t\n\r\f\v'])*)'",               lambda scanner, token:("QUOTED", token)),
        (r"[ ]",                                lambda scanner, token:("SPACE", token)),
        (r"\;",                                 lambda scanner, token:("SEMICOLON", token)),
        (r"\(",                                 lambda scanner, token:("BTEE", token)),
        (r"\)",                                 lambda scanner, token:("ETEE", token)),
        (r"\|",                                 lambda scanner, token:("PIPE", token)),
        (r"\,",                                 lambda scanner, token:("COMMA", token)),
        (r"[^\t\n\r\f\v'\";()|,-][^ \t\n\r\f\v'\";()|,]*",
                                                lambda scanner, token:("STRING", token)),
        (r"--[a-zA-Z0-9_-]+",                   lambda scanner, token:("OPTION", token)),
        (r"-[a-zA-Z0-9]+",                      lambda scanner, token:("OPTION", token)),
        (r"-",                                  lambda scanner, token:("STRING", token)),
    ])


    def __init__(self):
        self.top_modules = []
        self.all_modules = []
        self.strbuff = None

    def parseGrammar(self, grammar_string):
        results, remainder = self.scanner.scan(grammar_string)

        if remainder:
            return (None, None)

        nresults = []
        for token in results:
            if token[0] != "SPACE":
                nresults.append(token)
        results = nresults

        self.verify_chains(results)

        return self.all_modules


    def find_tee_end(self, chain, left):
        btee_stack = [True]
        #Assume left is the position of BTEE
        right = left + 1
        while right < len(chain):
            if chain[right][0] == "BTEE":
                btee_stack.append(True)
            elif chain[right][0] == "ETEE":
                if not len(btee_stack): #there's no cooresponding BTEE
                    raise Exception("Unexpected End Tee token ')'")
                    #return left #error
                if len(btee_stack) == 1: #this is the ETEE we're looking for
                    return right
                btee_stack.pop()
            right += 1
        raise Exception("Unable to find end of Tee")
        #return left #error
            


    def verify_chains(self, chains):
        left = 0
        right= 0
        flows = []

        #get chain
        #pdb.set_trace()
        while right < len(chains):
            while right < len(chains) and chains[right][0] != "SEMICOLON":
                right += 1
            chain = chains[left:right]
            right += 1
            left = right
            (ancestors, children) = self.verify_chain(chain)
            flows.extend(ancestors)

        self.top_modules = flows
        return True
            
    def verify_chain(self, chain):
        left = 0
        right= 0

        ancestors = []
        parents = []
        
        #get chain

        while right < len(chain):
            while right < len(chain) and (chain[right][0] != "PIPE" and chain[right][0] != "BTEE"):
                right += 1

            if right >= len(chain) or chain[right][0] == "PIPE": #Assume Invocation
                invocation = chain[left:right]
                mod = self.verify_invocation(invocation)
                if len(parents) == 0:
                    parents.append(mod)
                    ancestors.append(mod)
                else:
                    for parent in parents:
                        parent.children.append(mod)
                        mod.parents.append(parent)
                    parents = [mod]

            elif chain[right][0] == "BTEE": #Must find end of TEE
                if left != right:
                    raise Exception("Unexpected Tee")
                #left = right
                right = self.find_tee_end(chain, left)
                tee = chain[left + 1: right] #Remove the TEE elements
                
                if (right + 1) < len(chain): #There's more tokens after the end of the tee
                    if chain[right + 1][0] != "PIPE":
                        raise Exception('Unexpected token after TEE', chain[right + 1][0])
                    else:
                        right += 1
                (tparents, tchildren) = self.verify_tee(tee)
                if len(parents) == 0:
                    parents = tchildren
                    ancestors = tparents
                else:
                    for parent in parents:
                        for tparent in tparents:
                            parent.children.append(tparent)
                            tparent.parents.append(parent)
                    parents = tchildren
        
            right += 1
            left = right

        #return True
        return (ancestors,parents)

    def verify_tee(self, tee):
        left = 0
        right = 0
        comma = False

        parents = []
        children = []

        while right < len(tee):
            while right < len(tee) and (tee[right][0] != "COMMA" and tee[right][0] != "BTEE"):
                right += 1

            if right >= len(tee) or tee[right][0] == "COMMA": #Element of TEE, i.e., a chain
                if right < len(tee) and tee[right][0] == 'COMMA':
                    comma = True
                chain = tee[left:right]
                (cparents, cchildren) = self.verify_chain(chain)
                for cparent in cparents:
                    parents.append(cparent)
                for cchild in cchildren:
                    children.append(cchild)

            elif tee[right][0] == "BTEE": #TEE in the Chain, need to skip it to find the comma
                right = self.find_tee_end(tee,right)
                continue

            right += 1
            left = right

        if not comma:
            raise Exception('Usage of a Tee requires at least two elements')

        return (parents, children)
            

    def verify_invocation(self, invocation):
        right = 1
        if invocation[0][0] != "STRING":
            raise Exception("Invocation must begin with a 'STRING' token, not a %s token" % invocation[0][0])

        mymod = __ChopModule__(invocation[0][1].rstrip())

        while right < len(invocation):
            if invocation[right][0] == "OPTION":
                mymod.arguments.append(invocation[right][1].rstrip())
                if (right + 1) < len(invocation): #Check if the next element is the argument to the option
                    if invocation[right + 1][0] == "QUOTED":
                        #Need to strip the quotes
                        mymod.arguments.append(invocation[right + 1][1].rstrip()[1:-1])
                        right += 1 #skip the parameter
                    elif invocation[right + 1][0] == "STRING":
                        mymod.arguments.append(invocation[right + 1][1].rstrip())
                        right += 1 #skip the parameter
                    #If not, just skip it and let it be parsed out
            elif (invocation[right][0] == "QUOTED"):
                if (right + 1) < len(invocation):
                    raise Exception("QUOTED token must be last element of invocation or following a OPTION token")
                #Need to remove the quotes from the quoted string
                mymod.arguments.append(invocation[right][1].rstrip()[1:-1])
            elif (invocation[right][0] == "STRING"):
                if (right + 1) < len(invocation):
                    raise Exception("STRING token must be last element of invocation or following a OPTION token")
                mymod.arguments.append(invocation[right][1].rstrip())
            else:
                raise Exception("Unexpected %s token %s" % (invocation[right][0], invocation[right][1]))
            right += 1

        self.all_modules.append(mymod)
        return mymod

    def get_family_(self, top, tabs = 0):
        for i in range(0, tabs):
            self.strbuff.write("\t")

        self.strbuff.write("%s -->\n" % top.name)

        if len(top.children):
            for child in top.children:
                self.get_family_(child, tabs + 1)

    def get_family(self, top):
        if self.strbuff is not None:
            self.strbuff.close()

        self.strbuff = StringIO()
        self.get_family_(top)
        output = self.strbuff.getvalue()
        self.strbuff.close()
        return output
        

    def get_tree(self):
        output = ""
        for t in self.top_modules:
            output += self.get_family(t) + "\n"
        return output
        
    def print_family(self, top, tabs = 0):
        #print Self
        for i in range (0, tabs):
            print "\t",

        print top.name, "-->"

        if len(top.children):
            for child in top.children:
                self.print_family(child, tabs + 1)

    def print_tree(self):
        for t in self.top_modules:
            self.print_family(t)

Example #15

0

Show file

def pop_bone_context():
    global bone_context
    bone_context = bone_context[:-1]
    return bone_context[len(bone_context) - 1]


reserved = ["HIERARCHY", "ROOT", "OFFSET", "CHANNELS", "MOTION"]
channel_names = [
    "Xposition", "Yposition", "Zposition", "Zrotation", "Xrotation",
    "Yrotation"
]

scanner = re.Scanner([
    (r"[a-zA-Z_]\w*", identifier),
    (r"-*[0-9]+(\.[0-9]+)?", digit),
    (r"}", close_brace),
    (r"{", open_brace),
    (r":", None),
    (r"\s+", None),
])


def read_offset(bvh, token_index):
    if (bvh[token_index] != ("IDENT", "OFFSET")):
        return None, None
    token_index = token_index + 1
    offsets = [0.0] * 3
    for i in range(0, 3):
        offsets[i] = float(bvh[token_index][1])
        token_index = token_index + 1
    return offsets, token_index

Example #16

0

Show file

REScannerASN1 = re.Scanner(
    [
        #
        (r'(--).*?([%s]|(--)|$)' % _NL, lambda s, t: (TOK_CMT, t)),
        (r'(/\*).*?(\*/)', lambda s, t: (TOK_CMT, t)),
        (r'".*?(?<!")"(?!")', lambda s, t: (TOK_CSTR, t)),
        #
        (r'::=', lambda s, t: TOK_ASSI),
        (r':', lambda s, t: TOK_COL),
        (r';', lambda s, t: TOK_SCOL),
        (r'=', lambda s, t: TOK_EQU),
        (r',', lambda s, t: TOK_COM),
        (r'\(', lambda s, t: TOK_PARO),
        (r'\)', lambda s, t: TOK_PARC),
        (r'\[{2}', lambda s, t: TOK_DBRAO),
        (r'\]{2}', lambda s, t: TOK_DBRAC),
        (r'\[', lambda s, t: TOK_BRAO),
        (r'\]', lambda s, t: TOK_BRAC),
        (r'\{', lambda s, t: TOK_CBRAO),
        (r'\}', lambda s, t: TOK_CBRAC),
        (r'\.\.\.', lambda s, t: TOK_TDOT),
        (r'\.\.', lambda s, t: TOK_DDOT),
        (r'\.', lambda s, t: TOK_DOT),
        (r'\||(?:UNION%s)' % _EXC, lambda s, t: TOK_UNIO),
        (r'\^|(?:INTERSECTION%s)' % _EXC, lambda s, t: TOK_INTER),
        (r'<', lambda s, t: TOK_LTHAN),
        (r'>', lambda s, t: TOK_GTHAN),
        (r'@', lambda s, t: TOK_ARRO),
        (r'\!', lambda s, t: TOK_EXCL),
        #
        (r'ABSENT%s' % _EXC, lambda s, t: TOK_ABS),
        (r'ALL%s' % _EXC, lambda s, t: TOK_ALL),
        (r'APPLICATION%s' % _EXC, lambda s, t: TOK_TAPP),
        (r'AUTOMATIC%s' % _EXC, lambda s, t: TOK_AUTO),
        (r'BEGIN%s' % _EXC, lambda s, t: TOK_BEG),
        (r'BY%s' % _EXC, lambda s, t: TOK_BY),
        (r'COMPONENT%s' % _EXC, lambda s, t: TOK_COMP),
        (r'COMPONENTS%s' % _EXC, lambda s, t: TOK_COMPS),
        (r'CONSTRAINED%s' % _EXC, lambda s, t: TOK_CONST),
        (r'CONTAINING%s' % _EXC, lambda s, t: TOK_CONT),
        (r'DEFAULT%s' % _EXC, lambda s, t: TOK_DEF),
        (r'DEFINITIONS%s' % _EXC, lambda s, t: TOK_DEFI),
        (r'ENCODED%s' % _EXC, lambda s, t: TOK_ENC),
        (r'END%s' % _EXC, lambda s, t: TOK_END),
        (r'EXCEPT%s' % _EXC, lambda s, t: TOK_EXCE),
        (r'EXPLICIT%s' % _EXC, lambda s, t: TOK_TEXP),
        (r'EXPORTS%s' % _EXC, lambda s, t: TOK_EXP),
        (r'EXTENSIBILITY%sIMPLIED%s' %
         (REScannerSNL, _EXC), lambda s, t: TOK_EXTI),
        (r'FALSE%s' % _EXC, lambda s, t: TOK_FALS),
        (r'FROM%s' % _EXC, lambda s, t: TOK_FROM),
        (r'IMPLICIT%s' % _EXC, lambda s, t: TOK_TIMP),
        (r'IMPORTS%s' % _EXC, lambda s, t: TOK_IMP),
        (r'INCLUDES%s' % _EXC, lambda s, t: TOK_INCL),
        (r'MAX%s' % _EXC, lambda s, t: TOK_MAX),
        (r'MIN%s' % _EXC, lambda s, t: TOK_MIN),
        (r'MINUS-INFINITY%s' % _EXC, lambda s, t: TOK_MINF),
        (r'NOT-A-NUMBER%s' % _EXC, lambda s, t: TOK_NAN),
        (r'NULL%s' % _EXC, lambda s, t: TOK_NULL),
        (r'OF%s' % _EXC, lambda s, t: TOK_OF),
        (r'OPTIONAL%s' % _EXC, lambda s, t: TOK_OPT),
        (r'PATTERN%s' % _EXC, lambda s, t: TOK_PAT),
        (r'PLUS-INFINITY%s' % _EXC, lambda s, t: TOK_PINF),
        (r'PRESENT%s' % _EXC, lambda s, t: TOK_PRES),
        (r'PRIVATE%s' % _EXC, lambda s, t: TOK_TPRI),
        (r'SIZE%s' % _EXC, lambda s, t: TOK_SIZE),
        (r'TAGS%s' % _EXC, lambda s, t: TOK_TAGS),
        (r'TRUE%s' % _EXC, lambda s, t: TOK_TRUE),
        (r'UNIQUE%s' % _EXC, lambda s, t: TOK_UNIQ),
        (r'UNIVERSAL%s' % _EXC, lambda s, t: TOK_TUNI),
        (r'WITH%sSYNTAX%s' % (REScannerSNL, _EXC), lambda s, t: TOK_WSYN),
        #
        (r'%s' % REScannerReal, lambda s, t: (TOK_INT, t)),
        (r'%s' % REScannerInt, lambda s, t: (TOK_REAL, t)),
        (r'%s' % REScannerBStr, lambda s, t: (TOK_BSTR, t)),
        (r'%s' % REScannerHStr, lambda s, t: (TOK_HSTR, t)),
        #
        (r'(%s)%s' % (REScannerNTypes, _EXC), lambda s, t: (TOK_NTYPE, t)),
        (r'&[a-zA-Z](?:\-{0,1}[a-zA-Z0-9]{1,}){0,}%s' % _EXC, lambda s, t:
         (TOK_CLAID, t)),
        (r'[A-Z](?:\-{0,1}[a-zA-Z0-9]{1,}){0,}%s' % _EXC, lambda s, t:
         (TOK_HID, t)),
        (r'[A-Z](?:\-{0,1}[a-zA-Z0-9]{1,}){0,}%s' % _EXC, lambda s, t:
         (TOK_ID, t)),
        (r'[a-z](?:\-{0,1}[a-zA-Z0-9]{1,}){0,}%s' % _EXC, lambda s, t:
         (TOK_LID, t)),
        #
        (r'%s' % REScannerSNL, None)
    ],
    flags=re.DOTALL)

Example #17

0

Show file

File: search_query_parser.py Project: zhibimohai/calibre

class Parser(object):
    def __init__(self):
        self.current_token = 0
        self.tokens = None

    OPCODE = 1
    WORD = 2
    QUOTED_WORD = 3
    EOF = 4
    REPLACEMENTS = tuple(
        ('\\' + x, codepoint_to_chr(i + 1)) for i, x in enumerate('\\"()'))

    # Had to translate named constants to numeric values
    lex_scanner = re.Scanner([(r'[()]', lambda x, t: (Parser.OPCODE, t)),
                              (r'@.+?:[^")\s]+', lambda x, t:
                               (Parser.WORD, unicode_type(t))),
                              (r'[^"()\s]+', lambda x, t:
                               (Parser.WORD, unicode_type(t))),
                              (r'".*?((?<!\\)")', lambda x, t:
                               (Parser.QUOTED_WORD, t[1:-1])), (r'\s+', None)],
                             flags=re.DOTALL)

    def token(self, advance=False):
        if self.is_eof():
            return None
        res = self.tokens[self.current_token][1]
        if advance:
            self.current_token += 1
        return res

    def lcase_token(self, advance=False):
        if self.is_eof():
            return None
        res = self.tokens[self.current_token][1]
        if advance:
            self.current_token += 1
        return icu_lower(res)

    def token_type(self):
        if self.is_eof():
            return self.EOF
        return self.tokens[self.current_token][0]

    def is_eof(self):
        return self.current_token >= len(self.tokens)

    def advance(self):
        self.current_token += 1

    def tokenize(self, expr):
        # Strip out escaped backslashes, quotes and parens so that the
        # lex scanner doesn't get confused. We put them back later.
        for k, v in self.REPLACEMENTS:
            expr = expr.replace(k, v)
        tokens = self.lex_scanner.scan(expr)[0]

        def unescape(x):
            for k, v in self.REPLACEMENTS:
                x = x.replace(v, k[1:])
            return x

        return [(tt, unescape(tv) if tt in (self.WORD,
                                            self.QUOTED_WORD) else tv)
                for tt, tv in tokens]

    def parse(self, expr, locations):
        self.locations = locations
        self.tokens = self.tokenize(expr)
        self.current_token = 0
        prog = self.or_expression()
        if not self.is_eof():
            raise ParseException(_('Extra characters at end of search'))
        # prints(self.tokens, '\n', prog)
        return prog

    def or_expression(self):
        lhs = self.and_expression()
        if self.lcase_token() == 'or':
            self.advance()
            return ['or', lhs, self.or_expression()]
        return lhs

    def and_expression(self):
        lhs = self.not_expression()
        if self.lcase_token() == 'and':
            self.advance()
            return ['and', lhs, self.and_expression()]

        # Account for the optional 'and'
        if ((self.token_type() in [self.WORD, self.QUOTED_WORD]
             or self.token() == '(') and self.lcase_token() != 'or'):
            return ['and', lhs, self.and_expression()]
        return lhs

    def not_expression(self):
        if self.lcase_token() == 'not':
            self.advance()
            return ['not', self.not_expression()]
        return self.location_expression()

    def location_expression(self):
        if self.token_type() == self.OPCODE and self.token() == '(':
            self.advance()
            res = self.or_expression()
            if self.token_type() != self.OPCODE or self.token(
                    advance=True) != ')':
                raise ParseException(_('missing )'))
            return res
        if self.token_type() not in (self.WORD, self.QUOTED_WORD):
            raise ParseException(
                _('Invalid syntax. Expected a lookup name or a word'))

        return self.base_token()

    def base_token(self):
        if self.token_type() == self.QUOTED_WORD:
            return ['token', 'all', self.token(advance=True)]

        words = self.token(advance=True).split(':')

        # The complexity here comes from having colon-separated search
        # values. That forces us to check that the first "word" in a colon-
        # separated group is a valid location. If not, then the token must
        # be reconstructed. We also have the problem that locations can be
        # followed by quoted strings that appear as the next token. and that
        # tokens can be a sequence of colons.

        # We have a location if there is more than one word and the first
        # word is in locations. This check could produce a "wrong" answer if
        # the search string is something like 'author: "foo"' because it
        # will be interpreted as 'author:"foo"'. I am choosing to accept the
        # possible error. The expression should be written '"author:" foo'
        if len(words) > 1 and words[0].lower() in self.locations:
            loc = words[0].lower()
            words = words[1:]
            if len(words) == 1 and self.token_type() == self.QUOTED_WORD:
                return ['token', loc, self.token(advance=True)]
            return ['token', icu_lower(loc), ':'.join(words)]

        return ['token', 'all', ':'.join(words)]

Example #18

0

Show file

File: 4.1.py Project: AndrewMos/NaturalLanguage

import re
scanner=re.Scanner([
  (r"[\w\.-]+@[\w\.-]+\.[\w\.-]+",  lambda scanner,token:("Mail", token)),
  (r"([0-9]+[\-][0-9]+)", lambda scanner,token:("Endash num", token)),
  (r"([0-9])+",           lambda scanner,token:("Int", token)),
  (r"[A-Za-z]+",          lambda scanner,token:("Word", token)),
  (r"[!?;:,'-]+",         lambda scanner,token:("Punct", token)),
  (r"[\(]",               lambda scanner,token:("LParen", token)),
  (r"[\)]",               lambda scanner,token:("RParen", token)),
  (r"[.]",                lambda scanner,token:("Dot", token)),
  (r"\s\-\s",             lambda scanner,token:("Em dash", token)),
  (r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])", lambda scanner,token:("Emoji", token)),
  (r"\s+", None),
])

results, remainder = scanner.scan("The Book! (Tue) 55-88 Milani-lop'ed mss. (-) 123 - [email protected] ðŸ˜‹ ðŸ˜Ž")
print(results)

Example #19

0

Show file

File: compiler.py Project: martint/ibis

    warnings.warn(
        'locale specific date formats (%%c, %%x, %%X) are not yet implemented '
        'for %s' % platform.system())

# translate strftime spec into mostly equivalent PostgreSQL spec
_scanner = re.Scanner(
    # double quotes need to be escaped
    [('"', lambda scanner, token: r'\"')] + [(
        '|'.join(
            map(
                '(?:{})'.format,
                itertools.chain(
                    _strftime_to_postgresql_rules.keys(),
                    [
                        # "%e" is in the C standard and Python actually
                        # generates this if your spec contains "%c" but we
                        # don't officially support it as a specifier so we
                        # need to special case it in the scanner
                        '%e',
                        r'\s+',
                        r'[{}]'.format(re.escape(string.punctuation)),
                        r'[^{}\s]+'.format(re.escape(string.punctuation)),
                    ],
                ),
            )),
        lambda scanner, token: token,
    )])

_lexicon_values = frozenset(_strftime_to_postgresql_rules.values())

_strftime_blacklist = frozenset(['%w', '%U', '%c', '%x', '%X', '%e'])

Example #20

0

Show file

File: expression.py Project: h0r57/OctoTagger

def parse(string):
    """
    Order of operations

    ()
        Group

    =
        Numeric Match

    -
        Not

    " "
        And

    /
        Or
    """
    def identity(scanner, token):
        return token

    def unknown(scanner, token):
        raise ValueError("Unknown token", token)

    scanner = re.Scanner([
        (REG_NUM, identity),
        (REG_TAG, identity),
        (REG_OP, identity),
        (REG_GROUP, identity),
        (r'\s+', None),
        (r'', unknown),
    ])

    tokens = scanner.scan(string)[0]

    if tokens.count('(') != tokens.count(')'):
        raise ValueError('Unbalanced parentheses')

    def query_matching_files(match):
        parts = match.split("=")

        if re.match(reg_full(REG_TAG_NAME), parts[0]):
            clause = 'name = "{}"'.format(parts[0])
        elif re.match(reg_full(REG_TAG_ID), parts[0]):
            clause = 'pk_id = {}'.format(parts[0])
        else:
            raise ValueError("Found invalid tag specifier", parts[0])

        if len(parts) > 2:
            raise ValueError("Found too many `='", match)
        elif len(parts) == 2:
            if '..' in parts[1]:
                [lower, upper] = parts[1].split('..')
                clause += ' AND amount BETWEEN {} AND {}'.format(lower, upper)
            else:
                clause += ' AND amount = {}'.format(parts[1])

        return ("file.pk_id IN ("
                "SELECT pk_fk_file_id FROM file_has_tag"
                " LEFT JOIN tag ON pk_fk_tag_id = pk_id"
                " WHERE " + clause + ")")

    def reg_full(reg):
        return '^' + reg + '$'

    def parse_numeric(tokens):
        result = []
        try:
            while True:
                token = tokens.next()
                if token == '=':
                    if len(result) == 0:
                        raise ValueError("Expected tag before `='")
                    elif not re.match(reg_full(REG_TAG), result[-1]):
                        raise ValueError("Expected tag before `='", result[-1])

                    try:
                        number = tokens.next()
                    except StopIteration:
                        raise ValueError("Expected something after `='")

                    if not re.match(reg_full(REG_NUM), number):
                        raise ValueError("Expected number after `='", number)
                    result[-1] += '=' + number  # group the tokens
                elif '..' in token:
                    raise ValueError("An amount requires a tag")
                else:
                    result.append(token)
        except StopIteration:
            return result

    def parse_tag(tokens):
        result = []
        try:
            while True:
                token = tokens.next()
                if re.match(reg_full(REG_TAG_MATCH), token):
                    result.append(query_matching_files(token))
                else:
                    result.append(token)
        except StopIteration:
            return result

    def parse_not(tokens):
        result = []
        try:
            while True:
                token = tokens.next()
                if token == '-':
                    tag = tokens.next()
                    if not re.match(r'file\.pk_id', tag):
                        raise ValueError("Expected tag query after `-'", tag)
                    result.append('NOT (' + tag + ')')
                else:
                    result.append(token)
        except StopIteration:
            return result

    def parse_group(tokens):
        result = []
        try:
            while True:
                token = tokens.next()
                if token == '(':
                    result.append(parse_group(tokens))
                elif token == ')':
                    return result
                else:
                    result.append(token)
        except StopIteration:
            return result

    def parse_or(tokens):
        if tokens[0] == '/' or tokens[-1] == '/':
            raise ValueError("Found OR on edge")

        result = ""
        left = []
        for token in tokens:
            if token == '/':
                result += '(' + ' AND '.join(left) + ')'
                result += ' OR '
                left = []
            elif type(token) == list:
                left.append(parse_or(token))
            else:
                left.append(token)

        result += '(' + ' AND '.join(left) + ')'
        result = '(' + result + ')'
        return result

    tokens = parse_numeric(iter(tokens))  # group 3 num tokens to 1 token
    tokens = parse_tag(iter(tokens))  # convert tag to query
    tokens = parse_not(iter(tokens))  # group a not token to the tag query
    tokens = parse_group(iter(tokens))  # convert groups to sublists
    query = parse_or(tokens)

    return query

Example #21

0

Show file

File: parsing.py Project: arjo129/parse_cmake


def expect(expected_type, toks):
    line_num, (typ, tok_contents) = next(toks)
    if typ != expected_type:
        msg = 'Expected a %s, but got "%s" at line %s' % (
            expected_type, tok_contents, line_num)
        raise CMakeParseError(msg)

# http://stackoverflow.com/questions/691148/pythonic-way-to-implement-a-tokenizer
# TODO: Handle multiline strings.
scanner = re.Scanner([
    (r'#.*', lambda scanner, token: ("comment", token)),
    (r'"[^"]*"', lambda scanner, token: ("string", token)),
    (r"\(", lambda scanner, token: ("left paren", token)),
    (r"\)", lambda scanner, token: ("right paren", token)),
    (r'[^ \t\r\n()#"]+', lambda scanner, token: ("word", token)),
    (r'\n', lambda scanner, token: ("newline", token)),
    (r"\s+", None),  # skip other whitespace
])


def tokenize(s):
    """
    Yields pairs of the form (line_num, (token_type, token_contents))
    given a string containing the contents of a CMakeLists file.
    """
    toks, remainder = scanner.scan(s)
    line_num = 1
    if remainder != '':
        msg = 'Unrecognized tokens at line %s: %s' % (line_num, remainder)

Example #22

0

Show file

def tokenize(contents):
    """
  Scan a string and return a list of Token objects representing the contents
  of the cmake listfile.
  """

    # https://cmake.org/cmake/help/v3.0/manual/
    #     cmake-language.7.html#grammar-token-unquoted_legacy
    legacy_pattern = "({})+".format("|".join([
        # Make-style variable like $(MAKE)
        r'\$\([^\$\(\)]+\)',
        # Quoted-substring
        r'"[^"\\]*(?:\\.[^"\\]*)*"',
        # Any element except whitespace or one of '()#"\'
        r'[^\s\(\)#"\\]',
        # Escape sequences
        # https://cmake.org/cmake/help/v3.0/manual/
        #   cmake-language.7.html#grammar-token-escape_sequence
        r'\\[\(\)#" \\\$@\^\t\r\n;]'
    ]))

    # https://cmake.org/cmake/help/v3.0/manual/
    #     cmake-language.7.html#unquoted-argument
    unquoted_pattern = "({})+".format("|".join([
        # Any element except whitespace or one of '()#"\'
        r'[^\s\(\)#"\\]',
        # Escape sequences
        # https://cmake.org/cmake/help/v3.0/manual/
        #   cmake-language.7.html#grammar-token-escape_sequence
        r'\\[\(\)#" \\\$@\^\t\r\n;]'
    ]))

    # Regexes are in priority order. Changing the order may alter the
    # behavior of the lexer
    scanner = re.Scanner(
        [
            # double quoted string
            # NOTE(josh): regex borrowed from
            # https://stackoverflow.com/a/37379449/141023
            (r'(?<![^\s\(])"[^"\\]*(?:\\.[^"\\]*)*"(?![^\s\)])', lambda s, t:
             (TokenType.QUOTED_LITERAL, t)),
            # single quoted string
            (r"(?<![^\s\(])'[^'\\]*(?:\\.[^'\\]*)*'(?![^\s\)])", lambda s, t:
             (TokenType.QUOTED_LITERAL, t)),
            # bracket argument
            (r"(?<![^\s\(])\[(=*)\[.*\]\1\](?![^\s\)])", lambda s, t:
             (TokenType.BRACKET_ARGUMENT, t)),
            (r"(?<![^\s\(])-?[0-9]+(?![^\s\)\(])", lambda s, t:
             (TokenType.NUMBER, t)),
            # Either a valid function name or variable name.
            (r"(?<![^\s\(])[a-zA-z_][a-zA-Z0-9_]*(?![^\s\)\(])", lambda s, t:
             (TokenType.WORD, t)),
            (r"(?<![^\s\(])\${[a-zA-z_][a-zA-Z0-9_]*}(?![^\s\)])", lambda s, t:
             (TokenType.DEREF, t)),
            # unquoted_legacy
            (legacy_pattern, lambda s, t: (TokenType.UNQUOTED_LITERAL, t)),
            # unquoted_element+
            (unquoted_pattern, lambda s, t: (TokenType.UNQUOTED_LITERAL, t)),
            (r"\(", lambda s, t: (TokenType.LEFT_PAREN, t)),
            (r"\)", lambda s, t: (TokenType.RIGHT_PAREN, t)),
            # NOTE(josh): bare carriage returns are very unlikely to be used but
            # just for the case of explicitnes, if we ever encounter any we treat
            # it as a newline
            (r"\r?\n", lambda s, t: (TokenType.NEWLINE, t)),
            (r"\r\n?", lambda s, t: (TokenType.NEWLINE, t)),
            # NOTE(josh): don't match '\s' here or we'll miss some newline tokens
            # TODO(josh): should we match unicode whitespace too?
            (r"[ \t\f\v]+", lambda s, t: (TokenType.WHITESPACE, t)),
            (r"#\s*(cmake-format|cmf): off[^\n]*", lambda s, t:
             (TokenType.FORMAT_OFF, t)),
            (r"#\s*(cmake-format|cmf): on[^\n]*", lambda s, t:
             (TokenType.FORMAT_ON, t)),
            # bracket comment
            (r"#\[(=*)\[.*\]\1\]", lambda s, t:
             (TokenType.BRACKET_COMMENT, t)),
            # line comment
            (r"#[^\n]*", lambda s, t: (TokenType.COMMENT, t)),
            # Catch-all for literals which are compound statements.
            (r"([^\s\(\)]+|[^\s\(]*[^\)]|[^\(][^\s\)]*)", lambda s, t:
             (TokenType.UNQUOTED_LITERAL, t)),
        ],
        re.DOTALL)

    tokens_return = []
    if contents.startswith("\ufeff"):
        tokens_return = [
            Token(tok_type=TokenType.BYTEORDER_MARK,
                  spelling=contents[0],
                  index=-1,
                  begin=SourceLocation((0, 0, 0)),
                  end=SourceLocation((0, 0, 0)))
        ]
        contents = contents[1:]

    tokens, remainder = scanner.scan(contents)
    assert not remainder, "Unparsed tokens: {}".format(remainder)

    # Now add line, column, and serial number to token objects. We get lineno
    # by maintaining a running count of newline characters encountered among
    # tokens so far, and column count by splitting the most recent token on
    # it's right most newline. Note that line and numbers are 1-indexed to match
    # up with editors but column numbers are zero indexed because its fun to be
    # inconsistent.
    lineno = 1
    col = 0
    offset = 0
    for tok_index, (tok_type, spelling) in enumerate(tokens):
        if sys.version_info[0] < 3:
            assert isinstance(spelling, unicode)
        begin = SourceLocation((lineno, col, offset))

        newlines = spelling.count('\n')
        lineno += newlines
        if newlines:
            col = len(spelling.rsplit('\n', 1)[1])
        else:
            col += len(spelling)

        offset += len(bytearray(spelling, 'utf-8'))
        tokens_return.append(
            Token(tok_type=tok_type,
                  spelling=spelling,
                  index=tok_index,
                  begin=begin,
                  end=SourceLocation((lineno, col, offset))))

    return tokens_return

Example #23

0

Show file

File: router.py Project: luser/fxos-certsuite

 def scan(self, input_str):
     scanner = re.Scanner([(r"/", self.slash),
                           (r"{\w*}", self.group),
                           (r"\*", self.star),
                           (r"(?:\\.|[^{\*/])*", self.literal),])
     return scanner.scan(input_str)

Example #24

0

Show file

import re

scanner = re.Scanner([
    (r"\[[^\]]*\]", lambda scanner, token: token),
    (r"\+", lambda scanner, token: "R_PLUS"),
    (r"\*", lambda scanner, token: "R_KLEENE"),
    (r"%", lambda scanner, token: "R_WILD"),
    (r"\^", lambda scanner, token: "R_START"),
    (r"\$", lambda scanner, token: "R_END"),
    (r"\?", lambda scanner, token: "R_QUESTION"),
    (r"[\.~``;_a-zA-Z0-9\s=:\{\}\-\\]+", lambda scanner, token: "R_FREE"),
    (r'.', lambda scanner, token: None),
])


def tokenizeRegex(s):
    results, remainder = scanner.scan(s)
    return results


if __name__ == '__main__':
    print(tokenizeRegex("^discount[^(]*\\([0-9]+\\%\\)$"))
    print(tokenizeRegex("'helloworld'"))

Example #25

0

Show file

File: parse.py Project: wnissen/spack

 def __init__(self, lexicon):
     self.scanner = re.Scanner(lexicon)

Example #26

0

Show file

_junk = ur"[^א-ת%sa-zA-Z0-9!?.,:;\-()\[\]{}]+" % _NIKUD  #%%&!?.,;:\-()\[\]{}\"'\/\\+]+" #% _NIKUD

is_all_heb = re.compile(ur"^%s+$" % (_heb_letter), re.UNICODE).match
is_a_number = re.compile(r"^%s$" % _numeric, re.UNICODE).match
is_all_lat = re.compile(r"^[a-zA-Z]+$", re.UNICODE).match
is_sep = re.compile(r"^\|+$").match
is_punct = re.compile(r"^[.?!]+").match

#### scanner

scanner = re.Scanner([
    (r"\s+", None),
    (_url, url),
    (_heb_word_plus, heb),
    (_eng_word, eng),
    (_numeric, num),
    (_opening_punc, punct),
    (_closing_punc, punct),
    (_eos_punct, punct),
    (_internal_punct, punct),
    (_junk, junk),
])


##### tokenize
def tokenize(sent):
    tok = sent
    parts, reminder = scanner.scan(tok)
    assert (not reminder)
    return parts