def append_character(self, character): '''Add the next character.''' char_str = self._parser_state.alphabet.expression_to_str(character) lazy = char_str == '?' if character is not None and char_str in '+*': raise RxpyError('Compound repeat: ' + self._initial_char_str + char_str) elif self._initial_char_str == '?': self.build_optional(self._parent, self._latest, lazy) elif self._initial_char_str == '+': self.build_plus(self._parent, self._latest, lazy, self._parser_state) elif self._initial_char_str == '*': self.build_star(self._parent, self._latest, lazy, self._parser_state) else: raise RxpyError('Bad initial character for RepeatBuilder') if lazy: return self._parent else: return self._parent.append_character(character)
def append_character(self, character, escaped=False): '''Add the next character.''' char_str = self._parser_state.alphabet.expression_to_str(character) if self._create is None: if char_str == '<': self._create = True elif char_str == '=': self._create = False else: raise RxpyError('Unexpected qualifier after (?P - ' + char_str) else: if self._create and not escaped and char_str == '>': if not self._name: raise RxpyError('Empty name for group') return GroupBuilder(self._parser_state, self._parent, True, self._name) elif not self._create and not escaped and char_str == ')': self._parent._sequence.append( GroupReference( self._parser_state.index_for_name_or_count( self._name))) return self._parent elif not escaped and char_str == '\\': # this is just for the name return SimpleEscapeBuilder(self._parser_state, self) elif character: self._name += char_str else: raise RxpyError('Incomplete named group') return self
def append_character(self, character): '''Add the next character.''' if character is None: raise RxpyError('Incomplete unicode escape') self.__buffer += character self.__remaining -= 1 if self.__remaining: return self try: return self.__parent.append_character( self._parser_state.alphabet.unescape(int(self.__buffer, 16)), escaped=True) except: raise RxpyError('Bad unicode escape: ' + self.__buffer)
def group_reference(self, next, number): match = self.__match.group(number) if match: self.__replacement.append(match) return False # loop internally til done else: raise RxpyError('No match for group ' + str(number))
def append_character(self, character): '''Add the next character.''' self._count += 1 char_str = self._parser_state.alphabet.expression_to_str(character) if self._count == 1: if char_str == '?': return self else: builder = GroupBuilder(self._parser_state, self._parent) return builder.append_character(character) else: if char_str == ':': return GroupBuilder(self._parser_state, self._parent, binding=False) elif char_str in ParserStateBuilder.INITIAL: return ParserStateBuilder( self._parser_state, self._parent).append_character(character) elif char_str == 'P': return NamedGroupBuilder(self._parser_state, self._parent) elif char_str == '#': return CommentGroupBuilder(self._parser_state, self._parent) elif char_str == '=': return LookaheadBuilder(self._parser_state, self._parent, True, True) elif char_str == '!': return LookaheadBuilder(self._parser_state, self._parent, False, True) elif char_str == '<': return LookbackBuilder(self._parser_state, self._parent) elif char_str == '(': return ConditionalBuilder(self._parser_state, self._parent) else: raise RxpyError('Unexpected qualifier after (? - ' + char_str)
def append_character(self, character): '''Add the next character.''' char_str = self._parser_state.alphabet.expression_to_str(character) if self._closed: if not self._lazy and char_str == '?': self._lazy = True return self else: self.__build() return self._parent.append_character(character) empty = not self._acc and self._begin is None if empty and char_str == '}': # oops - not a count at all self._parent.append_character(self._open, escaped=True) self._parent.append_character(character, escaped=True) return self._parent elif char_str == '}': self.__store_value() self._closed = True elif char_str == ',': self.__store_value() elif character is not None: self._acc += character else: raise RxpyError('Incomplete count specification') return self
def append_character(self, character): '''Add the next character.''' char_str = self._parser_state.alphabet.expression_to_str(character) if character is None: raise RxpyError('Incomplete character escape') elif char_str in digits and char_str != '0': return GroupReferenceBuilder(self._parser_state, self._parent, character) elif char_str == 'A': self._parent._sequence.append(StartOfLine(False)) return self._parent elif char_str in 'bB': self._parent._sequence.append(WordBoundary(char_str == 'B')) return self._parent elif char_str in 'dD': self._parent._sequence.append(Digit(char_str == 'D')) return self._parent elif char_str in 'wW': self._parent._sequence.append(Word(char_str == 'W')) return self._parent elif char_str in 'sS': self._parent._sequence.append(Space(char_str == 'S')) return self._parent elif char_str == 'Z': self._parent._sequence.append(EndOfLine(False)) return self._parent else: return super(ComplexEscapeBuilder, self).append_character(character)
def index_for_name_or_count(self, name): ''' Given a group name or index (as text), return the group index (as int). First, we parse as an integer, then we try as a name. ''' try: index = int(name) if index not in self.__index_to_name: raise RxpyError('Unknown index ' + str(name)) else: return index except ValueError: if name not in self.__name_to_index: raise RxpyError('Unknown name ' + str(name)) else: return self.__name_to_index[name]
def append(character=character): '''Helper function to avoid repetition below - adds character.''' def unpack(character): '''Generate a `CharSet` or a character pair.''' (is_charset, value) = \ self._parser_state.alphabet.expression_to_charset( character, self._parser_state.flags) if not is_charset: value = (character, character) return value if self._range: if self._queue is None: raise RxpyError('Incomplete range') else: (alo, ahi) = unpack(self._queue) (blo, bhi) = unpack(character) self._charset.append_interval((alo, blo)) self._charset.append_interval((ahi, bhi)) self._queue = None self._range = False else: if self._queue: (lo, hi) = unpack(self._queue) self._charset.append_interval((lo, lo)) self._charset.append_interval((hi, hi)) self._queue = character
def parse(self, text): builder = self for character in text: builder = builder.append_character(character) builder = builder.append_character(None) if self != builder: raise RxpyError('Incomplete expression') return self._sequence.join(Match(), self._parser_state)
def append_character(self, character): if not character: raise RxpyError('Incomplete character escape') elif character == 'g': return ReplacementGroupReferenceBuilder(self._parser_state, self._parent) else: return super(ReplacementEscapeBuilder, self).append_character(character)
def post_process(graph, actions): map = {} for (type_, function) in actions: if type_ not in map: map[type_] = function else: raise RxpyError('Conflicting actions for ' + str(type_)) for node in node_iterator(graph): map.get(type(node), lambda x: None)(node) return graph
def append_character(self, character, escaped=False): '''Add the next character.''' char_str = self._parser_state.alphabet.expression_to_str(character) if character is None: raise RxpyError('Incomplete conditional match') elif not escaped and char_str in self.__terminals: return self.__conditional.callback(self, character) else: return super(YesNoBuilder, self).append_character(character, escaped)
def append_character(self, character): # this is so complex because the tests for different errors are so # detailed if not self.__buffer and character == '<': self.__buffer += character return self elif len(self.__buffer) > 1 and character == '>': self.__parent._sequence.append(self.__decode()) return self.__parent elif character and self.__numeric and character in digits: self.__buffer += character return self elif character and self.__name and character in ALPHANUMERIC: self.__buffer += character return self elif character: raise RxpyError('Unexpected character in group escape: ' + character) else: raise RxpyError('Incomplete group escape')
def append_character(self, character): '''Add the next character.''' char_str = self._parser_state.alphabet.expression_to_str(character) if character is None: raise RxpyError('Incomplete character escape') elif char_str in digits and char_str != '0': return GroupReferenceBuilder(self._parser_state, self._parent, character) else: return super(IntermediateEscapeBuilder, self).append_character(character)
def expression_to_charset(self, char, flags): ''' Given an input character (of expression type), return either a charset or a letter from the alphabet. Return either (True, CharSet) or (False, letter) ''' from lepl.rxpy.parser.support import ParserState if flags & ParserState.IGNORECASE: raise RxpyError('Default alphabet does not handle case') return False, self.join(self.expression_to_letter(char))
def append_character(self, character, escaped=False): '''Add the next character.''' char_str = self._parser_state.alphabet.expression_to_str(character) if not escaped and char_str == ')': return self._parent elif not escaped and char_str == '\\': return SimpleEscapeBuilder(self._parser_state, self) elif character is not None: return self else: raise RxpyError('Incomplete comment')
def parse_group(self, text): '''Parse a set of groups for `Scanner`.''' builder = GroupBuilder(self._parser_state, self) if self._sequence: self.__start_new_alternative() for character in text: builder = builder.append_character(character) try: builder = builder.append_character(')') assert builder == self except: raise RxpyError('Incomplete group')
def __store_value(self): if self._begin is None: if not self._acc: raise RxpyError('Missing lower limit for repeat') else: try: self._begin = int(self._acc) except ValueError: raise RxpyError('Bad lower limit for repeat: ' + self._acc) else: if self._range: raise RxpyError('Too many values in repeat') self._range = True if self._acc: try: self._end = int(self._acc) except ValueError: raise RxpyError('Bad upper limit for repeat: ' + self._acc) if self._begin > self._end: raise RxpyError('Inconsistent repeat range') self._acc = ''
def append_character(self, character): '''Add the next character.''' char_str = self._parser_state.alphabet.expression_to_str(character) if char_str == '=': return LookaheadBuilder(self._parser_state, self._parent, True, False) elif char_str == '!': self._parser_state.new_flag(ParserState._LOOKBACK) return LookaheadBuilder(self._parser_state, self._parent, False, False) else: raise RxpyError('Unexpected qualifier after (?< - ' + char_str)
def append_character(self, character): '''Add the next character.''' char_str = self._parser_state.alphabet.expression_to_str(character) if not self.__escape and char_str == '_': self.__escape = True return self elif self.__escape and char_str in 'lceug': self._parser_state.new_flag(self.__table['_' + char_str]) self.__escape = False return self elif not self.__escape and char_str == 'L': raise RxpyError('Locale based classes unsupported') elif not self.__escape and char_str in self.__table: self._parser_state.new_flag(self.__table[char_str]) return self elif not self.__escape and char_str == ')': return self.__parent elif self.__escape: raise RxpyError('Unexpected characters after (? - _' + char_str) else: raise RxpyError('Unexpected character after (? - ' + char_str)
def clone(self): ''' Duplicate this node (necessary when replacing a numbered repeat with explicit, repeated, instances, for example). This copies all "public" attributes as constructor kargs. ''' try: return self.__class__(**self._kargs()) except TypeError as e: raise RxpyError('Error cloning {0}: {1}'.format( self.__class__.__name__, e))
def parse(self, text): '''Parse a regular expression.''' builder, index = self, None try: for (index, character) in enumerate(text): builder = builder.append_character(character) builder = builder.append_character(None) except ParseError as e: e.update(text, index) raise if self != builder: raise RxpyError('Incomplete expression') return self.to_sequence().join(Match(), self._parser_state)
def append_character(self, character): '''Add the next character.''' char_str = self._parser_state.alphabet.expression_to_str(character) if character is None: raise RxpyError('Incomplete character escape') elif char_str in 'xuU': return CharacterCodeBuilder(self._parser_state, self._parent, character) elif char_str in digits: return OctalEscapeBuilder(self._parser_state, self._parent, character) elif char_str in self.__std_escapes: return self._parent.append_character(self.__std_escapes[char_str], escaped=True) elif char_str not in ascii_letters: # matches re.escape return self._parent.append_character(character, escaped=True) else: return self._unexpected_character(character)
def __build(self): if not self._parent._sequence: raise RxpyError('Nothing to repeat') latest = self._parent._sequence.pop() if (self._parser_state.flags & ParserState._LOOP_UNROLL) and ( (self._end is None and self._parser_state.unwind(self._begin)) or (self._end is not None and self._parser_state.unwind(self._end))): for _i in range(self._begin): self._parent._sequence.append(latest.clone()) if self._range: if self._end is None: RepeatBuilder.build_star(self._parent, latest.clone(), self._lazy, self._parser_state) else: for _i in range(self._end - self._begin): RepeatBuilder.build_optional(self._parent, latest.clone(), self._lazy) else: self.build_count(self._parent, latest, self._begin, self._end if self._range else self._begin, self._lazy, self._parser_state)
def parse_groups(texts, engine, flags=0, alphabet=None): ''' Parse set of expressions, used to define groups for `Scanner`. ''' from lepl.rxpy.compat.support import default_alphabet if not texts: raise ValueError('Empty set of texts for scanner') alphabet = default_alphabet(alphabet, texts[0]) parser_state = ParserState(flags=flags, alphabet=alphabet, refuse=engine.REFUSE, require=engine.REQUIRE) sequence = SequenceBuilder(parser_state) for text in texts: sequence.parse_group(text) if parser_state.has_new_flags: parser_state = parser_state.clone_with_new_flags(texts[0]) sequence = SequenceBuilder(parser_state) for text in texts: sequence.parse_group(text) if parser_state.has_new_flags: raise RxpyError('Inconsistent flags') return parser_state, sequence.to_sequence().join(Match(), parser_state)
def parse(text, parser_state, class_, mutable_flags=True): ''' Parse the text using the given builder. If the expression sets flags then it is parsed again. If it changes flags on the second parse then an error is raised. ''' graph = None try: graph = class_(parser_state).parse(text) except RxpyError: # suppress error if we will parse again if not (mutable_flags and parser_state.has_new_flags): raise if mutable_flags and parser_state.has_new_flags: parser_state = parser_state.clone_with_new_flags(text) graph = class_(parser_state).parse(text) parser_state.alphabet.validate_expression(text, parser_state.flags) actions = resolve_group_names(parser_state) actions.append(set_lookahead_properties()) graph = post_process(graph, actions) if parser_state.has_new_flags: raise RxpyError('Inconsistent flags') return parser_state, graph
def require_engine(engine): if not engine: raise RxpyError('Engine must be given for RXPY ' '(use an engine-specific re module).')
def append_character(self, character, escaped=False): '''Add the next character.''' def append(character=character): '''Helper function to avoid repetition below - adds character.''' def unpack(character): '''Generate a `CharSet` or a character pair.''' (is_charset, value) = \ self._parser_state.alphabet.expression_to_charset( character, self._parser_state.flags) if not is_charset: value = (character, character) return value if self._range: if self._queue is None: raise RxpyError('Incomplete range') else: (alo, ahi) = unpack(self._queue) (blo, bhi) = unpack(character) self._charset.append_interval((alo, blo)) self._charset.append_interval((ahi, bhi)) self._queue = None self._range = False else: if self._queue: (lo, hi) = unpack(self._queue) self._charset.append_interval((lo, lo)) self._charset.append_interval((hi, hi)) self._queue = character char_str = self._parser_state.alphabet.expression_to_str(character) if self._invert is None and char_str == '^': self._invert = True elif not escaped and char_str == '\\': return SimpleEscapeBuilder(self._parser_state, self) elif escaped and char_str in 'dD': self._charset.append_class(self._parser_state.alphabet.digit, character, char_str == 'D') elif escaped and char_str in 'wW': self._charset.append_class(self._parser_state.alphabet.word, character, char_str == 'W') elif escaped and char_str in 'sS': self._charset.append_class(self._parser_state.alphabet.space, character, char_str == 'S') # not charset allows first character to be unescaped - or ] elif character is not None and \ ((not self._charset and not self._queue) or escaped or char_str not in "-]"): append() elif char_str == '-': if self._range: # repeated - is range to -? append() else: self._range = True elif char_str == ']': if self._queue: if self._range: self._range = False # convert open range to '-' append('-') append(None) if self._invert: self._charset.invert() self._parent._sequence.append(self._charset.simplify()) return self._parent else: raise RxpyError('Syntax error in character set') # after first character this must be known if self._invert is None: self._invert = False return self
def decode(buffer, alphabet): '''Convert the octal sequence to a character.''' try: return alphabet.unescape(int(buffer, 8)) except: raise RxpyError('Bad octal escape: ' + buffer)