def _char_set_for(self, codepoint: int) -> CharSet: """ Return a CharSet instance for the given character. Note that this takes into account case insensitivity, if it is enabled. """ char = chr(codepoint) return (CharSet(char, char.lower(), char.upper()) if self.case_insensitive else CharSet(char))
def _parse_range(self, stream: SequenceReader) -> RegexpCollection.Parser: """ Parse a regular expression for a character range. :param file stream: Input regexp stream. :rtype: RegexpCollection.Parser """ assert stream.read() == '[' ranges: List[Tuple[int, int]] = [] # First, determine if this range must be negated negate = False if stream.next_is('^'): negate = True stream.read() # Now, read ranges... # # TODO: handle '-' and ']' in first position. in_range = False while not stream.eof and not stream.next_is(']'): if stream.next_is('-'): check_source_language(bool(ranges and not in_range), 'dangling dash') in_range = True stream.read() else: codepoint = (self._read_escape(stream) if stream.next_is('\\') else ord(stream.read())) if in_range: low, high = ranges.pop() assert low == high ranges.append((low, codepoint)) else: ranges.append((codepoint, codepoint)) in_range = False check_source_language(not in_range, 'dangling dash') check_source_language(stream.next_is(']'), 'unbalanced square bracket') assert stream.read() == ']' # In case insensitivity is enabled, make sure both lowercase and # uppercase variants of all characters in ranges are present. if self.case_insensitive: char_set = CharSet() for low, high in ranges: for codepoint in range(low, high + 1): char = chr(codepoint) for c in (char, char.lower(), char.upper()): char_set.add(c) else: char_set = CharSet.from_int_ranges(*ranges) if negate: char_set = char_set.negation return self.Range(char_set)
def _parse_range(cls, stream): """ Parse a regular expression for a character range. :param file stream: Input regexp stream. :rtype: RegexpCollection.Parser """ assert stream.read() == '[' ranges = [] # First, determine if this range must be negated negate = False if stream.next_is('^'): negate = True stream.read() # Now, read ranges... # # TODO: handle '-' and ']' in first position. in_range = False while not stream.eof and not stream.next_is(']'): if stream.next_is('-'): check_source_language(ranges and not in_range, 'dangling dash') in_range = True stream.read() else: char = (cls._read_escape(stream) if stream.next_is('\\') else ord(stream.read())) if in_range: low, high = ranges.pop() assert low == high ranges.append((low, char)) else: ranges.append((char, char)) in_range = False check_source_language(not in_range, 'dangling dash') check_source_language(stream.next_is(']'), 'unbalanced square bracket') assert stream.read() == ']' char_set = CharSet.from_int_ranges(*ranges) if negate: char_set = char_set.negation return cls.Range(char_set)
def add_transition(self, chars: CharSet, next_state: DFAState) -> None: """ Add a transition from this state to another one. :param chars: Specification of the input that allows to transition from this state to the next one. A CharSet instance indicates that one character in this set is required. :param next_state: Destination state for this new transition. """ assert isinstance(chars, CharSet) assert isinstance(next_state, DFAState) # Check that ``chars`` does overlap with character sets for other # transitions. for other_chars, _ in self.transitions: assert not chars.overlaps_with(other_chars), ( 'Overlapping input char sets: {} and {}'.format( chars, other_chars)) self.transitions.append((chars, next_state))
from langkit.lexer.char_set import CharSet def check_ranges(label, cs): def format_char(char): return (chr(char) if ord(' ') < char and char <= ord('~') else '\\U+{:04X}'.format(char)) print('== {} =='.format(label)) print(' '.join('{}-{}'.format(format_char(l), format_char(h)) for l, h in cs.ranges)) print('') check_ranges('Single', CharSet('a')) check_ranges('Adjacent 2 singles', CharSet('a', 'b')) check_ranges('Non-adjacent 2 singles', CharSet('a', 'c')) check_ranges('Reverted non-adjacent 2 singles', CharSet('c', 'a')) check_ranges('Adjacent 3 singles', CharSet('a', 'c', 'b')) check_ranges('Empty range', CharSet(('a', 'c'), ('d', 'c'))) check_ranges('Redundant single', CharSet(('a', 'c'), 'b')) check_ranges('Non-adjacent ranges', CharSet(('i', 'o'), ('a', 'c'))) for c in ('h', 'i', 'j', 'k'): check_ranges('Adjacent ranges - {}'.format(c), CharSet(('i', 'o'), ('a', c))) check_ranges('Overlapping ranges (1)', CharSet(('i', 'o'), ('a', 'o'))) check_ranges('Nested range', CharSet(('i', 'o'), ('a', 'p')))
def _parse_sequence(cls, stream): """ Parse a sequence of regexps. Stop at the first unmatched parenthesis or at the first top-level pipe character. :param file stream: Input regexp stream. :rtype: RegexpCollection.Parser """ subparsers = [] while True: if stream.eof or stream.next_is('|', ')'): break elif stream.next_is('('): # Nested group: recursively parse alternatives stream.read() subparsers.append(cls._parse_or(stream)) check_source_language(stream.next_is(')'), 'unbalanced parenthesis') stream.read() elif stream.next_is('['): # Parse a range of characters subparsers.append(cls._parse_range(stream)) elif stream.next_is('{'): # Parse a reference to a named pattern stream.read() name = '' while not stream.eof and not stream.next_is('}'): name += stream.read() check_source_language(stream.next_is('}'), 'unbalanced bracket') stream.read() check_source_language(rule_name_re.match(name), 'invalid rule name: {}'.format(name)) subparsers.append(cls.Defer(name)) elif stream.next_is('*', '+', '?'): # Repeat the previous sequence item check_source_language(subparsers, 'nothing to repeat') check_source_language( not isinstance(subparsers[-1], cls.Repeat), 'multiple repeat') wrapper = { '*': lambda p: cls.Repeat(p), '+': lambda p: cls.Sequence([p, cls.Repeat(p)]), '?': lambda p: cls.Opt(p) }[stream.read()] subparsers[-1] = wrapper(subparsers[-1]) elif stream.next_is('.'): # Generally, "." designates any character *except* newlines. Do # the same here. stream.read() subparsers.append(cls.Range(CharSet('\n').negation)) elif stream.next_is('^', '$'): check_source_language( False, 'matching beginning or ending is unsupported') elif stream.next_is('\\'): # Parse an escape sequence. In can be a Unicode character, a # Unicode property or a simple escape sequence. stream.read() # \p and \P refer to character sets from Unicode general # categories. if stream.next_is('p', 'P'): action = stream.read() # Read the category name, which must appear between curly # brackets. category = '' check_source_language( stream.next_is('{'), 'incomplete Unicode category matcher') stream.read() while not stream.eof and not stream.next_is('}'): category += stream.read() check_source_language( stream.next_is('}'), 'incomplete Unicode category matcher') stream.read() try: char_set = CharSet.for_category(category) except KeyError: check_source_language( False, 'invalid Unicode category: {}'.format(category)) if action == 'P': char_set = char_set.negation subparsers.append(cls.Range(char_set)) else: stream.go_back() subparsers.append( cls.Range(CharSet.from_int(cls._read_escape(stream)))) else: subparsers.append(cls.Range(CharSet(stream.read()))) return cls.Sequence(subparsers)
def _parse_sequence(self, stream: SequenceReader) -> RegexpCollection.Parser: """ Parse a sequence of regexps. Stop at the first unmatched parenthesis or at the first top-level pipe character. :param stream: Input regexp stream. """ subparsers = [] while True: if stream.eof or stream.next_is('|', ')'): break elif stream.next_is('('): # Nested group: recursively parse alternatives stream.read() subparsers.append(self._parse_or(stream)) check_source_language(stream.next_is(')'), 'unbalanced parenthesis') stream.read() elif stream.next_is('['): # Parse a range of characters subparsers.append(self._parse_range(stream)) elif stream.next_is('{'): # Parse a reference to a named pattern stream.read() name = '' while not stream.eof and not stream.next_is('}'): name += stream.read() check_source_language(stream.next_is('}'), 'unbalanced bracket') stream.read() check_source_language( rule_name_re.match(name) is not None, 'invalid rule name: {}'.format(name)) subparsers.append(self.Defer(name)) elif stream.next_is('*', '+', '?'): # Repeat the previous sequence item check_source_language(bool(subparsers), 'nothing to repeat') check_source_language( not isinstance(subparsers[-1], self.Repeat), 'multiple repeat') wrapper = { '*': lambda p: self.Repeat(p), '+': lambda p: self.Sequence([p, self.Repeat(p)]), '?': lambda p: self.Opt(p) }[stream.read()] subparsers[-1] = wrapper(subparsers[-1]) elif stream.next_is('.'): # Generally, "." designates any character *except* newlines. Do # the same here. stream.read() subparsers.append(self.Range(CharSet('\n').negation)) elif stream.next_is('^', '$'): check_source_language( False, 'matching beginning or ending is unsupported') elif stream.next_is('\\'): # Parse an escape sequence. In can be a Unicode character, a # Unicode property or a simple escape sequence. stream.read() # \p and \P refer to character sets from Unicode general # categories. if stream.next_is('p', 'P'): action = stream.read() # Read the category name, which must appear between curly # brackets. category = '' check_source_language( stream.next_is('{'), 'incomplete Unicode category matcher') stream.read() while not stream.eof and not stream.next_is('}'): category += stream.read() check_source_language( stream.next_is('}'), 'incomplete Unicode category matcher') stream.read() # If case insensitivity is enabled, the presence of either # the Ll, Lu or Lt categories automatically enable the # presence of the others. # # This is because X.upper() can turn codepoints from Ll or # Lt into codepoints from Lu and X.lower() can turn # codepoints from Lu or Lt into codepoints from Ll. if category in ("Ll", "Lu", "Lt"): char_set = (CharSet.for_category("Ll") | CharSet.for_category("Lu") | CharSet.for_category("Lt")) else: try: char_set = CharSet.for_category(category) except KeyError: check_source_language( False, f'invalid Unicode category: {category}') if action == 'P': char_set = char_set.negation subparsers.append(self.Range(char_set)) else: stream.go_back() subparsers.append(self.Range(self._parse_escape(stream))) else: char_set = self._char_set_for(ord(stream.read())) subparsers.append(self.Range(char_set)) return self.Sequence(subparsers)