def _build_parse_rest_of_etc(): def parse_rest_of_double_quoted_string_so_far(scn): return list(do_parse_rest_of_double_quoted_string_so_far(scn)) # list because allow popping #here1 because meh def do_parse_rest_of_double_quoted_string_so_far(scn): # Keep scanning the inside of the string while you have # either a run of ordinary content or a supported escape sequence while True: as_is = scn.scan(one_or_more_not_this_or_that) bs = scn.skip(a_backslash) if bs: what_kind = scn.scan_required(valid_backslash_ting) if not any((as_is, bs)): break if bs: if 'n' == what_kind: typ = 'newline_escape_sequence' elif 't' == what_kind: typ = 'tab_escape_sequence' else: assert '"' == what_kind typ = 'double_quote_escape_sequence' else: typ = None yield 'unencoded_string_content', (as_is or '') yield 'any_escape_sequence', typ # Either you're at the end of the line or the end of the d.quote if scn.empty: yield ('end_of_surface_line', ) return scn.skip_required(double_quote) yield ('end_of_double_quoted_string', ) from text_lib.magnetics.string_scanner_via_string import \ pattern_via_description_and_regex_string as o one_or_more_not_this_or_that = o( 'one or more not double quote or backslash', r'[^"\\]+') a_backslash = o('a backslash', r'\\') valid_backslash_ting = o("'t' or 'n' or '\"'", '[tn"]') double_quote = o("double quote", '"') return parse_rest_of_double_quoted_string_so_far
def _spans_of_old(throwing_listener): from text_lib.magnetics.string_scanner_via_string import \ StringScanner as string_scanner_via, \ pattern_via_description_and_regex_string as o zero_or_more_not_special = o('zero or more not special', r'[^,"\\\n\r]*') one_or_more_spaces = o('one or more spaces', '[ ]+') def spans_via_line(line): scn = string_scanner_via(line, throwing_listener) begin_pos = scn.pos while True: scn.skip_required(zero_or_more_not_special) if scn.empty: xx("line ended without a terminating newline") end_pos = scn.pos yield begin_pos, end_pos char = line[end_pos] if ',' == char: scn.advance_by_one() scn.skip(one_or_more_spaces) if scn.empty: xx() begin_pos = scn.pos continue if '\n' == char: scn.advance_by_one() assert scn.empty break if '"' == char: xx("have fun parsing escaped quotes") xx(f"unexpected character {char!r}") return spans_via_line
def _build_tokenized_sections_via(): def tokenized_sections_via(docu, body_of_text, listener): throwing_listener = build_throwing_listener(listener, stop) sect_els = _section_elements(docu, body_of_text, listener) for section_el in sect_els: key = section_el.string_key() scn = StringScanner(key, throwing_listener) try: yield sexp_via_parse_section_key(scn, section_el) except stop: break def sexp_via_parse_section_key(scn, section_el): s = scn.scan_required(first_word) if 'document-meta' == s: scn.skip_required(eos) return 'document_meta', None, section_el scn.skip_required(colon) eid = scn.scan_required(identifier) scn.skip_required(colon) scn.skip_required(attributes) scn.skip_required(eos) return 'entity_section', eid, section_el from text_lib.magnetics.string_scanner_via_string import \ StringScanner, pattern_via_description_and_regex_string as o, \ build_throwing_listener first_word = o('entity|document-meta', r'(entity|document-meta)\b') eos = o('end of string', '$') colon = o("colon (':') and space", ': ') # or make space optional identifier = o('identifier', '[A-Z0-9]{3}') # .. attributes = o("'attributes' keyword", r'attributes\b') class stop(RuntimeError): pass # make our own even thos there's one out there. safer return tokenized_sections_via
def _build_name_gist_via_name(): from text_lib.magnetics.string_scanner_via_string import \ StringScanner, pattern_via_description_and_regex_string as o # define some reflexive regexes all_LC_or_UC = o('all lowercase or all uppercase attribute name piece', r'[a-z0-9]+|[A-Z0-9]+') field_name_word_separator = o('underscore', '_') # at #history-A.3, changed from dash for [#873.21] (use underscores not..) # exactly_one_space = o('exactly one space', ' ') # equals_sign = o('equals sign', '=') # use them: def name_gist_via_name(attr_name_string, listener): # (before #history-A.1, this was how _AttributeName was built scn = StringScanner(attr_name_string, listener) pieces = [] while True: s = scn.scan_required(all_LC_or_UC) if s is None: return pieces.append(s) if scn.empty: break if not scn.skip_required(field_name_word_separator): return return ''.join(s.lower() for s in pieces) return name_gist_via_name
def sexps_via_lines(lines, listener=None): """NOTE this is a rough proof-of-concept. It will *not* parse all GraphViz documents, nor is it intended to [etc the usual disclaimer..] However, wherever it was a good fit, we tried to use names from the published grammar There's a CLI-exposed toolkit for developing this under `kst` """ # == States [#008.2] def from_beginning_state(): yield if_open_digraph_line, move_to_inside_digraph def from_root_of_document(): yield if_blank_line, ignore_for_now yield if_open_multiline_comment_line, enter_multiline_comment def from_inside_digraph(): yield if_blank_line, ignore_for_now yield if_open_node_statement, handle_line_that_begins_node_statement yield if_edge_statement, handle_line_that_begins_edge_statement yield if_set_an_attribute_statement, ignore_for_now # comment line one day yield if_close_clurly, close_digraph def from_inside_attribute_list(): yield if_scan_attribute_assignment_left_hand_side, push_to_attr_value yield if_skip_a_comma, do_nothing yield if_scan_close_square_bracket, pop_out_of_attr_list_kinda_big_deal def from_before_attribute_value(): yield if_scan_a_double_quote, BEGIN_TO_PARSE_DOUBLE_QUOTED_VALUE yield true, parse_a_not_double_quoted_value def from_inside_double_quoted_string(): yield true, parse_rest_of_inside_quoted_string def from_inside_multiline_comment(): yield if_line_that_closes_multiline_comment, pop_out_of_multiline_comme yield true, ignore_for_now # == Actions def move_to_inside_digraph(): move_to(from_root_of_document) push_to(from_inside_digraph) # -- mess with line scanning def handle_line_that_begins_node_statement(): store['element_start_lineno'] = lineno store['current_entity_type'] = 'node' md = store.pop('last_match') store['current_node_identifier'] = md['node_identifier'] scn = build_scanner(line) scn.advance_to_position(md.span()[1]) store['current_string_scanner'] = scn push_to(from_inside_attribute_list) return parse_to_end_of_line() def handle_line_that_begins_edge_statement(): store['element_start_lineno'] = lineno store['current_entity_type'] = 'edge' md = store.pop('last_match') store['left_node_identifier'], store['left_node_port'] = md.groups() scn = build_scanner(line) scn.advance_to_position(md.span()[1]) store['current_string_scanner'] = scn store['right_node_identifier'] = scn.scan_required(identifier) rhs_port = None if ':' == scn.peek(1): scn.advance_by_one() rhs_port = scn.scan_required(identifier) store['right_node_port'] = rhs_port store['current_attribute_list'] = [] # If the edge has attributes, parse them, else you're done with line if scn.skip(open_square_bracket): push_to(from_inside_attribute_list) return parse_to_end_of_line() skip_required_end_of_line(scn) store.pop('current_string_scanner') return finish_edge_or_node() # (but we know it's edge) def push_to_attr_value(): store['current_attribute_name'] # sanity check, catch it early push_to(from_before_attribute_value) def BEGIN_TO_PARSE_DOUBLE_QUOTED_VALUE(): scn = self.scn sxs = _parse_rest_of_double_quoted_string_so_far(scn) typ, = sxs[-1] assert 'from_before_attribute_value' == stack[-1].__name__ store['current_attribute_list'] = [] if 'end_of_surface_line' == typ: sxs.pop() # #here1 store['current_double_quoted_string_sexp'] = sxs push_to(from_inside_double_quoted_string) return assert 'end_of_double_quoted_string' == typ xx("easy no problem. when the label ends on the same line. EASY") def parse_rest_of_inside_quoted_string(): scn = build_scanner(line) sxs = _parse_rest_of_double_quoted_string_so_far(scn) typ, = sxs[-1] if 'end_of_surface_line' == typ: sxs.pop() # #here1 store['current_double_quoted_string_sexp'].extend(sxs) return # stay assert 'end_of_double_quoted_string' == typ sxs.pop() # #here1 accum_sxs = store.pop('current_double_quoted_string_sexp') accum_sxs.extend(sxs) k = store.pop('current_attribute_name') val_sexp = 'double_quoted_string', accum_sxs store['current_attribute_list'].append((k, val_sexp)) assert 'from_inside_double_quoted_string' == stack.pop().__name__ assert 'from_before_attribute_value' == stack.pop().__name__ # 😢 store['current_string_scanner'] = scn return parse_to_end_of_line() def parse_to_end_of_line(): res = None while self.scn.more: action = find_transition() res = action() if res is None: continue # The only way you can produce something is at the end of the line assert self.scn.empty break store.pop('current_string_scanner') return res def parse_a_not_double_quoted_value(): literal_value = self.scn.scan_required(identifier) k = store.pop('current_attribute_name') val_sexp = 'identifier_as_attribute_value', literal_value store['current_attribute_list'].append((k, val_sexp)) assert 'from_before_attribute_value' == stack.pop().__name__ # -- def pop_out_of_attr_list_kinda_big_deal(): assert 'from_inside_attribute_list' == stack[-1].__name__ stack.pop() return finish_edge_or_node() def finish_edge_or_node(): assert 'from_inside_digraph' == stack[-1].__name__ # or not typ = store.pop('current_entity_type') alist = store.pop('current_attribute_list') if 'edge' == typ: return finish_edge(alist) assert 'node' == typ return finish_node(alist) def finish_node(alist): iden = store.pop('current_node_identifier') this = _finish_alist(alist) # .. use_lineno = store.pop('element_start_lineno') sx = _NodeSexp(iden, this, use_lineno) return 'yield_this', sx def finish_edge(alist): these = (store.pop('left_node_identifier'), store.pop('left_node_port'), store.pop('right_node_identifier'), store.pop('right_node_port')) this = _finish_alist(alist) # .. use_lineno = store.pop('element_start_lineno') sx = _EdgeSexp(*these, this, use_lineno) return 'yield_this', sx def close_digraph(): assert 'from_inside_digraph' == stack[-1].__name__ stack.pop() assert 'from_root_of_document' == stack[-1].__name__ # -- def enter_multiline_comment(): push_to(from_inside_multiline_comment) def pop_out_of_multiline_comme(): assert 'from_inside_multiline_comment' == stack[-1].__name__ stack.pop() def ignore_for_now(): pass do_nothing = ignore_for_now # == Tests def if_blank_line(): return '\n' == line def if_open_multiline_comment_line(): md = open_comment_simple_rx.match(line) if md is None: return pos = line.find('*/', 2) if -1 != pos: xx("ugh can we not have single-line comments please") return True def if_line_that_closes_multiline_comment(): pos = line.find('*/') return -1 != pos def if_open_digraph_line(): return re.match(f'digraph {iden_rsx}[ ]?\\{{$', line) def if_set_an_attribute_statement(): return re.match(f'{iden_rsx}=', line) # big meh def if_open_node_statement(): md = looks_like_open_node_rx.match(line) if md is None: return store['last_match'] = md return True def if_edge_statement(): md = huge_peek_for_edge_rx.match(line) if md is None: return store['last_match'] = md return True # -- tests that use scanner def if_scan_attribute_assignment_left_hand_side(): scn = self.scn scn.skip(one_or_more_space_characters) # #here2 s = scn.scan(identifier) if s is None: return store['current_attribute_name'] = s scn.skip_required(equals) return True def if_scan_a_double_quote(): return self.scn.skip(double_quote) def if_skip_a_comma(): # assume #here2 return self.scn.skip(comma) def if_scan_close_square_bracket(): scn = self.scn yes = scn.skip(close_square_bracket) if not yes: return skip_required_end_of_line(scn) return True def skip_required_end_of_line(scn): if scn.skip(newline): assert scn.empty return xx(f"Maybe this is an end-of-line comment which is allowed: {scn.rest()}" ) # noqa: E501 def if_close_clurly(): return '}\n' == line # meh def true(): return True # == used below iden_rsx = '[a-zA-Z_][a-zA-Z0-9_]*' # == def build_scanner(line): return StringScanner(line, tlistener, cstacker) def cstacker(): return ({'line': line, 'lineno': lineno}, ) from text_lib.magnetics.string_scanner_via_string import \ StringScanner, \ pattern_via_description_and_regex_string as o, \ build_throwing_listener tlistener = build_throwing_listener(listener, _Stop) identifier = o('identifier', iden_rsx) open_square_bracket = o('open square bracket', r'\[') equals = o("equals sign", '=') double_quote = o("double quote", '"') comma = o('comma', ',') close_square_bracket = o('close square bracket', r'\]') one_or_more_space_characters = o('spaces', '[ ]+') newline = o('newline', r'\n') # == import re looks_like_open_node_rx = re.compile( f""" (?P<node_identifier> {iden_rsx} ) \\[ """, re.VERBOSE) huge_peek_for_edge_rx = re.compile( f""" (?P<left_node_identifier> {iden_rsx} ) (?: : (?P<left_node_port> {iden_rsx} ) )? -> """, re.VERBOSE) open_comment_simple_rx = re.compile(r'/\*') # == Interacting with FSA state: def move_to(state_function): stack[-1] = state_function def push_to(state_function): stack.append(state_function) # == def find_transition(): for test, action in stack[-1](): yn = test() if yn: return action reason_head = f"No transition found {stack[-1].__name__}" scn = store.get('current_string_scanner') if scn: def lines(): yield ''.join(reason_head, '\n') yield " {line}" yield ''.join((' ', '-' * scn.pos, '^\n')) reason = ''.join(lines()) else: reason = f"{reason_head} for {line!r}" xx(reason) # == class HeyGuysWhatsUp: @property def scn(self): return store['current_string_scanner'] self = HeyGuysWhatsUp() store = _NoClobberDict() stack = [from_beginning_state] lineno = 0 try: for line in lines: lineno += 1 while True: # (there may be a 'redo' directive in the future) action = find_transition() direc = action() if direc is None: break # parse next line, if any typ = direc[0] assert 'yield_this' == typ product, = direc[1:] yield product break # parse next line, if any except _Stop: return if 1 != len(stack): xx("something didn't close, can't end {stack[-1].__name__}")
def _build_function_call_parser(): def egads(string, listener): tlistener = build_throwing_listener(listener) scn = StringScanner(string, tlistener) return main(scn) def main(scn): # (there is a much simpler version of something like this at [#882.T]) func_name = scn.scan_required(func_name_symbol) scn.skip_required(open_paren) def do_args(): if scn.skip(close_paren): # #here2 return while True: yield do_one_arg(scn) if scn.skip(close_paren): # #here2 return scn.skip_required(comma) arg_sexps = tuple(do_args()) if scn.more: xx('cover extra characters after close') return func_name, arg_sexps def do_one_arg(scn): s = scn.scan(variable_name_symbol) if s: return 'variable_name', s s = scn.scan(hacky_mixed_value_sym) if s: return 'literal_value', s scn.whine_about_expecting(variable_name_symbol, hacky_mixed_value_sym) assert () def build_throwing_listener(listener): def use_listener(sev, *rest): listener(sev, *rest) if 'error' == sev: raise stop() return use_listener from text_lib.magnetics.string_scanner_via_string import \ StringScanner, \ pattern_via_description_and_regex_string as o func_name_symbol = o('function_name', iden) open_paren = o('open_paren', r'\(') variable_name_symbol = o('variable_name', f"{iden}(?=[,)])") # (you need the forward lookahead assertion so you don't match foo.md 🙃) comma = o('comma', r',[ ]*') close_paren = o('close_paren', r'\)') hacky_mixed_value_sym = o('hacky_mixed_value', '[^,)]+') # assume not var stop = _Stop return egads
content_s = scn.scan_required(_some_content) # (Case1403) ⏛ [#873.5] if content_s is None: return if False and content_s[0] in ('"', "'"): # allow literal quotes in values since #history-B.6 raise Exception( # #not-covered "Can we please just not bother with quotes ever? " "It seems they may neve be necessary for us in these files " f"({repr(content_s)}") return _Field(field_name, content_s, posov) _field_name = o('field name', r'[a-zA-Z][_a-zA-Z0-9]*') # (real recsel doesn't allow multbyte in first char, or dashes anywhere) _colon = o('colon', ':') _space = o('space', '[ ]+') _some_content = o('some content', r'[^\n]+') class _Field: # property names are derived from names used in /usr/local/include/rec.h # however, we have inflected the names further with local conventions def __init__(self, nn, vv, posov): self.field_name = nn
def _build_element_parser(tlistener, path=None): def parse_element(ast): typ = ast[0] if 'node_expression' == typ: return parse_node(ast) assert 'edge_expression' == typ return 'my_edge_def', ast # it's fine as-is as a forward def def parse_node(ast): label = ast.attributes['label'] cstacker = cstacker_via_AST(ast) scn = StringScanner(label, tlistener, cstacker) table_name = scn.scan_required(identifier) scn.skip_required(pipe_and_newline) cstacker.plus += 1 my_cols = [] while True: my_col = parse_column_definition(scn) cstacker.plus += 1 my_cols.append(my_col) if scn.empty: break return 'my_table_def', _MyTableDef(table_name, my_cols, ast) def parse_column_definition(scn): # Parse any port name port_name = None if scn.skip(less_than): port_name = scn.scan_required(identifier) scn.skip_required(greater_than) scn.skip_required(space) # Parse the column name and type (very strict for now) col_name = scn.scan_required(identifier) scn.skip_required(space) col_abs_typ = scn.scan_required(abstract_types) # Constraints kw = {'is_prim': False, 'null_OK': False, 'is_uniq': False} pool = {'is_prim': primary, 'is_uniq': unique, 'null_OK': null_ok} # o def find_first_one(): for k, v in pool.items(): yn = scn.scan(v) if yn: return k while pool: # Do you match any constraint in the pool from this point? k = find_first_one() # If you matched no constraints, forget the pool, you're done if k is None: break # (special handling for this one that's a two-token sequence meh) if 'is_prim' == k: scn.skip_required(key_token) # While every attribute is false by default this is easier assert kw[k] is False kw[k] = True # Keep looking for more constraints as long as you have unused ones pool.pop(k) w = scn.skip(end_of_column_def) if w is None: oh_boy = (primary, unique, null_ok, end_of_column_def) scn.whine_about_expecting(*oh_boy) return _MyColDef(port_name, col_name, col_abs_typ, **kw) def cstacker_via_AST(ast): def cstacker(): dct = {} dct['lineno'] = (ast.lineno + cstacker.plus) if path: dct['path'] = path return (dct, ) cstacker.plus = 0 return cstacker from text_lib.magnetics.string_scanner_via_string import \ StringScanner, pattern_via_description_and_regex_string as o # == # Port less_than = o('less_than than', '<') greater_than = o('greater than', '>') # (The below follow the order of here just because:) # https://www.sqlite.org/lang_createtable.html # Type abstract_types = o("'int' or 'text'", '(?:int|text)') # Primary or "null OK" or Unique primary = o('primary', r'[ ]primary\b') null_ok = o('null_ok', r'[ ]null_ok\b') unique = o('unique', r'[ ]unique\b') key_token = o('key', r'[ ]key\b') # Common identifier = o('identifier', '[a-zA-Z][a-zA-Z0-9]*(?:_[a-zA-Z][a-zA-Z0-9]*)*') space = o('space', '[ ]') pipe_and_newline = o('pipe and newline', r'\|\n') # redundant w/ next end_of_column_def = o('pipe and newline or end of string', r'(?:\|\n|$)') # == return parse_element