def check_intrinsics(): stderr("checking intrinsics...") header("checking intrinsics...") # We can't just scan through spec.text looking for %...%, # because that would find occurrences in element IDs, # which are lower-cased. # Instead, just look in literal (text) nodes. # (Note that this skips occurrences of "%<var>Foo</var>Prototype%".) for tnode in spec.doc_node.each_descendant_named('#LITERAL'): for mo in re.compile(r'%\S+%').finditer(spec.text, tnode.start_posn, tnode.end_posn): itext = mo.group(0) itext_start = mo.start(0) if itext in ['%name%', '%name.a.b%']: # placeholders continue if itext in ['%_NativeError_%', '%_TypedArray_%']: # metavariable interpolation continue is_in_table = any( table_start < itext_start < table_end for (table_start, table_end) in well_known_intrinsics_table_spans) status = well_known_intrinsics.get(itext, "doesn't exist") if status == "doesn't exist": msg_at_posn(itext_start, f"Intrinsic doesn't exist: {itext}") elif status.startswith("old name"): if not is_in_table: msg_at_posn(itext_start, f"Using {status}")
def check_for_extra_blank_lines(): stderr("checking for extra blank lines...") for mo in re.finditer(r'\n( *\n){2,}', spec.text): posn = mo.end() - 1 msg_at_posn(posn, "2 or more adjacent blank lines") for mo in re.finditer(r'\n( *\n *</emu-clause>)', spec.text): posn = mo.start(1) msg_at_posn(posn, "blank line before end-clause tag")
def check_tag_indent(line_s, tag_s, element_name): portion_of_line_before_tag = spec.text[line_s : tag_s] if ( portion_of_line_before_tag == '' or portion_of_line_before_tag.isspace() ): actual_indent = len(portion_of_line_before_tag) if actual_indent != expected_indent: msg_at_posn(tag_s, f"expected indent={expected_indent}, got {actual_indent}") else: msg_at_posn(tag_s, f"{element_name} tag isn't the first non-blank thing on the line")
def check_characters(): stderr("checking characters...") header("checking characters...") for mo in re.finditer(r'[^\n -~]', spec.text): posn = mo.start() character = spec.text[posn] if character == '\u211d': # PR 1135 introduced tons of these continue if character in ascii_replacement: suggestion = ": maybe change to %s" % ascii_replacement[character] else: suggestion = '' msg_at_posn( posn, "non-ASCII character U+%04x%s" % (ord(character), suggestion))
def check_lines(lo, hi, emi): if lo == hi: return assert lo < hi (top_indent, x) = line_[lo] if top_indent != emi: msg_at_posn(x, f"expected indent={emi}, got {top_indent}") siblings = [] for i in range(lo+1, hi): (indent, x) = line_[i] if indent < top_indent: msg_at_posn(x, f"expected indent<{top_indent}, got {indent}") siblings.append(i) # I guess elif indent == top_indent: siblings.append(i) for (i,j) in zip([lo] + siblings, siblings + [hi]): check_lines(i+1, j, top_indent + INDENT_UNIT)
def check_characters(): stderr("checking characters...") for mo in re.finditer(r'[^\n -~]', spec.text): # Note that this will (among other things) find and complain about TAB characters. posn = mo.start() character = spec.text[posn] if character == '\u211d': # PR 1135 introduced tons of these continue elif character in ['\u2124', '\U0001d53d']: continue if character in ascii_replacement: suggestion = ": maybe change to %s" % ascii_replacement[character] else: suggestion = '' msg_at_posn(posn, "non-ASCII character U+%04x%s" % (ord(character), suggestion) )
def _check_section_order(section): # In some sections, the subsections should be in "alphabetical order". if section.element_name == '#DOC': stderr("_check_section_order...") else: if section.section_kind in [ 'group_of_properties1', 'group_of_properties2', 'properties_of_an_intrinsic_object', 'properties_of_instances', ]: prev_title = None prev_t = None for child in section.section_children: if child.section_kind not in [ 'group_of_properties1', 'group_of_properties2', 'catchall', 'anonymous_built_in_function', ]: assert re.search(r'_property(_xref)?$', child.section_kind), child.section_kind t = child.section_title t = t.lower() t = t.replace('int8', 'int08') t = re.sub(r'^get ', '', t) if section.section_title == 'Properties of the RegExp Prototype Object': t = re.sub(r' \[ @@(\w+) \]', r'.\1', t) else: t = re.sub(r' \[ @@(\w+) \]', r'.zz_\1', t) if prev_t is not None and t <= prev_t: msg_at_posn( child.start_posn, '"%s" should be before "%s"' % (child.section_title, prev_title)) prev_t = t prev_title = child.section_title for child in section.section_children: _check_section_order(child)
def check_references_to_intrinsics(): stderr("check_references_to_intrinsics...") # We can't just scan through spec.text looking for %...%, # because that would find occurrences in element IDs, # which are lower-cased. # Instead, just look in literal (text) nodes. # (Note that this skips occurrences of "%<var>Foo</var>Prototype%".) for tnode in spec.doc_node.each_descendant_named('#LITERAL'): for mo in re.compile(r'%\S+%').finditer(spec.text, tnode.start_posn, tnode.end_posn): itext = mo.group(0) itext_start = mo.start(0) if itext in ['%name%', '%name.a.b%']: # placeholders continue if itext in ['%_NativeError_%', '%_TypedArray_%']: # metavariable interpolation continue base_intrinsic = re.sub(r'\.[^%]+', '', itext) if base_intrinsic not in well_known_intrinsics: msg_at_posn(itext_start, f"Intrinsic doesn't exist: {base_intrinsic}")
def close_open_child(end_tag_start_posn, end_tag_end_posn, element_name): nonlocal current_open_node if element_name != current_open_node.element_name: msg_at_posn( end_tag_start_posn, f"ERROR: The currently-open element is a {current_open_node.element_name!r}, but this is an end-tag for {element_name!r}.\nSkipping the end-tag, to see if that helps." ) # This old code might be useful to adapt: # if current_open_node.parent is None: # self._report("current_open_node.parent is None") # elif element_name == current_open_node.parent.element_name: # self._report("Assuming that </%s> is missing" % current_open_node.element_name) # # Pretend that we got the missing endtag: # self.handle_endtag(current_open_node.element_name) # # That will change current_open_node. # assert element_name == current_open_node.element_name # self.handle_endtag(current_open_node.element_name) return current_open_node.inner_start_posn = current_open_node.end_posn current_open_node.inner_end_posn = end_tag_start_posn current_open_node.end_posn = end_tag_end_posn current_open_node = current_open_node.parent
def check_section_title(h1, node): title = h1.inner_source_text() # Check capitalization. if node.parent.section_title != 'Terms and Definitions': mo = re.search(r' \b(?!(an|and|for|in|of|on|the|to|with))([a-z]\w+)', title) if mo: msg_at_posn(h1.inner_start_posn + mo.start() + 1, "title word '%s' should be capitalized?" % mo.group(2)) # Check references to well-known symbols. mo1 = re.search('\[ *@', title) if mo1: mo2 = re.search(r'( |^)\[ @@\w+ \]( |$)', title) if not mo2: msg_at_posn( h1.inner_start_posn + mo1.start(), "Title's reference to well-known symbol does not conform to expected pattern" ) # Check parentheses and spaces assert title.count('(') <= 1 assert title.count(')') <= 1 lpp = title.find('(') if lpp >= 0: if re.search(r' \(( .+)? \)( Concrete Method)?$', title): # space before and after '(' # space before ")" # If param list is empty, just one space between parens. pass elif title == 'RegExp (Regular Expression) Objects': # Use of parens that isn't a parameter list. pass else: msg_at_posn(h1.inner_start_posn + lpp, "Something odd here wrt parens + spaces")
def check_tables(): stderr('check_tables...') header("checking tables...") for et in spec.doc_node.each_descendant_named('emu-table'): a_caption = et.attrs.get('caption', None) caption_children = [c for c in et.each_child_named('emu-caption')] if len(caption_children) == 0: e_caption = None elif len(caption_children) == 1: [emu_caption] = caption_children e_caption = emu_caption.inner_source_text().strip() else: assert 0 # ---- if a_caption and not e_caption: caption = a_caption elif e_caption and not a_caption: caption = e_caption else: assert 0, (a_caption, e_caption) if 'id' not in et.attrs: msg_at_posn(et.start_posn, f'no id attribute for table with caption "{caption}"') header_tr = [tr for tr in et.each_descendant_named('tr')][0] header_line = '; '.join( th.inner_source_text().strip() for th in header_tr.each_descendant_named('th')) if 'Field' in caption: # print(header_line, ':', caption) if re.match(r'^(.+) Fields$', caption): pass elif re.match(r'^Additional Fields of (.+)$', caption): pass elif caption == 'Fields of the Private Name': # PR 1668 pass else: assert 0, caption elif 'Slot' in caption: if re.match(r'^Internal Slots of (.+)$', caption): pass else: assert 0 elif 'Method' in caption: if 'Internal Methods' in caption: assert caption in [ 'Essential Internal Methods', 'Additional Essential Internal Methods of Function Objects' ] assert header_line == 'Internal Method; Signature; Description' elif 'Records' in caption: assert re.fullmatch( r'(Additional )?(Abstract )?Methods of .+ Records', caption), caption assert header_line == 'Method; Purpose' elif caption == 'Proxy Handler Methods': assert header_line == 'Internal Method; Handler Method' else: assert 0 elif 'Properties' in caption: assert re.fullmatch( r'<i>\w+</i> Interface( (Required|Optional))? Properties', caption) assert header_line == 'Property; Value; Requirements' elif 'Intrinsic Objects' in caption: assert caption in [ 'Well-Known Intrinsic Objects', 'Additional Well-known Intrinsic Objects', ] well_known_intrinsics_table_spans.append( (et.start_posn, et.end_posn)) new_names = {} assert header_line == 'Intrinsic Name; Global Name; ECMAScript Language Association' for tr in et.each_descendant_named('tr'): if tr == header_tr: continue [oname, global_name, assoc] = [ td.inner_source_text().strip() for td in tr.each_descendant_named('td') ] assert re.fullmatch(r'%\w+%', oname) assert oname not in well_known_intrinsics assert re.fullmatch(r"|`\w+(\.\w+)*`", global_name) if ';' in assoc or 'i.e.' in assoc: mo = re.search(r'; i.e., (%\w+(\.\w+)+%)$', assoc) assert mo new_name = mo.group(1) assert new_name not in well_known_intrinsics assert new_name not in new_names new_names[new_name] = tr.start_posn assert new_name != oname well_known_intrinsics[ oname] = f"old name; 2950,$s/{oname}/{new_name}/gc" well_known_intrinsics[new_name] = "new name" else: well_known_intrinsics[oname] = "only name" # Have to do this after processing the table, # because of possible forward references. # (E.g., on the row for %AsyncGenerator%, # column 3 mentions %AsyncGeneratorFunction.prototype%, # which implies the existence of %AsyncGeneratorFunction%, # which is declared in column 1 of the *next* row.) for (new_name, tr_posn) in new_names.items(): base_of_new_name = re.sub(r'\..*', '%', new_name) if base_of_new_name not in well_known_intrinsics: msg_at_posn( tr_posn, f"Implied intrinsic doesn't exist: {base_of_new_name}") else: # print('>>>', header_line, '---', caption) pass
def check_ref_ids(refnode): if refnode.element_name == 'emu-xref': if 'href' not in refnode.attrs: stderr("At", shared.convert_posn_to_linecol(refnode.start_posn)) stderr("emu-xref element doesn't have an 'href' attribute") stderr("aborting") sys.exit() href = refnode.attrs['href'] assert href.startswith('#') refid = href[1:] refids.add(refid) if refid in node_with_id_: defnode = node_with_id_[refid] if defnode.element_name in [ 'emu-clause', 'emu-annex', 'emu-table' ]: pass elif defnode.element_name == 'dfn': deftext = defnode.inner_source_text() reftext = refnode.inner_source_text() assert deftext != '' if reftext != '' and reftext.lower() != deftext.lower(): # Auto-linking would fail to make `reftext` into a link? # So we have to use an emu-xref? pass else: msg_at_posn( refnode.start_posn, f"emu-xref used when auto-linking would work: '{refid}'" ) else: msg_at_posn( defnode.start_posn, f"unexpected defnode element-name <{defnode.element_name}>" ) else: if refid in [ 'table-binary-unicode-properties', 'table-nonbinary-unicode-properties', 'table-unicode-general-category-values', 'table-unicode-script-values', ]: # Those ids are declared in emu-imported files. pass elif refid in [ 'prod-annexB-LegacyOctalEscapeSequence', 'prod-annexB-LegacyOctalIntegerLiteral', 'prod-annexB-NonOctalDecimalIntegerLiteral', ]: # These don't exist in the source file, # but are generated during the rendering process? pass else: msg_at_posn(refnode.start_posn, f"emu-xref refers to nonexistent id: {refid}") for child in refnode.children: check_ref_ids(child)
def gather_def_ids(node): if 'id' in node.attrs: defid = node.attrs['id'] # ---------- # no duplicate ids, of course if defid in node_with_id_: msg_at_posn(node.start_posn, f"duplicate id: '{defid}'") node_with_id_[defid] = node # ---------- # id should begin with "(sec|eqn|figure|table)-" # if and only if the node is of certain kinds. id_prefix_expectation = { 'emu-intro': 'sec-', 'emu-clause': 'sec-', 'emu-annex': 'sec-', 'emu-eqn': 'eqn-', 'emu-figure': 'figure-', 'emu-table': 'table-', }.get(node.element_name, None) if id_prefix_expectation: if not defid.startswith(id_prefix_expectation): msg_at_posn( node.start_posn, f'Expected the id to start with "{id_prefix_expectation}"' ) else: if (False or defid.startswith('sec-') or defid.startswith('eqn-') or defid.startswith('figure-') or defid.startswith('table-')): msg_at_posn(node.start_posn, f'Did not expect the id to start that way') # ---------- # If an element defines an abstract operation, # its id should be ... if 'aoid' in node.attrs: aoid = node.attrs['aoid'] assert node.element_name in [ 'emu-clause', 'emu-annex', 'emu-eqn', 'dfn' ] if id_prefix_expectation is None: id_prefix_expectation = 'sec-' # for thisFooValue possibles = [ id_prefix_expectation + aoid.lower().replace(' ', '-').replace('::', '-'), id_prefix_expectation + aoid, id_prefix_expectation + kebab(aoid) ] if defid not in possibles: msg_at_posn(node.start_posn, f'Expected id="{possibles[0]}"') if 'oldids' in node.attrs: for oldid in node.attrs['oldids'].split(','): assert oldid not in all_oldids all_oldids.add(oldid) for child in node.children: gather_def_ids(child)
def check_ids(): header("checking ids...") node_with_id_ = OrderedDict() all_oldids = set() def gather_def_ids(node): if 'id' in node.attrs: defid = node.attrs['id'] # ---------- # no duplicate ids, of course if defid in node_with_id_: msg_at_posn(node.start_posn, f"duplicate id: '{defid}'") node_with_id_[defid] = node # ---------- # id should begin with "(sec|eqn|figure|table)-" # if and only if the node is of certain kinds. id_prefix_expectation = { 'emu-intro': 'sec-', 'emu-clause': 'sec-', 'emu-annex': 'sec-', 'emu-eqn': 'eqn-', 'emu-figure': 'figure-', 'emu-table': 'table-', }.get(node.element_name, None) if id_prefix_expectation: if not defid.startswith(id_prefix_expectation): msg_at_posn( node.start_posn, f'Expected the id to start with "{id_prefix_expectation}"' ) else: if (False or defid.startswith('sec-') or defid.startswith('eqn-') or defid.startswith('figure-') or defid.startswith('table-')): msg_at_posn(node.start_posn, f'Did not expect the id to start that way') # ---------- # If an element defines an abstract operation, # its id should be ... if 'aoid' in node.attrs: aoid = node.attrs['aoid'] assert node.element_name in [ 'emu-clause', 'emu-annex', 'emu-eqn', 'dfn' ] if id_prefix_expectation is None: id_prefix_expectation = 'sec-' # for thisFooValue possibles = [ id_prefix_expectation + aoid.lower().replace(' ', '-').replace('::', '-'), id_prefix_expectation + aoid, id_prefix_expectation + kebab(aoid) ] if defid not in possibles: msg_at_posn(node.start_posn, f'Expected id="{possibles[0]}"') if 'oldids' in node.attrs: for oldid in node.attrs['oldids'].split(','): assert oldid not in all_oldids all_oldids.add(oldid) for child in node.children: gather_def_ids(child) gather_def_ids(spec.doc_node) # An id can't be both an oldid and a current id. assert not all_oldids & set(node_with_id_.keys()) # Print a sorted list of all ids # (so that we notice if any ever go away): ids_f = shared.open_for_output('ids') for id in sorted(all_oldids | set(node_with_id_.keys())): print(id, file=ids_f) ids_f.close() # ------------------------------------------------------------- # Find "referenced but not declared" ids. refids = set() def check_ref_ids(refnode): if refnode.element_name == 'emu-xref': if 'href' not in refnode.attrs: stderr("At", shared.convert_posn_to_linecol(refnode.start_posn)) stderr("emu-xref element doesn't have an 'href' attribute") stderr("aborting") sys.exit() href = refnode.attrs['href'] assert href.startswith('#') refid = href[1:] refids.add(refid) if refid in node_with_id_: defnode = node_with_id_[refid] if defnode.element_name in [ 'emu-clause', 'emu-annex', 'emu-table' ]: pass elif defnode.element_name == 'dfn': deftext = defnode.inner_source_text() reftext = refnode.inner_source_text() assert deftext != '' if reftext != '' and reftext.lower() != deftext.lower(): # Auto-linking would fail to make `reftext` into a link? # So we have to use an emu-xref? pass else: msg_at_posn( refnode.start_posn, f"emu-xref used when auto-linking would work: '{refid}'" ) else: msg_at_posn( defnode.start_posn, f"unexpected defnode element-name <{defnode.element_name}>" ) else: if refid in [ 'table-binary-unicode-properties', 'table-nonbinary-unicode-properties', 'table-unicode-general-category-values', 'table-unicode-script-values', ]: # Those ids are declared in emu-imported files. pass elif refid in [ 'prod-annexB-LegacyOctalEscapeSequence', 'prod-annexB-LegacyOctalIntegerLiteral', 'prod-annexB-NonOctalDecimalIntegerLiteral', ]: # These don't exist in the source file, # but are generated during the rendering process? pass else: msg_at_posn(refnode.start_posn, f"emu-xref refers to nonexistent id: {refid}") for child in refnode.children: check_ref_ids(child) check_ref_ids(spec.doc_node) # ------------------------------------------------------------- # Find "declared but nor referenced" ids. for (id, defnode) in node_with_id_.items(): if id in refids: continue # `id` was not referenced. if id in ['metadata-block', 'ecma-logo']: # Actually, it *is* referenced, but from the CSS. continue if defnode.element_name in ['emu-intro', 'emu-clause', 'emu-annex']: # It's okay if the id isn't referenced: # it's more there for the ToC and for inbound URLs. continue if defnode.element_name in ['emu-figure', 'emu-table']: # The text might refer to it as "the following figure/table", # so don't expect an exolicit reference to the id. # So you could ask, why bother giving an id then? # I suppose for inbound URLs, and consistency? continue if defnode.element_name in ['dfn', 'emu-eqn']: # It's likely that the rendering process will create references # to this id. continue msg_at_posn(defnode.start_posn, f"id declared but not referenced: '{id}'")
def _parse(): stderr("parsing spec...") doc_node = HNode(0, len(shared.spec_text), '#DOC', {}) doc_node.parent = None current_open_node = doc_node def add_child(child): nonlocal current_open_node current_open_node.children.append(child) child.parent = current_open_node if child.element_name.startswith('#') or child.element_name in ['html', 'meta', 'link', 'img', 'br']: # This is a complete child pass else: # This is an incomplete ("open") element. # (It should be closed eventually by a corresponding end-tag.) current_open_node = child def close_open_child(end_tag_start_posn, end_tag_end_posn, element_name): nonlocal current_open_node if element_name != current_open_node.element_name: msg_at_posn( end_tag_start_posn, f"ERROR: The currently-open element is a {current_open_node.element_name!r}, but this is an end-tag for {element_name!r}.\nSkipping the end-tag, to see if that helps." ) # This old code might be useful to adapt: # if current_open_node.parent is None: # self._report("current_open_node.parent is None") # elif element_name == current_open_node.parent.element_name: # self._report("Assuming that </%s> is missing" % current_open_node.element_name) # # Pretend that we got the missing endtag: # self.handle_endtag(current_open_node.element_name) # # That will change current_open_node. # assert element_name == current_open_node.element_name # self.handle_endtag(current_open_node.element_name) return current_open_node.inner_start_posn = current_open_node.end_posn current_open_node.inner_end_posn = end_tag_start_posn current_open_node.end_posn = end_tag_end_posn current_open_node = current_open_node.parent # --------------------------------------------- pattern_funcs = [] def for_pattern(pattern): reo = re.compile(pattern) def wrapper(f): pattern_funcs.append( (reo, f) ) return None return wrapper # --------------------------------------------- # non-markup text: @for_pattern(r'[^<]+') def _(start_posn, end_posn, _): add_child(HNode(start_posn, end_posn, '#LITERAL', {})) return end_posn # start-tag: @for_pattern(r'<([a-z][-a-z0-9]*)\b') def _(tag_start_posn, end_name_posn, groups): [element_name] = groups attrs = OrderedDict() posn = end_name_posn while True: if shared.spec_text[posn] == '>': tag_end_posn = posn + 1 break mo = re.compile(r' ([a-z][-a-z0-9]*)(?:="([^"]*)")?').match(shared.spec_text, posn) if mo: (attr_name, attr_value) = mo.groups() assert attr_name not in attrs attrs[attr_name] = attr_value posn = mo.end() continue fatal_error(posn, "lexing error") add_child(HNode(tag_start_posn, tag_end_posn, element_name, attrs)) return tag_end_posn # end-tag: @for_pattern(r'</([a-z][-a-z0-9]*)>') def _(start_posn, end_posn, groups): [element_name] = groups close_open_child(start_posn, end_posn, element_name) return end_posn # comment: @for_pattern(r'(?s)<!--.*?-->') def _(start_posn, end_posn, _): add_child(HNode(start_posn, end_posn, '#COMMENT', {})) return end_posn # doctype-decl: @for_pattern(r'<!DOCTYPE html>') def _(start_posn, end_posn, _): add_child(HNode(start_posn, end_posn, '#DECL', {})) return end_posn # --------------------------------------------- def fatal_error(posn, msg): (line_num, col_num) = shared.convert_posn_to_linecol(posn) stderr() stderr("********************************************") stderr(f"line {line_num}, col {col_num}:") stderr(repr(shared.spec_text[posn:posn+30] + '...')) stderr(msg) stderr("********************************************") sys.exit(1) # --------------------------------------------- posn = 0 while posn < len(shared.spec_text): for (reo, func) in pattern_funcs: mo = reo.match(shared.spec_text, posn) if mo: posn = func(mo.start(), mo.end(), mo.groups()) break else: fatal_error(posn, "lexing error") if current_open_node.element_name != '#DOC': msg_at_posn( current_open_node.start_posn, "ERROR: At end of file, this element is still open" ) fatal_error(current_open_node.start_posn, f"At end of file, this element is still open") return doc_node
def check_trailing_whitespace(): stderr("checking trailing whitespace...") header("checking trailing whitespace...") for mo in re.finditer(r'(?m)[ \t]+$', spec.text): posn = mo.start() msg_at_posn(posn, "trailing whitespace")
def _validate(node): if node.element_name == '#DOC': stderr("validating markup...") def is_loose_about_spaces(x): # In sec-assignment-operators-runtime-semantics-evaluation # and sec-applystringornumericbinaryoperator. # we have <emu-alg> elements that contain 'lightweight' tables # where we format the source with extra spaces in the <th> and <td> elements # to make the source easier to read. return ( x.element_name in ['th', 'td'] and x.parent.parent.attrs.get('class', None) == 'lightweight-table' # x.parent is the <tr> # x.parent.parent is the <table> ) # TODO: Base it on the presence of "<!-- emu-format ignore -->", # because not all lightweight-tables are loose about spaces. if node.element_name == '#LITERAL': # Check for runs of multiple space characters. for mo in re.compile(r' {2,}').finditer(shared.spec.text, node.start_posn, node.end_posn): s_posn = mo.start() n_spaces = mo.end() - mo.start() if shared.spec.text[s_posn-1] == '\n': # indentation continue if is_loose_about_spaces(node.parent): continue msg_at_posn( s_posn, f"{n_spaces} space characters" ) return # ------------------------ if hasattr(node, 'inner_start_posn') and not is_loose_about_spaces(node): ist = node.inner_source_text() if ist.startswith(' '): msg_at_posn(node.inner_start_posn, f"<{node.element_name}> content starts with space") if re.search('\n +$', ist): # That's just an indented end-tag pass elif ist.endswith(' '): msg_at_posn(node.inner_end_posn, f"<{node.element_name}> content ends with space") # ------------------------ required_attrs = required_attrs_[node.element_name] optional_attrs = optional_attrs_[node.element_name] attrs = node.attrs.keys() def stringify_set(s): return ' '.join(sorted(s)) if not (attrs >= required_attrs): msg_at_posn(node.start_posn, f"required attribute(s) are missing: {stringify_set(required_attrs - attrs)}") if not (attrs <= required_attrs | optional_attrs): msg_at_posn(node.start_posn, f"unexpected attribute(s): {stringify_set(attrs - (required_attrs | optional_attrs))}") for (attr_name, attr_value) in node.attrs.items(): assert attr_value is None or isinstance(attr_value, str) for key in [ f"{node.element_name}.{attr_name}", attr_name ]: if key in attribute_info: value_pattern = attribute_info[key] break else: msg_at_posn( node.start_posn, f"Unknown attribute {attr_name!r}" ) continue if value_pattern is None and attr_value is None: pass elif value_pattern is None: msg_at_posn( node.start_posn, f"For attribute {attr_name!r}, expected no value, but got {attr_value!r}" ) elif attr_value is None: msg_at_posn( node.start_posn, f"For attribute {attr_name!r}, expected a value matching {value_pattern!r}, but got nothing" ) else: if not re.fullmatch(value_pattern, attr_value): msg_at_posn( node.start_posn, f"For attribute {attr_name!r}, expected a value matching {value_pattern!r}, but got {attr_value!r}" ) # ------------------------ # First do a pass to figure whether the content of this node # is block items or inline items or (anomalously) both. node.block_child_element_names = set() node.inline_child_element_names = set() for child in node.children: if child.element_name == '#COMMENT': continue elif child.element_name == '#LITERAL': if not child.is_whitespace(): node.inline_child_element_names.add(child.element_name) elif child.element_name in kind_: k = kind_[child.element_name] if k == 'B': node.block_child_element_names.add(child.element_name) elif k == 'I': node.inline_child_element_names.add(child.element_name) else: msg_at_posn(child.start_posn, "Is <%s> block or inline?" % child.element_name) if node.block_child_element_names and node.inline_child_element_names: msg_at_posn(node.start_posn, "%s content includes both block-level items (%s) and inline-level items (%s)" % ( node.element_name, ', '.join(sorted(list(node.block_child_element_names))), ', '.join(sorted(list(node.inline_child_element_names))) ) ) # ------------------------ children_names = [] for child in node.children: if child.element_name == '#LITERAL': if node.inline_child_element_names: x = '#TEXT;' else: assert child.is_whitespace() x = '#WS;' else: x = child.element_name + ';' children_names.append(x) children_names = ''.join(children_names) children_names = re.sub('#WS;#COMMENT;#WS;', '#WS;', children_names) if node.element_name not in content_model_: msg_at_posn(node.start_posn, "No content model for <%s>" % node.element_name) else: content_model = content_model_[node.element_name] mo = re.match(content_model, children_names) if mo is None: msg_at_posn(node.start_posn, "%s has content %s, expected %s" % (node.element_name, children_names, content_model)) #! if node.children: #! node.inner_start_posn = node.children[0].start_posn #! node.inner_end_posn = node.children[-1].end_posn for child in node.children: _validate(child)