def _report_status_checks(self, processed_report_doc: etree._ElementTree, embedded: bool): super()._report_status_checks(processed_report_doc, embedded) # check for any unsupported local features, e.g. DataTable # NOTE - we could eventually have different validators for local and uploaded reports if embedded: pass else: # TODO - validate at least a single element asset_blocks = processed_report_doc.xpath("count(/Report/Main//*)") if asset_blocks < 3: raise InvalidReportError( "Empty report - must contain at least one asset/block") elif asset_blocks < 4: url = "https://docs.datapane.com/reports/blocks/layout-pages-and-selects" display_msg( text= f"Your report only contains a single element - did you know you can include additional plots, tables and text in a report? Check out {url} for more info", md= f"Your report only contains a single element - did you know you can include additional plots, tables and text in a report? Check out [the docs]({url}) for more info", ) has_text: bool = processed_report_doc.xpath( "boolean(/Report/Main/Page//Text)") if not has_text: display_msg( "Your report doesn't contain any text - consider using TextReport to upload assets and add text to your report from your browser" )
def get_clusters(tree: etree._ElementTree) -> ty.Dict[str, ty.Set[str]]: chains_grp = tree.xpath( './tei:standOff/tei:annotation[@tei:type="coreference"]/tei:linkGrp[@tei:type="schema"]', namespaces=NSMAP, )[0] mentions = tree.xpath( ('./tei:standOff/tei:annotation[@tei:type="coreference"]' '/tei:spanGrp[@tei:subtype="mention"]/tei:span'), namespaces=NSMAP, ) res = dict() for c in chains_grp.iter(f"{TEI}link"): target = c.attrib[f"{TEI}target"] res[c.attrib[f"{XML}id"]] = set((t[1:] for t in target.split())) non_sing = set().union(*res.values()) for m in mentions: i = m.attrib[f"{XML}id"] if i not in non_sing: res[i] = {i} a_id, b_id, intersect = next( ((a_id, b_id, intersect) for a_id, a in res.items() for b_id, b in res.items() if b is not a for intersect in (a.intersection(b), ) if intersect), (None, None, None), ) if intersect is not None: raise Exception( f"Schemas {a_id} and {b_id} are not disjoints: {intersect}") return res
def save_xml(tree: et._ElementTree, fn: str): et.indent(tree, ' ') tree.write(fn, pretty_print=True, encoding='utf-8') with open(fn, 'r', encoding='utf-8') as file: to_save = file.read() to_save = to_save.replace('<TEI>', '<TEI xmlns="http://www.tei-c.org/ns/1.0">') to_save = to_save.replace('verse=', 'xml:id=') with open(fn, 'w', encoding='utf-8') as file: file.write(to_save) return fn
def _get_cib_version( cib: _ElementTree, attribute: str, regexp: Pattern ) -> Version: version = cib.getroot().get(attribute) if version is None: raise LibraryError( ReportItem.error( reports.messages.CibLoadErrorBadFormat( f"the attribute '{attribute}' of the element 'cib' " "is missing" ) ) ) match = regexp.match(version) if not match: raise LibraryError( ReportItem.error( reports.messages.CibLoadErrorBadFormat( f"the attribute '{attribute}' of the element 'cib' has " f"an invalid value: '{version}'" ) ) ) return Version( int(match.group("major")), int(match.group("minor")), int(match.group("rev")) if match.group("rev") else None, )
def generate_session_class( omc_interface_xml: etree._ElementTree, ) -> Code: elements_code = Code() code = Code( "class OMCSession(", CodeWithIndent( "OMCSessionBase,", ), "):", CodeWithIndent( elements_code ) ) elements_code.append("OpenModelica = OpenModelica") OpenModelica_Scripting, = omc_interface_xml.xpath( '//*[@id="OpenModelica.Scripting"]' ) for modelica_class in OpenModelica_Scripting.xpath('./classes/*'): if modelica_class.tag == "package": continue className = TypeName(modelica_class.attrib["id"]) if is_supported_element(modelica_class): elements_code.append( f"{className.last_identifier} = {className}" ) else: elements_code.append( f"# {className.last_identifier} = {className}" ) return code
def _collect_action_list(et: etree._ElementTree, action_list_name: str) \ -> Tuple[Union[None, etree._Element], Union[None, etree._Element], List[Union[None, etree._Element]]]: al_elem = et.find(f"*actionList[@name='{action_list_name}']") condition = et.xpath(f"*/condition/actionListName[text()='{action_list_name}']/..")[0] if al_elem is None or condition is None: return None, None, [None] # Collect affected state objects state_objects = list() for state_object_name in condition.xpath(f"stateCondition/stateObjectName"): state_object = et.find(f"*stateObject[@name='{state_object_name.text}']") if state_object is not None: state_objects.append(state_object) return al_elem, condition, state_objects
def validate(tree: etree._ElementTree, raise_: bool = False) -> bool: """Validate an UBL document against its associated schema. Args: tree: The tree document to validate raise_: True to raise an exception if the validation fails Returns: True if the document is validated (and raise_is False), False Raises: Validation failure - see https://lxml.de/validation.html#xmlschema """ # We find the appropriate XSD file ubl_version = ubl_version_finder_xp(tree) if len(ubl_version) > 0: ubl_version = ubl_version[0].text.strip() else: ubl_version = "2.0" ubl_version_tuple = tuple([int(x) for x in ubl_version.split(".")]) if ubl_version_tuple > (2, 1): warnings.warn( f"We cannot validate UBL {ubl_version} documents. Trying anyway") root = tree.getroot() schema = get_schema(root.tag) if schema is None: if raise_: raise KeyError(f"No schema available for root tree {root.tag}") return False if raise_: schema.assertValid(tree) return True return schema.validate(tree)
def _parse_element_tree(element_tree: etree._ElementTree) -> List[Node]: """Extract an interface element from an ElementTree if present.""" try: interface_schema().assertValid(element_tree) except etree.DocumentInvalid as e: raise errors.InvalidNoDLDocumentError(e) return _parse_interface(element_tree.getroot())
def targets_from_span( span: etree._ElementTree, getter: ty.Callable[[str], etree._Element] ) -> ty.List[etree._Element]: """Given a span and an {id: element} dict, return the list of the tokens in this span.""" span_id = xmlid(span) target = span.get(f"{TEI}target") if target is not None: try: return [getter(target_to_id(i)) for i in target.split()] except KeyError as e: raise ElementNotFoundError( f"Element targetted by span {span_id} not found", e.args[0] ) from e start_id = target_to_id(span.attrib[f"{TEI}from"]) end_id = target_to_id(span.attrib[f"{TEI}to"]) try: start_node = getter(start_id) except KeyError as e: raise ElementNotFoundError( f"Span {span_id} start element not found", start_id ) from e targets = [start_node] if start_id != end_id: last_node = start_node siblings = iter(start_node.itersiblings(*TOKEN_TAGS)) try: while xmlid(last_node) != end_id: last_node = next(siblings) targets.append(last_node) except StopIteration: raise ElementNotFoundError(f"Span {span_id} end element not found", end_id) return targets
def gen_tasks(tree: _ElementTree) -> Iterator[Task]: for task_node in tree.findall('/body/ul/li'): task_dict = dict(node_to_dict(task_node)) recurrence = None if task_dict.get('Recurrence info'): assert isinstance(task_dict, dict) recnode = cast(Dict[str, str], task_dict['Recurrence info']) recurrence = Recurrence( frequency=cast(Frequency, recnode['Frequency']), start=ensure(parse_timestamp_ms(recnode['Start'])), end=ensure(parse_timestamp_ms(recnode['End'])), hour=int(recnode['Hour of day to fire']), every=maybe(recnode.get('Every'), int) or 1, weekday_num=maybe(recnode.get('Weekday number'), int), day_of_month=maybe(recnode.get('Day number of month'), parse_day_num), day_of_week=maybe(recnode.get('Day of week'), lambda x: cast(Weekday, x)), month=maybe(recnode.get('Month of year'), lambda x: cast(Month, x)), ) simple_fields = cast(Dict[str, str], task_dict) task = Task(title=simple_fields['Title'], created=ensure( parse_timestamp_ms(simple_fields['Created time'])), state=cast(State, simple_fields['State']), due=maybe(simple_fields.get('Due date'), lambda x: parse_timestamp_ms(x)), recurrence=recurrence) print(task) yield task
def possible_smx_tags(lang1: str, pos: str, tree: _ElementTree) -> Iterator[Tuple[str, List[str]]]: """Transfer sme semtags to smX lemma. Args: lang1: the language where the semtags should be fetched. pos: part of speech of the lemmas. tree: an etree containing the content of a apertium bidix file. Yields: A tuple containing a lemma of the other language and the semtags of the corresponding lang1 lemma. """ # TODO: Merge semtags # Extract lemma: tags from sme .lexc file sme_sem_tag = {word: sem_tags for word, sem_tags in lang_tags(lang1, pos)} # Iterate through all lemmas in bidix where n = pos for symbol in tree.xpath('.//p/l/s[@n="{}"]'.format(pos)): # Get the bidix p element pair = symbol.getparent().getparent() # Extract sem_tags for the sme word sem_tags = sme_sem_tag.get(pair.find('l').text) if sem_tags and pair.find('r').text is not None: # Extract the smX lemma, add the sme semtags to it yield (pair.find('r').text, sorted(sem_tags))
def parse_bundle_relations(self, xml_tree: etree._ElementTree) -> list: relation_xpath = """ //rel_abstract_bundle | //rel_bundle_abstract | //rel_concrete_bundle | //rel_bundle_concrete """ source_xpaht = ".//mxCell" relations = {} for relation in xml_tree.xpath(relation_xpath): relation_dict = dict(relation.items()) relation_source_dict = dict(relation.find(source_xpaht).items()) if relation_dict and relation_source_dict: source_id = int(relation_source_dict["target"]) if not relations.get(source_id): relations[source_id] = [] relations[source_id].append( { "destination": int(relation_source_dict["source"]), } ) return relations
def find_urls(self, tree: etree._ElementTree) -> Iterator[str]: """Yield URLs found in the document C{tree}.""" for node in tree.getroot().iter(): for attr in self.link_attrs_for_node(node.tag): try: yield cast(str, node.attrib[attr]) except KeyError: pass
def xmlGetTextNodes(self, doc: etree._ElementTree, xpath: str, namespaces: dict): """Shorthand to retrieve serialized text nodes matching a specific xpath. :param lxml.etree._ElementTree doc: XML element to parse :param str xpath: Xpath to reach :param dict namespaces: XML namespaces like `lxml.etree.getroot().nsmap` """ return ", ".join(doc.xpath(xpath, namespaces=namespaces))
def get_mentions(tree: etree._ElementTree,) -> ty.Dict[ty.Tuple[str, str], Mention]: """Extract the mentions from an ANCOR-TEI document.""" mentions = tree.xpath( ( './tei:standOff/tei:annotation[@tei:type="coreference"]' '/tei:spanGrp[@tei:subtype="mention"]/tei:span' ), namespaces=NSMAP, ) if not mentions: raise ValueError("`tree` has no mention spans") features = get_fs(tree) texts_lst = tree.findall(f"{TEI}text") if not texts_lst: raise ValueError( "Attempting to extract mentions from a document without a text" ) tokens_id_store = { xmlid(elt): elt for text in texts_lst for elt in text.iter(*TOKEN_TAGS) } res = dict() for m_elt in mentions: try: m = Mention.from_urs(m_elt, tokens_id_store.get, features.get) except ValueError as e: logger.warning(f"Skipping span {xmlid(m)}: {e}") continue if m.span_type not in MENTION_TYPES: if m.span_type in IGNORED_MENTION_TYPES: logger.debug( f"Ignoring span {m.identifier!r} with mention type {m.span_type!r}" ) else: logger.warning( f"Span {m.identifier!r} has an invalid mention type ({m.span_type!r})" ) continue res[(xmlid(m.targets[0]), xmlid(m.targets[-1]))] = m return res
def unpack_element( tree: etree._ElementTree, element: Union[etree._Element, etree._ElementUnicodeResult, Any] ) -> Tuple[str, str]: """Returns path in the tree and string representation for the given XPath query element. """ if isinstance(element, etree._Element): path = tree.getpath(element) text = etree.tostring(element, encoding='unicode', pretty_print=True) else: text = str(element) try: parent = element.getparent() except AttributeError: path = '' else: path = tree.getpath(parent) return (path, text)
def parse_additional_resources(etree: ET) -> Tuple[Tuple[str, str], ...]: """Parse tuple of additional resources.""" return tuple( map( lambda et: ( clean(et.text_content()), first(et.xpath(".//a/@href")), ), etree.xpath("//*[@id='additional-resources']//p"), ) )
def parse_root(self, xml_tree: etree._ElementTree) -> dict: xpath = "//root" for root in xml_tree.xpath(xpath): root_dict = dict(root.items()) if root_dict: return { "id": int(root_dict["id"]), "name": root_dict["label"], } raise ValueError()
def parent_map(element_tree: etree._ElementTree) -> dict: """Considerando que a estrutura etree._Element não quarda ponteiro para parent, esta função retorna um dicionário com a estrutura {child : parent , child2 : parent2, ...}, onde 'child' e 'parent' são objetos do tipo lxml.etree._Element. Args: element_tree (etree._ElementTree): Arvore xml gerada no "parser" da biblioteca lxml. Returns: dict: Dicionário que relaciona os elementos da arvore do tipo _ElementTree com seus nós pais. """ parent_map = dict((c, p) for p in element_tree.iter() for c in p) return parent_map
def _parse_element_tree(element_tree: etree._ElementTree) -> List[Node]: """Extract an interface element from an ElementTree if present. :param element_tree: parsed xml tree to operate on :type element_tree: etree._ElementTree :raises InvalidNoDLDocumentError: if tree does not adhere to schema :return: List of NoDL nodes present in the xml tree. :rtype: List[Node] """ try: interface_schema().assertValid(element_tree) except etree.DocumentInvalid as e: raise InvalidNoDLDocumentError(e) return _parse_interface(element_tree.getroot())
def get_fs(tree: etree._ElementTree) -> ty.Dict[str, FeatureStructure]: """Find and parse all the feature structures in `tree`. Return ------- A dict mapping feature structures ids to their parsed contents. """ fs_lst = tree.xpath("//tei:fs", namespaces=NSMAP) if not fs_lst: raise ElementNotFoundError( "There are no feature structure elements in this tree" ) return {xmlid(fs): parse_fs(fs) for fs in fs_lst}
def generate_module_py( omc_interface_xml: etree._ElementTree, ) -> Code: return Code( empty_line, generate_import_statements(), empty_line * 2, generate_nested_modelica_class( omc_interface_xml.xpath('//*[@id]') ).to_code(), empty_line * 2, generate_session_class( omc_interface_xml, ), )
def xmlGetTextTag(self, doc: etree._ElementTree, xpath: str, namespaces: dict, key: str): """Function to get information in tag when information isn't in nodes matching a specific xpath. :param lxml.etree._ElementTree doc: XML element to parse :param str xpath: Xpath to reach :param dict namespaces: XML namespaces like 'lxml.etree.getroot().nsmap' :param key : XML key to find like 'codeListValue' """ tag = doc.xpath(xpath, namespaces=namespaces) if len(tag) > 0: tag = tag[0].get(key, None) else: tag = "None" return tag
def parse_features(self, xml_tree: etree._ElementTree) -> dict: xpath = """ //concrete | //abstract """ features = {} for feature in xml_tree.xpath(xpath): feature_dict = dict(feature.items()) if feature_dict and not FEATURE_CLON_SUFIX in feature_dict["id"]: feature_id = int(feature_dict["id"]) features[feature_id] = { "id": feature_id, "name": feature_dict["label"], } return features
def list_tag_attribute_usage(tree: etree._ElementTree): def generate_tag_attr_paths(element, parent_path): children = list(element.iterchildren(tag=etree.Element)) value = (element.text if (len(children) == 0 and element.text and element.text.strip()) else None) attrs = { k: v for k, v in element.attrib.items() # Ignore namespaced attributes if not k.startswith('{') } yield parent_path, element.tag, attrs, value path = parent_path + (element.tag, ) for child_el in children: yield from generate_tag_attr_paths(child_el, path) yield from generate_tag_attr_paths(tree.getroot(), ())
def get_chains(tree: etree._ElementTree) -> ty.Dict[str, ty.Set[str]]: chains_grp_lst = tree.xpath( './tei:standOff/tei:annotation[@tei:type="coreference"]/tei:linkGrp[@tei:type="schema"]', namespaces=NSMAP, ) chains_grp = chains_grp_lst[0] if len(chains_grp_lst) > 1: logger.warning( "There are more than one schema group in this document" f", only {xmlid(chains_grp)!r} will be taken into account" ) res = dict() for c in chains_grp.iter(f"{TEI}link"): c_id = xmlid(c) target = c.get(f"{TEI}target") if target is None: raise ValueError(f"Schema {c_id!r} has no target attribute") res[c_id] = set((target_to_id(t) for t in target.split())) return res
def scan_fields(tree: _ElementTree) -> List[Dict[str, Set[str]]]: task_fields: Dict[str, Set[str]] = {} location_fields: Dict[str, Set[str]] = {} recurrence_fields: Dict[str, Set[str]] = {} for task_node in tree.findall('/body/ul/li'): task_dict = dict(node_to_dict(task_node)) for key, val in task_dict.items(): if key not in ['Recurrence info', 'Location']: task_fields.setdefault(key, set()).add(cast(str, val)) if task_dict.get('Location'): location_dict = cast(Dict[str, str], task_dict['Location']) for key, val in location_dict.items(): location_fields.setdefault(key, set()).add(val) if task_dict.get('Recurrence info'): recurrence_dict = cast(Dict[str, str], task_dict['Recurrence info']) for key, val in recurrence_dict.items(): recurrence_fields.setdefault(key, set()).add(val) task_fields = chop(task_fields) location_fields = chop(location_fields) recurrence_fields = chop(recurrence_fields) return [task_fields, recurrence_fields, location_fields]
def generate_criteria( crit_def: _ElementTree ) -> Tuple[CriteriaFunction, CriteriaFunction, CriteriaFunction]: """ Generates a tuple containing functions evaluating criteria. :param crit_def: The criteria root element. :return: A tuple representing functions for evaluating criteria. The first is for preconditions, the second for success criteria and the third for fail criteria. """ from dbtypes.criteria import UnknownEvaluable from util.xml import xpath root: _Element = crit_def.getroot() preconditions_nodes = xpath(root, "db:precondition") precondition = generate_criterion( preconditions_nodes[0]) if preconditions_nodes else ( lambda _: UnknownEvaluable()) success_nodes = xpath(root, "db:success") success = generate_criterion( success_nodes[0]) if success_nodes else (lambda _: UnknownEvaluable()) failure_nodes = xpath(root, "db:failure") failure = generate_criterion( failure_nodes[0]) if failure_nodes else (lambda _: UnknownEvaluable()) return precondition, success, failure
def parse_feature_relations(self, xml_tree: etree._ElementTree) -> dict: relation_xpath = """ //rel_concrete_root | //rel_abstract_root | //rel_concrete_abstract | //rel_concrete_concrete | //rel_abstract_concrete | //rel_abstract_abstract """ source_xpaht = ".//mxCell" relations = {} for relation in xml_tree.xpath(relation_xpath): relation_dict = dict(relation.items()) relation_source_dict = dict(relation.find(source_xpaht).items()) if relation_dict and relation_source_dict: # Account for inverted direction in requires relations if relation_dict["relType"] == "requires": source_id = int(relation_source_dict["source"]) destination_id = int(relation_source_dict["target"]) else: source_id = int(relation_source_dict["target"]) destination_id = int(relation_source_dict["source"]) if not relations.get(source_id): relations[source_id] = [] relations[source_id].append( { "destination": destination_id, "constraint_type": relation_dict["relType"], } ) return relations
def repair_tree(tree: etree._ElementTree, content_type: str, report: Report) -> bool: """Check the document tree for general errors that would prevent other checkers from doing their work and repair those if possible. @return: True iff the tree was modified. """ modified = False # Make sure XHTML root element has a namespace. if content_type == 'application/xhtml+xml': root = tree.getroot() if root.tag != '{http://www.w3.org/1999/xhtml}html': msg = 'The root element does not use the XHTML namespace.' html = concat( msg, xml.br, 'expected: ', xml.code['<html xmlns="http://www.w3.org/1999/xhtml"']) report.error(msg, extra={'html': html}) # lxml will auto-fix this for us when serializing, so there is # no need to actually modify the tree. modified = True return modified