def get_clusters(tree: etree._ElementTree) -> ty.Dict[str, ty.Set[str]]: chains_grp = tree.xpath( './tei:standOff/tei:annotation[@tei:type="coreference"]/tei:linkGrp[@tei:type="schema"]', namespaces=NSMAP, )[0] mentions = tree.xpath( ('./tei:standOff/tei:annotation[@tei:type="coreference"]' '/tei:spanGrp[@tei:subtype="mention"]/tei:span'), namespaces=NSMAP, ) res = dict() for c in chains_grp.iter(f"{TEI}link"): target = c.attrib[f"{TEI}target"] res[c.attrib[f"{XML}id"]] = set((t[1:] for t in target.split())) non_sing = set().union(*res.values()) for m in mentions: i = m.attrib[f"{XML}id"] if i not in non_sing: res[i] = {i} a_id, b_id, intersect = next( ((a_id, b_id, intersect) for a_id, a in res.items() for b_id, b in res.items() if b is not a for intersect in (a.intersection(b), ) if intersect), (None, None, None), ) if intersect is not None: raise Exception( f"Schemas {a_id} and {b_id} are not disjoints: {intersect}") return res
def _report_status_checks(self, processed_report_doc: etree._ElementTree, embedded: bool): super()._report_status_checks(processed_report_doc, embedded) # check for any unsupported local features, e.g. DataTable # NOTE - we could eventually have different validators for local and uploaded reports if embedded: pass else: # TODO - validate at least a single element asset_blocks = processed_report_doc.xpath("count(/Report/Main//*)") if asset_blocks < 3: raise InvalidReportError( "Empty report - must contain at least one asset/block") elif asset_blocks < 4: url = "https://docs.datapane.com/reports/blocks/layout-pages-and-selects" display_msg( text= f"Your report only contains a single element - did you know you can include additional plots, tables and text in a report? Check out {url} for more info", md= f"Your report only contains a single element - did you know you can include additional plots, tables and text in a report? Check out [the docs]({url}) for more info", ) has_text: bool = processed_report_doc.xpath( "boolean(/Report/Main/Page//Text)") if not has_text: display_msg( "Your report doesn't contain any text - consider using TextReport to upload assets and add text to your report from your browser" )
def possible_smx_tags(lang1: str, pos: str, tree: _ElementTree) -> Iterator[Tuple[str, List[str]]]: """Transfer sme semtags to smX lemma. Args: lang1: the language where the semtags should be fetched. pos: part of speech of the lemmas. tree: an etree containing the content of a apertium bidix file. Yields: A tuple containing a lemma of the other language and the semtags of the corresponding lang1 lemma. """ # TODO: Merge semtags # Extract lemma: tags from sme .lexc file sme_sem_tag = {word: sem_tags for word, sem_tags in lang_tags(lang1, pos)} # Iterate through all lemmas in bidix where n = pos for symbol in tree.xpath('.//p/l/s[@n="{}"]'.format(pos)): # Get the bidix p element pair = symbol.getparent().getparent() # Extract sem_tags for the sme word sem_tags = sme_sem_tag.get(pair.find('l').text) if sem_tags and pair.find('r').text is not None: # Extract the smX lemma, add the sme semtags to it yield (pair.find('r').text, sorted(sem_tags))
def parse_bundle_relations(self, xml_tree: etree._ElementTree) -> list: relation_xpath = """ //rel_abstract_bundle | //rel_bundle_abstract | //rel_concrete_bundle | //rel_bundle_concrete """ source_xpaht = ".//mxCell" relations = {} for relation in xml_tree.xpath(relation_xpath): relation_dict = dict(relation.items()) relation_source_dict = dict(relation.find(source_xpaht).items()) if relation_dict and relation_source_dict: source_id = int(relation_source_dict["target"]) if not relations.get(source_id): relations[source_id] = [] relations[source_id].append( { "destination": int(relation_source_dict["source"]), } ) return relations
def generate_session_class( omc_interface_xml: etree._ElementTree, ) -> Code: elements_code = Code() code = Code( "class OMCSession(", CodeWithIndent( "OMCSessionBase,", ), "):", CodeWithIndent( elements_code ) ) elements_code.append("OpenModelica = OpenModelica") OpenModelica_Scripting, = omc_interface_xml.xpath( '//*[@id="OpenModelica.Scripting"]' ) for modelica_class in OpenModelica_Scripting.xpath('./classes/*'): if modelica_class.tag == "package": continue className = TypeName(modelica_class.attrib["id"]) if is_supported_element(modelica_class): elements_code.append( f"{className.last_identifier} = {className}" ) else: elements_code.append( f"# {className.last_identifier} = {className}" ) return code
def xmlGetTextNodes(self, doc: etree._ElementTree, xpath: str, namespaces: dict): """Shorthand to retrieve serialized text nodes matching a specific xpath. :param lxml.etree._ElementTree doc: XML element to parse :param str xpath: Xpath to reach :param dict namespaces: XML namespaces like `lxml.etree.getroot().nsmap` """ return ", ".join(doc.xpath(xpath, namespaces=namespaces))
def parse_additional_resources(etree: ET) -> Tuple[Tuple[str, str], ...]: """Parse tuple of additional resources.""" return tuple( map( lambda et: ( clean(et.text_content()), first(et.xpath(".//a/@href")), ), etree.xpath("//*[@id='additional-resources']//p"), ) )
def parse_root(self, xml_tree: etree._ElementTree) -> dict: xpath = "//root" for root in xml_tree.xpath(xpath): root_dict = dict(root.items()) if root_dict: return { "id": int(root_dict["id"]), "name": root_dict["label"], } raise ValueError()
def get_fs(tree: etree._ElementTree) -> ty.Dict[str, FeatureStructure]: """Find and parse all the feature structures in `tree`. Return ------- A dict mapping feature structures ids to their parsed contents. """ fs_lst = tree.xpath("//tei:fs", namespaces=NSMAP) if not fs_lst: raise ElementNotFoundError( "There are no feature structure elements in this tree" ) return {xmlid(fs): parse_fs(fs) for fs in fs_lst}
def generate_module_py( omc_interface_xml: etree._ElementTree, ) -> Code: return Code( empty_line, generate_import_statements(), empty_line * 2, generate_nested_modelica_class( omc_interface_xml.xpath('//*[@id]') ).to_code(), empty_line * 2, generate_session_class( omc_interface_xml, ), )
def _collect_action_list(et: etree._ElementTree, action_list_name: str) \ -> Tuple[Union[None, etree._Element], Union[None, etree._Element], List[Union[None, etree._Element]]]: al_elem = et.find(f"*actionList[@name='{action_list_name}']") condition = et.xpath(f"*/condition/actionListName[text()='{action_list_name}']/..")[0] if al_elem is None or condition is None: return None, None, [None] # Collect affected state objects state_objects = list() for state_object_name in condition.xpath(f"stateCondition/stateObjectName"): state_object = et.find(f"*stateObject[@name='{state_object_name.text}']") if state_object is not None: state_objects.append(state_object) return al_elem, condition, state_objects
def xmlGetTextTag(self, doc: etree._ElementTree, xpath: str, namespaces: dict, key: str): """Function to get information in tag when information isn't in nodes matching a specific xpath. :param lxml.etree._ElementTree doc: XML element to parse :param str xpath: Xpath to reach :param dict namespaces: XML namespaces like 'lxml.etree.getroot().nsmap' :param key : XML key to find like 'codeListValue' """ tag = doc.xpath(xpath, namespaces=namespaces) if len(tag) > 0: tag = tag[0].get(key, None) else: tag = "None" return tag
def parse_features(self, xml_tree: etree._ElementTree) -> dict: xpath = """ //concrete | //abstract """ features = {} for feature in xml_tree.xpath(xpath): feature_dict = dict(feature.items()) if feature_dict and not FEATURE_CLON_SUFIX in feature_dict["id"]: feature_id = int(feature_dict["id"]) features[feature_id] = { "id": feature_id, "name": feature_dict["label"], } return features
def get_mentions(tree: etree._ElementTree,) -> ty.Dict[ty.Tuple[str, str], Mention]: """Extract the mentions from an ANCOR-TEI document.""" mentions = tree.xpath( ( './tei:standOff/tei:annotation[@tei:type="coreference"]' '/tei:spanGrp[@tei:subtype="mention"]/tei:span' ), namespaces=NSMAP, ) if not mentions: raise ValueError("`tree` has no mention spans") features = get_fs(tree) texts_lst = tree.findall(f"{TEI}text") if not texts_lst: raise ValueError( "Attempting to extract mentions from a document without a text" ) tokens_id_store = { xmlid(elt): elt for text in texts_lst for elt in text.iter(*TOKEN_TAGS) } res = dict() for m_elt in mentions: try: m = Mention.from_urs(m_elt, tokens_id_store.get, features.get) except ValueError as e: logger.warning(f"Skipping span {xmlid(m)}: {e}") continue if m.span_type not in MENTION_TYPES: if m.span_type in IGNORED_MENTION_TYPES: logger.debug( f"Ignoring span {m.identifier!r} with mention type {m.span_type!r}" ) else: logger.warning( f"Span {m.identifier!r} has an invalid mention type ({m.span_type!r})" ) continue res[(xmlid(m.targets[0]), xmlid(m.targets[-1]))] = m return res
def get_chains(tree: etree._ElementTree) -> ty.Dict[str, ty.Set[str]]: chains_grp_lst = tree.xpath( './tei:standOff/tei:annotation[@tei:type="coreference"]/tei:linkGrp[@tei:type="schema"]', namespaces=NSMAP, ) chains_grp = chains_grp_lst[0] if len(chains_grp_lst) > 1: logger.warning( "There are more than one schema group in this document" f", only {xmlid(chains_grp)!r} will be taken into account" ) res = dict() for c in chains_grp.iter(f"{TEI}link"): c_id = xmlid(c) target = c.get(f"{TEI}target") if target is None: raise ValueError(f"Schema {c_id!r} has no target attribute") res[c_id] = set((target_to_id(t) for t in target.split())) return res
def parse_feature_relations(self, xml_tree: etree._ElementTree) -> dict: relation_xpath = """ //rel_concrete_root | //rel_abstract_root | //rel_concrete_abstract | //rel_concrete_concrete | //rel_abstract_concrete | //rel_abstract_abstract """ source_xpaht = ".//mxCell" relations = {} for relation in xml_tree.xpath(relation_xpath): relation_dict = dict(relation.items()) relation_source_dict = dict(relation.find(source_xpaht).items()) if relation_dict and relation_source_dict: # Account for inverted direction in requires relations if relation_dict["relType"] == "requires": source_id = int(relation_source_dict["source"]) destination_id = int(relation_source_dict["target"]) else: source_id = int(relation_source_dict["target"]) destination_id = int(relation_source_dict["source"]) if not relations.get(source_id): relations[source_id] = [] relations[source_id].append( { "destination": destination_id, "constraint_type": relation_dict["relType"], } ) return relations
def parse_bundles(self, xml_tree: etree._ElementTree) -> dict: xpath = "//bundle" bundles = {} for bundle in xml_tree.xpath(xpath): bundle_dict = dict(bundle.items()) if bundle_dict: bundle_id = int(bundle_dict["id"]) bundle_type = bundle_dict["bundleType"].lower() if bundle_type == "range": bundles[bundle_id] = { "id": int(bundle_id), "bundle_type": "group_cardinality", "low_threshold": bundle_dict["lowRange"], "high_threshold": bundle_dict["highRange"], } else: bundles[bundle_id] = { "id": int(bundle_id), "bundle_type": bundle_type, } return bundles
def parse_abstract(etree: ET) -> str: """Parse abstract.""" return etree.xpath( "//div[contains(@class, 'field-type-text-with-summary')]//p//text()" )
def parse_authors(etree: ET) -> str: """Parse all authors.""" return etree.xpath("//div[@id='info']")
def parse_references(etree: ET) -> TS: """Parse tuple of reference.""" return etree.xpath("//*[@id='bibliography']//p")
def parse_canonical(etree: ET) -> str: """Parse canonical.""" return etree.xpath("//link[contains(@rel, 'canonical')]/@href")
def parse_keywords(etree: ET) -> TS: """Parse tuple of keywords.""" return etree.xpath("//*[@id='keywords']//li//text()")
def parse_learning_objectives(etree: ET) -> TS: """Parse tuple of learning objectives.""" return etree.xpath( "//div[contains(@class, 'field-name-field-learning-objectives')]" "//li" )
def parse_instructional_assessment_questions(etree: ET) -> TS: """Parse tuple of instructional assessment questions.""" return etree.xpath( "//div[contains(@class, 'field-name-field-learning-questions')]" "//div[contains(@class, 'even')]/ol/li" )
def parse_related_topics(etree: ET) -> TS: """Parse tuple of related topics.""" return etree.xpath("//*[@id='related-topics']//a//@href")
def get_crumb(cls, page: _ElementTree) -> PaliCrumb: last_href: _Element = page.xpath("//CRUMBS/a")[-1] pali_type = PaliCrumb(last_href.get("href")) return pali_type
def get_paragraphs(cls, page: _ElementTree) -> List[_Element]: return page.xpath("//body//p")
def validate_xpath(xmlnode: etree._ElementTree, xpath: str, ns: str, attr: str, valrow: Dict, failcat: str = "ERROR") -> Dict: """ Populates valrow with validation results Populates valrow with validation results of the attribute in the node specified by xpath expression Args: xmlnode (etree._ElementTree): root or parent xmlnode xpath (str): xpath expression to search for ns (str): namespace for xpath attr (str): String with the attribute for wihc retrieve the value. If empty, the text value of the first node (if found) is used instead. valrow (Dict): initialized validation row where populate validation result. failcat (str): string with validation output category when validation fails (i.e., ERROR or WARNING) Returns: Dict: Validation row populated with the validation results. """ valrow["XPATH"] = xpath if ns != "": valnodes = xmlnode.xpath(xpath.replace("/", "/ns:"), namespaces={"ns": ns}) else: valnodes = xmlnode.xpath(xpath) valrow["VALIOUT"] = "ERROR" valrow[ "VALIMSG"] = "Validation unknown error parsing xpath expression in XML" if len(valnodes) == 1: valnode = valnodes[0] if attr == "": txt = valnode.text if txt is None: txt = "" valrow["VALIOUT"] = failcat valrow[ "VALIMSG"] = "Node found but value is missing or empty" \ " string" else: valrow["VALIOUT"] = "PASSED" valrow["VALIMSG"] = "" valrow["VALUE"] = txt else: txt = valnode.get(attr) if txt is None: txt = "" valrow["VALIOUT"] = failcat valrow["VALIMSG"] = "Node found but attribute is missing" else: valrow["VALIOUT"] = "PASSED" valrow["VALIMSG"] = "" valrow["VALUE"] = txt else: if len(valnodes) > 1: valrow["VALIOUT"] = failcat valrow["VALIMSG"] = "Multiple nodes in XML" else: valrow["VALIOUT"] = failcat valrow["VALIMSG"] = "Node not found" return valrow
def Check(root: etree._ElementTree) -> int: if (len(root.xpath('.//sign/text()')) != 1): return 1 if (len(root.xpath('.//clef-octave-change/text()')) > 1): return 2 if (len(root.xpath('.//fifths/text()')) != 1): return 3 return 0
def parse_topic_description(etree: ET) -> TS: """Parse tuple of topic description.""" return etree.xpath("//*[@id='toc']//ol//a//text()")