def validate_xml(xml, xsl_fname='fgdc', as_dataframe=False): """ Parameters ---------- xml : lxml document or filename or string containing xml representation xsl_fname : str (optional) can be one of: 'fgdc' - uses the standard fgdc schema ../resources/FGDC/fgdc-std-001-1998-annotated.xsd 'bdp' = use the Biological Data profile schema, ../resources/FGDC/BDPfgdc-std-001-1998-annotated.xsd full file path to another local schema. if not specified defaults to 'fgdc' Returns ------- list of tuples ( """ if xsl_fname.lower() == 'fgdc': xsl_fname = utils.get_resource_path('fgdc/fgdc-std-001-1998-annotated.xsd') elif xsl_fname.lower() == 'bdp': xsl_fname = utils.get_resource_path('fgdc/BDPfgdc-std-001-1998-annotated.xsd') else: xsl_fname = xsl_fname xmlschema_doc = etree.parse(xsl_fname) xmlschema = etree.XMLSchema(xmlschema_doc) xml_str = xml_utils.node_to_string(xml_utils.xml_document_loader(xml)) tree = etree.ElementTree(etree.fromstring(xml_str)) if xmlschema.validate(tree): return [] # tree = etree.ElementTree(xml) line_lookup = dict([(e.sourceline, tree.getpath(e)) for e in tree.xpath('.//*')]) line_lookup[tree.getroot().sourceline] = tree.getpath(tree.getroot()) errors = [] for error in xmlschema.error_log: try: errors.append((line_lookup[error.line][1:], clean_error_message(error.message), error.line)) except KeyError: errors.append(('Unknown', clean_error_message(error.message), error.line)) if as_dataframe: cols = ['xpath', 'message', 'line number'] return pd.DataFrame.from_records(errors, columns=cols) else: return errors
def validate_xml(xml, xsl_fname="fgdc", as_dataframe=False): """ Parameters ---------- xml : lxml document or filename or string containing xml representation xsl_fname : str (optional) can be one of: 'fgdc' - uses the standard fgdc schema ../resources/FGDC/fgdc-std-001-1998-annotated.xsd 'bdp' = use the Biological Data profile schema, ../resources/FGDC/BDPfgdc-std-001-1998-annotated.xsd full file path to another local schema. if not specified defaults to 'fgdc' as_dataframe : bool used to specify return format (list of tuples or dataframe) Returns ------- list of tuples (xpath, error message, line number) or pandas dataframe """ if xsl_fname.lower() == "fgdc": xsl_fname = utils.get_resource_path(FGDC_XSD_NAME) elif xsl_fname.lower() == "bdp": xsl_fname = utils.get_resource_path(BDP_XSD_NAME) else: xsl_fname = xsl_fname xmlschema = xml_utils.load_schema(xsl_fname) xml_doc = xml_utils.xml_document_loader(xml) xml_str = xml_utils.node_to_string(xml_doc) tree_node = xml_utils.string_to_node(xml_str.encode("utf-8")) lxml._etree._ElementTree(tree_node) errors = [] srcciteas = [] src_xpath = "dataqual/lineage/srcinfo/srccitea" src_nodes = tree_node.xpath(src_xpath) for i, src in enumerate(src_nodes): srcciteas.append(src.text) if src.text is None: if len(src_nodes) == 1: errors.append(( "metadata/" + src_xpath, "source citation abbreviation cannot be empty", 1, )) else: xpath = "metadata/dataqual/lineage/srcinfo[{}]/srccitea" errors.append(( xpath.format(i + 1), "source citation abbreviation cannot be empty", 1, )) procstep_xpath = "dataqual/lineage/procstep" procstep_nodes = tree_node.xpath(procstep_xpath) for proc_i, proc in enumerate(procstep_nodes): srcprod_nodes = proc.xpath("srcprod") for srcprod_i, srcprod in enumerate(srcprod_nodes): srcciteas.append(srcprod.text) if srcprod.text is None: error_xpath = procstep_xpath if len(procstep_nodes) > 1: error_xpath += "[{}]".format(proc_i + 1) error_xpath += "/srcprod" if len(srcprod_nodes) > 1: error_xpath += "[{}]".format(proc_i + 1) errors.append(( "metadata/" + error_xpath, "source produced abbreviation cannot be empty", 1, )) srcused_xpath = "dataqual/lineage/procstep/srcused" srcused_nodes = tree_node.xpath(srcused_xpath) for i, src in enumerate(srcused_nodes): if src.text not in srcciteas: if len(srcused_nodes) == 1: errors.append(( "metadata/" + srcused_xpath, "Source Used Citation Abbreviation {} " "not found in Source inputs " "used".format(src.text), 1, )) else: xpath = "metadata/dataqual/lineage/procstep[{}]/srcused" errors.append(( xpath.format(i + 1), "Source Used Citation Abbreviation {} " "not found in Source inputs " "used".format(src.text), 1, )) if xmlschema.validate(tree_node) and not errors: return [] line_lookup = dict([(e.sourceline, tree_node.getroottree().getpath(e)) for e in tree_node.xpath(".//*")]) sourceline = tree_node.sourceline line_lookup[sourceline] = tree_node.getroottree().getpath(tree_node) fgdc_lookup = get_fgdc_lookup() for error in xmlschema.error_log: error_msg = clean_error_message(error.message, fgdc_lookup) try: errors.append((line_lookup[error.line][1:], error_msg, error.line)) except KeyError: errors.append(("Unknown", error_msg, error.line)) errors = list(OrderedDict.fromkeys(errors)) if as_dataframe: cols = ["xpath", "message", "line number"] return pd.DataFrame.from_records(errors, columns=cols) else: return errors