def validate_xml(xml, xsl_fname='fgdc', as_dataframe=False):
    """

    Parameters
    ----------
    xml : lxml document
                or
          filename
                or
          string containing xml representation

    xsl_fname : str (optional)
                can be one of:
                'fgdc' - uses the standard fgdc schema
                        ../resources/FGDC/fgdc-std-001-1998-annotated.xsd
                'bdp' = use the Biological Data profile schema,
                        ../resources/FGDC/BDPfgdc-std-001-1998-annotated.xsd
                full file path to another local schema.

                if not specified defaults to 'fgdc'

    Returns
    -------
        list of tuples
        (
    """

    if xsl_fname.lower() == 'fgdc':
        xsl_fname = utils.get_resource_path('fgdc/fgdc-std-001-1998-annotated.xsd')
    elif xsl_fname.lower() == 'bdp':
        xsl_fname = utils.get_resource_path('fgdc/BDPfgdc-std-001-1998-annotated.xsd')
    else:
        xsl_fname = xsl_fname

    xmlschema_doc = etree.parse(xsl_fname)
    xmlschema = etree.XMLSchema(xmlschema_doc)

    xml_str = xml_utils.node_to_string(xml_utils.xml_document_loader(xml))
    tree = etree.ElementTree(etree.fromstring(xml_str))

    if xmlschema.validate(tree):
        return []

    # tree = etree.ElementTree(xml)
    line_lookup = dict([(e.sourceline, tree.getpath(e)) for e in tree.xpath('.//*')])
    line_lookup[tree.getroot().sourceline] = tree.getpath(tree.getroot())
    errors = []
    for error in xmlschema.error_log:
        try:
            errors.append((line_lookup[error.line][1:],
                           clean_error_message(error.message), error.line))
        except KeyError:
            errors.append(('Unknown', clean_error_message(error.message),
                           error.line))

    if as_dataframe:
        cols = ['xpath', 'message', 'line number']
        return pd.DataFrame.from_records(errors, columns=cols)
    else:
        return errors
Exemple #2
0
def validate_xml(xml, xsl_fname="fgdc", as_dataframe=False):
    """

    Parameters
    ----------
    xml : lxml document
                or
          filename
                or
          string containing xml representation

    xsl_fname : str (optional)
                can be one of:
                'fgdc' - uses the standard fgdc schema
                        ../resources/FGDC/fgdc-std-001-1998-annotated.xsd
                'bdp' = use the Biological Data profile schema,
                        ../resources/FGDC/BDPfgdc-std-001-1998-annotated.xsd
                full file path to another local schema.

                if not specified defaults to 'fgdc'
    as_dataframe : bool
                used to specify return format (list of tuples or dataframe)

    Returns
    -------
        list of tuples
        (xpath, error message, line number)
        or
        pandas dataframe
    """

    if xsl_fname.lower() == "fgdc":
        xsl_fname = utils.get_resource_path(FGDC_XSD_NAME)
    elif xsl_fname.lower() == "bdp":
        xsl_fname = utils.get_resource_path(BDP_XSD_NAME)
    else:
        xsl_fname = xsl_fname

    xmlschema = xml_utils.load_schema(xsl_fname)
    xml_doc = xml_utils.xml_document_loader(xml)
    xml_str = xml_utils.node_to_string(xml_doc)

    tree_node = xml_utils.string_to_node(xml_str.encode("utf-8"))
    lxml._etree._ElementTree(tree_node)

    errors = []
    srcciteas = []

    src_xpath = "dataqual/lineage/srcinfo/srccitea"
    src_nodes = tree_node.xpath(src_xpath)
    for i, src in enumerate(src_nodes):
        srcciteas.append(src.text)
        if src.text is None:
            if len(src_nodes) == 1:
                errors.append((
                    "metadata/" + src_xpath,
                    "source citation abbreviation cannot be empty",
                    1,
                ))
            else:
                xpath = "metadata/dataqual/lineage/srcinfo[{}]/srccitea"
                errors.append((
                    xpath.format(i + 1),
                    "source citation abbreviation cannot be empty",
                    1,
                ))
    procstep_xpath = "dataqual/lineage/procstep"
    procstep_nodes = tree_node.xpath(procstep_xpath)
    for proc_i, proc in enumerate(procstep_nodes):
        srcprod_nodes = proc.xpath("srcprod")
        for srcprod_i, srcprod in enumerate(srcprod_nodes):
            srcciteas.append(srcprod.text)
            if srcprod.text is None:
                error_xpath = procstep_xpath
                if len(procstep_nodes) > 1:
                    error_xpath += "[{}]".format(proc_i + 1)
                error_xpath += "/srcprod"
                if len(srcprod_nodes) > 1:
                    error_xpath += "[{}]".format(proc_i + 1)
                errors.append((
                    "metadata/" + error_xpath,
                    "source produced abbreviation cannot be empty",
                    1,
                ))

    srcused_xpath = "dataqual/lineage/procstep/srcused"
    srcused_nodes = tree_node.xpath(srcused_xpath)
    for i, src in enumerate(srcused_nodes):
        if src.text not in srcciteas:
            if len(srcused_nodes) == 1:
                errors.append((
                    "metadata/" + srcused_xpath,
                    "Source Used Citation Abbreviation {} "
                    "not found in Source inputs "
                    "used".format(src.text),
                    1,
                ))
            else:
                xpath = "metadata/dataqual/lineage/procstep[{}]/srcused"
                errors.append((
                    xpath.format(i + 1),
                    "Source Used Citation Abbreviation {} "
                    "not found in Source inputs "
                    "used".format(src.text),
                    1,
                ))

    if xmlschema.validate(tree_node) and not errors:
        return []

    line_lookup = dict([(e.sourceline, tree_node.getroottree().getpath(e))
                        for e in tree_node.xpath(".//*")])
    sourceline = tree_node.sourceline
    line_lookup[sourceline] = tree_node.getroottree().getpath(tree_node)

    fgdc_lookup = get_fgdc_lookup()

    for error in xmlschema.error_log:
        error_msg = clean_error_message(error.message, fgdc_lookup)
        try:
            errors.append((line_lookup[error.line][1:], error_msg, error.line))
        except KeyError:
            errors.append(("Unknown", error_msg, error.line))

    errors = list(OrderedDict.fromkeys(errors))

    if as_dataframe:
        cols = ["xpath", "message", "line number"]
        return pd.DataFrame.from_records(errors, columns=cols)
    else:
        return errors