def test_valid_extra_schematron(self): extra_sch = io.BytesIO(b'''\ <schema xmlns="http://purl.oclc.org/dsdl/schematron"> <pattern id="two_elements"> <title>Max 2 elements allowed.</title> <rule context="Total"> <assert test="count(//Percent) < 3">Element 'Total': More than 2 elements.</assert> </rule> </pattern> </schema> ''') fp = etree.parse( io.BytesIO( b'<Total><Percent>70</Percent><Percent>20</Percent><Percent>10</Percent></Total>' )) xml = domain.XMLValidator.parse(fp, no_doctype=True, sps_version='sps-1.1') xml.schematron = isoschematron.Schematron(etree.parse(sample_sch)) xml.extra_schematron = isoschematron.Schematron(etree.parse(extra_sch)) result, errors = xml._validate_sch() self.assertFalse(result) self.assertEqual(len(errors), 1) self.assertEqual(errors[0].message, "Element 'Total': More than 2 elements.")
def xml_validate(parent, xmlns, xpath=None, rng=None, sch=None): """Validate xml by rng and sch""" if xpath: # rng rules relaxng = None if rng: rng_node = etree.XML(rng) relaxng = etree.RelaxNG(rng_node) # schematron rules schematron = None if sch: sch_node = etree.XML(sch) schematron = isoschematron.Schematron(sch_node) # run validation for node in parent.xpath(xpath, namespaces=xmlns): if relaxng: if not relaxng.validate(node): raise cfy_exc.NonRecoverableError("Not valid xml by rng") if schematron: if not schematron.validate(node): raise cfy_exc.NonRecoverableError( "Not valid xml by Schematron")
def test_schematron_assertValid(self): schema = self.parse('''\ <sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron"> <sch:pattern id="number_of_entries"> <sch:title>mandatory number_of_entries tests</sch:title> <sch:rule context="number_of_entries"> <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert> </sch:rule> </sch:pattern> </sch:schema> ''') tree_valid = self.parse('''\ <message> <number_of_entries>0</number_of_entries> <entries> </entries> </message> ''') tree_invalid = self.parse('''\ <message> <number_of_entries>3</number_of_entries> <entries> <entry>Entry 1</entry> <entry>Entry 2</entry> </entries> </message> ''') schematron = isoschematron.Schematron(schema) self.assertTrue(schematron(tree_valid), schematron.error_log) self.assertRaises(etree.DocumentInvalid, schematron.assertValid, tree_invalid)
def test_extra_schematron_thru_parse(self): extra_sch = io.BytesIO(b'''\ <schema xmlns="http://purl.oclc.org/dsdl/schematron"> <pattern id="two_elements"> <title>Max 2 elements allowed.</title> <rule context="Total"> <assert test="count(//Percent) < 3">Element 'Total': More than 2 elements.</assert> </rule> </pattern> </schema> ''') fp = etree.parse( io.BytesIO( b'<Total><Percent>70</Percent><Percent>20</Percent><Percent>10</Percent></Total>' )) extra_sch_obj = isoschematron.Schematron(etree.parse(extra_sch)) xml = domain.XMLValidator.parse(fp, no_doctype=True, sps_version='sps-1.1', extra_sch_schemas=[extra_sch_obj]) result, errors = xml.validate_style() self.assertFalse(result) self.assertTrue("Element 'Total': More than 2 elements." in [err.message for err in errors])
def validate_xml(self, dict_file, parsed_xml_tree, validator_type, validator_name): # Check that validator is valid if not validator_type in self.Config or not validator_name in self.Config[ validator_type]: msg = "XML Validator type " + validator_type + " not found in ConfigManager instance" raise FprimeXmlException(msg) # Create proper xml validator tool validator_file_handler = open( ROOTDIR + self.Config.get(validator_type, validator_name), 'r') validator_parsed = etree.parse(validator_file_handler) validator_file_handler.close() if validator_type == 'schema': validator_compiled = etree.RelaxNG(validator_parsed) elif validator_type == 'schematron': validator_compiled = isoschematron.Schematron(validator_parsed) # Validate XML file if not validator_compiled.validate(parsed_xml_tree): if validator_type == 'schema': msg = "XML file {} is not valid according to {} {}.".format( dict_file, validator_type, ROOTDIR + self.Config.get(validator_type, validator_name)) raise FprimeXmlException(msg) elif validator_type == 'schematron': msg = "WARNING: XML file {} is not valid according to {} {}.".format( dict_file, validator_type, ROOTDIR + self.Config.get(validator_type, validator_name)) PRINT.info(msg)
def get_xslt_tree(self, model_instance): if self._schematron_xslt is None: schematron_tree = self.get_schematron_tree(model_instance) self._schematron = isoschematron.Schematron( schematron_tree, **self.schematron_kwargs) self._schematron_xslt = self._schematron.validator_xslt.getroot() return self._schematron_xslt
def test_schematron_error_log(self): schema = self.parse('''\ <sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron"> <sch:pattern id="number_of_entries"> <sch:title>mandatory number_of_entries tests</sch:title> <sch:rule context="number_of_entries"> <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert> </sch:rule> </sch:pattern> </sch:schema> ''') tree_valid = self.parse('''\ <message> <number_of_entries>0</number_of_entries> <entries> </entries> </message> ''') tree_invalid = self.parse('''\ <message> <number_of_entries>3</number_of_entries> <entries> <entry>Entry 1</entry> <entry>Entry 2</entry> </entries> </message> ''') schematron = isoschematron.Schematron(schema) self.assertTrue(schematron(tree_valid), schematron.error_log) valid = schematron(tree_invalid) self.assertTrue(not valid) self.assertEqual(len(schematron.error_log), 1, 'expected single error: %s (%s errors)' % (schematron.error_log, len(schematron.error_log)))
def validate_schematron_pt_udf(pt): # parse schematron sct_doc = etree.parse(vs_filepath) validator = isoschematron.Schematron(sct_doc, store_report=True) for row in pt: try: # get document xml record_xml = etree.fromstring(row.document.encode('utf-8')) # validate is_valid = validator.validate(record_xml) # if not valid, prepare Row if not is_valid: # prepare results_dict results_dict = {'fail_count': 0, 'failed': []} # get failed report_root = validator.validation_report.getroot() fails = report_root.findall( 'svrl:failed-assert', namespaces=report_root.nsmap) # log fail_count results_dict['fail_count'] = len(fails) # loop through fails and add to dictionary for fail in fails: fail_text_elem = fail.find('svrl:text', namespaces=fail.nsmap) results_dict['failed'].append(fail_text_elem.text) yield Row(record_id=row._id, record_identifier=row.record_id, job_id=row.job_id, validation_scenario_id=int(vs_id), validation_scenario_name=vs_name, valid=False, results_payload=json.dumps(results_dict), fail_count=results_dict['fail_count']) except Exception as e: results_dict = {'fail_count': 0, 'failed': []} results_dict['fail_count'] += 1 results_dict['failed'].append( "Schematron validation exception: %s" % (str(e))) yield Row(record_id=row._id, record_identifier=row.record_id, job_id=row.job_id, validation_scenario_id=int(vs_id), validation_scenario_name=vs_name, valid=False, results_payload=json.dumps(results_dict), fail_count=results_dict['fail_count'])
def test_schematron(self): tree_valid = self.parse('<AAA><BBB/><CCC/></AAA>') tree_invalid = self.parse('<AAA><BBB/><CCC/><DDD/></AAA>') schema = self.parse('''\ <schema xmlns="http://purl.oclc.org/dsdl/schematron" > <pattern id="OpenModel"> <title>Open Model</title> <rule context="AAA"> <assert test="BBB"> BBB element is not present</assert> <assert test="CCC"> CCC element is not present</assert> </rule> </pattern> <pattern id="ClosedModel"> <title>Closed model"</title> <rule context="AAA"> <assert test="BBB"> BBB element is not present</assert> <assert test="CCC"> CCC element is not present</assert> <assert test="count(BBB|CCC) = count (*)">There is an extra element</assert> </rule> </pattern> </schema> ''') schema = isoschematron.Schematron(schema) self.assertTrue(schema.validate(tree_valid)) self.assertTrue(not schema.validate(tree_invalid))
def get_schematron(sct_path): """Return an lxml ``isoschematron.Schematron()`` instance using the schematron file at ``sct_path``. """ sct_path = _get_file_path(sct_path) parser = etree.XMLParser(remove_blank_text=True) sct_doc = etree.parse(sct_path, parser=parser) return isoschematron.Schematron(sct_doc, store_report=True)
def test_schematron_store_xslt(self): schema = self.parse('''\ <sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron"> <sch:pattern id="number_of_entries"> <sch:title>mandatory number_of_entries tests</sch:title> <sch:rule context="number_of_entries"> <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert> </sch:rule> </sch:pattern> </sch:schema> ''') schematron = isoschematron.Schematron(schema) self.assertTrue(schematron.validator_xslt is None) schematron = isoschematron.Schematron(schema, store_xslt=True) self.assertTrue(isinstance(schematron.validator_xslt, etree._ElementTree), 'expected validator xslt to be stored')
def get_schematron_from_buffer(buff, parser=NOIDS_XMLPARSER): """Returns an ``isoschematron.Schematron`` for ``buff``. The default parser doesn't collect ids on a hash table, i.e.: ``collect_ids=False``. """ xmlschema_doc = etree.parse(buff, parser) return isoschematron.Schematron(xmlschema_doc)
def setUp(self): with open("../schemas/SAND-MPD.xsd") as schema: sand_mpd_schema_doc = etree.parse(schema) self.sand_mpd_schema = etree.XMLSchema(sand_mpd_schema_doc) with open("../schemas/SAND-MPD.sch") as schematron: sand_mpd_schmeatron_doc = etree.parse(schematron) self.sand_mpd_schematron = isoschematron.Schematron( sand_mpd_schmeatron_doc)
def main(): # Schema f = io.StringIO('''\ <schema xmlns="http://purl.oclc.org/dsdl/schematron" > <ns prefix="csip" uri="DILCIS"/> <ns prefix="ead" uri="urn:isbn:1-931666-22-9"/> <ns prefix="mets" uri="http://www.loc.gov/METS/"/> <pattern id="METS_root_element_validation"> <title>Validate METS root element.</title> <rule id="CSIP3-TYPE-existence" context="mets:mets"> <assert test="@TYPE">General content type attribute does not exist.</assert> </rule> <rule id="CSIP4-CONTENTTYPESPECIFICATION-existence" context="mets:mets"> <assert test="@csip:CONTENTTYPESPECIFICATION">Content information type attribute does not exist.</assert> </rule> </pattern> <pattern id="METS_root_element_value_validation"> <rule id="CSIP4-CONTENTTYPESPECIFICATION-value" context="mets:mets"> <assert test="(contains(string(@csip:CONTENTTYPESPECIFICATION), 'SMURFERMS') or contains(string(@csip:CONTENTTYPESPECIFICATION), 'SMURFSFSB') or contains(string(@csip:CONTENTTYPESPECIFICATION), 'SIARD1') or contains(string(@csip:CONTENTTYPESPECIFICATION), 'SIARD2') or contains(string(@csip:CONTENTTYPESPECIFICATION), 'SIARDDK') or contains(string(@csip:CONTENTTYPESPECIFICATION), 'GeoVectorGML') or contains(string(@csip:CONTENTTYPESPECIFICATION), 'GeoRasterGeotiff') or contains(string(@csip:CONTENTTYPESPECIFICATION), 'MIXED') or contains(string(@csip:CONTENTTYPESPECIFICATION), 'OTHER'))">Content information type attribute value is not known. Known values are: SMURFERMS, SMURFSFSB, SIARD1, SIARD2, SIARDDK, GeoVectorGML, GeoRasterGeotiff, MIXED, OTHER.</assert> </rule> </pattern> </schema> ''') # Parse schema sct_doc = etree.parse(f) schematron = isoschematron.Schematron(sct_doc, store_report = True) # XML to validate notValid = io.StringIO('''\ <mets:mets xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:mets="http://www.loc.gov/METS/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:csip="DILCIS" OBJID="uuid-4422c185-5407-4918-83b1-7abfa77de182" LABEL="Sample CS IP Information Package" TYPE="Database" csip:CONTENTTYPESPECIFICATION="SIARD3" PROFILE="http://www.eark-project.com/METS/IP.xml" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd http://www.w3.org/1999/xlink http://www.loc.gov/standards/mets/xlink.xsd"> </mets:mets> ''') # Parse xml doc = etree.parse(notValid) # Validate against schema validationResult = schematron.validate(doc) # Validation report report = schematron.validation_report print("is valid: " + str(validationResult)) print(type(report)) print(report)
def test_schematron_xmlschema_embedded(self): schema = self.parse('''\ <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:sch="http://purl.oclc.org/dsdl/schematron"> <xs:element name="message"> <xs:complexType> <xs:sequence> <xs:element name="number_of_entries" type="xs:positiveInteger"> <xs:annotation> <xs:appinfo> <sch:pattern id="number_of_entries"> <sch:title>mandatory number_of_entries tests</sch:title> <sch:rule context="number_of_entries"> <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert> </sch:rule> </sch:pattern> </xs:appinfo> </xs:annotation> </xs:element> <xs:element name="entries"> <xs:complexType> <xs:sequence> <xs:element name="entry" type="xs:string" minOccurs="0" maxOccurs="unbounded"/> </xs:sequence> </xs:complexType> </xs:element> </xs:sequence> </xs:complexType> </xs:element> </xs:schema> ''') tree_valid = self.parse('''\ <message> <number_of_entries>2</number_of_entries> <entries> <entry>Entry 1</entry> <entry>Entry 2</entry> </entries> </message> ''') tree_invalid = self.parse('''\ <message> <number_of_entries>1</number_of_entries> <entries> <entry>Entry 1</entry> <entry>Entry 2</entry> </entries> </message> ''') xmlschema = etree.XMLSchema(schema) schematron = isoschematron.Schematron(schema) # fwiw, this must also be XMLSchema-valid self.assertTrue(xmlschema(tree_valid), xmlschema.error_log) self.assertTrue(schematron(tree_valid)) # still schema-valid self.assertTrue(xmlschema(tree_invalid), xmlschema.error_log) self.assertTrue(not schematron(tree_invalid))
def test_schematron_result_report(self): schema = self.parse('''\ <sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron"> <sch:pattern id="number_of_entries"> <sch:title>mandatory number_of_entries tests</sch:title> <sch:rule context="number_of_entries"> <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert> </sch:rule> </sch:pattern> </sch:schema> ''') tree_valid = self.parse('''\ <message> <number_of_entries>0</number_of_entries> <entries> </entries> </message> ''') tree_invalid = self.parse('''\ <message> <number_of_entries>3</number_of_entries> <entries> <entry>Entry 1</entry> <entry>Entry 2</entry> </entries> </message> ''') schematron = isoschematron.Schematron(schema, store_report=True) self.assertTrue(schematron(tree_valid), schematron.error_log) valid = schematron(tree_invalid) self.assertTrue(not valid) self.assertTrue( isinstance(schematron.validation_report, etree._ElementTree), 'expected a validation report result tree, got: %s' % (schematron.validation_report)) schematron = isoschematron.Schematron(schema, store_report=False) self.assertTrue(schematron(tree_valid), schematron.error_log) valid = schematron(tree_invalid) self.assertTrue(not valid) self.assertTrue( schematron.validation_report is None, 'validation reporting switched off, still: %s' % (schematron.validation_report))
def test_invalid_schematron(self): fp = etree.parse( StringIO( b'<Total><Percent>60</Percent><Percent>30</Percent></Total>')) xml = stylechecker.XML(fp) xml.schematron = isoschematron.Schematron(etree.parse(sample_sch)) result, errors = xml._validate_sch() self.assertFalse(result) self.assertTrue(errors)
def __init__(self): xsd_path = join(dirname(__file__), self.sand_message_xsd) with open(xsd_path) as f: sand_schema_doc = etree.parse(f) self.sand_xml_schema = etree.XMLSchema(sand_schema_doc) sch_path = join(dirname(__file__), self.sand_message_sch) with open(sch_path) as f: sand_schematron_doc = etree.parse(f) self.sand_schematron = isoschematron.Schematron(sand_schematron_doc)
def test_schematron_empty_pattern(self): schema = self.parse('''\ <schema xmlns="http://purl.oclc.org/dsdl/schematron" > <pattern id="OpenModel"> <title>Open model</title> </pattern> </schema> ''') schema = isoschematron.Schematron(schema) self.assertTrue(schema)
def test_schematron_relaxng_embedded(self): schema = self.parse('''\ <grammar xmlns="http://relaxng.org/ns/structure/1.0" xmlns:sch="http://purl.oclc.org/dsdl/schematron" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes"> <start> <ref name="message"/> </start> <define name="message"> <element name="message"> <element name="number_of_entries"> <!-- RelaxNG can be mixed freely with stuff from other namespaces --> <sch:pattern id="number_of_entries"> <sch:title>mandatory number_of_entries tests</sch:title> <sch:rule context="number_of_entries"> <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert> </sch:rule> </sch:pattern> <data type="positiveInteger"/> </element> <element name="entries"> <zeroOrMore> <element name="entry"><data type="string"/></element> </zeroOrMore> </element> </element> </define> </grammar> ''') tree_valid = self.parse('''\ <message> <number_of_entries>2</number_of_entries> <entries> <entry>Entry 1</entry> <entry>Entry 2</entry> </entries> </message> ''') tree_invalid = self.parse('''\ <message> <number_of_entries>1</number_of_entries> <entries> <entry>Entry 1</entry> <entry>Entry 2</entry> </entries> </message> ''') relaxng = etree.RelaxNG(schema) schematron = isoschematron.Schematron(schema) # fwiw, this must also be RelaxNG-valid self.assertTrue(relaxng(tree_valid), relaxng.error_log) self.assertTrue(schematron(tree_valid)) # still schema-valid self.assertTrue(relaxng(tree_invalid), relaxng.error_log) self.assertTrue(not schematron(tree_invalid))
def test_schematron_fail_on_report(self): tree_valid = self.parse('<AAA><BBB/><CCC/></AAA>') tree_invalid = self.parse('<AAA><BBB/><CCC/><DDD/></AAA>') schema = self.parse('''\ <schema xmlns="http://purl.oclc.org/dsdl/schematron" > <pattern id="OpenModel"> <title>Simple Report</title> <rule context="AAA"> <report test="DDD"> DDD element must not be present</report> </rule> </pattern> </schema> ''') schema_report = isoschematron.Schematron( schema, error_finder=isoschematron.Schematron.ASSERTS_AND_REPORTS) schema_no_report = isoschematron.Schematron(schema) self.assertTrue(schema_report.validate(tree_valid)) self.assertTrue(not schema_report.validate(tree_invalid)) self.assertTrue(schema_no_report.validate(tree_valid)) self.assertTrue(schema_no_report.validate(tree_invalid))
def main(teipath, rngfile, schematronfile): """ Arguments: teipath (str): path to the TEI files, e.g. /home/ulrike/Dokumente/Git/textbox/es/novela-espanola/tei/*.xml rngfile (str): path to the schema file, e.g. /home/ulrike/Schreibtisch/basisformat.rng schematronfile (str): path to the schematron file, e.g. /home/ulrike/Schreibtisch/keywords.sch Example: from toolbox.check_quality import validate_tei validate_tei.main("/home/ulrike/Git/novelashispanoamericanas/master/nh0001.xml", "/home/ulrike/Git/novelashispanoamericanas/cligs_importance.rnc", "/home/ulrike/Git/novelashispanoamericanas/keywords.sch") """ problematic_files_rng = 0 problematic_files_sch = 0 for teifile in glob.glob(teipath): idno = os.path.basename(teifile) #print(idno) parser = etree.XMLParser(recover=True) parser.resolvers.add(FileResolver()) teiparsed = etree.parse(teifile, parser) #teiparsed = etree.parse(teifile) # RelaxNG validation rngparsed = etree.parse(rngfile) rngvalidator = etree.RelaxNG(rngparsed) validation_rng = rngvalidator.validate(teiparsed) log_rng = rngvalidator.error_log # Schematron validation sct_doc = etree.parse(schematronfile, parser) schematron = isoschematron.Schematron(sct_doc) validation_sch = schematron.validate(teiparsed) log_sch = schematron.error_log if validation_rng == True: print(idno, "valid with RNG!") else: print(idno, "sorry, not valid with RNG!") print(log_rng) problematic_files_rng += 1 #print(log.last_error) #print(log.last_error.domain_name) #print(log.last_error.type_name) if validation_sch == True: print(idno, "valid with schematron!") else: problematic_files_sch += 1 print(idno, "sorry, not valid with schematron!") print(log_sch) print(problematic_files_rng, problematic_files_sch)
def test_validation_schematron(self): fp = etree.parse( io.BytesIO( b'<Total><Percent>70</Percent><Percent>30</Percent></Total>')) schema = domain.SchematronValidator( isoschematron.Schematron(etree.parse(sample_sch))) xml = domain.XMLValidator(fp, style_validators=[schema]) is_valid, errors = xml.validate_style() self.assertTrue(is_valid) self.assertEqual(len(errors), 0)
def test_simple_validation(file_path, expected): with open(schematron_path) as f: schematron_doc = etree.parse(f) schematron = isoschematron.Schematron(schematron_doc) with open(file_path) as gml_f: gml_obj = etree.parse(gml_f) assert schematron.validate(gml_obj)
def test_invalid_schematron(self): fp = etree.parse( io.BytesIO( b'<Total><Percent>60</Percent><Percent>30</Percent></Total>')) schema = domain.SchematronValidator( isoschematron.Schematron(etree.parse(sample_sch))) xml = domain.XMLValidator(fp, style_validators=[schema]) result, errors = xml.validate_style() self.assertFalse(result) self.assertTrue(errors)
def TestPhase(phase_name, cache): """Factory of parsed Schematron phases. :param phase_name: the phase name :param cache: mapping type """ if phase_name not in cache: phase = isoschematron.Schematron(SCH, phase=phase_name) cache[phase_name] = phase return cache[phase_name]
def _init_schematron(self, schematron): '''Returns an instance of lxml.isoschematron.Schematron''' if schematron is None: self.schematron = None return elif not (isinstance(schematron, etree._Element) or isinstance(schematron, etree._ElementTree)): tree = etree.parse(schematron) else: tree = schematron self.schematron = isoschematron.Schematron(tree, store_report=True, store_xslt=True, store_schematron=True)
def test_invalid_schematron(self): fp = etree.parse( io.BytesIO( b'<Total><Percent>60</Percent><Percent>30</Percent></Total>')) xml = domain.XMLValidator.parse(fp, no_doctype=True, sps_version='sps-1.1') xml.schematron = isoschematron.Schematron(etree.parse(sample_sch)) result, errors = xml._validate_sch() self.assertFalse(result) self.assertTrue(errors)
def report_s3_schematron(**kwargs): """Wrapper function for using S3 Retrieval, Schematron Reporting, and S3 Writer.""" source_prefix = kwargs.get("source_prefix") dest_prefix = kwargs.get("destination_prefix") bucket = kwargs.get("bucket") schematron_file = kwargs.get("schematron_filename") access_id = kwargs.get("access_id") access_secret = kwargs.get("access_secret") # create reporting csv csv_in_mem = io.StringIO() report_csv = csv.DictWriter( csv_in_mem, fieldnames=["id", "report", "record", "source_file"]) report_csv.writeheader() # get schematron doc & return lxml.etree.Schematron validator schematron_doc = process.get_github_content("tulibraries/aggregator_mdx", schematron_file) schematron = isoschematron.Schematron(etree.fromstring(schematron_doc), store_report=True) # Iterate through S3 Files, Validate, & Save Report to CSV total_transform_count = 0 for s3_key in process.list_s3_content(bucket, access_id, access_secret, source_prefix): logging.info("Validating & Reporting On File: %s", s3_key) s3_content = process.get_s3_content(bucket, s3_key, access_id, access_secret) s3_xml = etree.fromstring(s3_content) for record in s3_xml.iterchildren(): total_transform_count += 1 record_id = record.get("airflow-record-id") logging.info("Ran report on record: %s", record_id) schematron.validate(record) report_csv.writerow({ "id": record_id, "report": schematron_failed_validation_text( schematron.validation_report), "record": identifier_or_full_record(record), "source_file": f"https://s3.console.aws.amazon.com/s3/object/{bucket}/{s3_key}" }) report_filename = dest_prefix + "-report.csv" logging.info("Records report: https://%s.s3.amazonaws.com/%s", bucket, report_filename) logging.info("Total Transform Count: %s", total_transform_count) process.generate_s3_object(csv_in_mem.getvalue(), bucket, report_filename, access_id, access_secret) return {"transformed": total_transform_count}
def validate_xml(ctx, verbose=False): from lxml import etree errors = False for section in ctx.xml_validation.keys(): if verbose: print('\nXML validation (%s)' % section) section_cfg = ctx.xml_validation[section] # gather files based on paths in the config files = [] for path in section_cfg['files']: files.extend(glob.glob(path)) if not files: print('No files to process for %s' % section) continue # load configured schema schema = None if 'xsd' in section_cfg: xmlschema_doc = etree.parse(section_cfg['xsd']) schema = etree.XMLSchema(xmlschema_doc) # TODO: support other schema types here # NOTE: should be possible to support rnc # if rnc2rng is installed, but current getting an error # elif 'rnc' in section_cfg: # with open(section_cfg['rnc']) as rncdoc: # schema = etree.RelaxNG.from_rnc_string(rncdoc.read()) elif 'schematron' in section_cfg: from lxml import isoschematron sct_doc = etree.parse(section_cfg['schematron']) schema = isoschematron.Schematron(sct_doc) if schema is None: print('No recognized schema format found for %s' % section) continue for file in files: xmldoc = etree.parse(file) if not schema.validate(xmldoc): print('Validation failed: %s' % file) errors = True # if verbose: # should errors only be displayed in verbose mode? print(schema.error_log) else: if verbose: print('%s is valid' % file) # if any file was invalid, exit with an error code to indicate # the build failed if errors: exit(-1)