def validate_xml_doc(xml_schema: XmlSchema, xml_doc: XmlElement) -> None: """ Validate ``xml_doc`` against XML schema ``xml_schema``. :raises XmlSchemaDocValidationError: if ``xml_doc`` did not be validate against ``xml_schema`` """ # There are several ways to validate 'xml_doc' according to an 'xml_schema'. # Different calls and what happens if validation passes or fails: # - xml_schema.assert_(xml_doc): nothign / raises 'AssertionError' # - xml_schema.assertValid(xml_doc): nothing / raises 'DocumentInvalid' # - xml_schema.validate(xml_doc): returns True / returns False try: xml_schema.assertValid(xml_doc) except lxml.etree.DocumentInvalid as exc: # note: 'exc.error_log' and 'xml_schema.error_log' are the same object # (type 'lxml.etree._ListErrorLog'). # TODO: advanced error details parsing, without leaking too much information. # xml_error_log = exc.error_log # type: lxml.etree._ListErrorLog # last_xml_error = exc.error_log.last_error # type: lxml.etree._LogEntry # last_xml_error_xml_doc_line = last_xml_error.line # TODO: does 'xml_schema.error_log' persist? is it necessary to clear it afterwards? # `xml_schema._clear_error_log()` # Simplest and safest way to get the error message. # Error example: # "Element 'DTE': No matching global declaration available for the validation root., line 2" # noqa: E501 validation_error_msg = str(exc) raise XmlSchemaDocValidationError(validation_error_msg) from exc
def validate(self): """ Validate the provided data file for correctness against the provided schema file. @return: A flag indicating if the data validates against the schema. """ # clear any previous xml errors clear_error_log() if self.schema_file is not None: try: # Attempt parsing the schema file schdoc = parse(self.schema_file) except XMLSyntaxError as e: # The schema was not parsable XML logging.warning('The schema XML file could not be parsed.') for item in e.error_log: logging.info(item) return False try: schema = XMLSchema(schdoc) except XMLSchemaParseError as e: # The schema document is XML, but it's not a schema logging.warning( 'The schema XML file was parsed, but it does not appear to be a valid XML Schema document.' ) for item in e.error_log: logging.info(item) return False try: # Attempt parsing the data file data = parse(self.datafile) except XMLSyntaxError as e: # The data was not parsable XML logging.warning('The data XML file could not be parsed.') for item in e.error_log: logging.info(item) return False if self.schema_file is not None: if schema.validate(data): self.data = data return True logging.warning( 'The data does not conform to the provided schema.') for item in schema.error_log: logging.info(item) return False self.data = data return True
def validate(self, message: Element) -> None: """Validate the constructed request against a XSD.""" path = os.path.join(os.path.dirname(schema.__file__), self.xmlschema_definition) with open(path) as xsd: xmlschema = XMLSchema(parse(xsd)) xmlschema.assertValid(message)
def testOne(self): self.observer.methods['add'] = lambda *args, **kwargs: (x for x in []) list( compose(self.norm_mods.all_unknown('add', 'id', 'metadata', 'anotherone', lxmlNode=parse(open("data/didl_mods.xml")), identifier='oai:very:secret:09987' ))) self.assertEquals(3, len(self.observer.calledMethods)) # for m in self.observer.calledMethods: # print 'method name:',m.name, m.args, m.kwargs result = self.observer.calledMethods[2].kwargs.get('lxmlNode') # print "Converted:", tostring(result) self.assertEquals(2, len(self.observer.calledMethods[0].args)) arguments = self.observer.calledMethods[2].args self.assertEquals("id", arguments[0]) self.assertEquals("metadata", arguments[1]) #Get MODS from record: mods = result.xpath("//mods:mods", namespaces=namespacesMap) # Should be exactly 1: self.assertTrue(len(mods)==1) #print tostring(mods[0], pretty_print = True, encoding='utf-8') #Validate against schema: modsSchema = XMLSchema(lxmlParse(open(join(self.schemasPath, 'mods-3-6.xsd')))) modsSchema.validate(mods[0]) if modsSchema.error_log.last_error: self.fail(modsSchema.error_log.last_error) # Check if expected result: expectedResult = open("data/mods_converted.xml").read() # print "EXPECTED MODS:", tostring(mods[0], pretty_print = True, encoding='utf-8') self.assertEqualsWithDiff(expectedResult, tostring(mods[0], pretty_print = True, encoding='utf-8'))
def validate_data(self, data, structural_schema, metamorphic_schema=None, validate_schemas=False): """ Ensure that a data structure matches a schema (xml schema) :param str data: data to check :param str structural_schema: unique_key :param str metamorphic_schema: unique_key :param bool validate_schemas: Any provided schema will be valided before use if set to True. They are used as is otherwise. This option is suited for better perfs. Use it at your own risk. :return: True if data is valid, False otherwise :rtype: bool """ if validate_schemas is True: self.assert_structural_schema(structural_schema) if metamorphic_schema is not None: self.assert_metamorphic_schema(metamorphic_schema) if metamorphic_schema is not None: self.transform(data, metamorphic_schema) xsl_xml = self.get_cached_schema(structural_schema) xmlschema = XMLSchema(xsl_xml) xml = parse(StringIO(data)) return xmlschema.validate(xml)
def assertValid(xmlString, schemaPath): schema = XMLSchema(parse(open(schemaPath))) toValidate = parse(StringIO(xmlString)) schema.validate(toValidate) if schema.error_log: raise AssertionError(formatException("assertValid", schema, toValidate))
def parse(self, filename): self.parser = None # find parser try: from lxml.etree import parse, XMLSchema self.logger.info('using lxml.etree parser') # parse XML and validate it tree = parse(filename) # get XSD schemaDoc = parse(XSDContents) schema = XMLSchema(schemaDoc) xml_valid = schema.validate(tree) if xml_valid: self.logger.info('XML validated') else: self.logger.error('XML NOT validated: {}'.format(filename)) print >> stderr, schema.error_log return tree if xml_valid else None except ImportError: try: from xml.etree.ElementTree import parse self.logger.info('using xml.etree.ElementTree parser') return parse(filename) except ImportError: self.logger.critical( "Failed to import ElementTree from any known place") raise
def __init__(self, schemaPath): Observable.__init__(self) try: with open(schemaPath) as fp: self._schema = XMLSchema(parse(fp)) except XMLSchemaParseError as e: print(e.error_log.last_error) raise
def valid_xml(self, xml_string): """Return True if `xml_string` conforms to SEPA XML schema.""" if self.xml_schema_path is None: raise NotImplementedError( "XSD validation failed: path to schema is not set.") parsed_xml = parse(StringIO(xml_string)) xsd_scheme = XMLSchema(parse(self.xml_schema_path)) return xsd_scheme.validate(parsed_xml)
def validate_xml_file_against_xsd(xml_file, xsd_file): try: xml_doc = parse(xml_file) xsd_doc = parse(xsd_file) xmlschema = XMLSchema(xsd_doc) xmlschema.assert_(xml_doc) logger.info("XSD Pass!") except Exception as e: logger.info("XSD Fail!") BuiltIn().fail(e)
def loadXml(self, data, validate=True, postValidate=True, **parserOptions): """Load a PMML model represented as an XML string, fileName, URI, or file-like object. Note that the XML file or string may be Gzip-compressed. @type data: string or file-like object @param data: The data to load. @type validate: bool @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema while loading. @type postValidate: bool @param postValidate: If True, run post-XSD validation checks. (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.) @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}. @rtype: PmmlBinding @return: In-memory PMML object. """ if isinstance(data, basestring): if len(data) >= 2 and data[0:2] == "\x1f\x8b": data = gzip.GzipFile(fileobj=StringIO(data)) elif data.find("<") != -1: data = StringIO(data) if validate: if self.preparedSchema is None: self.preparedSchema = XMLSchema(self.schema) schema = self.preparedSchema else: schema = None newParserOptions = {"schema": schema, "huge_tree": True} newParserOptions.update(parserOptions) parserOptions = newParserOptions parser = XMLParser(**parserOptions) lookup = ElementNamespaceClassLookup() namespace = lookup.get_namespace(defs.PMML_NAMESPACE) for xsdElement in self.schema.xpath( "xs:element", namespaces={"xs": defs.XSD_NAMESPACE}): namespace[xsdElement.attrib["name"]] = PmmlBinding namespace.update(self.tagToClass) parser.set_element_class_lookup(lookup) # ElementNamespaceClassLookup don't work with iterparse, so we have to parse all at once and then iterwalk pmmlBinding = parse(data, parser).getroot() pmmlBinding.modelLoader = self if postValidate: for event, elem in iterwalk(pmmlBinding, events=("end", ), tag="{%s}*" % defs.PMML_NAMESPACE): if isinstance(elem, PmmlBinding): elem.postValidate() return pmmlBinding
class Validate(Observable): def __init__(self, schemaPath): Observable.__init__(self) try: with open(schemaPath) as fp: self._schema = XMLSchema(parse(fp)) except XMLSchemaParseError as e: print(e.error_log.last_error) raise def all_unknown(self, message, *args, **kwargs): self._detectAndValidate(*args, **kwargs) yield self.all.unknown(message, *args, **kwargs) def do_unknown(self, message, *args, **kwargs): self._detectAndValidate(*args, **kwargs) return self.do.unknown(message, *args, **kwargs) def any_unknown(self, message, *args, **kwargs): self._detectAndValidate(*args, **kwargs) try: response = yield self.any.unknown(message, *args, **kwargs) return response except NoneOfTheObserversRespond: raise DeclineMessage def call_unknown(self, message, *args, **kwargs): self._detectAndValidate(*args, **kwargs) try: return self.call.unknown(message, *args, **kwargs) except NoneOfTheObserversRespond: raise DeclineMessage def _detectAndValidate(self, *args, **kwargs): allArguments = list(args) + list(kwargs.values()) for arg in allArguments: if type(arg) == _ElementTree: self.validate(arg) def validate(self, arg): self._schema.validate(arg) if self._schema.error_log: exception = ValidateException(formatException(self._schema, arg)) self.do.logException(exception) raise exception def assertValid(self, xmlOrString): toValidate = XML(xmlOrString.encode('utf-8')) if isinstance(xmlOrString, str) else xmlOrString self._schema.validate(toValidate) if self._schema.error_log: raise AssertionError(formatException(self._schema, toValidate))
def setUp(self): image_pathnames = ( FIRST_PAGE_IMAGE_PATHNAME, SECOND_PAGE_IMAGE_PATHNAME, ) self.document = ImageFileDocument(image_pathnames) page_iterator = iter(self.document) self.first_page = next(page_iterator) self.second_page = next(page_iterator) self.xml_schema = XMLSchema(file=XML_SCHEMA_PATHNAME) self.quadrangle_tracker = RTreeDequeConvexQuadrangleTracker() self.screen_detector = ScreenEventDetectorScreenDetector() self.page_detector = ScreenEventDetectorPageDetector()
def testOne(self): self.observer.methods['add'] = lambda *args, **kwargs: (x for x in []) list( compose( self.norm_didl.all_unknown( 'add', 'id', 'metadata', 'anotherone', lxmlNode=parse(open("data/didl_mods.xml")), identifier='oai:very:secret:09987'))) self.assertEquals(4, len(self.observer.calledMethods)) #for m in self.observer.calledMethods: # print 'method name:',m.name, m.args, m.kwargs result = self.observer.calledMethods[3].kwargs.get('lxmlNode') self.assertEquals(2, len(self.observer.calledMethods[0].args)) arguments = self.observer.calledMethods[1].args self.assertEquals("oai:very:secret:09987", arguments[0]) #Test logMessage: self.assertEquals( "Found objectFile in depricated dip:ObjectType. This should have been: rdf:type/@rdf:resource", arguments[1]) #Get DIDL from record: didl = result.xpath("//didl:DIDL", namespaces=namespacesMap) # Should be exactly 1: self.assertTrue(len(didl) == 1) #print tostring(didl[0], pretty_print = True, encoding='utf-8') #Validate against schema: didlSchema = XMLSchema( lxmlParse(open(join(self.schemasPath, 'didl.xsd')))) didlSchema.validate(didl[0]) if didlSchema.error_log.last_error: self.fail(didlSchema.error_log.last_error) # Check if expected result: expectedResult = open("data/didl_converted.xml").read() # print "EXPECTED DIDL:", tostring(didl[0], pretty_print = True, encoding='utf-8') self.assertEqualsWithDiff( expectedResult, tostring(didl[0], pretty_print=True, encoding='utf-8'))
def test_centre(self): client = Client() response = client.get('/restxml/1/', secure=True) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'application/xml') schema_root = fromstring( resource_string(__name__, join('data', 'CenterProfile.xsd'))) schema = XMLSchema(schema_root) try: xml_doc = fromstring(response.content) schema.assertValid(xml_doc) except (XMLSyntaxError, DocumentInvalid): print_exc() self.fail()
def validate_schema(xml_loc: str, schema: etree.XMLSchema): """ Given an xmlfile and a schema. This function returns True if the xml file matches the schema Raises: DoxygenMalformedXML: If the schema is malformed """ assert xml_loc is not None assert schema is not None assert os.path.exists(xml_loc) with open(xml_loc, "rb") as xml_f: xml_bytes = BytesIO(xml_f.read()) try: xml = etree.parse(xml_bytes) except etree.XMLSyntaxError as e: logger.error(e) logger.warning(f"Failed to parse: {xml_loc}") logger.warning("Skipping this XML file") raise DoxygenMalformedXML(xml_loc) from e logger.debug(f"Validating {xml_loc}") if schema and not schema.validate(xml): # type: ignore logger.warning(f"Validating schema failed for: {xml_loc}") logger.warning(schema.error_log) # type: ignore logger.warning("Skipping this XML file") return False return True
def validate_schema(self, schema): """ Make sure provided schema's syntax/grammar are correct :param str schema: xml (schema itself) or unique_key :return: [True, <schema_type>] if schema is correct and [False, None] otherwise :rtype: list .. note:: <schema_type> can either be 'XMLSchema' or 'XSLT' """ if schema in self.get_existing_unique_keys(): xschema = self.get_cached_schema(schema) else: try: xschema = parse(StringIO(schema)) except Exception: return [False, None] try: XMLSchema(xschema) return [True, 'XMLSchema'] except Exception: pass try: XSLT(xschema) return [True, 'XSLT'] except Exception: pass return [False, None]
def __init__(self, schemaPath): Observable.__init__(self) try: self._schema = XMLSchema(parse(open(schemaPath))) except XMLSchemaParseError, e: print e.error_log.last_error raise
def get_schemas(): try: dr_xmlschema_doc = lxml.parse('detailedreport.xsd') bi_xmlschema_doc = lxml.parse('buildinfo.xsd') dr_xmlschema = XMLSchema(dr_xmlschema_doc) bi_xmlschema = XMLSchema(bi_xmlschema_doc) schemas = {'detailed_report': dr_xmlschema, 'build_info': bi_xmlschema} except IOError: print('Invalid Schema File') sys.exit(1) except XMLSyntaxError: print('XML Syntax Error ') sys.exit(1) except AttributeError as e: print('Local XML file error: ' + str(e)) sys.exit(1) return schemas
def getSchema(): global schema if not schema: try: schema = XMLSchema(schemaXml) except XMLSchemaParseError as e: print(e.error_log.last_error) raise return schema
def setUp(self): # We need a ticket and an account for test to pass before we use # selenium and friends # Hipay credentials self.login = '******' self.password = '******' self.schema = XMLSchema(file=open(os.path.join('hipay', 'mapi.xsd'), 'rb'), attribute_defaults=True) self.parser = XMLParser(schema=self.schema, attribute_defaults=True)
def main(): ######## SETUP OPTION PARSER AND READ COMMAND LINE ARGS ########## usage = "usage: %prog -c CONFIGURATION_FILE [-d]" option_parser = OptionParser(usage, version='1.0') option_parser.add_option("-c", "--configuration-file", dest="configuration_file", action='store', help="the path to the XML configuration file") option_parser.add_option( "-d", "--dry-run", dest="dry_run", action='store_true', default=False, help="only execute a dry run without calls to osmosis [default=false]") option_parser.add_option( "-l", "--logging-conf", dest="logging_config_file", action='store', default='logging.conf', help="path to the logging configuration [default=logging.conf]") (options, args) = option_parser.parse_args() if len(args) != 0: option_parser.print_help() sys.exit("incorrect number of arguments") if not options.configuration_file: option_parser.print_help() sys.exit("configuration file is missing") # check whether xml configuration exists if not PATH.exists(options.configuration_file) or not PATH.isfile( options.configuration_file): sys.exit("the xml configuration file at '%s' could not be found" % options.configuration_file) ######## READING SCHEMA AND XML CONFIG ########### try: # load XML schema file xmlschema_doc = ET.parse('resources/mapcreator.xsd') xmlschema = XMLSchema(xmlschema_doc) parser = ET.XMLParser(schema=xmlschema, remove_comments=True) # try to load xml configuration, validate with schema tree = ET.parse(options.configuration_file, parser=parser) except NameError: tree = ET.parse(options.configuration_file) except (XMLSyntaxError, XMLSchemaValidateError), e: sys.exit("the xml configuration is not valid: '%s'" % e)
def test_centres_kml(self): client = Client() response = client.get('/api/KML/', secure=True) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'application/vnd.google-earth.kml+xml') # It is necessary to load this KML XSD over HTTP as it imports two # other XSDs using relative names, which # does not work well with Python package resources, that should not # be located to an absolute location. schema_doc = parse("http://www.opengis.net/kml/2.2") schema = XMLSchema(schema_doc) try: xml_doc = fromstring(response.content) schema.assertValid(xml_doc) except (XMLSyntaxError, DocumentInvalid): print_exc() self.fail()
def __init__(self, host_url=None, version='1.2.0', dtd_validation=False, xsd_validation=True): """Create a new XML parser instance containing the geoLink XSD for validation. Args: host_url (str): URL of the OEREBlex host to resolve relative URLs. The complete URL until but without the */api* part has to be set, starting with *http://* or *https://*. version (str): The version of the geoLink schema to be used. Defaults to `1.2.0`. dtd_validation (bool): Enable/disable validation of document type definition (DTD). Optional, defaults to False. xsd_validation (bool): Enable/disable validation against XML schema (XSD). Optional, defaults to True. """ self._host_url = host_url self._version = version self._dtd_validation = dtd_validation self._xsd_validation = xsd_validation xsd = pkg_resources.resource_filename('geolink_formatter', 'schema/v{0}.xsd'.format(version)) with open(xsd) as f: self._schema = XMLSchema(fromstring(f.read()))
def validate(self, xml=None): """Validate against xsd schema the provided schema https://payment.hipay.com/schema/mapi.xsd""" schema = XMLSchema(file=open(os.path.join(DIRNAME, 'mapi.xsd'), 'rb'), attribute_defaults=True) parser = XMLParser(schema=schema, attribute_defaults=True) if not xml: xml = ET.tostring(self.asTree().getroot()) try: root = fromstring(xml, parser) except Exception, e: return False
def loadXml(self, data, validate=True, postValidate=True, **parserOptions): """Load a PMML model represented as an XML string, fileName, URI, or file-like object. Note that the XML file or string may be Gzip-compressed. @type data: string or file-like object @param data: The data to load. @type validate: bool @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema while loading. @type postValidate: bool @param postValidate: If True, run post-XSD validation checks. (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.) @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}. @rtype: PmmlBinding @return: In-memory PMML object. """ if isinstance(data, basestring): if len(data) >= 2 and data[0:2] == "\x1f\x8b": data = gzip.GzipFile(fileobj=StringIO(data)) elif data.find("<") != -1: data = StringIO(data) if validate: if self.preparedSchema is None: self.preparedSchema = XMLSchema(self.schema) schema = self.preparedSchema else: schema = None newParserOptions = {"schema": schema, "huge_tree": True} newParserOptions.update(parserOptions) parserOptions = newParserOptions parser = XMLParser(**parserOptions) lookup = ElementNamespaceClassLookup() namespace = lookup.get_namespace(defs.PMML_NAMESPACE) for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}): namespace[xsdElement.attrib["name"]] = PmmlBinding namespace.update(self.tagToClass) parser.set_element_class_lookup(lookup) # ElementNamespaceClassLookup don't work with iterparse, so we have to parse all at once and then iterwalk pmmlBinding = parse(data, parser).getroot() pmmlBinding.modelLoader = self if postValidate: for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE): if isinstance(elem, PmmlBinding): elem.postValidate() return pmmlBinding
def __init__(self, xSDPathList=[], nsMap=None): Observable.__init__(self) self._namespacesMap = namespaces.copyUpdate(nsMap or {}) self._xmlSchemas = [] ## Fill the schemas list for later use: for strName, strXPath, schemaPath in xSDPathList: print 'schema init:', strName, strXPath, schemaPath try: self._xmlSchemas.append((strName, strXPath, XMLSchema(parse(join(join(dirname(abspath(__file__)), 'xsd'), schemaPath) ) ) )) except XMLSchemaParseError, e: print 'XMLSchemaParseError.............',e.error_log.last_error raise
def __init__(self, fromKwarg, toKwarg=None, name=None, nsMap=None): UiaConverter.__init__(self, name=name, fromKwarg=fromKwarg, toKwarg=toKwarg) self._nsMap = namespaces.copyUpdate(nsMap or {}) self._bln_success = False self._edu_extension_schemas = [] ## Fill the schemas list for later use: for schemaPath, xPad, s_loc in mods_edu_extentions: print 'schema init:' ,schemaPath, xPad, s_loc try: self._edu_extension_schemas.append((XMLSchema(parse(join(dirname(abspath(__file__)), 'xsd/'+ schemaPath) ) ), xPad, s_loc )) except XMLSchemaParseError, e: print 'XMLSchemaParseError.', e.error_log.last_error raise
def xml_to_xsd_validation(file_xml, file_xsd): """ Verify that the XML compliance with XSD Arguments: 1. file_xml: Input xml file 2. file_xsd: xsd file which needs to be validated against xml Return: No return value """ try: print_info("Validating:{0}".format(file_xml)) print_info("xsd_file:{0}".format(file_xsd)) xml_doc = parse(file_xml) xsd_doc = parse(file_xsd) xmlschema = XMLSchema(xsd_doc) xmlschema.assert_(xml_doc) return True except XMLSyntaxError as err: print_error("PARSING ERROR:{0}".format(err)) return False except AssertionError as err: print_error("Incorrect XML schema: {0}".format(err)) return False
def validate(self, pmmlBinding, postValidate=True): """Validate a PMML subtree on demand. Note that by default, PMML is validated as or just after it is loaded. This command is intended to check an in-memory PMML object after it has been changed or created by an algorithm. @type pmmlBinding: PmmlBinding @param pmmlBinding: The in-memory PMML object to check. @type postValidate: bool @param postValidate: If True, run post-XSD validation checks. (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.) """ if self.preparedSchema is None: self.preparedSchema = XMLSchema(self.schema) self.preparedSchema.assertValid(pmmlBinding) if postValidate: for event, elem in iterwalk(pmmlBinding, events=("end", ), tag="{%s}*" % defs.PMML_NAMESPACE): if isinstance(elem, PmmlBinding): elem.postValidate()
def parse(self, filename): self.parser = None # find parser try: from lxml.etree import parse, XMLSchema self.logger.info('using lxml.etree parser') # parse XML and validate it tree = parse(filename) # get XSD schemaDoc = parse(XSDContents) schema = XMLSchema(schemaDoc) if schema.validate(tree): self.logger.info('XML validated') return tree print >> stderr, schema.error_log raise ValueError('XML NOT validated: {}'.format(filename)) except ImportError: try: from xml.etree.ElementTree import parse self.logger.info('using xml.etree.ElementTree parser') return parse(filename) except ImportError: self.logger.critical("Failed to import ElementTree from any known place") raise
def __init__(self, filename, mode='r+b', writebackOnExit=True): ''' Creating a phonebook context manager requires a filename; opening mode and whether there is write back on exit are optional arguments. If write back on exit is True then there must be a mode which allows for writing. The mode must always allow reading! ''' self.filename = filename self.mode = mode self.writebackOnExit = writebackOnExit # TODO: Validate the document. parser = XMLParser(schema=XMLSchema(parse('contacts.xsd'))) self.cache = { item.find('name/lastname').text + ', ' + item.find('name/firstname').text: item.find('number').text for item in parse(filename, parser).findall('contact') }
def __init__(self, sld_file=None): """ Create a new SLD document. If an sld file is provided, this constructor will fetch the SLD schema from the internet and validate the file against that schema. @type sld_file: string @param sld_file: The name of a pre-existing SLD file. """ super(StyledLayerDescriptor, self).__init__(None) if StyledLayerDescriptor._cached_schema is None: logging.debug('Storing new schema into cache.') localschema = NamedTemporaryFile(delete=False) schema_url = 'http://schemas.opengis.net/sld/1.0.0/StyledLayerDescriptor.xsd' resp = urllib2.urlopen(schema_url) localschema.write(resp.read()) resp.close() localschema.seek(0) theschema = parse(localschema) localschema.close() StyledLayerDescriptor._cached_schema = localschema.name else: logging.debug('Fetching schema from cache.') localschema = open(StyledLayerDescriptor._cached_schema, 'r') theschema = parse(localschema) localschema.close() self._schema = XMLSchema(theschema) if not sld_file is None: self._node = parse(sld_file) if not self._schema.validate(self._node): logging.warn('SLD File "%s" does not validate against the SLD schema.', sld_file) else: self._node = Element("{%s}StyledLayerDescriptor" % SLDNode._nsmap['sld'], version="1.0.0", nsmap=SLDNode._nsmap) setattr(self.__class__, 'NamedLayer', SLDNode.makeproperty('sld', cls=NamedLayer, docstring="The named layer of the SLD."))
def validate(self, pmmlBinding, postValidate=True): """Validate a PMML subtree on demand. Note that by default, PMML is validated as or just after it is loaded. This command is intended to check an in-memory PMML object after it has been changed or created by an algorithm. @type pmmlBinding: PmmlBinding @param pmmlBinding: The in-memory PMML object to check. @type postValidate: bool @param postValidate: If True, run post-XSD validation checks. (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.) """ if self.preparedSchema is None: self.preparedSchema = XMLSchema(self.schema) self.preparedSchema.assertValid(pmmlBinding) if postValidate: for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE): if isinstance(elem, PmmlBinding): elem.postValidate()
def validate_xml(xml_string, xsd_string): """Validate XML file against XML schema """ xmlschema = XMLSchema(parse(xsd_string)) return xmlschema.validate(parse(xml_string))
os.remove(LOG_FILENAME) MAX_LOGSIZE = 10485760 logger = logging.getLogger('ValidationLogger') logger.setLevel(logging.INFO) handler = logging.handlers.RotatingFileHandler(LOG_FILENAME, backupCount=14, maxBytes=MAX_LOGSIZE) formatter = logging.Formatter("%(asctime)s - %(message)s", "%Y-%m-%d %H:%M:%S") handler.setFormatter(formatter) logger.addHandler(handler) logger.info('### Start validation of person, organisation and research entities in normdoc files in %s' % STORAGE_DIR) logger.info('NOTE: only failed validations are logged!') logger.info('Start time: %s\n' % datetime.datetime.now()) xmlschema_doc = parse(PERSON_SCHEMA) xmlschema_person = XMLSchema(xmlschema_doc) xmlschema_doc = parse(ORGANISATION_SCHEMA) xmlschema_organisation = XMLSchema(xmlschema_doc) xmlschema_doc = parse(RESEARCH_SCHEMA) xmlschema_research = XMLSchema(xmlschema_doc) # Loop over api storage for item in os.listdir(STORAGE_DIR): path = os.path.join(STORAGE_DIR, item) if os.path.isdir(path) and item in NOD_COLLECTIONS: logger.info('### Start validation of xml files in %s' % path) print('### Start validation of xml files in %s' % path) count_person = 0 count_person_invalid = 0
def assertValid(xmlString, schemaPath): schema = XMLSchema(parse(open(schemaPath))) toValidate = parse(StringIO(xmlString)) schema.validate(toValidate) if schema.error_log: raise AssertionError(formatException(schema, toValidate))
class StyledLayerDescriptor(SLDNode): """ An object representation of an SLD document. @prop: NamedLayer The named layer that this styling applies to. I{Type}: L{NamedLayer} """ _cached_schema = None """A cached schema document, to prevent multiple requests from occurring.""" def __init__(self, sld_file=None): """ Create a new SLD document. If an sld file is provided, this constructor will fetch the SLD schema from the internet and validate the file against that schema. @type sld_file: string @param sld_file: The name of a pre-existing SLD file. """ super(StyledLayerDescriptor, self).__init__(None) if StyledLayerDescriptor._cached_schema is None: logging.debug('Storing new schema into cache.') localschema = NamedTemporaryFile(delete=False) schema_url = 'http://schemas.opengis.net/sld/1.0.0/StyledLayerDescriptor.xsd' resp = urllib2.urlopen(schema_url) localschema.write(resp.read()) resp.close() localschema.seek(0) theschema = parse(localschema) localschema.close() StyledLayerDescriptor._cached_schema = localschema.name else: logging.debug('Fetching schema from cache.') localschema = open(StyledLayerDescriptor._cached_schema, 'r') theschema = parse(localschema) localschema.close() self._schema = XMLSchema(theschema) if not sld_file is None: self._node = parse(sld_file) if not self._schema.validate(self._node): logging.warn('SLD File "%s" does not validate against the SLD schema.', sld_file) else: self._node = Element("{%s}StyledLayerDescriptor" % SLDNode._nsmap['sld'], version="1.0.0", nsmap=SLDNode._nsmap) setattr(self.__class__, 'NamedLayer', SLDNode.makeproperty('sld', cls=NamedLayer, docstring="The named layer of the SLD.")) def __del__(self): """ Destroy the StyledLayerDescriptor object, and clear its cache. """ if not StyledLayerDescriptor._cached_schema is None: logging.debug('Clearing cached schema.') os.remove(StyledLayerDescriptor._cached_schema) StyledLayerDescriptor._cached_schema = None def __deepcopy__(self, memo): """ Perform a deep copy. Instead of copying references to the schema object, create a new SLD, and deepcopy the SLD node. """ sld = StyledLayerDescriptor() sld._node = copy.deepcopy(self._node) return sld def normalize(self): """ Normalize this node and all child nodes prior to validation. The SLD is modified in place. """ if not self.NamedLayer is None: self.NamedLayer.normalize() def validate(self): """ Validate the current file against the SLD schema. This first normalizes the SLD document, then validates it. Any schema validation error messages are logged at the INFO level. @rtype: boolean @return: A flag indicating if the SLD is valid. """ self.normalize() if self._node is None or self._schema is None: logging.debug('The node or schema is empty, and cannot be validated.') return False is_valid = self._schema.validate(self._node) for msg in self._schema.error_log: logging.info('Line:%d, Column:%d -- %s', msg.line, msg.column, msg.message) return is_valid @property def version(self): """ Get the SLD version. """ return self._node.getroot().get('version') @property def xmlns(self): """ Get the XML Namespace. """ return self._node.getroot().nsmap[None] def create_namedlayer(self, name): """ Create a L{NamedLayer} in this SLD. @type name: string @param name: The name of the layer. @rtype: L{NamedLayer} @return: The named layer, attached to this SLD. """ namedlayer = self.get_or_create_element('sld', 'NamedLayer') namedlayer.Name = name return namedlayer def as_sld(self, pretty_print=False): """ Serialize this SLD model into a string. @rtype: string @returns: The content of the SLD. """ return tostring(self._node, pretty_print=pretty_print)
#!/usr/bin/env python from lxml.etree import XMLParser, fromstring, XMLSchema schema_doc = open('schema.xsd').read() inst_doc = open('inst.xml').read() parser = XMLParser(resolve_entities=False) elt = fromstring(inst_doc, parser) schema = XMLSchema(fromstring(schema_doc)) schema.validate(elt)
class ModelLoader(object): """ModelLoader is a tool for unserializing or creating PMML models. A ModelLoader loader instance can be modified to support strict PMML compliance, extended PMML features, or optimized implementations of PMML elements. The user is encouraged to write new PmmlBinding subclasses and register them with a ModelLoader to modify the behavior of PMML or make certain functions more efficient for a given context. ModelLoader is the only supported way to make new PmmlBinding instances: any function that produces PMML must be given a ModelLoader. @type schema: lxml.etree.Element @param schema: Representation of the PMML schema used to interpret new models. @type tagToClass: dict @param tagToClass: Association of PMML tagnames with Python classes. """ def __init__(self, baseXsdFileName="pmml-4-1.xsd", baseXsltFileName="pmml-4-1.xslt"): """Initialize a ModelLoader with a base XSD. By default, the XSD is the official 4.1 schema published by the U{Data Mining Group<http://www.dmg.org/v4-1/GeneralStructure.html>}. @type baseXsdFileName: string @param baseXsdFileName: XSD fileName, either absolute or relative to augustus-pmml-library/augustus/core @type baseXsltFileName: string @param baseXsltFileName: XSLT fileName; future placeholder for XSLT non-local validation. Not currently used. """ if not os.path.exists(baseXsdFileName): baseXsdFileName = os.path.join(os.path.split(__file__)[0], baseXsdFileName) self.schema = parse(open(baseXsdFileName)).getroot() # if not os.path.exists(baseXsltFileName): # baseXsltFileName = os.path.join(os.path.split(__file__)[0], baseXsltFileName) # self.stylesheet = parse(open(baseXsltFileName)).getroot() self.preparedSchema = None self.tagToClass = {} def copy(self): """Return a deep copy of the ModelLoader for the sake of building multiple lines of PMML interpretation from the same base.""" return copy.deepcopy(self) def __getstate__(self): """Used by Pickle to serialize the ModelLoader. This serialization includes the entire schema and tag-to-class mapping. """ serialization = self.__dict__.copy() buff = StringIO() ElementTree(serialization["schema"]).write(buff, compression=defs.PICKLE_XML_COMPRESSION) serialization["schema"] = buff.getvalue() # buff = StringIO() # ElementTree(serialization["stylesheet"]).write(buff, compression=defs.PICKLE_XML_COMPRESSION) # serialization["stylesheet"] = buff.getvalue() serialization["preparedSchema"] = None return serialization def __setstate__(self, serialization): """Used by Pickle to unserialize the ModelLoader. This serialization includes the entire schema and tag-to-class mapping. """ serialization["schema"] = parse(gzip.GzipFile(fileobj=StringIO(serialization["schema"]))).getroot() # serialization["stylesheet"] = parse(gzip.GzipFile(fileobj=StringIO(serialization["stylesheet"]))).getroot() self.__dict__ = serialization for tag, cls in self.tagToClass.items(): cls.xsd = self.xsdElement(tag) def xsdElement(self, elementName): """Return the XSD that defines a given xs:element. @type elementName: string @param elementName: The name of the element to retrieve. @rtype: lxml.etree.Element @return: The XSD object. @raise LookupError: If C{elementName} is not found in the schema, an error is raised. """ results = self.schema.xpath("//xs:element[@name='%s']" % elementName, namespaces={"xs": defs.XSD_NAMESPACE}) if len(results) == 0: return None elif len(results) == 1: return results[0] else: raise LookupError("Element \"%s\" is defined %d times in this modelLoader's schema" % (elementName, len(results))) def xsdGroup(self, groupName): """Return the XSD that defines a given xs:group. @type groupName: string @param groupName: The name of the group to retrieve. @rtype: lxml.etree.Element @return: The XSD object. @raise LookupError: If C{groupName} is not found in the schema, an error is raised. """ results = self.schema.xpath("//xs:group[@name='%s']" % groupName, namespaces={"xs": defs.XSD_NAMESPACE}) if len(results) == 0: return None elif len(results) == 1: return results[0] else: raise LookupError("Group \"%s\" is defined %d times in this modelLoader's schema" % (groupName, len(results))) def xsdRemove(self, oldName): """Remove an arbitrary object from the ModelLoader's XSD schema. @type oldName: string @param oldName: Name of the object to be removed. """ for result in self.schema.xpath("//*[@name='%s']" % oldName, namespaces={"xs": defs.XSD_NAMESPACE}): parent = result.getparent() index = parent.index(result) del parent[index] def xsdAppend(self, newXsd): """Append an arbitrary object to the ModelLoader's XSD schema. @type newXsd: string or lxml.etree.Element @param newXsd: New XSD object to append. """ if isinstance(newXsd, basestring): newXsd = fromstring(newXsd) self.schema.append(newXsd) self.preparedSchema = None def register(self, tag, cls): """Define (or redefine) the class that is instantiated for a given tagname. If the class has an C{xsd} and/or C{xsdAppend} string as a class attribute, this method will replace the ModelLoader's schema entry for C{tag} with the version defined by the class. If the class does not have an C{xsd} attribute, this method attach the ModelLoader's schema entry for C{tag} to the class. As a result, the class will always end up with a C{xsd} class attribute representing its XSD schema. This schema fragment is expressed as a lxml.etree.Element for programmatic use. The currently-registered classes are in the ModelLoader's C{tagToClass} dictionary. @type tag: string @param tag: The tagname to define or redefine. @type cls: PmmlBinding subclass @param cls: The class to associate with C{tag}. """ oldXsdElement = self.xsdElement(tag) if cls.xsd is not None: if isinstance(cls.xsd, basestring): clsxsd = fromstring(cls.xsd) else: clsxsd = cls.xsd newXsdElements = clsxsd.xpath("//xs:element[@name='%s']" % tag, namespaces={"xs": defs.XSD_NAMESPACE}) if len(newXsdElements) != 1: raise ValueError("Class %s has an xsd member but %d definitions of element \"%s\"" % (cls.__name__, len(newXsdElements), tag)) else: newXsdElement = newXsdElements[0] if oldXsdElement is None: self.xsdAppend(newXsdElement) else: parent = oldXsdElement.getparent() index = parent.index(oldXsdElement) del parent[index] parent.insert(index, newXsdElement) cls.xsd = copy.deepcopy(newXsdElement) else: cls.xsd = copy.deepcopy(oldXsdElement) if cls.xsdRemove is not None: for name in cls.xsdRemove: self.xsdRemove(name) if cls.xsdAppend is not None: preexisting = {} for elem in self.schema: name = elem.get("name") if name is not None: preexisting[name] = elem for newXsd in cls.xsdAppend: if isinstance(newXsd, basestring): newXsd = fromstring(newXsd) name = newXsd.get("name") if name in preexisting: parent = preexisting[name].getparent() index = parent.index(preexisting[name]) del parent[index] self.xsdAppend(newXsd) self.preparedSchema = None self.tagToClass[tag] = cls def xsdAddToGroupChoice(self, groupName, newElementNames): """Add to an xs:group's xs:choice block. @type groupName: string @param groupName: The name of the xs:group. @type newElementNames: list of strings or a single string @param newElementNames: References to the xs:elements to add to the xs:choice block. """ results = self.schema.xpath("//xs:group[@name='%s']/xs:choice" % groupName, namespaces={"xs": defs.XSD_NAMESPACE}) if len(results) != 1: raise LookupError("Group \"%s\" is defined with a choice block %d times in this modelLoader's schema" % (groupName, len(results))) E = ElementMaker(namespace=defs.XSD_NAMESPACE, nsmap={"xs": defs.XSD_NAMESPACE}) if isinstance(newElementNames, basestring): results[0].append(E.element(ref=newElementNames)) else: for newElementName in newElementNames: results[0].append(E.element(ref=newElementName)) self.preparedSchema = None def xsdReplaceGroup(self, groupName, newXsd): """Replace an xs:group in this ModelLoader's schema. @type groupName: string @param groupName: The name of the xs:group. @type newXsd: string or lxml.etree.Element @param newXsd: The new XSD represented as an XML string or an lxml.etree.Element; it must contain an xs:group named C{groupName}. """ oldXsdElement = self.xsdGroup(groupName) if isinstance(newXsd, basestring): newXsd = fromstring(newXsd) newXsdElements = newXsd.xpath("//xs:group[@name='%s']" % groupName, namespaces={"xs": defs.XSD_NAMESPACE}) if len(newXsdElements) != 1: raise ValueError("newXsd has %d definitions of group \"%s\"" % (len(newXsdElements), groupName)) else: newXsdElement = newXsdElements[0] if oldXsdElement is None: self.xsdAppend(newXsdElement) else: parent = oldXsdElement.getparent() index = parent.index(oldXsdElement) del parent[index] parent.insert(index, newXsdElement) self.preparedSchema = None def elementMaker(self, prefix=None, **parserOptions): """Obtain a factory for making in-memory PMML objects. This factory is an lxml ElementMaker, pre-loaded with the PMML namespace and this ModelLoader's current tag-to-class relationship. See the lxml documentation for how to use an ElementMaker. @type prefix: string or None @param prefix: A prefix for the PMML namespace. @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}. @rtype: ElementMaker @return: The ElementMaker factory. @see: The lxml U{ElementMaker documentation<http://lxml.de/api/lxml.builder.ElementMaker-class.html>}, which explains how to use an ElementMaker factory. """ class XmlParser(XMLParser): def makeelement(parserSelf, *args, **kwds): result = XMLParser.makeelement(parserSelf, *args, **kwds) if isinstance(result, PmmlBinding): result.modelLoader = self return result parser = XmlParser(**parserOptions) lookup = ElementNamespaceClassLookup() namespace = lookup.get_namespace(defs.PMML_NAMESPACE) for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}): namespace[xsdElement.attrib["name"]] = PmmlBinding namespace.update(self.tagToClass) parser.set_element_class_lookup(lookup) return ElementMaker(namespace=defs.PMML_NAMESPACE, nsmap={prefix: defs.PMML_NAMESPACE}, makeelement=parser.makeelement) def validate(self, pmmlBinding, postValidate=True): """Validate a PMML subtree on demand. Note that by default, PMML is validated as or just after it is loaded. This command is intended to check an in-memory PMML object after it has been changed or created by an algorithm. @type pmmlBinding: PmmlBinding @param pmmlBinding: The in-memory PMML object to check. @type postValidate: bool @param postValidate: If True, run post-XSD validation checks. (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.) """ if self.preparedSchema is None: self.preparedSchema = XMLSchema(self.schema) self.preparedSchema.assertValid(pmmlBinding) if postValidate: for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE): if isinstance(elem, PmmlBinding): elem.postValidate() # def validateXslt(self, pmmlBinding): # xslt = XSLT(self.stylesheet) # return xslt(pmmlBinding) def look(self, tag=None, showXsd=True, showSource=False, stream=None): """An informative representation of the ModelLoader's current interpretation of PMML, intended for interactive use. @type tag: string or None @param tag: If a string, look up information about this tag; if None, display all tags in the tag-to-class dictionary. @type showXsd: bool @param showXsd: If True, show the XSD that defines a valid C{tag}. @type showSource: bool @param showSource: If True, show the Python source code that implements C{tag}. @type stream: file-like object or None @param stream: If None, print to C{sys.stdout}; otherwise, write to the specified stream. @rtype: None @return: None; human-readable output is written to the console or a specified stream. """ if stream is None: stream = sys.stdout if tag is None: names = sorted(self.schema.xpath("xs:element/@name", namespaces={"xs": defs.XSD_NAMESPACE})) index = 0 while index < len(names): for i in xrange(4): if index + i < len(names): if names[index + i] in self.tagToClass: word = "[%s]" % names[index + i] else: word = names[index + i] stream.write("%-25s " % word) else: break stream.write(os.linesep) index += 4 else: xsd = None if showXsd: try: xsd = self.xsdElement(tag) except LookupError: try: xsd = self.xsdGroup(tag) except LookupError: pass if xsd is not None: stream.write(tostring(xsd, pretty_print=True)) if showSource: cls = self.tagToClass.get(tag) if cls is not None: if xsd is not None: stream.write(os.linesep) stream.write(inspect.getsource(cls)) stream.flush() def loadXml(self, data, validate=True, postValidate=True, **parserOptions): """Load a PMML model represented as an XML string, fileName, URI, or file-like object. Note that the XML file or string may be Gzip-compressed. @type data: string or file-like object @param data: The data to load. @type validate: bool @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema while loading. @type postValidate: bool @param postValidate: If True, run post-XSD validation checks. (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.) @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}. @rtype: PmmlBinding @return: In-memory PMML object. """ if isinstance(data, basestring): if len(data) >= 2 and data[0:2] == "\x1f\x8b": data = gzip.GzipFile(fileobj=StringIO(data)) elif data.find("<") != -1: data = StringIO(data) if validate: if self.preparedSchema is None: self.preparedSchema = XMLSchema(self.schema) schema = self.preparedSchema else: schema = None newParserOptions = {"schema": schema, "huge_tree": True} newParserOptions.update(parserOptions) parserOptions = newParserOptions parser = XMLParser(**parserOptions) lookup = ElementNamespaceClassLookup() namespace = lookup.get_namespace(defs.PMML_NAMESPACE) for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}): namespace[xsdElement.attrib["name"]] = PmmlBinding namespace.update(self.tagToClass) parser.set_element_class_lookup(lookup) # ElementNamespaceClassLookup don't work with iterparse, so we have to parse all at once and then iterwalk pmmlBinding = parse(data, parser).getroot() pmmlBinding.modelLoader = self if postValidate: for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE): if isinstance(elem, PmmlBinding): elem.postValidate() return pmmlBinding def _loadJsonItem(self, tag, data, parser, nsmap): """Helper function for C{loadJson}; not for public use.""" if tag.find(":") == -1: prefix = None else: prefix, tag = tag.split(":") pretag = nsmap.get(prefix) if pretag is None: raise ValueError("This document contains a prefix (\"%s\") not found in the namespace (%r)" % (prefix, nsmap)) attrib = dict((x[1:], data[x]) for x in data if x.startswith("@")) childMap = dict((x, data[x]) for x in data if not x.startswith("@") and not x.startswith("#")) item = parser.makeelement("{%s}%s" % (pretag, tag), attrib=attrib, nsmap=nsmap) children = {} for subtag, childList in childMap.items(): for childItem in childList: number = childItem.get("#") if number is None: raise ValueError("Subtag \"%s\" has no \"#\"" % subtag) children[number] = self._loadJsonItem(subtag, childItem, parser, nsmap) for number in xrange(len(children)): child = children.get(number) if child is not None: item.append(child) text = data.get("#text") if text is not None: item.text = text tail = data.get("#tail") if tail is not None: item.tail = tail return item def loadJson(self, data, validate=True, postValidate=True, **parserOptions): """Load a PMML model represented as a JSON string, fileName, dict, or file-like object. There is no standard XML-to-JSON specification, so we define our own. Our specification is very similar to U{this proposal<http://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html>}, which collects subelements of different tagnames into different JSON lists, rather than having one long list and needing to specify the tag of each element in that list. This has the following advantages, particularly useful for PMML: - Frequent tagnames (like <Segment>) are not repeated, wasting space. - Subelements with a given tagname can be quickly queried, without having to iterate over a list that contains non-matching tagnames. It has the following disadvantages: - The relative order of subelements with different tagnames is not preserved. We therefore additionally include a JSON attribute named "#" to specify the ordering of subelements in the XML representation. Also, the specification referenced above represents single-child subelements as JSON objects and multiple children as JSON lists, but for consistency and ease of parsing, we always use lists. The last difference is that we include "#tail" as well as "#text", so that text outside of an element is preserved (rarely relevant for PMML, but included for completeness). Note that this method returns a JSON-like dictionary, not a string. To serialize to JSON, use the C{json} module from the Python Standard Library, a faster variant, or an exotic serializer such as BSON. @type data: string, dict, or file-like object @param data: The data to load. @type validate: bool @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema after loading. @type postValidate: bool @param postValidate: If True, run post-XSD validation checks. (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.) @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}. @rtype: PmmlBinding @return: In-memory PMML object. @raise ValueError: If the JSON text is malformed or does not represent PMML, an error is raised. """ if hasattr(data, "read"): data = json.load(data) elif isinstance(data, basestring): if os.path.exists(data): data = json.load(open(data)) else: data = json.loads(data) if not isinstance(data, dict): raise ValueError("JSON object must be a mapping at the top level") if validate: if self.preparedSchema is None: self.preparedSchema = XMLSchema(self.schema) schema = self.preparedSchema else: schema = None parser = XMLParser(**parserOptions) lookup = ElementNamespaceClassLookup() namespace = lookup.get_namespace(defs.PMML_NAMESPACE) for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}): namespace[xsdElement.attrib["name"]] = PmmlBinding namespace.update(self.tagToClass) parser.set_element_class_lookup(lookup) try: nsmap = data["#nsmap"] except KeyError: raise ValueError("JSON object must have a \"#nsmap\" key at the top level") if "" in nsmap: nsmap[None] = nsmap[""] del nsmap[""] del data["#nsmap"] if len(data) != 1: raise ValueError("JSON object must have exactly one PMML object at the top level") tag = data.keys()[0] data = data[tag] if not isinstance(data, list) or len(data) != 1: raise ValueError("Top-level PMML object must be a list with exactly one item") data = data[0] pmmlBinding = self._loadJsonItem(tag, data, parser, nsmap) if validate: schema.assertValid(pmmlBinding) if postValidate: for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE): if isinstance(elem, PmmlBinding): elem.postValidate() return pmmlBinding
def loadJson(self, data, validate=True, postValidate=True, **parserOptions): """Load a PMML model represented as a JSON string, fileName, dict, or file-like object. There is no standard XML-to-JSON specification, so we define our own. Our specification is very similar to U{this proposal<http://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html>}, which collects subelements of different tagnames into different JSON lists, rather than having one long list and needing to specify the tag of each element in that list. This has the following advantages, particularly useful for PMML: - Frequent tagnames (like <Segment>) are not repeated, wasting space. - Subelements with a given tagname can be quickly queried, without having to iterate over a list that contains non-matching tagnames. It has the following disadvantages: - The relative order of subelements with different tagnames is not preserved. We therefore additionally include a JSON attribute named "#" to specify the ordering of subelements in the XML representation. Also, the specification referenced above represents single-child subelements as JSON objects and multiple children as JSON lists, but for consistency and ease of parsing, we always use lists. The last difference is that we include "#tail" as well as "#text", so that text outside of an element is preserved (rarely relevant for PMML, but included for completeness). Note that this method returns a JSON-like dictionary, not a string. To serialize to JSON, use the C{json} module from the Python Standard Library, a faster variant, or an exotic serializer such as BSON. @type data: string, dict, or file-like object @param data: The data to load. @type validate: bool @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema after loading. @type postValidate: bool @param postValidate: If True, run post-XSD validation checks. (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.) @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}. @rtype: PmmlBinding @return: In-memory PMML object. @raise ValueError: If the JSON text is malformed or does not represent PMML, an error is raised. """ if hasattr(data, "read"): data = json.load(data) elif isinstance(data, basestring): if os.path.exists(data): data = json.load(open(data)) else: data = json.loads(data) if not isinstance(data, dict): raise ValueError("JSON object must be a mapping at the top level") if validate: if self.preparedSchema is None: self.preparedSchema = XMLSchema(self.schema) schema = self.preparedSchema else: schema = None parser = XMLParser(**parserOptions) lookup = ElementNamespaceClassLookup() namespace = lookup.get_namespace(defs.PMML_NAMESPACE) for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}): namespace[xsdElement.attrib["name"]] = PmmlBinding namespace.update(self.tagToClass) parser.set_element_class_lookup(lookup) try: nsmap = data["#nsmap"] except KeyError: raise ValueError("JSON object must have a \"#nsmap\" key at the top level") if "" in nsmap: nsmap[None] = nsmap[""] del nsmap[""] del data["#nsmap"] if len(data) != 1: raise ValueError("JSON object must have exactly one PMML object at the top level") tag = data.keys()[0] data = data[tag] if not isinstance(data, list) or len(data) != 1: raise ValueError("Top-level PMML object must be a list with exactly one item") data = data[0] pmmlBinding = self._loadJsonItem(tag, data, parser, nsmap) if validate: schema.assertValid(pmmlBinding) if postValidate: for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE): if isinstance(elem, PmmlBinding): elem.postValidate() return pmmlBinding