def validate_xml_doc(xml_schema: XmlSchema, xml_doc: XmlElement) -> None:
    """
    Validate ``xml_doc`` against XML schema ``xml_schema``.

    :raises XmlSchemaDocValidationError: if ``xml_doc`` did not be validate
        against ``xml_schema``

    """
    # There are several ways to validate 'xml_doc' according to an 'xml_schema'.
    #   Different calls and what happens if validation passes or fails:
    #   - xml_schema.assert_(xml_doc): nothign / raises 'AssertionError'
    #   - xml_schema.assertValid(xml_doc): nothing / raises 'DocumentInvalid'
    #   - xml_schema.validate(xml_doc): returns True / returns False

    try:
        xml_schema.assertValid(xml_doc)
    except lxml.etree.DocumentInvalid as exc:
        # note: 'exc.error_log' and 'xml_schema.error_log' are the same object
        #   (type 'lxml.etree._ListErrorLog').

        # TODO: advanced error details parsing, without leaking too much information.
        # xml_error_log = exc.error_log  # type: lxml.etree._ListErrorLog
        # last_xml_error = exc.error_log.last_error  # type: lxml.etree._LogEntry
        # last_xml_error_xml_doc_line = last_xml_error.line

        # TODO: does 'xml_schema.error_log' persist? is it necessary to clear it afterwards?
        #   `xml_schema._clear_error_log()`

        # Simplest and safest way to get the error message.
        # Error example:
        #   "Element 'DTE': No matching global declaration available for the validation root., line 2"  # noqa: E501
        validation_error_msg = str(exc)

        raise XmlSchemaDocValidationError(validation_error_msg) from exc
Example #2
0
    def validate(self):
        """
        Validate the provided data file for correctness against the provided
        schema file.

        @return: A flag indicating if the data validates against the schema.
        """

        # clear any previous xml errors
        clear_error_log()
        if self.schema_file is not None:
            try:
                # Attempt parsing the schema file
                schdoc = parse(self.schema_file)
            except XMLSyntaxError as e:
                # The schema was not parsable XML
                logging.warning('The schema XML file could not be parsed.')
                for item in e.error_log:
                    logging.info(item)

                return False

            try:
                schema = XMLSchema(schdoc)
            except XMLSchemaParseError as e:
                # The schema document is XML, but it's not a schema
                logging.warning(
                    'The schema XML file was parsed, but it does not appear to be a valid XML Schema document.'
                )
                for item in e.error_log:
                    logging.info(item)

                return False

        try:
            # Attempt parsing the data file
            data = parse(self.datafile)
        except XMLSyntaxError as e:
            # The data was not parsable XML
            logging.warning('The data XML file could not be parsed.')
            for item in e.error_log:
                logging.info(item)

            return False

        if self.schema_file is not None:
            if schema.validate(data):
                self.data = data
                return True

            logging.warning(
                'The data does not conform to the provided schema.')
            for item in schema.error_log:
                logging.info(item)

            return False

        self.data = data

        return True
Example #3
0
    def validate(self, message: Element) -> None:
        """Validate the constructed request against a XSD."""
        path = os.path.join(os.path.dirname(schema.__file__), self.xmlschema_definition)
        with open(path) as xsd:
            xmlschema = XMLSchema(parse(xsd))

        xmlschema.assertValid(message)
Example #4
0
 def testOne(self):
     self.observer.methods['add'] = lambda *args, **kwargs: (x for x in [])
     list( compose(self.norm_mods.all_unknown('add', 'id', 'metadata', 'anotherone', lxmlNode=parse(open("data/didl_mods.xml")), identifier='oai:very:secret:09987' )))        
     self.assertEquals(3, len(self.observer.calledMethods))
     
     # for m in self.observer.calledMethods:
     #    print 'method name:',m.name, m.args, m.kwargs
     result = self.observer.calledMethods[2].kwargs.get('lxmlNode')
     
     # print "Converted:", tostring(result)
     self.assertEquals(2, len(self.observer.calledMethods[0].args))
     
     arguments = self.observer.calledMethods[2].args
     self.assertEquals("id", arguments[0])
     self.assertEquals("metadata", arguments[1])
     
     #Get MODS from record:
     mods = result.xpath("//mods:mods", namespaces=namespacesMap)
     
     # Should be exactly 1:
     self.assertTrue(len(mods)==1)
     
     #print tostring(mods[0], pretty_print = True, encoding='utf-8')        
     
     #Validate against schema:
     modsSchema = XMLSchema(lxmlParse(open(join(self.schemasPath, 'mods-3-6.xsd'))))                
     modsSchema.validate(mods[0])
     if modsSchema.error_log.last_error:
         self.fail(modsSchema.error_log.last_error)        
     
     # Check if expected result:        
     expectedResult = open("data/mods_converted.xml").read()
     # print "EXPECTED MODS:", tostring(mods[0], pretty_print = True, encoding='utf-8')      
     self.assertEqualsWithDiff(expectedResult, tostring(mods[0], pretty_print = True, encoding='utf-8'))
Example #5
0
    def validate_data(self, data, structural_schema, metamorphic_schema=None, validate_schemas=False):
        """
        Ensure that a data structure matches a schema (xml schema)

        :param str data: data to check
        :param str structural_schema: unique_key
        :param str metamorphic_schema: unique_key
        :param bool validate_schemas: Any provided schema will be
          valided before use if set to True. They are used as is
          otherwise. This option is suited for better perfs. Use it at
          your own risk.
        :return: True if data is valid, False otherwise
        :rtype: bool
        """

        if validate_schemas is True:
            self.assert_structural_schema(structural_schema)

            if metamorphic_schema is not None:
                self.assert_metamorphic_schema(metamorphic_schema)

        if metamorphic_schema is not None:
            self.transform(data, metamorphic_schema)

        xsl_xml = self.get_cached_schema(structural_schema)
        xmlschema = XMLSchema(xsl_xml)

        xml = parse(StringIO(data))

        return xmlschema.validate(xml)
Example #6
0
    def validate_data(self,
                      data,
                      structural_schema,
                      metamorphic_schema=None,
                      validate_schemas=False):
        """
        Ensure that a data structure matches a schema (xml schema)

        :param str data: data to check
        :param str structural_schema: unique_key
        :param str metamorphic_schema: unique_key
        :param bool validate_schemas: Any provided schema will be
          valided before use if set to True. They are used as is
          otherwise. This option is suited for better perfs. Use it at
          your own risk.
        :return: True if data is valid, False otherwise
        :rtype: bool
        """

        if validate_schemas is True:
            self.assert_structural_schema(structural_schema)

            if metamorphic_schema is not None:
                self.assert_metamorphic_schema(metamorphic_schema)

        if metamorphic_schema is not None:
            self.transform(data, metamorphic_schema)

        xsl_xml = self.get_cached_schema(structural_schema)
        xmlschema = XMLSchema(xsl_xml)

        xml = parse(StringIO(data))

        return xmlschema.validate(xml)
def assertValid(xmlString, schemaPath):
    schema = XMLSchema(parse(open(schemaPath)))
    toValidate = parse(StringIO(xmlString))
    schema.validate(toValidate)
    if schema.error_log:
        raise AssertionError(formatException("assertValid", schema,
                                             toValidate))
Example #8
0
 def parse(self, filename):
     self.parser = None
     # find parser
     try:
         from lxml.etree import parse, XMLSchema
         self.logger.info('using lxml.etree parser')
         # parse XML and validate it
         tree = parse(filename)
         # get XSD
         schemaDoc = parse(XSDContents)
         schema = XMLSchema(schemaDoc)
         xml_valid = schema.validate(tree)
         if xml_valid:
             self.logger.info('XML validated')
         else:
             self.logger.error('XML NOT validated: {}'.format(filename))
             print >> stderr, schema.error_log
         return tree if xml_valid else None
     except ImportError:
         try:
             from xml.etree.ElementTree import parse
             self.logger.info('using xml.etree.ElementTree parser')
             return parse(filename)
         except ImportError:
             self.logger.critical(
                 "Failed to import ElementTree from any known place")
             raise
Example #9
0
 def __init__(self, schemaPath):
     Observable.__init__(self)
     try:
         with open(schemaPath) as fp:
             self._schema = XMLSchema(parse(fp))
     except XMLSchemaParseError as e:
         print(e.error_log.last_error)
         raise
Example #10
0
 def valid_xml(self, xml_string):
     """Return True if `xml_string` conforms to SEPA XML schema."""
     if self.xml_schema_path is None:
         raise NotImplementedError(
             "XSD validation failed: path to schema is not set.")
     parsed_xml = parse(StringIO(xml_string))
     xsd_scheme = XMLSchema(parse(self.xml_schema_path))
     return xsd_scheme.validate(parsed_xml)
def validate_xml_file_against_xsd(xml_file, xsd_file):
    try:
        xml_doc = parse(xml_file)
        xsd_doc = parse(xsd_file)
        xmlschema = XMLSchema(xsd_doc)
        xmlschema.assert_(xml_doc)
        logger.info("XSD Pass!")
    except Exception as e:
        logger.info("XSD Fail!")
        BuiltIn().fail(e)
Example #12
0
    def loadXml(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as an XML string, fileName,
        URI, or file-like object.

        Note that the XML file or string may be Gzip-compressed.

        @type data: string or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema while loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        """

        if isinstance(data, basestring):
            if len(data) >= 2 and data[0:2] == "\x1f\x8b":
                data = gzip.GzipFile(fileobj=StringIO(data))
            elif data.find("<") != -1:
                data = StringIO(data)

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        newParserOptions = {"schema": schema, "huge_tree": True}
        newParserOptions.update(parserOptions)
        parserOptions = newParserOptions

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath(
                "xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        # ElementNamespaceClassLookup don't work with iterparse, so we have to parse all at once and then iterwalk
        pmmlBinding = parse(data, parser).getroot()
        pmmlBinding.modelLoader = self

        if postValidate:
            for event, elem in iterwalk(pmmlBinding,
                                        events=("end", ),
                                        tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding
Example #13
0
class Validate(Observable):
    def __init__(self, schemaPath):
        Observable.__init__(self)
        try:
            with open(schemaPath) as fp:
                self._schema = XMLSchema(parse(fp))
        except XMLSchemaParseError as e:
            print(e.error_log.last_error)
            raise


    def all_unknown(self, message, *args, **kwargs):
        self._detectAndValidate(*args, **kwargs)
        yield self.all.unknown(message, *args, **kwargs)

    def do_unknown(self, message, *args, **kwargs):
        self._detectAndValidate(*args, **kwargs)
        return self.do.unknown(message, *args, **kwargs)

    def any_unknown(self, message, *args, **kwargs):
        self._detectAndValidate(*args, **kwargs)
        try:
            response = yield self.any.unknown(message, *args, **kwargs)
            return response
        except NoneOfTheObserversRespond:
            raise DeclineMessage

    def call_unknown(self, message, *args, **kwargs):
        self._detectAndValidate(*args, **kwargs)
        try:
            return self.call.unknown(message, *args, **kwargs)
        except NoneOfTheObserversRespond:
            raise DeclineMessage

    def _detectAndValidate(self, *args, **kwargs):
        allArguments = list(args) + list(kwargs.values())
        for arg in allArguments:
            if type(arg) == _ElementTree:
                self.validate(arg)

    def validate(self, arg):
        self._schema.validate(arg)
        if self._schema.error_log:
            exception = ValidateException(formatException(self._schema, arg))
            self.do.logException(exception)
            raise exception

    def assertValid(self, xmlOrString):
        toValidate = XML(xmlOrString.encode('utf-8')) if isinstance(xmlOrString, str) else xmlOrString
        self._schema.validate(toValidate)
        if self._schema.error_log:
            raise AssertionError(formatException(self._schema, toValidate))
Example #14
0
    def setUp(self):
        image_pathnames = (
            FIRST_PAGE_IMAGE_PATHNAME,
            SECOND_PAGE_IMAGE_PATHNAME,
        )
        self.document = ImageFileDocument(image_pathnames)
        page_iterator = iter(self.document)
        self.first_page = next(page_iterator)
        self.second_page = next(page_iterator)

        self.xml_schema = XMLSchema(file=XML_SCHEMA_PATHNAME)
        self.quadrangle_tracker = RTreeDequeConvexQuadrangleTracker()
        self.screen_detector = ScreenEventDetectorScreenDetector()
        self.page_detector = ScreenEventDetectorPageDetector()
    def testOne(self):
        self.observer.methods['add'] = lambda *args, **kwargs: (x for x in [])
        list(
            compose(
                self.norm_didl.all_unknown(
                    'add',
                    'id',
                    'metadata',
                    'anotherone',
                    lxmlNode=parse(open("data/didl_mods.xml")),
                    identifier='oai:very:secret:09987')))
        self.assertEquals(4, len(self.observer.calledMethods))

        #for m in self.observer.calledMethods:
        #    print 'method name:',m.name, m.args, m.kwargs
        result = self.observer.calledMethods[3].kwargs.get('lxmlNode')

        self.assertEquals(2, len(self.observer.calledMethods[0].args))

        arguments = self.observer.calledMethods[1].args

        self.assertEquals("oai:very:secret:09987", arguments[0])
        #Test logMessage:
        self.assertEquals(
            "Found objectFile in depricated dip:ObjectType. This should have been: rdf:type/@rdf:resource",
            arguments[1])

        #Get DIDL from record:
        didl = result.xpath("//didl:DIDL", namespaces=namespacesMap)

        # Should be exactly 1:
        self.assertTrue(len(didl) == 1)

        #print tostring(didl[0], pretty_print = True, encoding='utf-8')

        #Validate against schema:
        didlSchema = XMLSchema(
            lxmlParse(open(join(self.schemasPath, 'didl.xsd'))))
        didlSchema.validate(didl[0])
        if didlSchema.error_log.last_error:
            self.fail(didlSchema.error_log.last_error)

        # Check if expected result:
        expectedResult = open("data/didl_converted.xml").read()
        # print "EXPECTED DIDL:", tostring(didl[0], pretty_print = True, encoding='utf-8')
        self.assertEqualsWithDiff(
            expectedResult,
            tostring(didl[0], pretty_print=True, encoding='utf-8'))
Example #16
0
    def test_centre(self):
        client = Client()
        response = client.get('/restxml/1/', secure=True)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/xml')

        schema_root = fromstring(
            resource_string(__name__, join('data', 'CenterProfile.xsd')))
        schema = XMLSchema(schema_root)

        try:
            xml_doc = fromstring(response.content)
            schema.assertValid(xml_doc)
        except (XMLSyntaxError, DocumentInvalid):
            print_exc()
            self.fail()
Example #17
0
    def test_centre(self):
        client = Client()
        response = client.get('/restxml/1/', secure=True)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/xml')

        schema_root = fromstring(
            resource_string(__name__, join('data', 'CenterProfile.xsd')))
        schema = XMLSchema(schema_root)

        try:
            xml_doc = fromstring(response.content)
            schema.assertValid(xml_doc)
        except (XMLSyntaxError, DocumentInvalid):
            print_exc()
            self.fail()
Example #18
0
def validate_schema(xml_loc: str, schema: etree.XMLSchema):
    """
    Given an xmlfile and a schema. This function returns True
    if the xml file matches the schema
    Raises: DoxygenMalformedXML: If the schema is malformed
    """
    assert xml_loc is not None
    assert schema is not None
    assert os.path.exists(xml_loc)

    with open(xml_loc, "rb") as xml_f:
        xml_bytes = BytesIO(xml_f.read())

    try:
        xml = etree.parse(xml_bytes)
    except etree.XMLSyntaxError as e:
        logger.error(e)
        logger.warning(f"Failed to parse: {xml_loc}")
        logger.warning("Skipping this XML file")
        raise DoxygenMalformedXML(xml_loc) from e

    logger.debug(f"Validating {xml_loc}")

    if schema and not schema.validate(xml):  # type: ignore
        logger.warning(f"Validating schema failed for: {xml_loc}")
        logger.warning(schema.error_log)  # type: ignore
        logger.warning("Skipping this XML file")
        return False

    return True
Example #19
0
    def validate_schema(self, schema):
        """
        Make sure provided schema's syntax/grammar are correct

        :param str schema: xml (schema itself) or unique_key
        :return: [True, <schema_type>] if schema is correct and
          [False, None] otherwise
        :rtype: list

        .. note:: <schema_type> can either be 'XMLSchema' or 'XSLT'
        """

        if schema in self.get_existing_unique_keys():
            xschema = self.get_cached_schema(schema)
        else:
            try:
                xschema = parse(StringIO(schema))
            except Exception:
                return [False, None]

        try:
            XMLSchema(xschema)
            return [True, 'XMLSchema']
        except Exception:
            pass

        try:
            XSLT(xschema)
            return [True, 'XSLT']
        except Exception:
            pass

        return [False, None]
 def __init__(self, schemaPath):
     Observable.__init__(self)
     try:
         self._schema = XMLSchema(parse(open(schemaPath)))
     except XMLSchemaParseError, e:
         print e.error_log.last_error
         raise
def get_schemas():
    try:
        dr_xmlschema_doc = lxml.parse('detailedreport.xsd')
        bi_xmlschema_doc = lxml.parse('buildinfo.xsd')
        dr_xmlschema = XMLSchema(dr_xmlschema_doc)
        bi_xmlschema = XMLSchema(bi_xmlschema_doc)
        schemas = {'detailed_report': dr_xmlschema, 'build_info': bi_xmlschema}
    except IOError:
        print('Invalid Schema File')
        sys.exit(1)
    except XMLSyntaxError:
        print('XML Syntax Error ')
        sys.exit(1)
    except AttributeError as e:
        print('Local XML file error: ' + str(e))
        sys.exit(1)
    return schemas
Example #22
0
def getSchema():
    global schema
    if not schema:
        try:
            schema = XMLSchema(schemaXml)
        except XMLSchemaParseError as e:
            print(e.error_log.last_error)
            raise
    return schema
Example #23
0
 def setUp(self):
     # We need a ticket and an account for test to pass before we use
     # selenium and friends
     # Hipay credentials
     self.login = '******'
     self.password = '******'
     self.schema = XMLSchema(file=open(os.path.join('hipay', 'mapi.xsd'),
                                       'rb'),
                             attribute_defaults=True)
     self.parser = XMLParser(schema=self.schema, attribute_defaults=True)
def main():

    ######## SETUP OPTION PARSER AND READ COMMAND LINE ARGS ##########

    usage = "usage: %prog -c CONFIGURATION_FILE [-d]"
    option_parser = OptionParser(usage, version='1.0')
    option_parser.add_option("-c",
                             "--configuration-file",
                             dest="configuration_file",
                             action='store',
                             help="the path to the XML configuration file")
    option_parser.add_option(
        "-d",
        "--dry-run",
        dest="dry_run",
        action='store_true',
        default=False,
        help="only execute a dry run without calls to osmosis [default=false]")
    option_parser.add_option(
        "-l",
        "--logging-conf",
        dest="logging_config_file",
        action='store',
        default='logging.conf',
        help="path to the logging configuration [default=logging.conf]")
    (options, args) = option_parser.parse_args()

    if len(args) != 0:
        option_parser.print_help()
        sys.exit("incorrect number of arguments")

    if not options.configuration_file:
        option_parser.print_help()
        sys.exit("configuration file is missing")

    # check whether xml configuration exists
    if not PATH.exists(options.configuration_file) or not PATH.isfile(
            options.configuration_file):
        sys.exit("the xml configuration file at '%s' could not be found" %
                 options.configuration_file)

    ######## READING SCHEMA AND XML CONFIG ###########

    try:
        # load XML schema file
        xmlschema_doc = ET.parse('resources/mapcreator.xsd')
        xmlschema = XMLSchema(xmlschema_doc)
        parser = ET.XMLParser(schema=xmlschema, remove_comments=True)
        # try to load xml configuration, validate with schema
        tree = ET.parse(options.configuration_file, parser=parser)
    except NameError:
        tree = ET.parse(options.configuration_file)
    except (XMLSyntaxError, XMLSchemaValidateError), e:
        sys.exit("the xml configuration is not valid: '%s'" % e)
Example #25
0
    def test_centres_kml(self):
        client = Client()
        response = client.get('/api/KML/', secure=True)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'],
                         'application/vnd.google-earth.kml+xml')

        # It is necessary to load this KML XSD over HTTP as it imports two
        # other XSDs using relative names, which
        # does not work well with Python package resources, that should not
        # be located to an absolute location.
        schema_doc = parse("http://www.opengis.net/kml/2.2")
        schema = XMLSchema(schema_doc)

        try:
            xml_doc = fromstring(response.content)
            schema.assertValid(xml_doc)
        except (XMLSyntaxError, DocumentInvalid):
            print_exc()
            self.fail()
Example #26
0
    def __init__(self, host_url=None, version='1.2.0', dtd_validation=False, xsd_validation=True):
        """Create a new XML parser instance containing the geoLink XSD for validation.

        Args:
            host_url (str): URL of the OEREBlex host to resolve relative URLs. The complete URL until but
                without the */api* part has to be set, starting with *http://* or *https://*.
            version (str): The version of the geoLink schema to be used. Defaults to `1.2.0`.
            dtd_validation (bool): Enable/disable validation of document type definition (DTD).
                Optional, defaults to False.
            xsd_validation (bool): Enable/disable validation against XML schema (XSD).
                Optional, defaults to True.

        """
        self._host_url = host_url
        self._version = version
        self._dtd_validation = dtd_validation
        self._xsd_validation = xsd_validation
        xsd = pkg_resources.resource_filename('geolink_formatter', 'schema/v{0}.xsd'.format(version))
        with open(xsd) as f:
            self._schema = XMLSchema(fromstring(f.read()))
Example #27
0
    def test_centres_kml(self):
        client = Client()
        response = client.get('/api/KML/', secure=True)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'],
                         'application/vnd.google-earth.kml+xml')

        # It is necessary to load this KML XSD over HTTP as it imports two
        # other XSDs using relative names, which
        # does not work well with Python package resources, that should not
        # be located to an absolute location.
        schema_doc = parse("http://www.opengis.net/kml/2.2")
        schema = XMLSchema(schema_doc)

        try:
            xml_doc = fromstring(response.content)
            schema.assertValid(xml_doc)
        except (XMLSyntaxError, DocumentInvalid):
            print_exc()
            self.fail()
Example #28
0
 def validate(self, xml=None):
     """Validate against xsd schema the provided schema
     https://payment.hipay.com/schema/mapi.xsd"""
     schema = XMLSchema(file=open(os.path.join(DIRNAME, 'mapi.xsd'), 'rb'),
                        attribute_defaults=True)
     parser = XMLParser(schema=schema, attribute_defaults=True)
     if not xml:
         xml = ET.tostring(self.asTree().getroot())
     try:
         root = fromstring(xml, parser)
     except Exception, e:
         return False
Example #29
0
    def loadXml(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as an XML string, fileName,
        URI, or file-like object.

        Note that the XML file or string may be Gzip-compressed.

        @type data: string or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema while loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        """

        if isinstance(data, basestring):
            if len(data) >= 2 and data[0:2] == "\x1f\x8b":
                data = gzip.GzipFile(fileobj=StringIO(data))
            elif data.find("<") != -1:
                data = StringIO(data)

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        newParserOptions = {"schema": schema, "huge_tree": True}
        newParserOptions.update(parserOptions)
        parserOptions = newParserOptions

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        # ElementNamespaceClassLookup don't work with iterparse, so we have to parse all at once and then iterwalk
        pmmlBinding = parse(data, parser).getroot()
        pmmlBinding.modelLoader = self

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding
Example #30
0
 def __init__(self, xSDPathList=[], nsMap=None):
     Observable.__init__(self)
     
     self._namespacesMap = namespaces.copyUpdate(nsMap or {})
     self._xmlSchemas = []
     
     ## Fill the schemas list for later use:
     for strName, strXPath, schemaPath in xSDPathList:
         print 'schema init:', strName, strXPath, schemaPath
         try:
             self._xmlSchemas.append((strName, strXPath, XMLSchema(parse(join(join(dirname(abspath(__file__)), 'xsd'), schemaPath) ) ) ))
         except XMLSchemaParseError, e:
             print 'XMLSchemaParseError.............',e.error_log.last_error
             raise
Example #31
0
    def __init__(self, fromKwarg, toKwarg=None, name=None, nsMap=None):
        UiaConverter.__init__(self, name=name, fromKwarg=fromKwarg, toKwarg=toKwarg)
        self._nsMap = namespaces.copyUpdate(nsMap or {})
        self._bln_success = False
        self._edu_extension_schemas = []

        ## Fill the schemas list for later use:
        for schemaPath, xPad, s_loc in mods_edu_extentions:
            print 'schema init:' ,schemaPath, xPad, s_loc
            try:
                self._edu_extension_schemas.append((XMLSchema(parse(join(dirname(abspath(__file__)), 'xsd/'+ schemaPath) ) ), xPad, s_loc ))
            except XMLSchemaParseError, e:
                print 'XMLSchemaParseError.', e.error_log.last_error
                raise
    def xml_to_xsd_validation(file_xml, file_xsd):
        """ Verify that the XML compliance with XSD
        Arguments:
            1. file_xml: Input xml file
            2. file_xsd: xsd file which needs to be validated against xml
        Return:
            No return value
        """
        try:
            print_info("Validating:{0}".format(file_xml))
            print_info("xsd_file:{0}".format(file_xsd))
            xml_doc = parse(file_xml)
            xsd_doc = parse(file_xsd)
            xmlschema = XMLSchema(xsd_doc)
            xmlschema.assert_(xml_doc)
            return True

        except XMLSyntaxError as err:
            print_error("PARSING ERROR:{0}".format(err))
            return False

        except AssertionError as err:
            print_error("Incorrect XML schema: {0}".format(err))
            return False
Example #33
0
    def validate(self, pmmlBinding, postValidate=True):
        """Validate a PMML subtree on demand.

        Note that by default, PMML is validated as or just after it is
        loaded.  This command is intended to check an in-memory PMML
        object after it has been changed or created by an algorithm.

        @type pmmlBinding: PmmlBinding
        @param pmmlBinding: The in-memory PMML object to check.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        """

        if self.preparedSchema is None:
            self.preparedSchema = XMLSchema(self.schema)

        self.preparedSchema.assertValid(pmmlBinding)

        if postValidate:
            for event, elem in iterwalk(pmmlBinding,
                                        events=("end", ),
                                        tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()
Example #34
0
 def parse(self, filename):
     self.parser = None
     # find parser
     try:
         from lxml.etree import parse, XMLSchema
         self.logger.info('using lxml.etree parser')
         # parse XML and validate it
         tree = parse(filename)
         # get XSD
         schemaDoc = parse(XSDContents)
         schema = XMLSchema(schemaDoc)
         if schema.validate(tree):
             self.logger.info('XML validated')
             return tree
         print >> stderr,  schema.error_log
         raise ValueError('XML NOT validated: {}'.format(filename))
     except ImportError:
         try:
             from xml.etree.ElementTree import parse
             self.logger.info('using xml.etree.ElementTree parser')
             return parse(filename)
         except ImportError:
             self.logger.critical("Failed to import ElementTree from any known place")
             raise
 def __init__(self, filename, mode='r+b', writebackOnExit=True):
     '''
     Creating a phonebook context manager requires a filename; opening mode
     and whether there is write back on exit are optional arguments.
     If write back on exit is True then there must be a mode which
     allows for writing. The mode must always allow reading!
     '''
     self.filename = filename
     self.mode = mode
     self.writebackOnExit = writebackOnExit
     # TODO: Validate the document.
     parser = XMLParser(schema=XMLSchema(parse('contacts.xsd')))
     self.cache = {
         item.find('name/lastname').text + ', ' +
         item.find('name/firstname').text: item.find('number').text
         for item in parse(filename, parser).findall('contact')
     }
Example #36
0
    def __init__(self, sld_file=None):
        """
        Create a new SLD document. If an sld file is provided, this constructor
        will fetch the SLD schema from the internet and validate the file 
        against that schema.

        @type  sld_file: string
        @param sld_file: The name of a pre-existing SLD file.
        """
        super(StyledLayerDescriptor, self).__init__(None)

        if StyledLayerDescriptor._cached_schema is None:
            logging.debug('Storing new schema into cache.')

            localschema = NamedTemporaryFile(delete=False)
            schema_url = 'http://schemas.opengis.net/sld/1.0.0/StyledLayerDescriptor.xsd'
            resp = urllib2.urlopen(schema_url)
            localschema.write(resp.read())
            resp.close()
            localschema.seek(0)

            theschema = parse(localschema)
            localschema.close()

            StyledLayerDescriptor._cached_schema = localschema.name
        else:
            logging.debug('Fetching schema from cache.')

            localschema = open(StyledLayerDescriptor._cached_schema, 'r')
            theschema = parse(localschema)
            localschema.close()

        self._schema = XMLSchema(theschema)

        if not sld_file is None:
            self._node = parse(sld_file)

            if not self._schema.validate(self._node):
                logging.warn('SLD File "%s" does not validate against the SLD schema.', sld_file)
        else:
            self._node = Element("{%s}StyledLayerDescriptor" % SLDNode._nsmap['sld'], version="1.0.0", nsmap=SLDNode._nsmap)

        setattr(self.__class__, 'NamedLayer', SLDNode.makeproperty('sld', cls=NamedLayer,
            docstring="The named layer of the SLD."))
Example #37
0
    def validate(self, pmmlBinding, postValidate=True):
        """Validate a PMML subtree on demand.

        Note that by default, PMML is validated as or just after it is
        loaded.  This command is intended to check an in-memory PMML
        object after it has been changed or created by an algorithm.

        @type pmmlBinding: PmmlBinding
        @param pmmlBinding: The in-memory PMML object to check.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        """

        if self.preparedSchema is None:
            self.preparedSchema = XMLSchema(self.schema)

        self.preparedSchema.assertValid(pmmlBinding)

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()
Example #38
0
def validate_xml(xml_string, xsd_string):
    """Validate XML file against XML schema
    """
    xmlschema = XMLSchema(parse(xsd_string))
    return xmlschema.validate(parse(xml_string))
    os.remove(LOG_FILENAME)

MAX_LOGSIZE = 10485760
logger = logging.getLogger('ValidationLogger')
logger.setLevel(logging.INFO)
handler = logging.handlers.RotatingFileHandler(LOG_FILENAME, backupCount=14, maxBytes=MAX_LOGSIZE)
formatter = logging.Formatter("%(asctime)s - %(message)s", "%Y-%m-%d %H:%M:%S")
handler.setFormatter(formatter)
logger.addHandler(handler)

logger.info('### Start validation of person, organisation and research entities in normdoc files in %s' % STORAGE_DIR)
logger.info('NOTE: only failed validations are logged!')
logger.info('Start time: %s\n' % datetime.datetime.now())

xmlschema_doc = parse(PERSON_SCHEMA)
xmlschema_person = XMLSchema(xmlschema_doc)
xmlschema_doc = parse(ORGANISATION_SCHEMA)
xmlschema_organisation = XMLSchema(xmlschema_doc)
xmlschema_doc = parse(RESEARCH_SCHEMA)
xmlschema_research = XMLSchema(xmlschema_doc)

# Loop over api storage
for item in os.listdir(STORAGE_DIR):
    path = os.path.join(STORAGE_DIR, item)
    if os.path.isdir(path) and item in NOD_COLLECTIONS:

        logger.info('### Start validation of xml files in %s' % path)
        print('### Start validation of xml files in %s' % path)

        count_person = 0
        count_person_invalid = 0
Example #40
0
def assertValid(xmlString, schemaPath):
    schema = XMLSchema(parse(open(schemaPath)))
    toValidate = parse(StringIO(xmlString))
    schema.validate(toValidate)
    if schema.error_log:
        raise AssertionError(formatException(schema, toValidate))
Example #41
0
class StyledLayerDescriptor(SLDNode):
    """
    An object representation of an SLD document.

    @prop: NamedLayer
    
        The named layer that this styling applies to.

        I{Type}: L{NamedLayer}
    """

    _cached_schema = None
    """A cached schema document, to prevent multiple requests from occurring."""

    def __init__(self, sld_file=None):
        """
        Create a new SLD document. If an sld file is provided, this constructor
        will fetch the SLD schema from the internet and validate the file 
        against that schema.

        @type  sld_file: string
        @param sld_file: The name of a pre-existing SLD file.
        """
        super(StyledLayerDescriptor, self).__init__(None)

        if StyledLayerDescriptor._cached_schema is None:
            logging.debug('Storing new schema into cache.')

            localschema = NamedTemporaryFile(delete=False)
            schema_url = 'http://schemas.opengis.net/sld/1.0.0/StyledLayerDescriptor.xsd'
            resp = urllib2.urlopen(schema_url)
            localschema.write(resp.read())
            resp.close()
            localschema.seek(0)

            theschema = parse(localschema)
            localschema.close()

            StyledLayerDescriptor._cached_schema = localschema.name
        else:
            logging.debug('Fetching schema from cache.')

            localschema = open(StyledLayerDescriptor._cached_schema, 'r')
            theschema = parse(localschema)
            localschema.close()

        self._schema = XMLSchema(theschema)

        if not sld_file is None:
            self._node = parse(sld_file)

            if not self._schema.validate(self._node):
                logging.warn('SLD File "%s" does not validate against the SLD schema.', sld_file)
        else:
            self._node = Element("{%s}StyledLayerDescriptor" % SLDNode._nsmap['sld'], version="1.0.0", nsmap=SLDNode._nsmap)

        setattr(self.__class__, 'NamedLayer', SLDNode.makeproperty('sld', cls=NamedLayer,
            docstring="The named layer of the SLD."))

    def __del__(self):
        """
        Destroy the StyledLayerDescriptor object, and clear its cache.
        """
        if not StyledLayerDescriptor._cached_schema is None:
            logging.debug('Clearing cached schema.')

            os.remove(StyledLayerDescriptor._cached_schema)
            StyledLayerDescriptor._cached_schema = None

    def __deepcopy__(self, memo):
        """
        Perform a deep copy. Instead of copying references to the schema
        object, create a new SLD, and deepcopy the SLD node.
        """
        sld = StyledLayerDescriptor()
        sld._node = copy.deepcopy(self._node)
        return sld


    def normalize(self):
        """
        Normalize this node and all child nodes prior to validation. The SLD
        is modified in place.
        """
        if not self.NamedLayer is None:
            self.NamedLayer.normalize()


    def validate(self):
        """
        Validate the current file against the SLD schema. This first normalizes
        the SLD document, then validates it. Any schema validation error messages
        are logged at the INFO level.

        @rtype: boolean
        @return: A flag indicating if the SLD is valid.
        """
        self.normalize()

        if self._node is None or self._schema is None:
            logging.debug('The node or schema is empty, and cannot be validated.')
            return False

        is_valid = self._schema.validate(self._node)

        for msg in self._schema.error_log:
            logging.info('Line:%d, Column:%d -- %s', msg.line, msg.column, msg.message)

        return is_valid


    @property
    def version(self):
        """
        Get the SLD version.
        """
        return self._node.getroot().get('version')

    @property
    def xmlns(self):
        """
        Get the XML Namespace.
        """
        return self._node.getroot().nsmap[None]

    def create_namedlayer(self, name):
        """
        Create a L{NamedLayer} in this SLD.
        
        @type  name: string
        @param name: The name of the layer.
        @rtype: L{NamedLayer}
        @return: The named layer, attached to this SLD.
        """
        namedlayer = self.get_or_create_element('sld', 'NamedLayer')
        namedlayer.Name = name
        return namedlayer

    def as_sld(self, pretty_print=False):
        """
        Serialize this SLD model into a string.

        @rtype: string
        @returns: The content of the SLD.
        """
        return tostring(self._node, pretty_print=pretty_print)
Example #42
0
#!/usr/bin/env python

from lxml.etree import XMLParser, fromstring, XMLSchema

schema_doc = open('schema.xsd').read()
inst_doc = open('inst.xml').read()

parser = XMLParser(resolve_entities=False)
elt = fromstring(inst_doc, parser)
schema = XMLSchema(fromstring(schema_doc))
schema.validate(elt)
Example #43
0
class ModelLoader(object):
    """ModelLoader is a tool for unserializing or creating PMML
    models.

    A ModelLoader loader instance can be modified to support strict
    PMML compliance, extended PMML features, or optimized
    implementations of PMML elements.

    The user is encouraged to write new PmmlBinding subclasses and
    register them with a ModelLoader to modify the behavior of PMML or
    make certain functions more efficient for a given context.

    ModelLoader is the only supported way to make new PmmlBinding
    instances: any function that produces PMML must be given a
    ModelLoader.

    @type schema: lxml.etree.Element
    @param schema: Representation of the PMML schema used to interpret new models.
    @type tagToClass: dict
    @param tagToClass: Association of PMML tagnames with Python classes.
    """

    def __init__(self, baseXsdFileName="pmml-4-1.xsd", baseXsltFileName="pmml-4-1.xslt"):
        """Initialize a ModelLoader with a base XSD.

        By default, the XSD is the official 4.1 schema published by the U{Data Mining Group<http://www.dmg.org/v4-1/GeneralStructure.html>}.

        @type baseXsdFileName: string
        @param baseXsdFileName: XSD fileName, either absolute or relative to augustus-pmml-library/augustus/core
        @type baseXsltFileName: string
        @param baseXsltFileName: XSLT fileName; future placeholder for XSLT non-local validation.  Not currently used.
        """

        if not os.path.exists(baseXsdFileName):
            baseXsdFileName = os.path.join(os.path.split(__file__)[0], baseXsdFileName)
        self.schema = parse(open(baseXsdFileName)).getroot()

        # if not os.path.exists(baseXsltFileName):
        #     baseXsltFileName = os.path.join(os.path.split(__file__)[0], baseXsltFileName)
        # self.stylesheet = parse(open(baseXsltFileName)).getroot()

        self.preparedSchema = None
        self.tagToClass = {}

    def copy(self):
        """Return a deep copy of the ModelLoader for the sake of
        building multiple lines of PMML interpretation from the same
        base."""

        return copy.deepcopy(self)

    def __getstate__(self):
        """Used by Pickle to serialize the ModelLoader.

        This serialization includes the entire schema and tag-to-class
        mapping.
        """

        serialization = self.__dict__.copy()
        buff = StringIO()
        ElementTree(serialization["schema"]).write(buff, compression=defs.PICKLE_XML_COMPRESSION)
        serialization["schema"] = buff.getvalue()
        # buff = StringIO()
        # ElementTree(serialization["stylesheet"]).write(buff, compression=defs.PICKLE_XML_COMPRESSION)
        # serialization["stylesheet"] = buff.getvalue()
        serialization["preparedSchema"] = None
        return serialization

    def __setstate__(self, serialization):
        """Used by Pickle to unserialize the ModelLoader.

        This serialization includes the entire schema and tag-to-class
        mapping.
        """

        serialization["schema"] = parse(gzip.GzipFile(fileobj=StringIO(serialization["schema"]))).getroot()
        # serialization["stylesheet"] = parse(gzip.GzipFile(fileobj=StringIO(serialization["stylesheet"]))).getroot()
        self.__dict__ = serialization

        for tag, cls in self.tagToClass.items():
            cls.xsd = self.xsdElement(tag)

    def xsdElement(self, elementName):
        """Return the XSD that defines a given xs:element.

        @type elementName: string
        @param elementName: The name of the element to retrieve.
        @rtype: lxml.etree.Element
        @return: The XSD object.
        @raise LookupError: If C{elementName} is not found in the schema, an error is raised.
        """

        results = self.schema.xpath("//xs:element[@name='%s']" % elementName, namespaces={"xs": defs.XSD_NAMESPACE})
        if len(results) == 0:
            return None
        elif len(results) == 1:
            return results[0]
        else:
            raise LookupError("Element \"%s\" is defined %d times in this modelLoader's schema" % (elementName, len(results)))

    def xsdGroup(self, groupName):
        """Return the XSD that defines a given xs:group.

        @type groupName: string
        @param groupName: The name of the group to retrieve.
        @rtype: lxml.etree.Element
        @return: The XSD object.
        @raise LookupError: If C{groupName} is not found in the schema, an error is raised.
        """

        results = self.schema.xpath("//xs:group[@name='%s']" % groupName, namespaces={"xs": defs.XSD_NAMESPACE})
        if len(results) == 0:
            return None
        elif len(results) == 1:
            return results[0]
        else:
            raise LookupError("Group \"%s\" is defined %d times in this modelLoader's schema" % (groupName, len(results)))

    def xsdRemove(self, oldName):
        """Remove an arbitrary object from the ModelLoader's XSD schema.

        @type oldName: string
        @param oldName: Name of the object to be removed.
        """

        for result in self.schema.xpath("//*[@name='%s']" % oldName, namespaces={"xs": defs.XSD_NAMESPACE}):
            parent = result.getparent()
            index = parent.index(result)
            del parent[index]

    def xsdAppend(self, newXsd):
        """Append an arbitrary object to the ModelLoader's XSD schema.

        @type newXsd: string or lxml.etree.Element
        @param newXsd: New XSD object to append.
        """

        if isinstance(newXsd, basestring):
            newXsd = fromstring(newXsd)
        self.schema.append(newXsd)
        self.preparedSchema = None

    def register(self, tag, cls):
        """Define (or redefine) the class that is instantiated for a
        given tagname.

        If the class has an C{xsd} and/or C{xsdAppend} string as a
        class attribute, this method will replace the ModelLoader's
        schema entry for C{tag} with the version defined by the class.

        If the class does not have an C{xsd} attribute, this method
        attach the ModelLoader's schema entry for C{tag} to the class.

        As a result, the class will always end up with a C{xsd} class
        attribute representing its XSD schema.  This schema fragment is
        expressed as a lxml.etree.Element for programmatic use.

        The currently-registered classes are in the ModelLoader's
        C{tagToClass} dictionary.

        @type tag: string
        @param tag: The tagname to define or redefine.
        @type cls: PmmlBinding subclass
        @param cls: The class to associate with C{tag}.
        """

        oldXsdElement = self.xsdElement(tag)

        if cls.xsd is not None:
            if isinstance(cls.xsd, basestring):
                clsxsd = fromstring(cls.xsd)
            else:
                clsxsd = cls.xsd

            newXsdElements = clsxsd.xpath("//xs:element[@name='%s']" % tag, namespaces={"xs": defs.XSD_NAMESPACE})
            if len(newXsdElements) != 1:
                raise ValueError("Class %s has an xsd member but %d definitions of element \"%s\"" % (cls.__name__, len(newXsdElements), tag))
            else:
                newXsdElement = newXsdElements[0]

            if oldXsdElement is None:
                self.xsdAppend(newXsdElement)

            else:
                parent = oldXsdElement.getparent()
                index = parent.index(oldXsdElement)
                del parent[index]
                parent.insert(index, newXsdElement)

            cls.xsd = copy.deepcopy(newXsdElement)

        else:
            cls.xsd = copy.deepcopy(oldXsdElement)

        if cls.xsdRemove is not None:
            for name in cls.xsdRemove:
                self.xsdRemove(name)

        if cls.xsdAppend is not None:
            preexisting = {}
            for elem in self.schema:
                name = elem.get("name")
                if name is not None:
                    preexisting[name] = elem

            for newXsd in cls.xsdAppend:
                if isinstance(newXsd, basestring):
                    newXsd = fromstring(newXsd)

                name = newXsd.get("name")
                if name in preexisting:
                    parent = preexisting[name].getparent()
                    index = parent.index(preexisting[name])
                    del parent[index]
                    
                self.xsdAppend(newXsd)

        self.preparedSchema = None
        self.tagToClass[tag] = cls

    def xsdAddToGroupChoice(self, groupName, newElementNames):
        """Add to an xs:group's xs:choice block.

        @type groupName: string
        @param groupName: The name of the xs:group.
        @type newElementNames: list of strings or a single string
        @param newElementNames: References to the xs:elements to add to the xs:choice block.
        """

        results = self.schema.xpath("//xs:group[@name='%s']/xs:choice" % groupName, namespaces={"xs": defs.XSD_NAMESPACE})
        if len(results) != 1:
            raise LookupError("Group \"%s\" is defined with a choice block %d times in this modelLoader's schema" % (groupName, len(results)))

        E = ElementMaker(namespace=defs.XSD_NAMESPACE, nsmap={"xs": defs.XSD_NAMESPACE})

        if isinstance(newElementNames, basestring):
            results[0].append(E.element(ref=newElementNames))
        else:
            for newElementName in newElementNames:
                results[0].append(E.element(ref=newElementName))

        self.preparedSchema = None

    def xsdReplaceGroup(self, groupName, newXsd):
        """Replace an xs:group in this ModelLoader's schema.

        @type groupName: string
        @param groupName: The name of the xs:group.
        @type newXsd: string or lxml.etree.Element
        @param newXsd: The new XSD represented as an XML string or an lxml.etree.Element; it must contain an xs:group named C{groupName}.
        """

        oldXsdElement = self.xsdGroup(groupName)
        
        if isinstance(newXsd, basestring):
            newXsd = fromstring(newXsd)

        newXsdElements = newXsd.xpath("//xs:group[@name='%s']" % groupName, namespaces={"xs": defs.XSD_NAMESPACE})
        if len(newXsdElements) != 1:
            raise ValueError("newXsd has %d definitions of group \"%s\"" % (len(newXsdElements), groupName))
        else:
            newXsdElement = newXsdElements[0]

        if oldXsdElement is None:
            self.xsdAppend(newXsdElement)
        else:
            parent = oldXsdElement.getparent()
            index = parent.index(oldXsdElement)
            del parent[index]
            parent.insert(index, newXsdElement)

        self.preparedSchema = None

    def elementMaker(self, prefix=None, **parserOptions):
        """Obtain a factory for making in-memory PMML objects.

        This factory is an lxml ElementMaker, pre-loaded with the PMML
        namespace and this ModelLoader's current tag-to-class
        relationship.  See the lxml documentation for how to use an
        ElementMaker.

        @type prefix: string or None
        @param prefix: A prefix for the PMML namespace.
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: ElementMaker
        @return: The ElementMaker factory.
        @see: The lxml U{ElementMaker documentation<http://lxml.de/api/lxml.builder.ElementMaker-class.html>}, which explains how to use an ElementMaker factory.
        """

        class XmlParser(XMLParser):
            def makeelement(parserSelf, *args, **kwds):
                result = XMLParser.makeelement(parserSelf, *args, **kwds)
                if isinstance(result, PmmlBinding):
                    result.modelLoader = self
                return result

        parser = XmlParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        return ElementMaker(namespace=defs.PMML_NAMESPACE, nsmap={prefix: defs.PMML_NAMESPACE}, makeelement=parser.makeelement)

    def validate(self, pmmlBinding, postValidate=True):
        """Validate a PMML subtree on demand.

        Note that by default, PMML is validated as or just after it is
        loaded.  This command is intended to check an in-memory PMML
        object after it has been changed or created by an algorithm.

        @type pmmlBinding: PmmlBinding
        @param pmmlBinding: The in-memory PMML object to check.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        """

        if self.preparedSchema is None:
            self.preparedSchema = XMLSchema(self.schema)

        self.preparedSchema.assertValid(pmmlBinding)

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

    # def validateXslt(self, pmmlBinding):
    #     xslt = XSLT(self.stylesheet)
    #     return xslt(pmmlBinding)

    def look(self, tag=None, showXsd=True, showSource=False, stream=None):
        """An informative representation of the ModelLoader's current
        interpretation of PMML, intended for interactive use.

        @type tag: string or None
        @param tag: If a string, look up information about this tag; if None, display all tags in the tag-to-class dictionary.
        @type showXsd: bool
        @param showXsd: If True, show the XSD that defines a valid C{tag}.
        @type showSource: bool
        @param showSource: If True, show the Python source code that implements C{tag}.
        @type stream: file-like object or None
        @param stream: If None, print to C{sys.stdout}; otherwise, write to the specified stream.
        @rtype: None
        @return: None; human-readable output is written to the console or a specified stream.
        """

        if stream is None:
            stream = sys.stdout

        if tag is None:
            names = sorted(self.schema.xpath("xs:element/@name", namespaces={"xs": defs.XSD_NAMESPACE}))
            index = 0
            while index < len(names):
                for i in xrange(4):
                    if index + i < len(names):
                        if names[index + i] in self.tagToClass:
                            word = "[%s]" % names[index + i]
                        else:
                            word = names[index + i]
                        stream.write("%-25s " % word)
                    else:
                        break

                stream.write(os.linesep)
                index += 4

        else:
            xsd = None
            if showXsd:
                try:
                    xsd = self.xsdElement(tag)

                except LookupError:
                    try:
                        xsd = self.xsdGroup(tag)
                    except LookupError:
                        pass

                if xsd is not None:
                    stream.write(tostring(xsd, pretty_print=True))        

            if showSource:
                cls = self.tagToClass.get(tag)
                if cls is not None:
                    if xsd is not None:
                        stream.write(os.linesep)
                    stream.write(inspect.getsource(cls))

        stream.flush()

    def loadXml(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as an XML string, fileName,
        URI, or file-like object.

        Note that the XML file or string may be Gzip-compressed.

        @type data: string or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema while loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        """

        if isinstance(data, basestring):
            if len(data) >= 2 and data[0:2] == "\x1f\x8b":
                data = gzip.GzipFile(fileobj=StringIO(data))
            elif data.find("<") != -1:
                data = StringIO(data)

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        newParserOptions = {"schema": schema, "huge_tree": True}
        newParserOptions.update(parserOptions)
        parserOptions = newParserOptions

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        # ElementNamespaceClassLookup don't work with iterparse, so we have to parse all at once and then iterwalk
        pmmlBinding = parse(data, parser).getroot()
        pmmlBinding.modelLoader = self

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding
    
    def _loadJsonItem(self, tag, data, parser, nsmap):
        """Helper function for C{loadJson}; not for public use."""

        if tag.find(":") == -1:
            prefix = None
        else:
            prefix, tag = tag.split(":")

        pretag = nsmap.get(prefix)
        if pretag is None:
            raise ValueError("This document contains a prefix (\"%s\") not found in the namespace (%r)" % (prefix, nsmap))

        attrib = dict((x[1:], data[x]) for x in data if x.startswith("@"))
        childMap = dict((x, data[x]) for x in data if not x.startswith("@") and not x.startswith("#"))

        item = parser.makeelement("{%s}%s" % (pretag, tag), attrib=attrib, nsmap=nsmap)

        children = {}
        for subtag, childList in childMap.items():
            for childItem in childList:
                number = childItem.get("#")
                if number is None:
                    raise ValueError("Subtag \"%s\" has no \"#\"" % subtag)

                children[number] = self._loadJsonItem(subtag, childItem, parser, nsmap)

        for number in xrange(len(children)):
            child = children.get(number)
            if child is not None:
                item.append(child)

        text = data.get("#text")
        if text is not None:
            item.text = text

        tail = data.get("#tail")
        if tail is not None:
            item.tail = tail

        return item

    def loadJson(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as a JSON string, fileName,
        dict, or file-like object.

        There is no standard XML-to-JSON specification, so we define
        our own.  Our specification is very similar to U{this
        proposal<http://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html>},
        which collects subelements of different tagnames into
        different JSON lists, rather than having one long list and
        needing to specify the tag of each element in that list.  This
        has the following advantages, particularly useful for PMML:
          - Frequent tagnames (like <Segment>) are not repeated,
            wasting space.
          - Subelements with a given tagname can be quickly queried,
            without having to iterate over a list that contains
            non-matching tagnames.
        It has the following disadvantages:
          - The relative order of subelements with different tagnames
            is not preserved.
        We therefore additionally include a JSON attribute named "#"
        to specify the ordering of subelements in the XML
        representation.  Also, the specification referenced above
        represents single-child subelements as JSON objects and
        multiple children as JSON lists, but for consistency and ease
        of parsing, we always use lists.  The last difference is that
        we include "#tail" as well as "#text", so that text outside of
        an element is preserved (rarely relevant for PMML, but
        included for completeness).

        Note that this method returns a JSON-like dictionary, not a
        string.  To serialize to JSON, use the C{json} module from the
        Python Standard Library, a faster variant, or an exotic
        serializer such as BSON.

        @type data: string, dict, or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema after loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        @raise ValueError: If the JSON text is malformed or does not represent PMML, an error is raised.
        """

        if hasattr(data, "read"):
            data = json.load(data)
        elif isinstance(data, basestring):
            if os.path.exists(data):
                data = json.load(open(data))
            else:
                data = json.loads(data)

        if not isinstance(data, dict):
            raise ValueError("JSON object must be a mapping at the top level")

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        try:
            nsmap = data["#nsmap"]
        except KeyError:
            raise ValueError("JSON object must have a \"#nsmap\" key at the top level")

        if "" in nsmap:
            nsmap[None] = nsmap[""]
            del nsmap[""]
        del data["#nsmap"]
        
        if len(data) != 1:
            raise ValueError("JSON object must have exactly one PMML object at the top level")

        tag = data.keys()[0]
        data = data[tag]
        if not isinstance(data, list) or len(data) != 1:
            raise ValueError("Top-level PMML object must be a list with exactly one item")
        data = data[0]
        
        pmmlBinding = self._loadJsonItem(tag, data, parser, nsmap)

        if validate:
            schema.assertValid(pmmlBinding)

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding
Example #44
0
    def loadJson(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as a JSON string, fileName,
        dict, or file-like object.

        There is no standard XML-to-JSON specification, so we define
        our own.  Our specification is very similar to U{this
        proposal<http://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html>},
        which collects subelements of different tagnames into
        different JSON lists, rather than having one long list and
        needing to specify the tag of each element in that list.  This
        has the following advantages, particularly useful for PMML:
          - Frequent tagnames (like <Segment>) are not repeated,
            wasting space.
          - Subelements with a given tagname can be quickly queried,
            without having to iterate over a list that contains
            non-matching tagnames.
        It has the following disadvantages:
          - The relative order of subelements with different tagnames
            is not preserved.
        We therefore additionally include a JSON attribute named "#"
        to specify the ordering of subelements in the XML
        representation.  Also, the specification referenced above
        represents single-child subelements as JSON objects and
        multiple children as JSON lists, but for consistency and ease
        of parsing, we always use lists.  The last difference is that
        we include "#tail" as well as "#text", so that text outside of
        an element is preserved (rarely relevant for PMML, but
        included for completeness).

        Note that this method returns a JSON-like dictionary, not a
        string.  To serialize to JSON, use the C{json} module from the
        Python Standard Library, a faster variant, or an exotic
        serializer such as BSON.

        @type data: string, dict, or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema after loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        @raise ValueError: If the JSON text is malformed or does not represent PMML, an error is raised.
        """

        if hasattr(data, "read"):
            data = json.load(data)
        elif isinstance(data, basestring):
            if os.path.exists(data):
                data = json.load(open(data))
            else:
                data = json.loads(data)

        if not isinstance(data, dict):
            raise ValueError("JSON object must be a mapping at the top level")

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        try:
            nsmap = data["#nsmap"]
        except KeyError:
            raise ValueError("JSON object must have a \"#nsmap\" key at the top level")

        if "" in nsmap:
            nsmap[None] = nsmap[""]
            del nsmap[""]
        del data["#nsmap"]
        
        if len(data) != 1:
            raise ValueError("JSON object must have exactly one PMML object at the top level")

        tag = data.keys()[0]
        data = data[tag]
        if not isinstance(data, list) or len(data) != 1:
            raise ValueError("Top-level PMML object must be a list with exactly one item")
        data = data[0]
        
        pmmlBinding = self._loadJsonItem(tag, data, parser, nsmap)

        if validate:
            schema.assertValid(pmmlBinding)

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding