Example #1
0
    def validate(self, message: Element) -> None:
        """Validate the constructed request against a XSD."""
        path = os.path.join(os.path.dirname(schema.__file__), self.xmlschema_definition)
        with open(path) as xsd:
            xmlschema = XMLSchema(parse(xsd))

        xmlschema.assertValid(message)
def validate_xml_doc(xml_schema: XmlSchema, xml_doc: XmlElement) -> None:
    """
    Validate ``xml_doc`` against XML schema ``xml_schema``.

    :raises XmlSchemaDocValidationError: if ``xml_doc`` did not be validate
        against ``xml_schema``

    """
    # There are several ways to validate 'xml_doc' according to an 'xml_schema'.
    #   Different calls and what happens if validation passes or fails:
    #   - xml_schema.assert_(xml_doc): nothign / raises 'AssertionError'
    #   - xml_schema.assertValid(xml_doc): nothing / raises 'DocumentInvalid'
    #   - xml_schema.validate(xml_doc): returns True / returns False

    try:
        xml_schema.assertValid(xml_doc)
    except lxml.etree.DocumentInvalid as exc:
        # note: 'exc.error_log' and 'xml_schema.error_log' are the same object
        #   (type 'lxml.etree._ListErrorLog').

        # TODO: advanced error details parsing, without leaking too much information.
        # xml_error_log = exc.error_log  # type: lxml.etree._ListErrorLog
        # last_xml_error = exc.error_log.last_error  # type: lxml.etree._LogEntry
        # last_xml_error_xml_doc_line = last_xml_error.line

        # TODO: does 'xml_schema.error_log' persist? is it necessary to clear it afterwards?
        #   `xml_schema._clear_error_log()`

        # Simplest and safest way to get the error message.
        # Error example:
        #   "Element 'DTE': No matching global declaration available for the validation root., line 2"  # noqa: E501
        validation_error_msg = str(exc)

        raise XmlSchemaDocValidationError(validation_error_msg) from exc
Example #3
0
    def test_centre(self):
        client = Client()
        response = client.get('/restxml/1/', secure=True)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/xml')

        schema_root = fromstring(
            resource_string(__name__, join('data', 'CenterProfile.xsd')))
        schema = XMLSchema(schema_root)

        try:
            xml_doc = fromstring(response.content)
            schema.assertValid(xml_doc)
        except (XMLSyntaxError, DocumentInvalid):
            print_exc()
            self.fail()
Example #4
0
    def test_centre(self):
        client = Client()
        response = client.get('/restxml/1/', secure=True)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'], 'application/xml')

        schema_root = fromstring(
            resource_string(__name__, join('data', 'CenterProfile.xsd')))
        schema = XMLSchema(schema_root)

        try:
            xml_doc = fromstring(response.content)
            schema.assertValid(xml_doc)
        except (XMLSyntaxError, DocumentInvalid):
            print_exc()
            self.fail()
Example #5
0
def validate():
    for s in SCHEMAS:
        fp1 = "{}/{}.xsd".format(s, s)

        xsdDocument = parse(fp1)
        schema = XMLSchema(xsdDocument)

        logging.info(
            "Checking that all valid examples pass when validated against the XSD file."
        )

        if s == "question":
            fps = glob.glob("{}/examples/*_new.question.xml".format(s))
        else:
            fps = glob.glob("{}/examples/*.xml".format(s))

        for fp in fps:
            xmlDocument = parse(fp)
            isValid = schema.validate(xmlDocument)

            if isValid == True:
                logging.info(" - {} passes.".format(fp))
            else:
                logging.info(" - {} does not pass.".format(fp))

                schema.assertValid(xmlDocument)

        logging.info(
            "Checking that all invalid examples fail when validated against the XSD file."
        )

        if s == "question":
            fps = glob.glob("{}/examples/*_current.question.xml".format(s))
        else:
            fps = glob.glob("{}/invalid_examples/*.xml".format(s))

        for fp in fps:
            xmlDocument = parse(fp)
            isValid = schema.validate(xmlDocument)

            if isValid == False:
                logging.info(" - {} fails.".format(fp))
            else:
                logging.info(" - {} does not fail.".format(fp))

                raise Exception(
                    "{} should fail validation, but doesn't.".format(fp))
Example #6
0
    def test_centres_kml(self):
        client = Client()
        response = client.get('/api/KML/', secure=True)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'],
                         'application/vnd.google-earth.kml+xml')

        # It is necessary to load this KML XSD over HTTP as it imports two
        # other XSDs using relative names, which
        # does not work well with Python package resources, that should not
        # be located to an absolute location.
        schema_doc = parse("http://www.opengis.net/kml/2.2")
        schema = XMLSchema(schema_doc)

        try:
            xml_doc = fromstring(response.content)
            schema.assertValid(xml_doc)
        except (XMLSyntaxError, DocumentInvalid):
            print_exc()
            self.fail()
Example #7
0
    def test_centres_kml(self):
        client = Client()
        response = client.get('/api/KML/', secure=True)
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response['Content-Type'],
                         'application/vnd.google-earth.kml+xml')

        # It is necessary to load this KML XSD over HTTP as it imports two
        # other XSDs using relative names, which
        # does not work well with Python package resources, that should not
        # be located to an absolute location.
        schema_doc = parse("http://www.opengis.net/kml/2.2")
        schema = XMLSchema(schema_doc)

        try:
            xml_doc = fromstring(response.content)
            schema.assertValid(xml_doc)
        except (XMLSyntaxError, DocumentInvalid):
            print_exc()
            self.fail()
Example #8
0
class TestScreenEventDetector(unittest.TestCase):
    """Tests the ability of the ScreenEventDetector class to produce events, and valid XML output.

    """
    def setUp(self):
        image_pathnames = (
            FIRST_PAGE_IMAGE_PATHNAME,
            SECOND_PAGE_IMAGE_PATHNAME,
        )
        self.document = ImageFileDocument(image_pathnames)
        page_iterator = iter(self.document)
        self.first_page = next(page_iterator)
        self.second_page = next(page_iterator)

        self.xml_schema = XMLSchema(file=XML_SCHEMA_PATHNAME)
        self.quadrangle_tracker = RTreeDequeConvexQuadrangleTracker()
        self.screen_detector = ScreenEventDetectorScreenDetector()
        self.page_detector = ScreenEventDetectorPageDetector()

    def test_empty(self):
        video = ScreenEventDetectorVideo(
            fps=VIDEO_FPS,
            width=VIDEO_WIDTH,
            height=VIDEO_HEIGHT,
            datetime=VIDEO_DATETIME,
            quadrangles=(),
            pages=(),
        )
        detector = ScreenEventDetector(
            video,
            self.quadrangle_tracker,
            self.screen_detector,
            self.page_detector,
        )
        screen_events = list(detector)
        self.assertEqual(0, len(screen_events))

        f = BytesIO()
        with xmlfile(f) as xf:
            detector.write_xml(xf)
        f.seek(0)
        xml_document = etree.parse(f)
        self.xml_schema.assertValid(xml_document)

    def test_nonempty(self):
        video = ScreenEventDetectorVideo(
            fps=VIDEO_FPS,
            width=VIDEO_WIDTH,
            height=VIDEO_HEIGHT,
            datetime=VIDEO_DATETIME,
            quadrangles=(
                FIRST_COORDINATES,
                FIRST_COORDINATES,
                SECOND_COORDINATES,
                SECOND_COORDINATES,
            ),
            pages=(
                self.first_page,
                self.second_page,
                self.second_page,
                self.first_page,
            ),
        )
        detector = ScreenEventDetector(
            video,
            self.quadrangle_tracker,
            self.screen_detector,
            self.page_detector,
        )
        screen_events = list(detector)
        self.assertEqual(5, len(screen_events))
        screen_event_iterator = iter(screen_events)

        screen_event = next(screen_event_iterator)
        self.assertTrue(isinstance(screen_event, ScreenAppearedEvent))
        screen_id = screen_event.screen_id
        self.assertEqual(1, screen_event.frame.number)
        self.assertEqual(FIRST_COORDINATES, screen_event.screen.coordinates)
        self.assertEqual(self.first_page, screen_event.page)

        screen_event = next(screen_event_iterator)
        self.assertTrue(isinstance(screen_event, ScreenChangedContentEvent))
        self.assertEqual(screen_id, screen_event.screen_id)
        self.assertEqual(2, screen_event.frame.number)
        self.assertEqual(FIRST_COORDINATES, screen_event.screen.coordinates)
        self.assertEqual(self.second_page, screen_event.page)

        screen_event = next(screen_event_iterator)
        self.assertTrue(isinstance(screen_event, ScreenMovedEvent))
        self.assertEqual(screen_id, screen_event.screen_id)
        self.assertEqual(3, screen_event.frame.number)
        self.assertEqual(SECOND_COORDINATES, screen_event.screen.coordinates)

        screen_event = next(screen_event_iterator)
        self.assertTrue(isinstance(screen_event, ScreenChangedContentEvent))
        self.assertEqual(screen_id, screen_event.screen_id)
        self.assertEqual(4, screen_event.frame.number)
        self.assertEqual(SECOND_COORDINATES, screen_event.screen.coordinates)
        self.assertEqual(self.first_page, screen_event.page)

        screen_event = next(screen_event_iterator)
        self.assertTrue(isinstance(screen_event, ScreenDisappearedEvent))
        self.assertEqual(screen_id, screen_event.screen_id)
        self.assertEqual(5, screen_event.frame.number)
        self.assertEqual(SECOND_COORDINATES, screen_event.screen.coordinates)

        f = BytesIO()
        with xmlfile(f) as xf:
            detector.write_xml(xf)
        f.seek(0)
        xml_document = etree.parse(f)
        self.xml_schema.assertValid(xml_document)
Example #9
0
class ModelLoader(object):
    """ModelLoader is a tool for unserializing or creating PMML
    models.

    A ModelLoader loader instance can be modified to support strict
    PMML compliance, extended PMML features, or optimized
    implementations of PMML elements.

    The user is encouraged to write new PmmlBinding subclasses and
    register them with a ModelLoader to modify the behavior of PMML or
    make certain functions more efficient for a given context.

    ModelLoader is the only supported way to make new PmmlBinding
    instances: any function that produces PMML must be given a
    ModelLoader.

    @type schema: lxml.etree.Element
    @param schema: Representation of the PMML schema used to interpret new models.
    @type tagToClass: dict
    @param tagToClass: Association of PMML tagnames with Python classes.
    """
    def __init__(self,
                 baseXsdFileName="pmml-4-1.xsd",
                 baseXsltFileName="pmml-4-1.xslt"):
        """Initialize a ModelLoader with a base XSD.

        By default, the XSD is the official 4.1 schema published by the U{Data Mining Group<http://www.dmg.org/v4-1/GeneralStructure.html>}.

        @type baseXsdFileName: string
        @param baseXsdFileName: XSD fileName, either absolute or relative to augustus-pmml-library/augustus/core
        @type baseXsltFileName: string
        @param baseXsltFileName: XSLT fileName; future placeholder for XSLT non-local validation.  Not currently used.
        """

        if not os.path.exists(baseXsdFileName):
            baseXsdFileName = os.path.join(
                os.path.split(__file__)[0], baseXsdFileName)
        self.schema = parse(open(baseXsdFileName)).getroot()

        # if not os.path.exists(baseXsltFileName):
        #     baseXsltFileName = os.path.join(os.path.split(__file__)[0], baseXsltFileName)
        # self.stylesheet = parse(open(baseXsltFileName)).getroot()

        self.preparedSchema = None
        self.tagToClass = {}

    def copy(self):
        """Return a deep copy of the ModelLoader for the sake of
        building multiple lines of PMML interpretation from the same
        base."""

        return copy.deepcopy(self)

    def __getstate__(self):
        """Used by Pickle to serialize the ModelLoader.

        This serialization includes the entire schema and tag-to-class
        mapping.
        """

        serialization = self.__dict__.copy()
        buff = StringIO()
        ElementTree(serialization["schema"]).write(
            buff, compression=defs.PICKLE_XML_COMPRESSION)
        serialization["schema"] = buff.getvalue()
        # buff = StringIO()
        # ElementTree(serialization["stylesheet"]).write(buff, compression=defs.PICKLE_XML_COMPRESSION)
        # serialization["stylesheet"] = buff.getvalue()
        serialization["preparedSchema"] = None
        return serialization

    def __setstate__(self, serialization):
        """Used by Pickle to unserialize the ModelLoader.

        This serialization includes the entire schema and tag-to-class
        mapping.
        """

        serialization["schema"] = parse(
            gzip.GzipFile(
                fileobj=StringIO(serialization["schema"]))).getroot()
        # serialization["stylesheet"] = parse(gzip.GzipFile(fileobj=StringIO(serialization["stylesheet"]))).getroot()
        self.__dict__ = serialization

        for tag, cls in self.tagToClass.items():
            cls.xsd = self.xsdElement(tag)

    def xsdElement(self, elementName):
        """Return the XSD that defines a given xs:element.

        @type elementName: string
        @param elementName: The name of the element to retrieve.
        @rtype: lxml.etree.Element
        @return: The XSD object.
        @raise LookupError: If C{elementName} is not found in the schema, an error is raised.
        """

        results = self.schema.xpath("//xs:element[@name='%s']" % elementName,
                                    namespaces={"xs": defs.XSD_NAMESPACE})
        if len(results) == 0:
            return None
        elif len(results) == 1:
            return results[0]
        else:
            raise LookupError(
                "Element \"%s\" is defined %d times in this modelLoader's schema"
                % (elementName, len(results)))

    def xsdGroup(self, groupName):
        """Return the XSD that defines a given xs:group.

        @type groupName: string
        @param groupName: The name of the group to retrieve.
        @rtype: lxml.etree.Element
        @return: The XSD object.
        @raise LookupError: If C{groupName} is not found in the schema, an error is raised.
        """

        results = self.schema.xpath("//xs:group[@name='%s']" % groupName,
                                    namespaces={"xs": defs.XSD_NAMESPACE})
        if len(results) == 0:
            return None
        elif len(results) == 1:
            return results[0]
        else:
            raise LookupError(
                "Group \"%s\" is defined %d times in this modelLoader's schema"
                % (groupName, len(results)))

    def xsdRemove(self, oldName):
        """Remove an arbitrary object from the ModelLoader's XSD schema.

        @type oldName: string
        @param oldName: Name of the object to be removed.
        """

        for result in self.schema.xpath("//*[@name='%s']" % oldName,
                                        namespaces={"xs": defs.XSD_NAMESPACE}):
            parent = result.getparent()
            index = parent.index(result)
            del parent[index]

    def xsdAppend(self, newXsd):
        """Append an arbitrary object to the ModelLoader's XSD schema.

        @type newXsd: string or lxml.etree.Element
        @param newXsd: New XSD object to append.
        """

        if isinstance(newXsd, basestring):
            newXsd = fromstring(newXsd)
        self.schema.append(newXsd)
        self.preparedSchema = None

    def register(self, tag, cls):
        """Define (or redefine) the class that is instantiated for a
        given tagname.

        If the class has an C{xsd} and/or C{xsdAppend} string as a
        class attribute, this method will replace the ModelLoader's
        schema entry for C{tag} with the version defined by the class.

        If the class does not have an C{xsd} attribute, this method
        attach the ModelLoader's schema entry for C{tag} to the class.

        As a result, the class will always end up with a C{xsd} class
        attribute representing its XSD schema.  This schema fragment is
        expressed as a lxml.etree.Element for programmatic use.

        The currently-registered classes are in the ModelLoader's
        C{tagToClass} dictionary.

        @type tag: string
        @param tag: The tagname to define or redefine.
        @type cls: PmmlBinding subclass
        @param cls: The class to associate with C{tag}.
        """

        oldXsdElement = self.xsdElement(tag)

        if cls.xsd is not None:
            if isinstance(cls.xsd, basestring):
                clsxsd = fromstring(cls.xsd)
            else:
                clsxsd = cls.xsd

            newXsdElements = clsxsd.xpath(
                "//xs:element[@name='%s']" % tag,
                namespaces={"xs": defs.XSD_NAMESPACE})
            if len(newXsdElements) != 1:
                raise ValueError(
                    "Class %s has an xsd member but %d definitions of element \"%s\""
                    % (cls.__name__, len(newXsdElements), tag))
            else:
                newXsdElement = newXsdElements[0]

            if oldXsdElement is None:
                self.xsdAppend(newXsdElement)

            else:
                parent = oldXsdElement.getparent()
                index = parent.index(oldXsdElement)
                del parent[index]
                parent.insert(index, newXsdElement)

            cls.xsd = copy.deepcopy(newXsdElement)

        else:
            cls.xsd = copy.deepcopy(oldXsdElement)

        if cls.xsdRemove is not None:
            for name in cls.xsdRemove:
                self.xsdRemove(name)

        if cls.xsdAppend is not None:
            preexisting = {}
            for elem in self.schema:
                name = elem.get("name")
                if name is not None:
                    preexisting[name] = elem

            for newXsd in cls.xsdAppend:
                if isinstance(newXsd, basestring):
                    newXsd = fromstring(newXsd)

                name = newXsd.get("name")
                if name in preexisting:
                    parent = preexisting[name].getparent()
                    index = parent.index(preexisting[name])
                    del parent[index]

                self.xsdAppend(newXsd)

        self.preparedSchema = None
        self.tagToClass[tag] = cls

    def xsdAddToGroupChoice(self, groupName, newElementNames):
        """Add to an xs:group's xs:choice block.

        @type groupName: string
        @param groupName: The name of the xs:group.
        @type newElementNames: list of strings or a single string
        @param newElementNames: References to the xs:elements to add to the xs:choice block.
        """

        results = self.schema.xpath("//xs:group[@name='%s']/xs:choice" %
                                    groupName,
                                    namespaces={"xs": defs.XSD_NAMESPACE})
        if len(results) != 1:
            raise LookupError(
                "Group \"%s\" is defined with a choice block %d times in this modelLoader's schema"
                % (groupName, len(results)))

        E = ElementMaker(namespace=defs.XSD_NAMESPACE,
                         nsmap={"xs": defs.XSD_NAMESPACE})

        if isinstance(newElementNames, basestring):
            results[0].append(E.element(ref=newElementNames))
        else:
            for newElementName in newElementNames:
                results[0].append(E.element(ref=newElementName))

        self.preparedSchema = None

    def xsdReplaceGroup(self, groupName, newXsd):
        """Replace an xs:group in this ModelLoader's schema.

        @type groupName: string
        @param groupName: The name of the xs:group.
        @type newXsd: string or lxml.etree.Element
        @param newXsd: The new XSD represented as an XML string or an lxml.etree.Element; it must contain an xs:group named C{groupName}.
        """

        oldXsdElement = self.xsdGroup(groupName)

        if isinstance(newXsd, basestring):
            newXsd = fromstring(newXsd)

        newXsdElements = newXsd.xpath("//xs:group[@name='%s']" % groupName,
                                      namespaces={"xs": defs.XSD_NAMESPACE})
        if len(newXsdElements) != 1:
            raise ValueError("newXsd has %d definitions of group \"%s\"" %
                             (len(newXsdElements), groupName))
        else:
            newXsdElement = newXsdElements[0]

        if oldXsdElement is None:
            self.xsdAppend(newXsdElement)
        else:
            parent = oldXsdElement.getparent()
            index = parent.index(oldXsdElement)
            del parent[index]
            parent.insert(index, newXsdElement)

        self.preparedSchema = None

    def elementMaker(self, prefix=None, **parserOptions):
        """Obtain a factory for making in-memory PMML objects.

        This factory is an lxml ElementMaker, pre-loaded with the PMML
        namespace and this ModelLoader's current tag-to-class
        relationship.  See the lxml documentation for how to use an
        ElementMaker.

        @type prefix: string or None
        @param prefix: A prefix for the PMML namespace.
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: ElementMaker
        @return: The ElementMaker factory.
        @see: The lxml U{ElementMaker documentation<http://lxml.de/api/lxml.builder.ElementMaker-class.html>}, which explains how to use an ElementMaker factory.
        """
        class XmlParser(XMLParser):
            def makeelement(parserSelf, *args, **kwds):
                result = XMLParser.makeelement(parserSelf, *args, **kwds)
                if isinstance(result, PmmlBinding):
                    result.modelLoader = self
                return result

        parser = XmlParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath(
                "xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        return ElementMaker(namespace=defs.PMML_NAMESPACE,
                            nsmap={prefix: defs.PMML_NAMESPACE},
                            makeelement=parser.makeelement)

    def validate(self, pmmlBinding, postValidate=True):
        """Validate a PMML subtree on demand.

        Note that by default, PMML is validated as or just after it is
        loaded.  This command is intended to check an in-memory PMML
        object after it has been changed or created by an algorithm.

        @type pmmlBinding: PmmlBinding
        @param pmmlBinding: The in-memory PMML object to check.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        """

        if self.preparedSchema is None:
            self.preparedSchema = XMLSchema(self.schema)

        self.preparedSchema.assertValid(pmmlBinding)

        if postValidate:
            for event, elem in iterwalk(pmmlBinding,
                                        events=("end", ),
                                        tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

    # def validateXslt(self, pmmlBinding):
    #     xslt = XSLT(self.stylesheet)
    #     return xslt(pmmlBinding)

    def look(self, tag=None, showXsd=True, showSource=False, stream=None):
        """An informative representation of the ModelLoader's current
        interpretation of PMML, intended for interactive use.

        @type tag: string or None
        @param tag: If a string, look up information about this tag; if None, display all tags in the tag-to-class dictionary.
        @type showXsd: bool
        @param showXsd: If True, show the XSD that defines a valid C{tag}.
        @type showSource: bool
        @param showSource: If True, show the Python source code that implements C{tag}.
        @type stream: file-like object or None
        @param stream: If None, print to C{sys.stdout}; otherwise, write to the specified stream.
        @rtype: None
        @return: None; human-readable output is written to the console or a specified stream.
        """

        if stream is None:
            stream = sys.stdout

        if tag is None:
            names = sorted(
                self.schema.xpath("xs:element/@name",
                                  namespaces={"xs": defs.XSD_NAMESPACE}))
            index = 0
            while index < len(names):
                for i in xrange(4):
                    if index + i < len(names):
                        if names[index + i] in self.tagToClass:
                            word = "[%s]" % names[index + i]
                        else:
                            word = names[index + i]
                        stream.write("%-25s " % word)
                    else:
                        break

                stream.write(os.linesep)
                index += 4

        else:
            xsd = None
            if showXsd:
                try:
                    xsd = self.xsdElement(tag)

                except LookupError:
                    try:
                        xsd = self.xsdGroup(tag)
                    except LookupError:
                        pass

                if xsd is not None:
                    stream.write(tostring(xsd, pretty_print=True))

            if showSource:
                cls = self.tagToClass.get(tag)
                if cls is not None:
                    if xsd is not None:
                        stream.write(os.linesep)
                    stream.write(inspect.getsource(cls))

        stream.flush()

    def loadXml(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as an XML string, fileName,
        URI, or file-like object.

        Note that the XML file or string may be Gzip-compressed.

        @type data: string or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema while loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        """

        if isinstance(data, basestring):
            if len(data) >= 2 and data[0:2] == "\x1f\x8b":
                data = gzip.GzipFile(fileobj=StringIO(data))
            elif data.find("<") != -1:
                data = StringIO(data)

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        newParserOptions = {"schema": schema, "huge_tree": True}
        newParserOptions.update(parserOptions)
        parserOptions = newParserOptions

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath(
                "xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        # ElementNamespaceClassLookup don't work with iterparse, so we have to parse all at once and then iterwalk
        pmmlBinding = parse(data, parser).getroot()
        pmmlBinding.modelLoader = self

        if postValidate:
            for event, elem in iterwalk(pmmlBinding,
                                        events=("end", ),
                                        tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding

    def _loadJsonItem(self, tag, data, parser, nsmap):
        """Helper function for C{loadJson}; not for public use."""

        if tag.find(":") == -1:
            prefix = None
        else:
            prefix, tag = tag.split(":")

        pretag = nsmap.get(prefix)
        if pretag is None:
            raise ValueError(
                "This document contains a prefix (\"%s\") not found in the namespace (%r)"
                % (prefix, nsmap))

        attrib = dict((x[1:], data[x]) for x in data if x.startswith("@"))
        childMap = dict((x, data[x]) for x in data
                        if not x.startswith("@") and not x.startswith("#"))

        item = parser.makeelement("{%s}%s" % (pretag, tag),
                                  attrib=attrib,
                                  nsmap=nsmap)

        children = {}
        for subtag, childList in childMap.items():
            for childItem in childList:
                number = childItem.get("#")
                if number is None:
                    raise ValueError("Subtag \"%s\" has no \"#\"" % subtag)

                children[number] = self._loadJsonItem(subtag, childItem,
                                                      parser, nsmap)

        for number in xrange(len(children)):
            child = children.get(number)
            if child is not None:
                item.append(child)

        text = data.get("#text")
        if text is not None:
            item.text = text

        tail = data.get("#tail")
        if tail is not None:
            item.tail = tail

        return item

    def loadJson(self,
                 data,
                 validate=True,
                 postValidate=True,
                 **parserOptions):
        """Load a PMML model represented as a JSON string, fileName,
        dict, or file-like object.

        There is no standard XML-to-JSON specification, so we define
        our own.  Our specification is very similar to U{this
        proposal<http://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html>},
        which collects subelements of different tagnames into
        different JSON lists, rather than having one long list and
        needing to specify the tag of each element in that list.  This
        has the following advantages, particularly useful for PMML:
          - Frequent tagnames (like <Segment>) are not repeated,
            wasting space.
          - Subelements with a given tagname can be quickly queried,
            without having to iterate over a list that contains
            non-matching tagnames.
        It has the following disadvantages:
          - The relative order of subelements with different tagnames
            is not preserved.
        We therefore additionally include a JSON attribute named "#"
        to specify the ordering of subelements in the XML
        representation.  Also, the specification referenced above
        represents single-child subelements as JSON objects and
        multiple children as JSON lists, but for consistency and ease
        of parsing, we always use lists.  The last difference is that
        we include "#tail" as well as "#text", so that text outside of
        an element is preserved (rarely relevant for PMML, but
        included for completeness).

        Note that this method returns a JSON-like dictionary, not a
        string.  To serialize to JSON, use the C{json} module from the
        Python Standard Library, a faster variant, or an exotic
        serializer such as BSON.

        @type data: string, dict, or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema after loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        @raise ValueError: If the JSON text is malformed or does not represent PMML, an error is raised.
        """

        if hasattr(data, "read"):
            data = json.load(data)
        elif isinstance(data, basestring):
            if os.path.exists(data):
                data = json.load(open(data))
            else:
                data = json.loads(data)

        if not isinstance(data, dict):
            raise ValueError("JSON object must be a mapping at the top level")

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath(
                "xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        try:
            nsmap = data["#nsmap"]
        except KeyError:
            raise ValueError(
                "JSON object must have a \"#nsmap\" key at the top level")

        if "" in nsmap:
            nsmap[None] = nsmap[""]
            del nsmap[""]
        del data["#nsmap"]

        if len(data) != 1:
            raise ValueError(
                "JSON object must have exactly one PMML object at the top level"
            )

        tag = data.keys()[0]
        data = data[tag]
        if not isinstance(data, list) or len(data) != 1:
            raise ValueError(
                "Top-level PMML object must be a list with exactly one item")
        data = data[0]

        pmmlBinding = self._loadJsonItem(tag, data, parser, nsmap)

        if validate:
            schema.assertValid(pmmlBinding)

        if postValidate:
            for event, elem in iterwalk(pmmlBinding,
                                        events=("end", ),
                                        tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding
 def test_intensity_factors(
     self, xml_api: XMLClient, factors_schema: XMLSchema
 ) -> None:
     response = xml_api.national.get_intensity_factors()
     factors_schema.assertValid(response.document)
 def test_today_intensity(
     self, xml_api: XMLClient, measurement_schema: XMLSchema
 ) -> None:
     response = xml_api.national.get_today_intensity()
     measurement_schema.assertValid(response.document)
Example #12
0
class ModelLoader(object):
    """ModelLoader is a tool for unserializing or creating PMML
    models.

    A ModelLoader loader instance can be modified to support strict
    PMML compliance, extended PMML features, or optimized
    implementations of PMML elements.

    The user is encouraged to write new PmmlBinding subclasses and
    register them with a ModelLoader to modify the behavior of PMML or
    make certain functions more efficient for a given context.

    ModelLoader is the only supported way to make new PmmlBinding
    instances: any function that produces PMML must be given a
    ModelLoader.

    @type schema: lxml.etree.Element
    @param schema: Representation of the PMML schema used to interpret new models.
    @type tagToClass: dict
    @param tagToClass: Association of PMML tagnames with Python classes.
    """

    def __init__(self, baseXsdFileName="pmml-4-1.xsd", baseXsltFileName="pmml-4-1.xslt"):
        """Initialize a ModelLoader with a base XSD.

        By default, the XSD is the official 4.1 schema published by the U{Data Mining Group<http://www.dmg.org/v4-1/GeneralStructure.html>}.

        @type baseXsdFileName: string
        @param baseXsdFileName: XSD fileName, either absolute or relative to augustus-pmml-library/augustus/core
        @type baseXsltFileName: string
        @param baseXsltFileName: XSLT fileName; future placeholder for XSLT non-local validation.  Not currently used.
        """

        if not os.path.exists(baseXsdFileName):
            baseXsdFileName = os.path.join(os.path.split(__file__)[0], baseXsdFileName)
        self.schema = parse(open(baseXsdFileName)).getroot()

        # if not os.path.exists(baseXsltFileName):
        #     baseXsltFileName = os.path.join(os.path.split(__file__)[0], baseXsltFileName)
        # self.stylesheet = parse(open(baseXsltFileName)).getroot()

        self.preparedSchema = None
        self.tagToClass = {}

    def copy(self):
        """Return a deep copy of the ModelLoader for the sake of
        building multiple lines of PMML interpretation from the same
        base."""

        return copy.deepcopy(self)

    def __getstate__(self):
        """Used by Pickle to serialize the ModelLoader.

        This serialization includes the entire schema and tag-to-class
        mapping.
        """

        serialization = self.__dict__.copy()
        buff = StringIO()
        ElementTree(serialization["schema"]).write(buff, compression=defs.PICKLE_XML_COMPRESSION)
        serialization["schema"] = buff.getvalue()
        # buff = StringIO()
        # ElementTree(serialization["stylesheet"]).write(buff, compression=defs.PICKLE_XML_COMPRESSION)
        # serialization["stylesheet"] = buff.getvalue()
        serialization["preparedSchema"] = None
        return serialization

    def __setstate__(self, serialization):
        """Used by Pickle to unserialize the ModelLoader.

        This serialization includes the entire schema and tag-to-class
        mapping.
        """

        serialization["schema"] = parse(gzip.GzipFile(fileobj=StringIO(serialization["schema"]))).getroot()
        # serialization["stylesheet"] = parse(gzip.GzipFile(fileobj=StringIO(serialization["stylesheet"]))).getroot()
        self.__dict__ = serialization

        for tag, cls in self.tagToClass.items():
            cls.xsd = self.xsdElement(tag)

    def xsdElement(self, elementName):
        """Return the XSD that defines a given xs:element.

        @type elementName: string
        @param elementName: The name of the element to retrieve.
        @rtype: lxml.etree.Element
        @return: The XSD object.
        @raise LookupError: If C{elementName} is not found in the schema, an error is raised.
        """

        results = self.schema.xpath("//xs:element[@name='%s']" % elementName, namespaces={"xs": defs.XSD_NAMESPACE})
        if len(results) == 0:
            return None
        elif len(results) == 1:
            return results[0]
        else:
            raise LookupError("Element \"%s\" is defined %d times in this modelLoader's schema" % (elementName, len(results)))

    def xsdGroup(self, groupName):
        """Return the XSD that defines a given xs:group.

        @type groupName: string
        @param groupName: The name of the group to retrieve.
        @rtype: lxml.etree.Element
        @return: The XSD object.
        @raise LookupError: If C{groupName} is not found in the schema, an error is raised.
        """

        results = self.schema.xpath("//xs:group[@name='%s']" % groupName, namespaces={"xs": defs.XSD_NAMESPACE})
        if len(results) == 0:
            return None
        elif len(results) == 1:
            return results[0]
        else:
            raise LookupError("Group \"%s\" is defined %d times in this modelLoader's schema" % (groupName, len(results)))

    def xsdRemove(self, oldName):
        """Remove an arbitrary object from the ModelLoader's XSD schema.

        @type oldName: string
        @param oldName: Name of the object to be removed.
        """

        for result in self.schema.xpath("//*[@name='%s']" % oldName, namespaces={"xs": defs.XSD_NAMESPACE}):
            parent = result.getparent()
            index = parent.index(result)
            del parent[index]

    def xsdAppend(self, newXsd):
        """Append an arbitrary object to the ModelLoader's XSD schema.

        @type newXsd: string or lxml.etree.Element
        @param newXsd: New XSD object to append.
        """

        if isinstance(newXsd, basestring):
            newXsd = fromstring(newXsd)
        self.schema.append(newXsd)
        self.preparedSchema = None

    def register(self, tag, cls):
        """Define (or redefine) the class that is instantiated for a
        given tagname.

        If the class has an C{xsd} and/or C{xsdAppend} string as a
        class attribute, this method will replace the ModelLoader's
        schema entry for C{tag} with the version defined by the class.

        If the class does not have an C{xsd} attribute, this method
        attach the ModelLoader's schema entry for C{tag} to the class.

        As a result, the class will always end up with a C{xsd} class
        attribute representing its XSD schema.  This schema fragment is
        expressed as a lxml.etree.Element for programmatic use.

        The currently-registered classes are in the ModelLoader's
        C{tagToClass} dictionary.

        @type tag: string
        @param tag: The tagname to define or redefine.
        @type cls: PmmlBinding subclass
        @param cls: The class to associate with C{tag}.
        """

        oldXsdElement = self.xsdElement(tag)

        if cls.xsd is not None:
            if isinstance(cls.xsd, basestring):
                clsxsd = fromstring(cls.xsd)
            else:
                clsxsd = cls.xsd

            newXsdElements = clsxsd.xpath("//xs:element[@name='%s']" % tag, namespaces={"xs": defs.XSD_NAMESPACE})
            if len(newXsdElements) != 1:
                raise ValueError("Class %s has an xsd member but %d definitions of element \"%s\"" % (cls.__name__, len(newXsdElements), tag))
            else:
                newXsdElement = newXsdElements[0]

            if oldXsdElement is None:
                self.xsdAppend(newXsdElement)

            else:
                parent = oldXsdElement.getparent()
                index = parent.index(oldXsdElement)
                del parent[index]
                parent.insert(index, newXsdElement)

            cls.xsd = copy.deepcopy(newXsdElement)

        else:
            cls.xsd = copy.deepcopy(oldXsdElement)

        if cls.xsdRemove is not None:
            for name in cls.xsdRemove:
                self.xsdRemove(name)

        if cls.xsdAppend is not None:
            preexisting = {}
            for elem in self.schema:
                name = elem.get("name")
                if name is not None:
                    preexisting[name] = elem

            for newXsd in cls.xsdAppend:
                if isinstance(newXsd, basestring):
                    newXsd = fromstring(newXsd)

                name = newXsd.get("name")
                if name in preexisting:
                    parent = preexisting[name].getparent()
                    index = parent.index(preexisting[name])
                    del parent[index]
                    
                self.xsdAppend(newXsd)

        self.preparedSchema = None
        self.tagToClass[tag] = cls

    def xsdAddToGroupChoice(self, groupName, newElementNames):
        """Add to an xs:group's xs:choice block.

        @type groupName: string
        @param groupName: The name of the xs:group.
        @type newElementNames: list of strings or a single string
        @param newElementNames: References to the xs:elements to add to the xs:choice block.
        """

        results = self.schema.xpath("//xs:group[@name='%s']/xs:choice" % groupName, namespaces={"xs": defs.XSD_NAMESPACE})
        if len(results) != 1:
            raise LookupError("Group \"%s\" is defined with a choice block %d times in this modelLoader's schema" % (groupName, len(results)))

        E = ElementMaker(namespace=defs.XSD_NAMESPACE, nsmap={"xs": defs.XSD_NAMESPACE})

        if isinstance(newElementNames, basestring):
            results[0].append(E.element(ref=newElementNames))
        else:
            for newElementName in newElementNames:
                results[0].append(E.element(ref=newElementName))

        self.preparedSchema = None

    def xsdReplaceGroup(self, groupName, newXsd):
        """Replace an xs:group in this ModelLoader's schema.

        @type groupName: string
        @param groupName: The name of the xs:group.
        @type newXsd: string or lxml.etree.Element
        @param newXsd: The new XSD represented as an XML string or an lxml.etree.Element; it must contain an xs:group named C{groupName}.
        """

        oldXsdElement = self.xsdGroup(groupName)
        
        if isinstance(newXsd, basestring):
            newXsd = fromstring(newXsd)

        newXsdElements = newXsd.xpath("//xs:group[@name='%s']" % groupName, namespaces={"xs": defs.XSD_NAMESPACE})
        if len(newXsdElements) != 1:
            raise ValueError("newXsd has %d definitions of group \"%s\"" % (len(newXsdElements), groupName))
        else:
            newXsdElement = newXsdElements[0]

        if oldXsdElement is None:
            self.xsdAppend(newXsdElement)
        else:
            parent = oldXsdElement.getparent()
            index = parent.index(oldXsdElement)
            del parent[index]
            parent.insert(index, newXsdElement)

        self.preparedSchema = None

    def elementMaker(self, prefix=None, **parserOptions):
        """Obtain a factory for making in-memory PMML objects.

        This factory is an lxml ElementMaker, pre-loaded with the PMML
        namespace and this ModelLoader's current tag-to-class
        relationship.  See the lxml documentation for how to use an
        ElementMaker.

        @type prefix: string or None
        @param prefix: A prefix for the PMML namespace.
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: ElementMaker
        @return: The ElementMaker factory.
        @see: The lxml U{ElementMaker documentation<http://lxml.de/api/lxml.builder.ElementMaker-class.html>}, which explains how to use an ElementMaker factory.
        """

        class XmlParser(XMLParser):
            def makeelement(parserSelf, *args, **kwds):
                result = XMLParser.makeelement(parserSelf, *args, **kwds)
                if isinstance(result, PmmlBinding):
                    result.modelLoader = self
                return result

        parser = XmlParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        return ElementMaker(namespace=defs.PMML_NAMESPACE, nsmap={prefix: defs.PMML_NAMESPACE}, makeelement=parser.makeelement)

    def validate(self, pmmlBinding, postValidate=True):
        """Validate a PMML subtree on demand.

        Note that by default, PMML is validated as or just after it is
        loaded.  This command is intended to check an in-memory PMML
        object after it has been changed or created by an algorithm.

        @type pmmlBinding: PmmlBinding
        @param pmmlBinding: The in-memory PMML object to check.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        """

        if self.preparedSchema is None:
            self.preparedSchema = XMLSchema(self.schema)

        self.preparedSchema.assertValid(pmmlBinding)

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

    # def validateXslt(self, pmmlBinding):
    #     xslt = XSLT(self.stylesheet)
    #     return xslt(pmmlBinding)

    def look(self, tag=None, showXsd=True, showSource=False, stream=None):
        """An informative representation of the ModelLoader's current
        interpretation of PMML, intended for interactive use.

        @type tag: string or None
        @param tag: If a string, look up information about this tag; if None, display all tags in the tag-to-class dictionary.
        @type showXsd: bool
        @param showXsd: If True, show the XSD that defines a valid C{tag}.
        @type showSource: bool
        @param showSource: If True, show the Python source code that implements C{tag}.
        @type stream: file-like object or None
        @param stream: If None, print to C{sys.stdout}; otherwise, write to the specified stream.
        @rtype: None
        @return: None; human-readable output is written to the console or a specified stream.
        """

        if stream is None:
            stream = sys.stdout

        if tag is None:
            names = sorted(self.schema.xpath("xs:element/@name", namespaces={"xs": defs.XSD_NAMESPACE}))
            index = 0
            while index < len(names):
                for i in xrange(4):
                    if index + i < len(names):
                        if names[index + i] in self.tagToClass:
                            word = "[%s]" % names[index + i]
                        else:
                            word = names[index + i]
                        stream.write("%-25s " % word)
                    else:
                        break

                stream.write(os.linesep)
                index += 4

        else:
            xsd = None
            if showXsd:
                try:
                    xsd = self.xsdElement(tag)

                except LookupError:
                    try:
                        xsd = self.xsdGroup(tag)
                    except LookupError:
                        pass

                if xsd is not None:
                    stream.write(tostring(xsd, pretty_print=True))        

            if showSource:
                cls = self.tagToClass.get(tag)
                if cls is not None:
                    if xsd is not None:
                        stream.write(os.linesep)
                    stream.write(inspect.getsource(cls))

        stream.flush()

    def loadXml(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as an XML string, fileName,
        URI, or file-like object.

        Note that the XML file or string may be Gzip-compressed.

        @type data: string or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema while loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        """

        if isinstance(data, basestring):
            if len(data) >= 2 and data[0:2] == "\x1f\x8b":
                data = gzip.GzipFile(fileobj=StringIO(data))
            elif data.find("<") != -1:
                data = StringIO(data)

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        newParserOptions = {"schema": schema, "huge_tree": True}
        newParserOptions.update(parserOptions)
        parserOptions = newParserOptions

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        # ElementNamespaceClassLookup don't work with iterparse, so we have to parse all at once and then iterwalk
        pmmlBinding = parse(data, parser).getroot()
        pmmlBinding.modelLoader = self

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding
    
    def _loadJsonItem(self, tag, data, parser, nsmap):
        """Helper function for C{loadJson}; not for public use."""

        if tag.find(":") == -1:
            prefix = None
        else:
            prefix, tag = tag.split(":")

        pretag = nsmap.get(prefix)
        if pretag is None:
            raise ValueError("This document contains a prefix (\"%s\") not found in the namespace (%r)" % (prefix, nsmap))

        attrib = dict((x[1:], data[x]) for x in data if x.startswith("@"))
        childMap = dict((x, data[x]) for x in data if not x.startswith("@") and not x.startswith("#"))

        item = parser.makeelement("{%s}%s" % (pretag, tag), attrib=attrib, nsmap=nsmap)

        children = {}
        for subtag, childList in childMap.items():
            for childItem in childList:
                number = childItem.get("#")
                if number is None:
                    raise ValueError("Subtag \"%s\" has no \"#\"" % subtag)

                children[number] = self._loadJsonItem(subtag, childItem, parser, nsmap)

        for number in xrange(len(children)):
            child = children.get(number)
            if child is not None:
                item.append(child)

        text = data.get("#text")
        if text is not None:
            item.text = text

        tail = data.get("#tail")
        if tail is not None:
            item.tail = tail

        return item

    def loadJson(self, data, validate=True, postValidate=True, **parserOptions):
        """Load a PMML model represented as a JSON string, fileName,
        dict, or file-like object.

        There is no standard XML-to-JSON specification, so we define
        our own.  Our specification is very similar to U{this
        proposal<http://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html>},
        which collects subelements of different tagnames into
        different JSON lists, rather than having one long list and
        needing to specify the tag of each element in that list.  This
        has the following advantages, particularly useful for PMML:
          - Frequent tagnames (like <Segment>) are not repeated,
            wasting space.
          - Subelements with a given tagname can be quickly queried,
            without having to iterate over a list that contains
            non-matching tagnames.
        It has the following disadvantages:
          - The relative order of subelements with different tagnames
            is not preserved.
        We therefore additionally include a JSON attribute named "#"
        to specify the ordering of subelements in the XML
        representation.  Also, the specification referenced above
        represents single-child subelements as JSON objects and
        multiple children as JSON lists, but for consistency and ease
        of parsing, we always use lists.  The last difference is that
        we include "#tail" as well as "#text", so that text outside of
        an element is preserved (rarely relevant for PMML, but
        included for completeness).

        Note that this method returns a JSON-like dictionary, not a
        string.  To serialize to JSON, use the C{json} module from the
        Python Standard Library, a faster variant, or an exotic
        serializer such as BSON.

        @type data: string, dict, or file-like object
        @param data: The data to load.
        @type validate: bool
        @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema after loading.
        @type postValidate: bool
        @param postValidate: If True, run post-XSD validation checks.  (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.)
        @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}.
        @rtype: PmmlBinding
        @return: In-memory PMML object.
        @raise ValueError: If the JSON text is malformed or does not represent PMML, an error is raised.
        """

        if hasattr(data, "read"):
            data = json.load(data)
        elif isinstance(data, basestring):
            if os.path.exists(data):
                data = json.load(open(data))
            else:
                data = json.loads(data)

        if not isinstance(data, dict):
            raise ValueError("JSON object must be a mapping at the top level")

        if validate:
            if self.preparedSchema is None:
                self.preparedSchema = XMLSchema(self.schema)
            schema = self.preparedSchema
        else:
            schema = None

        parser = XMLParser(**parserOptions)
        lookup = ElementNamespaceClassLookup()
        namespace = lookup.get_namespace(defs.PMML_NAMESPACE)
        for xsdElement in self.schema.xpath("xs:element", namespaces={"xs": defs.XSD_NAMESPACE}):
            namespace[xsdElement.attrib["name"]] = PmmlBinding
        namespace.update(self.tagToClass)
        parser.set_element_class_lookup(lookup)

        try:
            nsmap = data["#nsmap"]
        except KeyError:
            raise ValueError("JSON object must have a \"#nsmap\" key at the top level")

        if "" in nsmap:
            nsmap[None] = nsmap[""]
            del nsmap[""]
        del data["#nsmap"]
        
        if len(data) != 1:
            raise ValueError("JSON object must have exactly one PMML object at the top level")

        tag = data.keys()[0]
        data = data[tag]
        if not isinstance(data, list) or len(data) != 1:
            raise ValueError("Top-level PMML object must be a list with exactly one item")
        data = data[0]
        
        pmmlBinding = self._loadJsonItem(tag, data, parser, nsmap)

        if validate:
            schema.assertValid(pmmlBinding)

        if postValidate:
            for event, elem in iterwalk(pmmlBinding, events=("end",), tag="{%s}*" % defs.PMML_NAMESPACE):
                if isinstance(elem, PmmlBinding):
                    elem.postValidate()

        return pmmlBinding
Example #13
0
class XML(object):

    _date_format = '%Y-%m-%d'
    """str: Format of date values in XML."""

    def __init__(self, host_url=None, version='1.2.0', dtd_validation=False, xsd_validation=True):
        """Create a new XML parser instance containing the geoLink XSD for validation.

        Args:
            host_url (str): URL of the OEREBlex host to resolve relative URLs. The complete URL until but
                without the */api* part has to be set, starting with *http://* or *https://*.
            version (str): The version of the geoLink schema to be used. Defaults to `1.2.0`.
            dtd_validation (bool): Enable/disable validation of document type definition (DTD).
                Optional, defaults to False.
            xsd_validation (bool): Enable/disable validation against XML schema (XSD).
                Optional, defaults to True.

        """
        self._host_url = host_url
        self._version = version
        self._dtd_validation = dtd_validation
        self._xsd_validation = xsd_validation
        xsd = pkg_resources.resource_filename('geolink_formatter', 'schema/v{0}.xsd'.format(version))
        with open(xsd) as f:
            self._schema = XMLSchema(fromstring(f.read()))

    @property
    def host_url(self):
        """str: The OEREBlex host URL to resolve relative URLs."""
        return self._host_url

    def _parse_xml(self, xml):
        """Parses the specified XML string and validates it against the geoLink XSD.

        Args:
            xml (str or bytes): The XML to be parsed.

        Returns:
            lxml.etree._Element: The root element of the parsed geoLink XML.

        Raises:
            lxml.etree.XMLSyntaxError: Raised on failed validation.

        """
        if isinstance(xml, bytes):
            content = fromstring(xml)
        else:
            content = fromstring(xml.encode('utf-16be'))
        if self._xsd_validation:
            self._schema.assertValid(content)
        if self._dtd_validation:
            dtd = content.getroottree().docinfo.internalDTD
            if isinstance(dtd, DTD):
                dtd.assertValid(content)
            else:
                raise DocumentInvalid('Missing DTD in parsed content')
        return content

    def from_string(self, xml):
        """Parses XML into internal structure.

        The specified XML string is gets validated against the geoLink XSD on parsing.

        Args:
            xml (str or bytes): The XML to be parsed.

        Returns:
            list[geolink_formatter.entity.Document]: A list containing the parsed document elements.

        Raises:
            lxml.etree.XMLSyntaxError: Raised on failed validation.
        """
        root = self._parse_xml(xml)
        documents = list()

        for document_el in root.iter('document'):
            doc_id = document_el.attrib.get('id')
            doctype = document_el.attrib.get('doctype')

            # Mangle doc_id for notices. While IDs are unique between decrees
            # and edicts, this is not the case when adding notices to the mix.
            if doctype == 'notice':
                doc_id += doctype

            if doc_id and doc_id not in [doc.id for doc in documents]:
                files = list()
                for file_el in document_el.iter('file'):
                    href = file_el.attrib.get('href')
                    if self.host_url and not href.startswith(u'http://') and not href.startswith(u'https://'):
                        href = u'{host}{href}'.format(host=self.host_url, href=href)
                    files.append(File(
                        title=file_el.attrib.get('title'),
                        description=file_el.attrib.get('description'),
                        href=href,
                        category=file_el.attrib.get('category')
                    ))
                enactment_date = document_el.attrib.get('enactment_date')
                if enactment_date:
                    enactment_date = datetime.datetime.strptime(enactment_date, self._date_format).date()
                decree_date = document_el.attrib.get('decree_date')
                if decree_date:
                    decree_date = datetime.datetime.strptime(decree_date, self._date_format).date()
                abrogation_date = document_el.attrib.get('abrogation_date')
                if abrogation_date:
                    abrogation_date = datetime.datetime.strptime(abrogation_date, self._date_format).date()
                documents.append(Document(
                    files=files,
                    id=doc_id,
                    category=document_el.attrib.get('category'),
                    doctype=document_el.attrib.get('doctype'),
                    federal_level=document_el.attrib.get('federal_level'),
                    authority=document_el.attrib.get('authority'),
                    authority_url=document_el.attrib.get('authority_url'),
                    title=document_el.attrib.get('title'),
                    number=document_el.attrib.get('number'),
                    abbreviation=document_el.attrib.get('abbreviation'),
                    instance=document_el.attrib.get('instance'),
                    type=document_el.attrib.get('type'),
                    subtype=document_el.attrib.get('subtype'),
                    decree_date=decree_date,
                    enactment_date=enactment_date,
                    abrogation_date=abrogation_date,
                    cycle=document_el.attrib.get('cycle')
                ))

        return documents

    def from_url(self, url, params=None, **kwargs):
        """Loads the geoLink of the specified URL and parses it into the internal structure.

        Args:
            url (str): The URL of the geoLink to be parsed.
            params (dict): Dictionary or bytes to be sent in the query string for the
                :class:`requests.models.Request`.
            **kwargs: Optional arguments that ``requests.api.request`` takes.

        Returns:
            list[geolink_formatter.entity.Document]: A list containing the parsed document elements.

        Raises:
            lxml.etree.XMLSyntaxError: Raised on failed validation.
            requests.HTTPError: Raised on failed HTTP request.

        """
        response = requests.get(url, params=params, **kwargs)
        if response.status_code == 200:
            return self.from_string(response.content)
        else:
            response.raise_for_status()
 def test_without_period(self, xml_api: XMLClient,
                         measurement_schema: XMLSchema) -> None:
     response = xml_api.national.get_intensity_by_date(date=yesterday)
     measurement_schema.assertValid(response.document)
 def test_intensity_between(
     self, xml_api: XMLClient, measurement_schema: XMLSchema
 ) -> None:
     response = xml_api.national.get_intensity_between(from_=yesterday, to=today)
     measurement_schema.assertValid(response.document)
 def test_intensity_before(
     self, xml_api: XMLClient, measurement_schema: XMLSchema
 ) -> None:
     response = xml_api.national.get_intensity_before(from_=today, days=1)
     measurement_schema.assertValid(response.document)
 def test_with_valid_period(self, xml_api: XMLClient,
                            measurement_schema: XMLSchema) -> None:
     period = random.randrange(MIN_PERIOD, MAX_PERIOD)
     response = xml_api.national.get_intensity_by_date(date=yesterday,
                                                       period=period)
     measurement_schema.assertValid(response.document)