Example #1
0
    def __init__(self, file):
        """Represents an XML under validation.

        The XML can be retrieved given its filesystem path,
        an URL, a file-object or an etree instance.

        The XML is validated against the JATS Publishing tag set
        and the SPS Style.

        :param file: Path to the XML file, URL or etree.
        """
        if isinstance(file, etree._ElementTree):
            self.lxml = file
        else:
            self.lxml = etree.parse(file)

        self.xmlschema = XMLSchema('SciELO-journalpublishing1.xsd')
        self.schematron = XMLSchematron('sps.sch')
        self.ppl = StyleCheckingPipeline()
Example #2
0
    def __init__(self, file):
        """Represents an XML under validation.

        The XML can be retrieved given its filesystem path,
        an URL, a file-object or an etree instance.

        The XML is validated against the JATS Publishing tag set
        and the SPS Style.

        :param file: Path to the XML file, URL or etree.
        """
        if isinstance(file, etree._ElementTree):
            self.lxml = file
        else:
            self.lxml = etree.parse(file)

        self.xmlschema = XMLSchema('SciELO-journalpublishing1.xsd')
        self.schematron = XMLSchematron('sps.sch')
        self.ppl = StyleCheckingPipeline()
Example #3
0
    def __init__(self, file, no_network=True, dtd=None):
        """Represents an SPS article XML.

        The XML can be retrieved given its filesystem path,
        an URL, a file-object or an etree instance.

        The XML is validated against the JATS Publishing tag set
        and the SPS Style.

        :param file: Path to the XML file, URL or etree.
        :param no_network: (optional) prevent network access for external DTD.
        :param dtd: (optional) etree.DTD instance. if not provided, we try to guess.
        """
        if isinstance(file, etree._ElementTree):
            self.lxml = file
        else:
            parser = etree.XMLParser(remove_blank_text=True,
                                     load_dtd=True, no_network=no_network)
            self.lxml = etree.parse(file, parser)

        self.dtd = dtd or self.lxml.docinfo.externalDTD
        self.schematron = XMLSchematron('scielo-style.sch')
        self.ppl = StyleCheckingPipeline()
Example #4
0
class XML(object):
    def __init__(self, file):
        """Represents an XML under validation.

        The XML can be retrieved given its filesystem path,
        an URL, a file-object or an etree instance.

        The XML is validated against the JATS Publishing tag set
        and the SPS Style.

        :param file: Path to the XML file, URL or etree.
        """
        if isinstance(file, etree._ElementTree):
            self.lxml = file
        else:
            self.lxml = etree.parse(file)

        self.xmlschema = XMLSchema('SciELO-journalpublishing1.xsd')
        self.schematron = XMLSchematron('sps.sch')
        self.ppl = StyleCheckingPipeline()

    def find_element(self, tagname, lineno=None, fallback=True):
        """Find an element given a tagname and a line number.

        If no element is found than the return value is None.
        :param tagname: string of the tag name.
        :param lineno: int if the line it appears on the original source file.
        :param fallback: fallback to root element when `element` is not found.
        """
        for elem in self.lxml.findall('//' + tagname):
            if lineno is None:
                return elem

            elif elem.sourceline == lineno:
                logger.debug('method *find*: hit a regular element: %s.' %
                             tagname)
                return elem

            else:
                continue
        else:
            root = self.lxml.getroot()
            if fallback:
                return root
            elif root.tag == tagname:
                logger.debug('method *find*: hit a root element.')
                return root
            else:
                raise ValueError("Could not find element '%s'." % tagname)

    def validate(self):
        """Validate the source XML against the JATS Publishing Schema.

        Returns a tuple comprising the validation status and the errors list.
        """
        result = setdefault(self, '__validation_result',
                            lambda: self.xmlschema.validate(self.lxml))
        errors = setdefault(self, '__validation_errors',
                            lambda: self.xmlschema.error_log)
        return result, errors

    def _validate_sch(self):
        """Validate the source XML against the SPS Schematron.

        Returns a tuple comprising the validation status and the errors list.
        """
        def make_error_log():
            err_log = self.schematron.error_log
            return [StyleError.from_schematron_errlog(err) for err in err_log]

        result = setdefault(self, '__sch_validation_result',
                            lambda: self.schematron.validate(self.lxml))
        errors = setdefault(self, '__sch_validation_errors', make_error_log)
        return result, errors

    def validate_style(self):
        """Validate the source XML against the SPS Tagging guidelines.

        Returns a tuple comprising the validation status and the errors list.
        """
        def make_error_log():
            errors = next(self.ppl.run(self.lxml, rewrap=True))
            errors += self._validate_sch()[1]
            return errors

        errors = setdefault(self, '__style_validation_result', make_error_log)
        result = setdefault(self, '__style_validation_errors',
                            lambda: not bool(errors))
        return result, errors

    def _annotate_error(self, element, error):
        """Add an annotation prior to `element`, with `error` as the content.

        The annotation is a <SPS-ERROR> element added prior to `element`.
        If `element` is the root element, then the error is annotated as comment.
        :param element: etree instance to be annotated.
        :param error: string of the error.
        """
        notice_element = etree.Element('SPS-ERROR')
        notice_element.text = error
        try:
            element.addprevious(notice_element)
        except TypeError:
            # In case of a root element, a comment if added.
            element.addprevious(etree.Comment('SPS-ERROR: %s' % error))

    def annotate_errors(self):
        """Add notes on all elements that have errors.

        The errors list is generated as a result of calling both :meth:`validate` and
        :meth:`validate_style` methods.
        """
        v_result, v_errors = self.validate()
        s_result, s_errors = self.validate_style()

        if v_result and s_result:
            return None

        for error in itertools.chain(v_errors, s_errors):
            try:
                element_name = search_element_name(error.message)
            except ValueError:
                # could not find the element name
                logger.info('Could not locate the element name in: %s' %
                            error.message)
                continue

            if error.line is None:
                err_element = self.find_element(element_name)
            else:
                err_element = self.find_element(element_name,
                                                lineno=error.line)

            self._annotate_error(err_element, error.message)

    def __str__(self):
        return etree.tostring(self.lxml,
                              pretty_print=True,
                              encoding='utf-8',
                              xml_declaration=True)

    def __unicode__(self):
        return str(self).decode('utf-8')

    def __repr__(self):
        return '<packtools.stylechecker.XML xml=%s valid=%s>' % (
            self.lxml, self.validate()[0])

    def read(self):
        """
        Read the XML contents as text.
        """
        return unicode(self)
Example #5
0
class XML(object):
    def __init__(self, file):
        """Represents an XML under validation.

        The XML can be retrieved given its filesystem path,
        an URL, a file-object or an etree instance.

        The XML is validated against the JATS Publishing tag set
        and the SPS Style.

        :param file: Path to the XML file, URL or etree.
        """
        if isinstance(file, etree._ElementTree):
            self.lxml = file
        else:
            self.lxml = etree.parse(file)

        self.xmlschema = XMLSchema('SciELO-journalpublishing1.xsd')
        self.schematron = XMLSchematron('sps.sch')
        self.ppl = StyleCheckingPipeline()

    def find_element(self, tagname, lineno=None, fallback=True):
        """Find an element given a tagname and a line number.

        If no element is found than the return value is None.
        :param tagname: string of the tag name.
        :param lineno: int if the line it appears on the original source file.
        :param fallback: fallback to root element when `element` is not found.
        """
        for elem in self.lxml.findall('//' + tagname):
            if lineno is None:
                return elem

            elif elem.sourceline == lineno:
                logger.debug('method *find*: hit a regular element: %s.' % tagname)
                return elem

            else:
                continue
        else:
            root = self.lxml.getroot()
            if fallback:
                return root
            elif root.tag == tagname:
                logger.debug('method *find*: hit a root element.')
                return root
            else:
                raise ValueError("Could not find element '%s'." % tagname)

    def validate(self):
        """Validate the source XML against the JATS Publishing Schema.

        Returns a tuple comprising the validation status and the errors list.
        """
        result = setdefault(self, '__validation_result', lambda: self.xmlschema.validate(self.lxml))
        errors = setdefault(self, '__validation_errors', lambda: self.xmlschema.error_log)
        return result, errors

    def _validate_sch(self):
        """Validate the source XML against the SPS Schematron.

        Returns a tuple comprising the validation status and the errors list.
        """
        def make_error_log():
            err_log = self.schematron.error_log
            return [StyleError.from_schematron_errlog(err) for err in err_log]

        result = setdefault(self, '__sch_validation_result', lambda: self.schematron.validate(self.lxml))
        errors = setdefault(self, '__sch_validation_errors', make_error_log)
        return result, errors

    def validate_style(self):
        """Validate the source XML against the SPS Tagging guidelines.

        Returns a tuple comprising the validation status and the errors list.
        """
        def make_error_log():
            errors = next(self.ppl.run(self.lxml, rewrap=True))
            errors += self._validate_sch()[1]
            return errors

        errors = setdefault(self, '__style_validation_result', make_error_log)
        result = setdefault(self, '__style_validation_errors', lambda: not bool(errors))
        return result, errors

    def _annotate_error(self, element, error):
        """Add an annotation prior to `element`, with `error` as the content.

        The annotation is a <SPS-ERROR> element added prior to `element`.
        If `element` is the root element, then the error is annotated as comment.
        :param element: etree instance to be annotated.
        :param error: string of the error.
        """
        notice_element = etree.Element('SPS-ERROR')
        notice_element.text = error
        try:
            element.addprevious(notice_element)
        except TypeError:
            # In case of a root element, a comment if added.
            element.addprevious(etree.Comment('SPS-ERROR: %s' % error))

    def annotate_errors(self):
        """Add notes on all elements that have errors.

        The errors list is generated as a result of calling both :meth:`validate` and
        :meth:`validate_style` methods.
        """
        v_result, v_errors = self.validate()
        s_result, s_errors = self.validate_style()

        if v_result and s_result:
            return None

        for error in itertools.chain(v_errors, s_errors):
            try:
                element_name = search_element_name(error.message)
            except ValueError:
                # could not find the element name
                logger.info('Could not locate the element name in: %s' % error.message)
                continue

            if error.line is None:
                err_element = self.find_element(element_name)
            else:
                err_element = self.find_element(element_name, lineno=error.line)

            self._annotate_error(err_element, error.message)

    def __str__(self):
        return etree.tostring(self.lxml, pretty_print=True,
            encoding='utf-8', xml_declaration=True)

    def __unicode__(self):
        return str(self).decode('utf-8')

    def __repr__(self):
        return '<packtools.stylechecker.XML xml=%s valid=%s>' % (self.lxml, self.validate()[0])

    def read(self):
        """
        Read the XML contents as text.
        """
        return unicode(self)
Example #6
0
class XML(object):
    def __init__(self, file, no_network=True, dtd=None):
        """Represents an SPS article XML.

        The XML can be retrieved given its filesystem path,
        an URL, a file-object or an etree instance.

        The XML is validated against the JATS Publishing tag set
        and the SPS Style.

        :param file: Path to the XML file, URL or etree.
        :param no_network: (optional) prevent network access for external DTD.
        :param dtd: (optional) etree.DTD instance. if not provided, we try to guess.
        """
        if isinstance(file, etree._ElementTree):
            self.lxml = file
        else:
            parser = etree.XMLParser(remove_blank_text=True,
                                     load_dtd=True, no_network=no_network)
            self.lxml = etree.parse(file, parser)

        self.dtd = dtd or self.lxml.docinfo.externalDTD
        self.schematron = XMLSchematron('scielo-style.sch')
        self.ppl = StyleCheckingPipeline()

    @cachedmethod
    def validate(self):
        """Validate the source XML against the JATS Publishing Schema.

        Returns a tuple comprising the validation status and the errors list.
        """
        if self.dtd is None:
            raise TypeError('The DTD/XSD could not be loaded.')

        def make_error_log():
            return [SchemaStyleError(err) for err in self.dtd.error_log]

        result = self.dtd.validate(self.lxml)
        errors = make_error_log()
        return result, errors

    @cachedmethod
    def _validate_sch(self):
        """Validate the source XML against the SPS Schematron.

        Returns a tuple comprising the validation status and the errors list.
        """
        def make_error_log():
            err_log = self.schematron.error_log
            return [SchematronStyleError(err) for err in err_log]

        result = self.schematron.validate(self.lxml)
        errors = make_error_log()
        return result, errors

    @cachedmethod
    def validate_style(self):
        """Validate the source XML against the SPS Tagging guidelines.

        Returns a tuple comprising the validation status and the errors list.
        """
        def make_error_log():
            errors = next(self.ppl.run(self.lxml, rewrap=True))
            errors += self._validate_sch()[1]
            return errors

        errors = make_error_log()
        result = not bool(errors)
        return result, errors

    def _annotate_error(self, element, error):
        """Add an annotation prior to `element`, with `error` as the content.

        The annotation is a <SPS-ERROR> element added prior to `element`.
        If `element` is the root element, then the error is annotated as comment.
        :param element: etree instance to be annotated.
        :param error: string of the error.
        """
        notice_element = etree.Element('SPS-ERROR')
        notice_element.text = error
        element.addprevious(etree.Comment('SPS-ERROR: %s' % error))

    def annotate_errors(self, fail_fast=False):
        """Add notes on all elements that have errors.

        The errors list is generated as a result of calling both :meth:`validate` and
        :meth:`validate_style` methods.

        :param fail_fast: (optional) raise TypeError if the dtd have not been loaded.
        """
        try:
            v_result, v_errors = self.validate()

        except TypeError:
            if fail_fast:
                raise
            else:
                v_result = True
                v_errors = []

        s_result, s_errors = self.validate_style()

        if v_result and s_result:
            return None

        for error in itertools.chain(v_errors, s_errors):
            try:
                err_element = error.get_apparent_element(self.lxml)
            except ValueError:
                logger.info('Could not locate the element name in: %s' % error.message)
                err_element = self.lxml.getroot()

            self._annotate_error(err_element, error.message)

    def __str__(self):
        return etree.tostring(self.lxml, pretty_print=True,
            encoding='utf-8', xml_declaration=True)

    def __unicode__(self):
        return str(self).decode('utf-8')

    def __repr__(self):
        return '<packtools.stylechecker.XML xml=%s valid=%s>' % (self.lxml, self.validate()[0])

    def read(self):
        """
        Read the XML contents as text.
        """
        return unicode(self)