Ejemplo n.º 1
0
    def exec_xmllint(self, dtd_check=False, schema=None):
        """
        Execute xmllint.

        :dtd_check: True, if check against DTD, false otherwise
        :schema: Schema file
        :returns: tuple including: returncode, stdout, strderr
        """
        command = ['xmllint']
        command += ['--valid'] if dtd_check else []
        command += ['--huge']
        command += ['--noout']
        command += ['--nonet'] if self._no_network else []
        command += ['--catalogs'] if self._catalogs else []
        command += ['--schema', schema] if schema else []
        command += [self.filename]

        if self._catalog_path is not None:
            environment = {
                'SGML_CATALOG_FILES': self._catalog_path
            }
        else:
            environment = None

        shell = Shell(command, env=environment)

        return (shell.returncode, shell.stdout, shell.stderr)
Ejemplo n.º 2
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return

        # Check file header
        with open(self.filename, 'rb') as input_file:
            first_line = input_file.readline()
        if SPSS_PORTABLE_HEADER not in first_line:
            self.errors("File is not SPSS Portable format.")

        # Try to convert file with pspp-convert. If conversion is succesful
        # (converted.por file is produced), the original file is well-formed.
        temp_dir = tempfile.mkdtemp()
        temp_file = os.path.join(temp_dir, 'converted.por')

        try:
            shell = Shell([PSPP_PATH, self.filename, temp_file])
            self.errors(ensure_str(shell.stderr))
            self.messages(ensure_str(shell.stdout))
            if os.path.isfile(temp_file):
                self.messages('File conversion was succesful.')
            else:
                self.errors('File conversion failed.')
        finally:
            shutil.rmtree(temp_dir)
            self._check_supported()
            self._collect_elements()
Ejemplo n.º 3
0
    def scrape_file(self):
        """
        Run JHove command and store XML output to self.report.
        """
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return

        exec_cmd = [
            'jhove', '-h', 'XML', '-m', self._jhove_module, self.filename
        ]
        self._shell = Shell(exec_cmd)

        if self._shell.returncode != 0:
            self.errors("JHove returned error: %s\n%s" %
                        (self._shell.returncode, self._shell.stderr))

        self._report = lxml.etree.fromstring(self._shell.stdout)

        status = self.report_field("status")
        self.messages(status)
        if 'Well-Formed and valid' not in status:
            self.errors("Validator returned error: %s\n%s" % (ensure_str(
                self._shell.stdout), ensure_str(self._shell.stderr)))
        self._check_supported()
        self._collect_elements()
Ejemplo n.º 4
0
    def scrape_file(self):
        """
        Scrape file.

        :raises: VeraPDFError
        """
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        cmd = [VERAPDF_PATH, self.filename]

        shell = Shell(cmd)
        if shell.returncode != 0:
            raise VeraPDFError(ensure_str(shell.stderr))
        self.messages(ensure_str(shell.stdout))

        try:
            report = ET.fromstring(shell.stdout)
            if report.xpath('//batchSummary')[0].get('failedToParse') == '0':
                compliant = report.xpath('//validationReport')[0].get(
                    'isCompliant')
                if compliant == 'false':
                    self.errors(ensure_str(shell.stdout))
                profile = \
                    report.xpath('//validationReport')[0].get('profileName')
                self.version = 'A' + profile.split("PDF/A")[1].split(
                    " validation profile")[0].lower()
            else:
                self.errors(ensure_str(shell.stdout))
        except ET.XMLSyntaxError:
            self.errors(ensure_str(shell.stderr))
        finally:
            self._check_supported()
            self._collect_elements()
Ejemplo n.º 5
0
    def scrape_file(self):
        """
        Scrape ARC file by converting to WARC.

        This is done using Warctools' arc2warc converter.
        """
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        size = os.path.getsize(self.filename)
        if size == 0:
            self.errors('Empty file.')
        with tempfile.NamedTemporaryFile(prefix="scraper-warctools.") \
                as warcfile:
            shell = Shell(command=['arc2warc', self.filename],
                          output_file=warcfile)

            if shell.returncode != 0:
                self.errors("Failed: returncode %s" % shell.returncode)
                # replace non-utf8 characters
                utf8string = shell.stderr.decode('utf8', errors='replace')
                # remove non-printable characters
                sanitized_string = sanitize_string(utf8string)
                # encode string to utf8 before adding to errors
                self.errors(sanitized_string.encode('utf-8'))
            elif size > 0:
                self.messages('File was analyzed successfully.')
            self.messages(ensure_str(shell.stdout))

        self.mimetype = 'application/x-internet-archive'
        self._check_supported()
        self._collect_elements()
Ejemplo n.º 6
0
    def _compile_phase(self, stylesheet, inputfile, allowed_codes,
                       outputfile=None, outputfilter=False):
        """
        Compile one phase.

        :stylesheet: XSLT file to used in the conversion
        :inputfile: Input document filename
        :outputfile: Filename of the resulted document, stdout if None
        :outputfilter: Use outputfilter parameter with value only_messages
        :return: Shell instance
        """
        cmd = ['xsltproc']
        if outputfile:
            cmd = cmd + ['-o', outputfile]
        if outputfilter and not self._verbose:
            cmd = cmd + ['--stringparam', 'outputfilter', 'only_messages']
        cmd = cmd + [os.path.join(self._schematron_dirname, stylesheet),
                     inputfile]
        shell = Shell(cmd)
        if shell.returncode not in allowed_codes:
            raise SchematronValidatorError(
                "Error %s\nstdout:\n%s\nstderr:\n%s" % (
                    shell.returncode, ensure_str(shell.stdout),
                    ensure_str(shell.stderr)))
        return shell
Ejemplo n.º 7
0
 def scrape_file(self):
     """Scrape file using vnu.jar."""
     if not self._check_wellformed and self._only_wellformed:
         self.messages('Skipping scraper: Well-formed check not used.')
         self._collect_elements()
         return
     shell = Shell(['java', '-jar', VNU_PATH, '--verbose', self.filename])
     self.errors(ensure_str(shell.stderr))
     self.messages(ensure_str(shell.stdout))
     self._check_supported()
     self._collect_elements()
Ejemplo n.º 8
0
    def _file_mimetype(self):
        """
        Detect mimetype with the soft option that excludes libmagick.

        :returns: file mimetype
        """
        shell = Shell(
            [FILECMD_PATH, '-be', 'soft', '--mime-type', self.filename],
            env=ENV)

        self.errors(shell.stderr)
        mimetype = ensure_str(shell.stdout).strip()

        return mimetype
Ejemplo n.º 9
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        shell = Shell(['pngcheck', self.filename])

        if shell.returncode != 0:
            self.errors("Failed: returncode %s" % shell.returncode)
            self.errors(ensure_str(shell.stderr))

        self.messages(ensure_str(shell.stdout))
        self._check_supported()
        self._collect_elements()
Ejemplo n.º 10
0
    def scrape_file(self):
        """Scrape A/V files."""
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        shell = Shell(
            ['ffmpeg', '-v', 'error', '-i', self.filename, '-f', 'null', '-'])

        if shell.returncode == 0:
            self.messages('The file was analyzed successfully.')

        self.errors(ensure_str(shell.stderr))
        self.messages(ensure_str(shell.stdout))
        self._check_supported()
        self._collect_elements()
Ejemplo n.º 11
0
    def scrape_file(self):
        """Scrape WARC file."""
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        size = os.path.getsize(self.filename)
        if size == 0:
            self.errors('Empty file.')
        shell = Shell(['warcvalid', self.filename])

        if shell.returncode != 0:
            self.errors("Failed: returncode %s" % shell.returncode)
            # Filter some trash printed by warcvalid.
            filtered_errors = \
                b"\n".join([line for line in shell.stderr.split(b'\n')
                            if b'ignored line' not in line])
            self.errors(filtered_errors)

        self.messages(ensure_str(shell.stdout))

        warc_fd = gzip.open(self.filename)
        try:
            # First assume archive is compressed
            line = warc_fd.readline()
        except IOError:
            # Not compressed archive
            warc_fd.close()
            with open(self.filename, 'rb') as warc_fd:
                line = warc_fd.readline()
        except Exception as exception:  # pylint: disable=broad-except
            # Compressed but corrupted gzip file
            self.errors(str(exception))
            self._check_supported()
            self._collect_elements()
            return

        self.mimetype = 'application/warc'
        if len(line.split(b"WARC/", 1)) > 1:
            self.version = ensure_str(
                line.split(b"WARC/", 1)[1].split(b" ")[0].strip())
        if size > 0:
            self.messages('File was analyzed successfully.')
        self._check_supported()
        self._collect_elements()
Ejemplo n.º 12
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self.messages('Skipping scraper: Well-formed check not used.')
            self._collect_elements()
            return
        shell = Shell(
            ['gs', '-o', '/dev/null', '-sDEVICE=nullpage', self.filename])

        # Ghostscript will result 0 if it can repair errors.
        # However, stderr is not then empty.
        # This case should be handled as well-formed failure.
        if shell.stderr:
            self.errors(shell.stderr.decode('iso-8859-1').encode('utf8'))
        elif shell.returncode != 0:
            self.errors("Ghostscript returned return code: %s" %
                        shell.returncode)
        self.messages(shell.stdout.decode('iso-8859-1').encode('utf8'))
        self._check_supported()
        self._collect_elements()