def scrape_file(self):
        """Scrape file."""
        # Check file header
        with io_open(self.filename, "rb") as input_file:
            first_line = input_file.readline()
        if first_line.count(SPSS_PORTABLE_HEADER) != 1:
            self._errors.append("File is not SPSS Portable format.")

        # Try to convert file with pspp-convert. If conversion is succesful
        # (converted.por file is produced), the original file is well-formed.
        temp_dir = tempfile.mkdtemp()
        temp_file = os.path.join(temp_dir, "converted.por")

        try:
            shell = Shell([PSPP_PATH, self.filename, temp_file])
            if shell.stderr:
                self._errors.append(shell.stderr)
            self._messages.append(shell.stdout)
            if os.path.isfile(temp_file):
                self._messages.append("File conversion was succesful.")
            else:
                self._errors.append("File conversion failed.")
        finally:
            shutil.rmtree(temp_dir)
            self.streams = list(
                self.iterate_models(well_formed=self.well_formed))
            self._check_supported(allow_unav_mime=True,
                                  allow_unav_version=True)
Esempio n. 2
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return

        # Check file header
        with io_open(self.filename, "rb") as input_file:
            first_line = input_file.readline()
        if SPSS_PORTABLE_HEADER not in first_line:
            self._errors.append("File is not SPSS Portable format.")

        # Try to convert file with pspp-convert. If conversion is succesful
        # (converted.por file is produced), the original file is well-formed.
        temp_dir = tempfile.mkdtemp()
        temp_file = os.path.join(temp_dir, "converted.por")

        try:
            shell = Shell([PSPP_PATH, self.filename, temp_file])
            if shell.stderr:
                self._errors.append(shell.stderr)
            self._messages.append(shell.stdout)
            if os.path.isfile(temp_file):
                self._messages.append("File conversion was succesful.")
            else:
                self._errors.append("File conversion failed.")
        finally:
            shutil.rmtree(temp_dir)
            for md_class in self._supported_metadata:
                self.streams.append(
                    md_class(self._given_mimetype, self._given_version))
            self._check_supported(allow_unav_mime=True,
                                  allow_unav_version=True)
Esempio n. 3
0
    def exec_xmllint(self, dtd_check=False, schema=None):
        """
        Execute xmllint.

        :dtd_check: True, if check against DTD, false otherwise
        :schema: Schema file
        :returns: tuple including: returncode, stdout, strderr
        """
        command = ["xmllint"]
        command += ["--valid"] if dtd_check else []
        command += ["--huge"]
        command += ["--noout"]
        command += ["--nonet"] if self._no_network else []
        command += ["--catalogs"] if self._catalogs else []
        command += ["--schema", schema] if schema else []
        command += [encode_path(self.filename)]

        if self._catalog_path is not None:
            environment = {"SGML_CATALOG_FILES": self._catalog_path}
        else:
            environment = None

        shell = Shell(command, env=environment)

        return (shell.returncode, shell.stdout, shell.stderr)
Esempio n. 4
0
    def _compile_phase(self,
                       stylesheet,
                       inputfile,
                       allowed_codes,
                       outputfile=None,
                       outputfilter=False):
        """
        Compile one phase.

        :stylesheet: XSLT file to used in the conversion
        :inputfile: Input document filename
        :outputfile: Filename of the resulted document, stdout if None
        :outputfilter: Use outputfilter parameter with value only_messages
        :return: Shell instance
        """
        cmd = ["xsltproc"]
        if outputfile:
            cmd = cmd + ["-o", outputfile]
        if outputfilter and not self._verbose:
            cmd = cmd + ["--stringparam", "outputfilter", "only_messages"]
        cmd = cmd + [
            os.path.join(SCHEMATRON_DIRNAME, stylesheet),
            encode_path(inputfile)
        ]
        shell = Shell(cmd)
        if shell.returncode not in allowed_codes:
            raise SchematronValidatorError(
                "Error {}\nstdout:\n{}\nstderr:\n{}".format(
                    shell.returncode, shell.stdout, shell.stderr))
        return shell
Esempio n. 5
0
 def scrape_file(self):
     """Scrape file."""
     if not self._check_wellformed and self._only_wellformed:
         self._messages.append("Skipping scraper: Well-formed check not"
                               "used.")
         return
     temp_dir = tempfile.mkdtemp()
     try:
         env = {"HOME": temp_dir}
         shell = Shell([
             "soffice", "--convert-to", "pdf", "--outdir", temp_dir,
             encode_path(self.filename)
         ],
                       env=env)
         if shell.stderr:
             self._errors.append(shell.stderr)
         self._messages.append(shell.stdout)
     except OSError as error:
         self._errors.append("Error handling file: {}".format(error))
     finally:
         shutil.rmtree(temp_dir)
         for md_class in self._supported_metadata:
             self.streams.append(
                 md_class(self._given_mimetype, self._given_version))
         self._check_supported(allow_unav_mime=True,
                               allow_unav_version=True)
Esempio n. 6
0
    def scrape_file(self):
        """Scrape DPX."""

        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: "
                                  "Well-formed check not used.")
            return

        shell = Shell(["dpxv", encode_path(self.filename)])

        if shell.returncode != 0:
            raise DPXvError(shell.stderr)

        if shell.stderr:
            self._errors += list(shell.stderr.splitlines())

        if shell.stdout:
            self._messages += list(shell.stdout.splitlines())

        for md_class in self._supported_metadata:
            self.streams.append(
                md_class(
                    mimetype=self._given_mimetype,
                    version=self._given_version,
                    info=self.info(),
                    filename=self.filename))

        self._check_supported()
Esempio n. 7
0
    def scrape_file(self):
        """
        Scrape ARC file by converting to WARC.

        This is done using Warctools" arc2warc converter.
        """
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return
        size = os.path.getsize(self.filename)
        if size == 0:
            self._errors.append("Empty file.")
            return
        with tempfile.NamedTemporaryFile(prefix="scraper-warctools.") \
                as warcfile:
            shell = Shell(
                command=["arc2warc", encode_path(self.filename)],
                stdout=warcfile)
            if shell.returncode != 0:
                self._errors.append("Failed: returncode %s" % shell.returncode)
                self._errors.append(sanitize_bytestring(shell.stderr_raw))
                return
            self._messages.append("File was analyzed successfully.")
            if shell.stdout:
                self._messages.append(shell.stdout)

        for md_class in self._supported_metadata:
            self.streams.append(md_class(self._given_mimetype,
                                         self._given_version))
        self._check_supported(allow_unav_version=True)
Esempio n. 8
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not"
                                  "used.")
            return
        shell = Shell([
            "gs", "-o", "/dev/null", "-sDEVICE=nullpage",
            encode_path(self.filename)])

        for model in self._supported_metadata:
            self.streams.append(model(self._given_mimetype,
                                      self._given_version))

        # Ghostscript may print characters which cannot be converted to UTF-8
        stdout_message = ensure_text(shell.stdout_raw, errors='replace')
        stderr_message = ensure_text(shell.stderr_raw, errors='replace')
        self._messages.append(stdout_message)

        # Ghostscript will result 0 if it can repair errors.
        # However, in those cases an error is logged to either _errors or
        # _messages. This case should be handled as well-formed failure.
        if stderr_message:
            self._errors.append(stderr_message)
        elif shell.returncode != 0:
            self._errors.append("Ghostscript returned return code: %s"
                                % shell.returncode)

        # If no errors have been logged, the file is valid.
        else:
            self._messages.append("Well-Formed and valid")

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
Esempio n. 9
0
    def scrape_file(self):
        """
        Scrape file.

        :raises: VeraPDFError
        """
        cmd = [VERAPDF_PATH, encode_path(self.filename)]

        shell = Shell(cmd)
        if shell.returncode not in OK_CODES:
            raise VeraPDFError(shell.stderr)
        profile = None

        try:
            report = ET.fromstring(shell.stdout_raw)
            if report.xpath("//batchSummary")[0].get("failedToParse") == "0":
                compliant = report.xpath("//validationReport")[0].get(
                    "isCompliant")
                if compliant == "false":
                    self._errors.append(shell.stdout)
                else:
                    self._messages.append(shell.stdout)
                profile = \
                    report.xpath("//validationReport")[0].get("profileName")
            else:
                self._errors.append(shell.stdout)
        except ET.XMLSyntaxError:
            self._errors.append(shell.stderr)

        self.streams = list(
            self.iterate_models(well_formed=self.well_formed, profile=profile))

        self._check_supported()
Esempio n. 10
0
    def scrape_file(self):
        """Scrape A/V files."""
        try:
            probe_results = ffmpeg.probe(encode_path(self.filename))
            streams = [probe_results["format"]] + probe_results["streams"]
            for stream in streams:
                if "index" not in stream:
                    stream["index"] = 0
                else:
                    stream["index"] = stream["index"] + 1
        except ffmpeg.Error as err:
            self._errors.append("Error in analyzing file.")
            self._errors.append(ensure_text(err.stderr))

        shell = Shell([
            "ffmpeg", "-v", "error", "-i",
            encode_path(self.filename), "-f", "null", "-"
        ])

        if shell.returncode == 0:
            self._messages.append("The file was analyzed successfully.")

        if self._filter_stderr(shell.stderr):
            self._errors.append(shell.stderr)
            return

        # We deny e.g. A-law PCM, mu-law PCM, DPCM and ADPCM and allow
        # only signed/unsigned linear PCM. Note that we need this check
        # only if PCM audio is present. This should not be given e.g.
        # for video streams nor audio streams of another type (such as
        # MPEG).
        for stream in streams:
            if "PCM" in stream.get("codec_long_name", UNAV) and not \
                    any(stream.get("codec_long_name", UNAV).startswith(x)
                        for x in ["PCM signed", "PCM unsigned"]):
                self._errors.append("%s does not seem to be LPCM format." %
                                    stream["codec_long_name"])

        container = False
        for index in range(len(streams)):
            # FFMpeg has separate "format" (relevant for containers) and
            # "streams" (relevant for all files) elements in its output.
            # We know whether we'll have streams + container or just
            # streams only after scraping the first stream, so there's a
            # risk of trying to add one too many streams. This check
            # prevents constructing more metadata models than there are
            # streams.
            if not container and index == len(streams) - 1:
                break

            self.streams += list(
                self.iterate_models(probe_results=probe_results, index=index))

            for stream in self.streams:
                if stream.hascontainer():
                    container = True

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def test_shell_with_env():
    """Test running commands using custom environment variables."""
    custom_env = os.environ.copy()
    custom_env["TEST_VARIABLE"] = "testing"
    shell = Shell(["printenv", "TEST_VARIABLE"], env=custom_env)

    assert shell.returncode == 0
    assert shell.stdout == "testing\n"
    assert not shell.stderr
Esempio n. 12
0
    def scrape_file(self):
        """Scrape A/V files."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return

        try:
            probe_results = ffmpeg.probe(encode_path(self.filename))
            streams = [probe_results["format"]] + probe_results["streams"]
            for stream in streams:
                if "index" not in stream:
                    stream["index"] = 0
                else:
                    stream["index"] = stream["index"] + 1
        except ffmpeg.Error as err:
            self._errors.append("Error in analyzing file.")
            self._errors.append(ensure_text(err.stderr))

        shell = Shell([
            "ffmpeg", "-v", "error", "-i",
            encode_path(self.filename), "-f", "null", "-"
        ])

        if shell.returncode == 0:
            self._messages.append("The file was analyzed successfully.")
        # if "truncated" in self.filename:
        #     __import__('pdb').set_trace()

        if self._filter_stderr(shell.stderr):
            self._errors.append(shell.stderr)
            return

        container = False
        for index in range(len(streams)):
            # FFMpeg has separate "format" (relevant for containers) and
            # "streams" (relevant for all files) elements in its output. We
            # know whether we'll have streams + container or just streams only
            # after scraping the first stream, so there's a risk of trying to
            # add one too many streams. This check prevents constructing more
            # metadata models than there are streams.
            if not container and index == len(streams) - 1:
                break

            for md_class in self._supported_metadata:
                if md_class.is_supported(self._mimetype_guess):
                    stream = md_class(probe_results, index,
                                      self._given_mimetype,
                                      self._given_version)
                    self.streams.append(stream)
                    if stream.hascontainer():
                        container = True

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def test_shell_output_to_file():
    """Test having output of a shell command directed to a file"""
    with TemporaryFile("w+") as outfile:
        shell = Shell(["seq", "5"], stdout=outfile)

        assert shell.returncode == 0
        assert not shell.stdout
        assert not shell.stderr

        outfile.seek(0)
        expected_number = 1
        for line in outfile:
            assert line == six.text_type(expected_number) + "\n"
            expected_number += 1
Esempio n. 14
0
def test_shell(command, expected_returncode, expected_stdout,
               expected_stderr):
    """Test running commands normally."""

    shell = Shell(command)

    assert isinstance(shell.stdout, six.text_type)
    assert isinstance(shell.stderr, six.text_type)

    assert isinstance(shell.stdout_raw, six.binary_type)
    assert isinstance(shell.stderr_raw, six.binary_type)

    assert shell.returncode == expected_returncode
    assert shell.stderr == expected_stderr
    assert shell.stdout == expected_stdout
    def scrape_file(self):
        """Scrape file."""
        shell = Shell(["pngcheck", encode_path(self.filename)])

        if shell.returncode != 0:
            self._errors.append("Failed: returncode %s" % shell.returncode)
            self._errors.append(shell.stderr)

        self._messages.append(shell.stdout)

        # This scraper does not know anything about the MIME type, so checking
        # is not useful. Just add metadata models.
        self.streams = list(self.iterate_models())

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def file_command(filename, parameters=None):
    """Use file command in shell.

    :filename: Filename for the file command.
    :parameters: Parameter list for the file command.
    :returns: Shell class
    """
    cmd = "file"
    env = {}
    if os.path.isfile(FILECMD_PATH) and os.path.isdir(LD_LIBRARY_PATH):
        cmd = FILECMD_PATH
        env = {"LD_LIBRARY_PATH": LD_LIBRARY_PATH}

    if parameters is None:
        parameters = []
    return Shell([cmd] + parameters + [encode_path(filename)], env=env)
    def scrape_file(self):
        """Scrape file using vnu.jar."""
        filterfile = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  'vnu_filters.txt')
        shell = Shell([
            "java", "-jar", VNU_PATH, "--verbose", "--filterfile", filterfile,
            self.filename
        ])

        if shell.stderr:
            self._errors.append(shell.stderr)
        self._messages.append(shell.stdout)

        if self.well_formed:
            self.streams = list(
                self.iterate_models(well_formed=self.well_formed))
            self._check_supported()
Esempio n. 18
0
    def scrape_file(self):
        """Scrape file using vnu.jar."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return
        shell = Shell([
            "java", "-jar", VNU_PATH, "--verbose",
            self.filename])

        if shell.stderr:
            self._errors.append(shell.stderr)
        self._messages.append(shell.stdout)

        if self.well_formed:
            for md_class in self._supported_metadata:
                self.streams.append(md_class(self._given_mimetype,
                                             self._given_version))
            self._check_supported()
Esempio n. 19
0
    def scrape_file(self):
        """Run JHove command and store XML output to self.report."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not"
                                  "used.")
            return

        exec_cmd = [
            "jhove", "-h", "XML", "-m", self._jhove_module, self.filename
        ]
        shell = Shell(exec_cmd)

        if shell.returncode != 0:
            self._errors.append("JHove returned error: %s\n%s" %
                                (shell.returncode, shell.stderr))

        self._report = lxml.etree.fromstring(shell.stdout_raw)

        status = get_field(self._report, "status")
        self._messages.append(status)
        if "Well-Formed and valid" not in status:
            self._errors.append("Validator returned error.")
            self._errors.append(shell.stdout)
            self._errors.append(shell.stderr)

        # If the MIME type is forced, use that, otherwise scrape the MIME type
        if self._given_mimetype:
            mimetype = self._given_mimetype
        else:
            mimetype = get_field(self._report, "mimeType")

        if mimetype == "text/xml":  # XML MIME type has to be set manually
            mimetype = "application/xhtml+xml"
        elif mimetype is not None and "audio/vnd.wave" in mimetype:  # wav also
            mimetype = "audio/x-wav"

        for md_class in self._supported_metadata:
            if md_class.is_supported(mimetype) or self._force_metadata_use:
                self.streams.append(
                    md_class(self._report, self._errors, self._given_mimetype,
                             self._given_version))

        self._check_supported(allow_unav_version=True)
    def scrape_file(self):
        """Scrape WARC file."""
        size = os.path.getsize(self.filename)
        if size == 0:
            self._errors.append("Empty file.")
            return
        shell = Shell(["warcvalid", self.filename])

        if shell.returncode != 0:
            self._errors.append("Failed: returncode %s" % shell.returncode)
            # Filter some trash printed by warcvalid.
            filtered_errors = [line for line in shell.stderr.split("\n")
                               if u"ignored line" not in line]
            self._errors.append("\n".join(filtered_errors))
            return

        self._messages.append(shell.stdout)

        super(WarctoolsFullScraper, self).scrape_file()
 def scrape_file(self):
     """Scrape file."""
     temp_dir = tempfile.mkdtemp()
     try:
         env = {"HOME": temp_dir}
         shell = Shell([
             SOFFICE_PATH, "--convert-to", "pdf", "--outdir", temp_dir,
             encode_path(self.filename)
         ],
                       env=env)
         if shell.stderr:
             self._errors.append(shell.stderr)
         self._messages.append(shell.stdout)
     except OSError as error:
         self._errors.append("Error handling file: {}".format(error))
     finally:
         shutil.rmtree(temp_dir)
         self.streams = list(self.iterate_models())
         self._check_supported(allow_unav_mime=True,
                               allow_unav_version=True)
Esempio n. 22
0
    def detect(self):
        """
        Run veraPDF to find out if the file is PDF/A and possibly its version.

        If the file is not a PDF/A, the MIME type and version are left as None.
        """
        cmd = [VERAPDF_PATH, encode_path(self.filename)]
        shell = Shell(cmd)

        # Test if the file is a PDF/A
        if shell.returncode != 0:
            self._set_info_not_pdf_a(shell)
            return
        try:
            report = ET.fromstring(shell.stdout_raw)
            if report.xpath("//batchSummary")[0].get("failedToParse") == "0":
                compliant = report.xpath("//validationReport")[0].get(
                    "isCompliant")
                if compliant == "false":
                    self._set_info_not_pdf_a()
                    return
                profile = \
                    report.xpath("//validationReport")[0].get("profileName")
            else:
                self._set_info_not_pdf_a(shell)
                return
        except ET.XMLSyntaxError:
            self._set_info_not_pdf_a(shell)
            return

        # If we have not encountered problems, the file is PDF/A and its
        # version can be read from the profile.
        version = profile.split("PDF/A")[1].split(" validation profile")[0]
        self.version = "A{}".format(version.lower())
        self.mimetype = "application/pdf"
        self.info = {
            "class": self.__class__.__name__,
            "messages": ["PDF/A version detected by veraPDF."],
            "errors": [],
            "tools": []
        }
Esempio n. 23
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return
        shell = Shell(["pngcheck", encode_path(self.filename)])

        if shell.returncode != 0:
            self._errors.append("Failed: returncode %s" % shell.returncode)
            self._errors.append(shell.stderr)

        self._messages.append(shell.stdout)

        # This scraper does not know anything about the MIME type, so checking
        # is not useful. Just add metadata models.
        for md_class in self._supported_metadata:
            self.streams.append(
                md_class(self._given_mimetype, self._given_version))

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
Esempio n. 24
0
    def scrape_file(self):
        """Scrape WARC file."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return
        size = os.path.getsize(self.filename)
        if size == 0:
            self._errors.append("Empty file.")
            return
        shell = Shell(["warcvalid", self.filename])

        if shell.returncode != 0:
            self._errors.append("Failed: returncode %s" % shell.returncode)
            # Filter some trash printed by warcvalid.
            filtered_errors = [line for line in shell.stderr.split("\n")
                               if u"ignored line" not in line]
            self._errors.append("\n".join(filtered_errors))
            return

        self._messages.append(shell.stdout)

        try:
            # First assume archive is compressed
            with gzip.open(self.filename) as warc_fd:
                line = warc_fd.readline()
        except IOError:
            # Not compressed archive
            with io_open(self.filename, "rb") as warc_fd:
                line = warc_fd.readline()
        except Exception as exception:  # pylint: disable=broad-except
            # Compressed but corrupted gzip file
            self._errors.append(six.text_type(exception))
            return

        self._messages.append("File was analyzed successfully.")
        for md_class in self._supported_metadata:
            self.streams.append(md_class(line, self._given_mimetype,
                                         self._given_version))
        self._check_supported()
Esempio n. 25
0
    def scrape_file(self):
        """
        Scrape file.

        :raises: VeraPDFError
        """
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return
        cmd = [VERAPDF_PATH, encode_path(self.filename)]

        shell = Shell(cmd)
        if shell.returncode != 0:
            raise VeraPDFError(shell.stderr)
        self._messages.append(shell.stdout)

        try:
            report = ET.fromstring(shell.stdout_raw)
            if report.xpath("//batchSummary")[0].get("failedToParse") == "0":
                compliant = report.xpath("//validationReport")[0].get(
                    "isCompliant")
                if compliant == "false":
                    self._errors.append(shell.stdout)
                profile = \
                    report.xpath("//validationReport")[0].get("profileName")
            else:
                self._errors.append(shell.stdout)
        except ET.XMLSyntaxError:
            self._errors.append(shell.stderr)

        if self.well_formed:
            for md_class in self._supported_metadata:
                self.streams.append(
                    md_class(profile, self._given_mimetype,
                             self._given_version))
                self._check_supported()
Esempio n. 26
0
    def scrape_file(self):
        """Run JHove command and store XML output to self.report."""
        exec_cmd = ["jhove", "-h", "XML", "-m",
                    self._jhove_module, self.filename]
        shell = Shell(exec_cmd)

        if shell.returncode != 0:
            self._errors.append("JHove returned error: %s\n%s" % (
                shell.returncode, shell.stderr))

        self._report = lxml.etree.fromstring(shell.stdout_raw)

        status = get_field(self._report, "status")
        self._messages.append(status)
        if "Well-Formed and valid" not in status:
            self._errors.append("Validator returned error.")
            self._errors.append(shell.stdout)
            self._errors.append(shell.stderr)

        self.streams = list(self.iterate_models(
            well_formed=self.well_formed, report=self._report))

        self._check_supported(allow_unav_version=True,
                              allow_unap_version=True)