Beispiel #1
0
    def scrape_file(self):
        """Scrape file."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not"
                                  "used.")
            return
        shell = Shell([
            "gs", "-o", "/dev/null", "-sDEVICE=nullpage",
            encode_path(self.filename)])

        for model in self._supported_metadata:
            self.streams.append(model(self._given_mimetype,
                                      self._given_version))

        # Ghostscript may print characters which cannot be converted to UTF-8
        stdout_message = ensure_text(shell.stdout_raw, errors='replace')
        stderr_message = ensure_text(shell.stderr_raw, errors='replace')
        self._messages.append(stdout_message)

        # Ghostscript will result 0 if it can repair errors.
        # However, in those cases an error is logged to either _errors or
        # _messages. This case should be handled as well-formed failure.
        if stderr_message:
            self._errors.append(stderr_message)
        elif shell.returncode != 0:
            self._errors.append("Ghostscript returned return code: %s"
                                % shell.returncode)

        # If no errors have been logged, the file is valid.
        else:
            self._messages.append("Well-Formed and valid")

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
Beispiel #2
0
def scrape_file(ctx, filename, check_wellformed, tool_info, mimetype, version):
    """
    Identify file type, collect metadata, and optionally check well-formedness.

    In addition to the given options, the user can provide any extra options
    that are passed onto the scraper. These options must be in the long form,
    e.g. "--charset=UTF-8" or "--charset UTF-8".
    \f

    :ctx: Context object
    :filename: Path to the file that should be scraped
    :check_wellformed: Flag whether the scraper checks wellformedness
    :tool_info: Flag whether the scraper includes messages from different 3rd
                party tools
    :mimetype: Specified mimetype for the scraped file
    :version: Specified version for the scraped file
    """
    scraper = Scraper(filename,
                      mimetype=mimetype,
                      version=version,
                      **_extra_options_to_dict(ctx.args))
    scraper.scrape(check_wellformed=check_wellformed)

    results = {
        "path": ensure_text(scraper.filename),
        "MIME type": ensure_text(scraper.mimetype),
        "version": ensure_text(scraper.version),
        "metadata": scraper.streams,
        "grade": scraper.grade()
    }
    if check_wellformed:
        results["well-formed"] = scraper.well_formed
    if tool_info:
        results["tool_info"] = scraper.info

    errors = {}

    for item in scraper.info.values():
        if "ScraperNotFound" in item["class"]:
            raise click.ClickException("Proper scraper was not found. The "
                                       "file was not analyzed.")

        if item["errors"]:
            errors[item["class"]] = item["errors"]

    if errors:
        results["errors"] = errors

    click.echo(json.dumps(results, indent=4))
Beispiel #3
0
    def scrape_file(self):
        """Do the Schematron check."""
        if self._schematron_file is None:
            self._errors.append("Schematron file missing from parameters.")
            return

        xslt_filename = self._compile_schematron()

        shell = self._compile_phase(stylesheet=xslt_filename,
                                    inputfile=self.filename,
                                    allowed_codes=[0, 6])

        self._returncode = shell.returncode
        if shell.stderr:
            self._errors.append(shell.stderr)

        if not self._verbose and shell.returncode == 0:
            self._messages.append(
                ensure_text(self._filter_duplicate_elements(shell.stdout_raw)))
        else:
            self._messages.append(shell.stdout)

        self.streams = list(self.iterate_models(well_formed=self.well_formed))

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
    def scrape_file(self):
        """Do the Schematron check."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return

        if self._schematron_file is None:
            self._errors.append("Schematron file missing from parameters.")
            return

        xslt_filename = self._compile_schematron()

        shell = self._compile_phase(stylesheet=xslt_filename,
                                    inputfile=self.filename,
                                    allowed_codes=[0, 6])

        self._returncode = shell.returncode
        if shell.stderr:
            self._errors.append(shell.stderr)

        if not self._verbose and shell.returncode == 0:
            self._messages.append(
                ensure_text(self._filter_duplicate_elements(shell.stdout_raw)))
        else:
            self._messages.append(shell.stdout)

        for md_class in self._supported_metadata:
            self.streams.append(
                md_class(self._given_mimetype, self._given_version))

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
Beispiel #5
0
    def scrape_file(self):
        """Scrape A/V files."""
        try:
            probe_results = ffmpeg.probe(encode_path(self.filename))
            streams = [probe_results["format"]] + probe_results["streams"]
            for stream in streams:
                if "index" not in stream:
                    stream["index"] = 0
                else:
                    stream["index"] = stream["index"] + 1
        except ffmpeg.Error as err:
            self._errors.append("Error in analyzing file.")
            self._errors.append(ensure_text(err.stderr))

        shell = Shell([
            "ffmpeg", "-v", "error", "-i",
            encode_path(self.filename), "-f", "null", "-"
        ])

        if shell.returncode == 0:
            self._messages.append("The file was analyzed successfully.")

        if self._filter_stderr(shell.stderr):
            self._errors.append(shell.stderr)
            return

        # We deny e.g. A-law PCM, mu-law PCM, DPCM and ADPCM and allow
        # only signed/unsigned linear PCM. Note that we need this check
        # only if PCM audio is present. This should not be given e.g.
        # for video streams nor audio streams of another type (such as
        # MPEG).
        for stream in streams:
            if "PCM" in stream.get("codec_long_name", UNAV) and not \
                    any(stream.get("codec_long_name", UNAV).startswith(x)
                        for x in ["PCM signed", "PCM unsigned"]):
                self._errors.append("%s does not seem to be LPCM format." %
                                    stream["codec_long_name"])

        container = False
        for index in range(len(streams)):
            # FFMpeg has separate "format" (relevant for containers) and
            # "streams" (relevant for all files) elements in its output.
            # We know whether we'll have streams + container or just
            # streams only after scraping the first stream, so there's a
            # risk of trying to add one too many streams. This check
            # prevents constructing more metadata models than there are
            # streams.
            if not container and index == len(streams) - 1:
                break

            self.streams += list(
                self.iterate_models(probe_results=probe_results, index=index))

            for stream in self.streams:
                if stream.hascontainer():
                    container = True

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
Beispiel #6
0
    def version(self):
        """Return the version."""
        if self._given_mimetype and self._given_version:
            return self._given_version

        if len(self._line.split(b"WARC/", 1)) > 1:
            return ensure_text(
                self._line.split(b"WARC/", 1)[1].split(b" ")[0].strip())
        return "(:unav)"
Beispiel #7
0
    def stdout(self):
        """
        Command standard error output.

        :returns: Stdout as unicode string
        """
        if self.stdout_raw is None:
            return None
        return ensure_text(self.stdout_raw)
Beispiel #8
0
    def stderr(self):
        """
        Standard error output from the command.

        :returns: Stderr as unicode string
        """
        if self.stderr_raw is None:
            return None
        return ensure_text(self.stderr_raw)
    def scrape_file(self):
        """Scrape A/V files."""
        if not self._check_wellformed and self._only_wellformed:
            self._messages.append("Skipping scraper: Well-formed check not "
                                  "used.")
            return

        try:
            probe_results = ffmpeg.probe(encode_path(self.filename))
            streams = [probe_results["format"]] + probe_results["streams"]
            for stream in streams:
                if "index" not in stream:
                    stream["index"] = 0
                else:
                    stream["index"] = stream["index"] + 1
        except ffmpeg.Error as err:
            self._errors.append("Error in analyzing file.")
            self._errors.append(ensure_text(err.stderr))

        shell = Shell([
            "ffmpeg", "-v", "error", "-i",
            encode_path(self.filename), "-f", "null", "-"
        ])

        if shell.returncode == 0:
            self._messages.append("The file was analyzed successfully.")
        # if "truncated" in self.filename:
        #     __import__('pdb').set_trace()

        if self._filter_stderr(shell.stderr):
            self._errors.append(shell.stderr)
            return

        container = False
        for index in range(len(streams)):
            # FFMpeg has separate "format" (relevant for containers) and
            # "streams" (relevant for all files) elements in its output. We
            # know whether we'll have streams + container or just streams only
            # after scraping the first stream, so there's a risk of trying to
            # add one too many streams. This check prevents constructing more
            # metadata models than there are streams.
            if not container and index == len(streams) - 1:
                break

            for md_class in self._supported_metadata:
                if md_class.is_supported(self._mimetype_guess):
                    stream = md_class(probe_results, index,
                                      self._given_mimetype,
                                      self._given_version)
                    self.streams.append(stream)
                    if stream.hascontainer():
                        container = True

        self._check_supported(allow_unav_mime=True, allow_unav_version=True)
Beispiel #10
0
    def version(self):
        """Return version."""
        if self._given_mimetype and self._given_version:
            return self._given_version

        for supported_version in self._supported["image/x-dpx"]:

            version_string = "File {}: Validated as V{}".format(
                ensure_text(self._filename), supported_version)

            if version_string in self._messages:
                return supported_version

        return '(:unav)'
Beispiel #11
0
    def _evaluate_xsd_location(self, location):
        """Determine whether or not the XSD schema is a
        local file in relation to the assigned XML file.

        If local file is found, absolute path will be returned for
        xsd-construction's import purpose. Otherwise return the location as-is.

        Absolute path is required for construct_xsd-function as the temporary
        file's location will differ a lot in related to the current
        self.filename.

        :param location: Given schema location in string.
        :return: String of the XSD location. If it's local, absolute path.
        """
        # schemaLocation or noNamespaceSchemaLocation is always either
        # direct path or relative path to the XML in question.
        local_location = os.path.join(
            os.path.dirname(encode_path(self.filename)),
            encode_path(location)
        )
        if os.path.isfile(local_location):
            return os.path.abspath(ensure_text(local_location))
        return ensure_text(location)
Beispiel #12
0
 def __init__(self, filename, **kwargs):
     """Initialize scraper.
     :filename: File path
     :kwargs: Extra arguments for certain scrapers
     """
     if filename is not None:
         filename = ensure_text(filename)
     self.filename = filename
     self.mimetype = None
     self.version = None
     self.streams = None
     self.well_formed = None
     self.info = None
     self._important = {}
     self._params = kwargs
Beispiel #13
0
    def errors(self):
        """
        Return errors without unnecessary ones.

        See KDKPAS-1190.

        :returns: Filtered error messages
        """
        errors_to_remove = []
        errors_to_add = []
        for error in self._errors:
            line = ensure_text(error)
            if "this namespace was already imported" in line:
                errors_to_remove.append(error)
            if "I/O error : Attempt to load network entity" in line:
                errors_to_add.append(
                    "Schema definition probably missing from XML catalog")
                errors_to_remove.append(error)
        for error in errors_to_remove:
            self._errors.remove(error)
        for error in errors_to_add:
            self._errors.append(error)

        return super(XmllintScraper, self).errors()