def scrape_file(self): """Scrape file.""" # Check file header with io_open(self.filename, "rb") as input_file: first_line = input_file.readline() if first_line.count(SPSS_PORTABLE_HEADER) != 1: self._errors.append("File is not SPSS Portable format.") # Try to convert file with pspp-convert. If conversion is succesful # (converted.por file is produced), the original file is well-formed. temp_dir = tempfile.mkdtemp() temp_file = os.path.join(temp_dir, "converted.por") try: shell = Shell([PSPP_PATH, self.filename, temp_file]) if shell.stderr: self._errors.append(shell.stderr) self._messages.append(shell.stdout) if os.path.isfile(temp_file): self._messages.append("File conversion was succesful.") else: self._errors.append("File conversion failed.") finally: shutil.rmtree(temp_dir) self.streams = list( self.iterate_models(well_formed=self.well_formed)) self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def scrape_file(self): """Scrape file.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not " "used.") return # Check file header with io_open(self.filename, "rb") as input_file: first_line = input_file.readline() if SPSS_PORTABLE_HEADER not in first_line: self._errors.append("File is not SPSS Portable format.") # Try to convert file with pspp-convert. If conversion is succesful # (converted.por file is produced), the original file is well-formed. temp_dir = tempfile.mkdtemp() temp_file = os.path.join(temp_dir, "converted.por") try: shell = Shell([PSPP_PATH, self.filename, temp_file]) if shell.stderr: self._errors.append(shell.stderr) self._messages.append(shell.stdout) if os.path.isfile(temp_file): self._messages.append("File conversion was succesful.") else: self._errors.append("File conversion failed.") finally: shutil.rmtree(temp_dir) for md_class in self._supported_metadata: self.streams.append( md_class(self._given_mimetype, self._given_version)) self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def exec_xmllint(self, dtd_check=False, schema=None): """ Execute xmllint. :dtd_check: True, if check against DTD, false otherwise :schema: Schema file :returns: tuple including: returncode, stdout, strderr """ command = ["xmllint"] command += ["--valid"] if dtd_check else [] command += ["--huge"] command += ["--noout"] command += ["--nonet"] if self._no_network else [] command += ["--catalogs"] if self._catalogs else [] command += ["--schema", schema] if schema else [] command += [encode_path(self.filename)] if self._catalog_path is not None: environment = {"SGML_CATALOG_FILES": self._catalog_path} else: environment = None shell = Shell(command, env=environment) return (shell.returncode, shell.stdout, shell.stderr)
def _compile_phase(self, stylesheet, inputfile, allowed_codes, outputfile=None, outputfilter=False): """ Compile one phase. :stylesheet: XSLT file to used in the conversion :inputfile: Input document filename :outputfile: Filename of the resulted document, stdout if None :outputfilter: Use outputfilter parameter with value only_messages :return: Shell instance """ cmd = ["xsltproc"] if outputfile: cmd = cmd + ["-o", outputfile] if outputfilter and not self._verbose: cmd = cmd + ["--stringparam", "outputfilter", "only_messages"] cmd = cmd + [ os.path.join(SCHEMATRON_DIRNAME, stylesheet), encode_path(inputfile) ] shell = Shell(cmd) if shell.returncode not in allowed_codes: raise SchematronValidatorError( "Error {}\nstdout:\n{}\nstderr:\n{}".format( shell.returncode, shell.stdout, shell.stderr)) return shell
def scrape_file(self): """Scrape file.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not" "used.") return temp_dir = tempfile.mkdtemp() try: env = {"HOME": temp_dir} shell = Shell([ "soffice", "--convert-to", "pdf", "--outdir", temp_dir, encode_path(self.filename) ], env=env) if shell.stderr: self._errors.append(shell.stderr) self._messages.append(shell.stdout) except OSError as error: self._errors.append("Error handling file: {}".format(error)) finally: shutil.rmtree(temp_dir) for md_class in self._supported_metadata: self.streams.append( md_class(self._given_mimetype, self._given_version)) self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def scrape_file(self): """Scrape DPX.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: " "Well-formed check not used.") return shell = Shell(["dpxv", encode_path(self.filename)]) if shell.returncode != 0: raise DPXvError(shell.stderr) if shell.stderr: self._errors += list(shell.stderr.splitlines()) if shell.stdout: self._messages += list(shell.stdout.splitlines()) for md_class in self._supported_metadata: self.streams.append( md_class( mimetype=self._given_mimetype, version=self._given_version, info=self.info(), filename=self.filename)) self._check_supported()
def scrape_file(self): """ Scrape ARC file by converting to WARC. This is done using Warctools" arc2warc converter. """ if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not " "used.") return size = os.path.getsize(self.filename) if size == 0: self._errors.append("Empty file.") return with tempfile.NamedTemporaryFile(prefix="scraper-warctools.") \ as warcfile: shell = Shell( command=["arc2warc", encode_path(self.filename)], stdout=warcfile) if shell.returncode != 0: self._errors.append("Failed: returncode %s" % shell.returncode) self._errors.append(sanitize_bytestring(shell.stderr_raw)) return self._messages.append("File was analyzed successfully.") if shell.stdout: self._messages.append(shell.stdout) for md_class in self._supported_metadata: self.streams.append(md_class(self._given_mimetype, self._given_version)) self._check_supported(allow_unav_version=True)
def scrape_file(self): """Scrape file.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not" "used.") return shell = Shell([ "gs", "-o", "/dev/null", "-sDEVICE=nullpage", encode_path(self.filename)]) for model in self._supported_metadata: self.streams.append(model(self._given_mimetype, self._given_version)) # Ghostscript may print characters which cannot be converted to UTF-8 stdout_message = ensure_text(shell.stdout_raw, errors='replace') stderr_message = ensure_text(shell.stderr_raw, errors='replace') self._messages.append(stdout_message) # Ghostscript will result 0 if it can repair errors. # However, in those cases an error is logged to either _errors or # _messages. This case should be handled as well-formed failure. if stderr_message: self._errors.append(stderr_message) elif shell.returncode != 0: self._errors.append("Ghostscript returned return code: %s" % shell.returncode) # If no errors have been logged, the file is valid. else: self._messages.append("Well-Formed and valid") self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def scrape_file(self): """ Scrape file. :raises: VeraPDFError """ cmd = [VERAPDF_PATH, encode_path(self.filename)] shell = Shell(cmd) if shell.returncode not in OK_CODES: raise VeraPDFError(shell.stderr) profile = None try: report = ET.fromstring(shell.stdout_raw) if report.xpath("//batchSummary")[0].get("failedToParse") == "0": compliant = report.xpath("//validationReport")[0].get( "isCompliant") if compliant == "false": self._errors.append(shell.stdout) else: self._messages.append(shell.stdout) profile = \ report.xpath("//validationReport")[0].get("profileName") else: self._errors.append(shell.stdout) except ET.XMLSyntaxError: self._errors.append(shell.stderr) self.streams = list( self.iterate_models(well_formed=self.well_formed, profile=profile)) self._check_supported()
def scrape_file(self): """Scrape A/V files.""" try: probe_results = ffmpeg.probe(encode_path(self.filename)) streams = [probe_results["format"]] + probe_results["streams"] for stream in streams: if "index" not in stream: stream["index"] = 0 else: stream["index"] = stream["index"] + 1 except ffmpeg.Error as err: self._errors.append("Error in analyzing file.") self._errors.append(ensure_text(err.stderr)) shell = Shell([ "ffmpeg", "-v", "error", "-i", encode_path(self.filename), "-f", "null", "-" ]) if shell.returncode == 0: self._messages.append("The file was analyzed successfully.") if self._filter_stderr(shell.stderr): self._errors.append(shell.stderr) return # We deny e.g. A-law PCM, mu-law PCM, DPCM and ADPCM and allow # only signed/unsigned linear PCM. Note that we need this check # only if PCM audio is present. This should not be given e.g. # for video streams nor audio streams of another type (such as # MPEG). for stream in streams: if "PCM" in stream.get("codec_long_name", UNAV) and not \ any(stream.get("codec_long_name", UNAV).startswith(x) for x in ["PCM signed", "PCM unsigned"]): self._errors.append("%s does not seem to be LPCM format." % stream["codec_long_name"]) container = False for index in range(len(streams)): # FFMpeg has separate "format" (relevant for containers) and # "streams" (relevant for all files) elements in its output. # We know whether we'll have streams + container or just # streams only after scraping the first stream, so there's a # risk of trying to add one too many streams. This check # prevents constructing more metadata models than there are # streams. if not container and index == len(streams) - 1: break self.streams += list( self.iterate_models(probe_results=probe_results, index=index)) for stream in self.streams: if stream.hascontainer(): container = True self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def test_shell_with_env(): """Test running commands using custom environment variables.""" custom_env = os.environ.copy() custom_env["TEST_VARIABLE"] = "testing" shell = Shell(["printenv", "TEST_VARIABLE"], env=custom_env) assert shell.returncode == 0 assert shell.stdout == "testing\n" assert not shell.stderr
def scrape_file(self): """Scrape A/V files.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not " "used.") return try: probe_results = ffmpeg.probe(encode_path(self.filename)) streams = [probe_results["format"]] + probe_results["streams"] for stream in streams: if "index" not in stream: stream["index"] = 0 else: stream["index"] = stream["index"] + 1 except ffmpeg.Error as err: self._errors.append("Error in analyzing file.") self._errors.append(ensure_text(err.stderr)) shell = Shell([ "ffmpeg", "-v", "error", "-i", encode_path(self.filename), "-f", "null", "-" ]) if shell.returncode == 0: self._messages.append("The file was analyzed successfully.") # if "truncated" in self.filename: # __import__('pdb').set_trace() if self._filter_stderr(shell.stderr): self._errors.append(shell.stderr) return container = False for index in range(len(streams)): # FFMpeg has separate "format" (relevant for containers) and # "streams" (relevant for all files) elements in its output. We # know whether we'll have streams + container or just streams only # after scraping the first stream, so there's a risk of trying to # add one too many streams. This check prevents constructing more # metadata models than there are streams. if not container and index == len(streams) - 1: break for md_class in self._supported_metadata: if md_class.is_supported(self._mimetype_guess): stream = md_class(probe_results, index, self._given_mimetype, self._given_version) self.streams.append(stream) if stream.hascontainer(): container = True self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def test_shell_output_to_file(): """Test having output of a shell command directed to a file""" with TemporaryFile("w+") as outfile: shell = Shell(["seq", "5"], stdout=outfile) assert shell.returncode == 0 assert not shell.stdout assert not shell.stderr outfile.seek(0) expected_number = 1 for line in outfile: assert line == six.text_type(expected_number) + "\n" expected_number += 1
def test_shell(command, expected_returncode, expected_stdout, expected_stderr): """Test running commands normally.""" shell = Shell(command) assert isinstance(shell.stdout, six.text_type) assert isinstance(shell.stderr, six.text_type) assert isinstance(shell.stdout_raw, six.binary_type) assert isinstance(shell.stderr_raw, six.binary_type) assert shell.returncode == expected_returncode assert shell.stderr == expected_stderr assert shell.stdout == expected_stdout
def scrape_file(self): """Scrape file.""" shell = Shell(["pngcheck", encode_path(self.filename)]) if shell.returncode != 0: self._errors.append("Failed: returncode %s" % shell.returncode) self._errors.append(shell.stderr) self._messages.append(shell.stdout) # This scraper does not know anything about the MIME type, so checking # is not useful. Just add metadata models. self.streams = list(self.iterate_models()) self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def file_command(filename, parameters=None): """Use file command in shell. :filename: Filename for the file command. :parameters: Parameter list for the file command. :returns: Shell class """ cmd = "file" env = {} if os.path.isfile(FILECMD_PATH) and os.path.isdir(LD_LIBRARY_PATH): cmd = FILECMD_PATH env = {"LD_LIBRARY_PATH": LD_LIBRARY_PATH} if parameters is None: parameters = [] return Shell([cmd] + parameters + [encode_path(filename)], env=env)
def scrape_file(self): """Scrape file using vnu.jar.""" filterfile = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vnu_filters.txt') shell = Shell([ "java", "-jar", VNU_PATH, "--verbose", "--filterfile", filterfile, self.filename ]) if shell.stderr: self._errors.append(shell.stderr) self._messages.append(shell.stdout) if self.well_formed: self.streams = list( self.iterate_models(well_formed=self.well_formed)) self._check_supported()
def scrape_file(self): """Scrape file using vnu.jar.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not " "used.") return shell = Shell([ "java", "-jar", VNU_PATH, "--verbose", self.filename]) if shell.stderr: self._errors.append(shell.stderr) self._messages.append(shell.stdout) if self.well_formed: for md_class in self._supported_metadata: self.streams.append(md_class(self._given_mimetype, self._given_version)) self._check_supported()
def scrape_file(self): """Run JHove command and store XML output to self.report.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not" "used.") return exec_cmd = [ "jhove", "-h", "XML", "-m", self._jhove_module, self.filename ] shell = Shell(exec_cmd) if shell.returncode != 0: self._errors.append("JHove returned error: %s\n%s" % (shell.returncode, shell.stderr)) self._report = lxml.etree.fromstring(shell.stdout_raw) status = get_field(self._report, "status") self._messages.append(status) if "Well-Formed and valid" not in status: self._errors.append("Validator returned error.") self._errors.append(shell.stdout) self._errors.append(shell.stderr) # If the MIME type is forced, use that, otherwise scrape the MIME type if self._given_mimetype: mimetype = self._given_mimetype else: mimetype = get_field(self._report, "mimeType") if mimetype == "text/xml": # XML MIME type has to be set manually mimetype = "application/xhtml+xml" elif mimetype is not None and "audio/vnd.wave" in mimetype: # wav also mimetype = "audio/x-wav" for md_class in self._supported_metadata: if md_class.is_supported(mimetype) or self._force_metadata_use: self.streams.append( md_class(self._report, self._errors, self._given_mimetype, self._given_version)) self._check_supported(allow_unav_version=True)
def scrape_file(self): """Scrape WARC file.""" size = os.path.getsize(self.filename) if size == 0: self._errors.append("Empty file.") return shell = Shell(["warcvalid", self.filename]) if shell.returncode != 0: self._errors.append("Failed: returncode %s" % shell.returncode) # Filter some trash printed by warcvalid. filtered_errors = [line for line in shell.stderr.split("\n") if u"ignored line" not in line] self._errors.append("\n".join(filtered_errors)) return self._messages.append(shell.stdout) super(WarctoolsFullScraper, self).scrape_file()
def scrape_file(self): """Scrape file.""" temp_dir = tempfile.mkdtemp() try: env = {"HOME": temp_dir} shell = Shell([ SOFFICE_PATH, "--convert-to", "pdf", "--outdir", temp_dir, encode_path(self.filename) ], env=env) if shell.stderr: self._errors.append(shell.stderr) self._messages.append(shell.stdout) except OSError as error: self._errors.append("Error handling file: {}".format(error)) finally: shutil.rmtree(temp_dir) self.streams = list(self.iterate_models()) self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def detect(self): """ Run veraPDF to find out if the file is PDF/A and possibly its version. If the file is not a PDF/A, the MIME type and version are left as None. """ cmd = [VERAPDF_PATH, encode_path(self.filename)] shell = Shell(cmd) # Test if the file is a PDF/A if shell.returncode != 0: self._set_info_not_pdf_a(shell) return try: report = ET.fromstring(shell.stdout_raw) if report.xpath("//batchSummary")[0].get("failedToParse") == "0": compliant = report.xpath("//validationReport")[0].get( "isCompliant") if compliant == "false": self._set_info_not_pdf_a() return profile = \ report.xpath("//validationReport")[0].get("profileName") else: self._set_info_not_pdf_a(shell) return except ET.XMLSyntaxError: self._set_info_not_pdf_a(shell) return # If we have not encountered problems, the file is PDF/A and its # version can be read from the profile. version = profile.split("PDF/A")[1].split(" validation profile")[0] self.version = "A{}".format(version.lower()) self.mimetype = "application/pdf" self.info = { "class": self.__class__.__name__, "messages": ["PDF/A version detected by veraPDF."], "errors": [], "tools": [] }
def scrape_file(self): """Scrape file.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not " "used.") return shell = Shell(["pngcheck", encode_path(self.filename)]) if shell.returncode != 0: self._errors.append("Failed: returncode %s" % shell.returncode) self._errors.append(shell.stderr) self._messages.append(shell.stdout) # This scraper does not know anything about the MIME type, so checking # is not useful. Just add metadata models. for md_class in self._supported_metadata: self.streams.append( md_class(self._given_mimetype, self._given_version)) self._check_supported(allow_unav_mime=True, allow_unav_version=True)
def scrape_file(self): """Scrape WARC file.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not " "used.") return size = os.path.getsize(self.filename) if size == 0: self._errors.append("Empty file.") return shell = Shell(["warcvalid", self.filename]) if shell.returncode != 0: self._errors.append("Failed: returncode %s" % shell.returncode) # Filter some trash printed by warcvalid. filtered_errors = [line for line in shell.stderr.split("\n") if u"ignored line" not in line] self._errors.append("\n".join(filtered_errors)) return self._messages.append(shell.stdout) try: # First assume archive is compressed with gzip.open(self.filename) as warc_fd: line = warc_fd.readline() except IOError: # Not compressed archive with io_open(self.filename, "rb") as warc_fd: line = warc_fd.readline() except Exception as exception: # pylint: disable=broad-except # Compressed but corrupted gzip file self._errors.append(six.text_type(exception)) return self._messages.append("File was analyzed successfully.") for md_class in self._supported_metadata: self.streams.append(md_class(line, self._given_mimetype, self._given_version)) self._check_supported()
def scrape_file(self): """ Scrape file. :raises: VeraPDFError """ if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not " "used.") return cmd = [VERAPDF_PATH, encode_path(self.filename)] shell = Shell(cmd) if shell.returncode != 0: raise VeraPDFError(shell.stderr) self._messages.append(shell.stdout) try: report = ET.fromstring(shell.stdout_raw) if report.xpath("//batchSummary")[0].get("failedToParse") == "0": compliant = report.xpath("//validationReport")[0].get( "isCompliant") if compliant == "false": self._errors.append(shell.stdout) profile = \ report.xpath("//validationReport")[0].get("profileName") else: self._errors.append(shell.stdout) except ET.XMLSyntaxError: self._errors.append(shell.stderr) if self.well_formed: for md_class in self._supported_metadata: self.streams.append( md_class(profile, self._given_mimetype, self._given_version)) self._check_supported()
def scrape_file(self): """Run JHove command and store XML output to self.report.""" exec_cmd = ["jhove", "-h", "XML", "-m", self._jhove_module, self.filename] shell = Shell(exec_cmd) if shell.returncode != 0: self._errors.append("JHove returned error: %s\n%s" % ( shell.returncode, shell.stderr)) self._report = lxml.etree.fromstring(shell.stdout_raw) status = get_field(self._report, "status") self._messages.append(status) if "Well-Formed and valid" not in status: self._errors.append("Validator returned error.") self._errors.append(shell.stdout) self._errors.append(shell.stderr) self.streams = list(self.iterate_models( well_formed=self.well_formed, report=self._report)) self._check_supported(allow_unav_version=True, allow_unap_version=True)