def exec_xmllint(self, dtd_check=False, schema=None): """ Execute xmllint. :dtd_check: True, if check against DTD, false otherwise :schema: Schema file :returns: tuple including: returncode, stdout, strderr """ command = ['xmllint'] command += ['--valid'] if dtd_check else [] command += ['--huge'] command += ['--noout'] command += ['--nonet'] if self._no_network else [] command += ['--catalogs'] if self._catalogs else [] command += ['--schema', schema] if schema else [] command += [self.filename] if self._catalog_path is not None: environment = { 'SGML_CATALOG_FILES': self._catalog_path } else: environment = None shell = Shell(command, env=environment) return (shell.returncode, shell.stdout, shell.stderr)
def scrape_file(self): """Scrape file.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return # Check file header with open(self.filename, 'rb') as input_file: first_line = input_file.readline() if SPSS_PORTABLE_HEADER not in first_line: self.errors("File is not SPSS Portable format.") # Try to convert file with pspp-convert. If conversion is succesful # (converted.por file is produced), the original file is well-formed. temp_dir = tempfile.mkdtemp() temp_file = os.path.join(temp_dir, 'converted.por') try: shell = Shell([PSPP_PATH, self.filename, temp_file]) self.errors(ensure_str(shell.stderr)) self.messages(ensure_str(shell.stdout)) if os.path.isfile(temp_file): self.messages('File conversion was succesful.') else: self.errors('File conversion failed.') finally: shutil.rmtree(temp_dir) self._check_supported() self._collect_elements()
def scrape_file(self): """ Run JHove command and store XML output to self.report. """ if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return exec_cmd = [ 'jhove', '-h', 'XML', '-m', self._jhove_module, self.filename ] self._shell = Shell(exec_cmd) if self._shell.returncode != 0: self.errors("JHove returned error: %s\n%s" % (self._shell.returncode, self._shell.stderr)) self._report = lxml.etree.fromstring(self._shell.stdout) status = self.report_field("status") self.messages(status) if 'Well-Formed and valid' not in status: self.errors("Validator returned error: %s\n%s" % (ensure_str( self._shell.stdout), ensure_str(self._shell.stderr))) self._check_supported() self._collect_elements()
def scrape_file(self): """ Scrape file. :raises: VeraPDFError """ if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return cmd = [VERAPDF_PATH, self.filename] shell = Shell(cmd) if shell.returncode != 0: raise VeraPDFError(ensure_str(shell.stderr)) self.messages(ensure_str(shell.stdout)) try: report = ET.fromstring(shell.stdout) if report.xpath('//batchSummary')[0].get('failedToParse') == '0': compliant = report.xpath('//validationReport')[0].get( 'isCompliant') if compliant == 'false': self.errors(ensure_str(shell.stdout)) profile = \ report.xpath('//validationReport')[0].get('profileName') self.version = 'A' + profile.split("PDF/A")[1].split( " validation profile")[0].lower() else: self.errors(ensure_str(shell.stdout)) except ET.XMLSyntaxError: self.errors(ensure_str(shell.stderr)) finally: self._check_supported() self._collect_elements()
def scrape_file(self): """ Scrape ARC file by converting to WARC. This is done using Warctools' arc2warc converter. """ if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return size = os.path.getsize(self.filename) if size == 0: self.errors('Empty file.') with tempfile.NamedTemporaryFile(prefix="scraper-warctools.") \ as warcfile: shell = Shell(command=['arc2warc', self.filename], output_file=warcfile) if shell.returncode != 0: self.errors("Failed: returncode %s" % shell.returncode) # replace non-utf8 characters utf8string = shell.stderr.decode('utf8', errors='replace') # remove non-printable characters sanitized_string = sanitize_string(utf8string) # encode string to utf8 before adding to errors self.errors(sanitized_string.encode('utf-8')) elif size > 0: self.messages('File was analyzed successfully.') self.messages(ensure_str(shell.stdout)) self.mimetype = 'application/x-internet-archive' self._check_supported() self._collect_elements()
def _compile_phase(self, stylesheet, inputfile, allowed_codes, outputfile=None, outputfilter=False): """ Compile one phase. :stylesheet: XSLT file to used in the conversion :inputfile: Input document filename :outputfile: Filename of the resulted document, stdout if None :outputfilter: Use outputfilter parameter with value only_messages :return: Shell instance """ cmd = ['xsltproc'] if outputfile: cmd = cmd + ['-o', outputfile] if outputfilter and not self._verbose: cmd = cmd + ['--stringparam', 'outputfilter', 'only_messages'] cmd = cmd + [os.path.join(self._schematron_dirname, stylesheet), inputfile] shell = Shell(cmd) if shell.returncode not in allowed_codes: raise SchematronValidatorError( "Error %s\nstdout:\n%s\nstderr:\n%s" % ( shell.returncode, ensure_str(shell.stdout), ensure_str(shell.stderr))) return shell
def scrape_file(self): """Scrape file using vnu.jar.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return shell = Shell(['java', '-jar', VNU_PATH, '--verbose', self.filename]) self.errors(ensure_str(shell.stderr)) self.messages(ensure_str(shell.stdout)) self._check_supported() self._collect_elements()
def _file_mimetype(self): """ Detect mimetype with the soft option that excludes libmagick. :returns: file mimetype """ shell = Shell( [FILECMD_PATH, '-be', 'soft', '--mime-type', self.filename], env=ENV) self.errors(shell.stderr) mimetype = ensure_str(shell.stdout).strip() return mimetype
def scrape_file(self): """Scrape file.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return shell = Shell(['pngcheck', self.filename]) if shell.returncode != 0: self.errors("Failed: returncode %s" % shell.returncode) self.errors(ensure_str(shell.stderr)) self.messages(ensure_str(shell.stdout)) self._check_supported() self._collect_elements()
def scrape_file(self): """Scrape A/V files.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return shell = Shell( ['ffmpeg', '-v', 'error', '-i', self.filename, '-f', 'null', '-']) if shell.returncode == 0: self.messages('The file was analyzed successfully.') self.errors(ensure_str(shell.stderr)) self.messages(ensure_str(shell.stdout)) self._check_supported() self._collect_elements()
def scrape_file(self): """Scrape WARC file.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return size = os.path.getsize(self.filename) if size == 0: self.errors('Empty file.') shell = Shell(['warcvalid', self.filename]) if shell.returncode != 0: self.errors("Failed: returncode %s" % shell.returncode) # Filter some trash printed by warcvalid. filtered_errors = \ b"\n".join([line for line in shell.stderr.split(b'\n') if b'ignored line' not in line]) self.errors(filtered_errors) self.messages(ensure_str(shell.stdout)) warc_fd = gzip.open(self.filename) try: # First assume archive is compressed line = warc_fd.readline() except IOError: # Not compressed archive warc_fd.close() with open(self.filename, 'rb') as warc_fd: line = warc_fd.readline() except Exception as exception: # pylint: disable=broad-except # Compressed but corrupted gzip file self.errors(str(exception)) self._check_supported() self._collect_elements() return self.mimetype = 'application/warc' if len(line.split(b"WARC/", 1)) > 1: self.version = ensure_str( line.split(b"WARC/", 1)[1].split(b" ")[0].strip()) if size > 0: self.messages('File was analyzed successfully.') self._check_supported() self._collect_elements()
def scrape_file(self): """Scrape file.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return shell = Shell( ['gs', '-o', '/dev/null', '-sDEVICE=nullpage', self.filename]) # Ghostscript will result 0 if it can repair errors. # However, stderr is not then empty. # This case should be handled as well-formed failure. if shell.stderr: self.errors(shell.stderr.decode('iso-8859-1').encode('utf8')) elif shell.returncode != 0: self.errors("Ghostscript returned return code: %s" % shell.returncode) self.messages(shell.stdout.decode('iso-8859-1').encode('utf8')) self._check_supported() self._collect_elements()