def scrape_file(self): """ Scrape file. :raises: VeraPDFError """ if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return cmd = [VERAPDF_PATH, self.filename] shell = Shell(cmd) if shell.returncode != 0: raise VeraPDFError(ensure_str(shell.stderr)) self.messages(ensure_str(shell.stdout)) try: report = ET.fromstring(shell.stdout) if report.xpath('//batchSummary')[0].get('failedToParse') == '0': compliant = report.xpath('//validationReport')[0].get( 'isCompliant') if compliant == 'false': self.errors(ensure_str(shell.stdout)) profile = \ report.xpath('//validationReport')[0].get('profileName') self.version = 'A' + profile.split("PDF/A")[1].split( " validation profile")[0].lower() else: self.errors(ensure_str(shell.stdout)) except ET.XMLSyntaxError: self.errors(ensure_str(shell.stderr)) finally: self._check_supported() self._collect_elements()
def scrape_file(self): """Scrape file.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return # Check file header with open(self.filename, 'rb') as input_file: first_line = input_file.readline() if SPSS_PORTABLE_HEADER not in first_line: self.errors("File is not SPSS Portable format.") # Try to convert file with pspp-convert. If conversion is succesful # (converted.por file is produced), the original file is well-formed. temp_dir = tempfile.mkdtemp() temp_file = os.path.join(temp_dir, 'converted.por') try: shell = Shell([PSPP_PATH, self.filename, temp_file]) self.errors(ensure_str(shell.stderr)) self.messages(ensure_str(shell.stdout)) if os.path.isfile(temp_file): self.messages('File conversion was succesful.') else: self.errors('File conversion failed.') finally: shutil.rmtree(temp_dir) self._check_supported() self._collect_elements()
def scrape_file(self): """ Run JHove command and store XML output to self.report. """ if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return exec_cmd = [ 'jhove', '-h', 'XML', '-m', self._jhove_module, self.filename ] self._shell = Shell(exec_cmd) if self._shell.returncode != 0: self.errors("JHove returned error: %s\n%s" % (self._shell.returncode, self._shell.stderr)) self._report = lxml.etree.fromstring(self._shell.stdout) status = self.report_field("status") self.messages(status) if 'Well-Formed and valid' not in status: self.errors("Validator returned error: %s\n%s" % (ensure_str( self._shell.stdout), ensure_str(self._shell.stderr))) self._check_supported() self._collect_elements()
def scrape_file(self): """Do the Schematron check.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return if self._schematron_file is None: self.errors('Schematron file missing from parameters.') self._collect_elements() return xslt_filename = self._compile_schematron() shell = self._compile_phase( stylesheet=xslt_filename, inputfile=self.filename, allowed_codes=[0, 6]) self._returncode = shell.returncode self.errors(ensure_str(shell.stderr)) if not self._verbose and shell.returncode == 0: self.messages( ensure_str(self._filter_duplicate_elements(shell.stdout))) else: self.messages(ensure_str(shell.stdout)) self._check_supported() self._collect_elements()
def _compile_phase(self, stylesheet, inputfile, allowed_codes, outputfile=None, outputfilter=False): """ Compile one phase. :stylesheet: XSLT file to used in the conversion :inputfile: Input document filename :outputfile: Filename of the resulted document, stdout if None :outputfilter: Use outputfilter parameter with value only_messages :return: Shell instance """ cmd = ['xsltproc'] if outputfile: cmd = cmd + ['-o', outputfile] if outputfilter and not self._verbose: cmd = cmd + ['--stringparam', 'outputfilter', 'only_messages'] cmd = cmd + [os.path.join(self._schematron_dirname, stylesheet), inputfile] shell = Shell(cmd) if shell.returncode not in allowed_codes: raise SchematronValidatorError( "Error %s\nstdout:\n%s\nstderr:\n%s" % ( shell.returncode, ensure_str(shell.stdout), ensure_str(shell.stderr))) return shell
def scrape_file(self): """Scrape file using vnu.jar.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return shell = Shell(['java', '-jar', VNU_PATH, '--verbose', self.filename]) self.errors(ensure_str(shell.stderr)) self.messages(ensure_str(shell.stdout)) self._check_supported() self._collect_elements()
def scrape_file(self): """Scrape file.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return shell = Shell(['pngcheck', self.filename]) if shell.returncode != 0: self.errors("Failed: returncode %s" % shell.returncode) self.errors(ensure_str(shell.stderr)) self.messages(ensure_str(shell.stdout)) self._check_supported() self._collect_elements()
def scrape_file(self): """Scrape A/V files.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return shell = Shell( ['ffmpeg', '-v', 'error', '-i', self.filename, '-f', 'null', '-']) if shell.returncode == 0: self.messages('The file was analyzed successfully.') self.errors(ensure_str(shell.stderr)) self.messages(ensure_str(shell.stdout)) self._check_supported() self._collect_elements()
def scrape_file(self): """ Scrape ARC file by converting to WARC. This is done using Warctools' arc2warc converter. """ if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return size = os.path.getsize(self.filename) if size == 0: self.errors('Empty file.') with tempfile.NamedTemporaryFile(prefix="scraper-warctools.") \ as warcfile: shell = Shell(command=['arc2warc', self.filename], output_file=warcfile) if shell.returncode != 0: self.errors("Failed: returncode %s" % shell.returncode) # replace non-utf8 characters utf8string = shell.stderr.decode('utf8', errors='replace') # remove non-printable characters sanitized_string = sanitize_string(utf8string) # encode string to utf8 before adding to errors self.errors(sanitized_string.encode('utf-8')) elif size > 0: self.messages('File was analyzed successfully.') self.messages(ensure_str(shell.stdout)) self.mimetype = 'application/x-internet-archive' self._check_supported() self._collect_elements()
def scrape_file(self): """Scrape WARC file.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return size = os.path.getsize(self.filename) if size == 0: self.errors('Empty file.') shell = Shell(['warcvalid', self.filename]) if shell.returncode != 0: self.errors("Failed: returncode %s" % shell.returncode) # Filter some trash printed by warcvalid. filtered_errors = \ b"\n".join([line for line in shell.stderr.split(b'\n') if b'ignored line' not in line]) self.errors(filtered_errors) self.messages(ensure_str(shell.stdout)) warc_fd = gzip.open(self.filename) try: # First assume archive is compressed line = warc_fd.readline() except IOError: # Not compressed archive warc_fd.close() with open(self.filename, 'rb') as warc_fd: line = warc_fd.readline() except Exception as exception: # pylint: disable=broad-except # Compressed but corrupted gzip file self.errors(str(exception)) self._check_supported() self._collect_elements() return self.mimetype = 'application/warc' if len(line.split(b"WARC/", 1)) > 1: self.version = ensure_str( line.split(b"WARC/", 1)[1].split(b" ")[0].strip()) if size > 0: self.messages('File was analyzed successfully.') self._check_supported() self._collect_elements()
def errors(self, error=None): """ Return error messages. :error: New error to add to the errors """ err_msg = ensure_str(error) if error is not None else None if err_msg is not None and err_msg != "": self._errors.append(error) return concat(self._errors, 'ERROR: ')
def _file_mimetype(self): """ Detect mimetype with the soft option that excludes libmagick. :returns: file mimetype """ shell = Shell( [FILECMD_PATH, '-be', 'soft', '--mime-type', self.filename], env=ENV) self.errors(shell.stderr) mimetype = ensure_str(shell.stdout).strip() return mimetype
def scrape_file(self): """Scrape data from file.""" if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return try: self._ffmpeg = ffmpeg.probe(self.filename) for stream in [self._ffmpeg['format']] + self._ffmpeg['streams']: if 'index' not in stream: stream['index'] = 0 else: stream['index'] = stream['index'] + 1 self.set_tool_stream(0) except self._ffmpeg.Error as err: self.errors('Error in analyzing file.') self.errors(ensure_str(err.stderr)) else: self.messages('The file was analyzed successfully.') finally: self._check_supported() self._collect_elements()
def errors(self, error=None): """ Remove the warning which we do not need to see from self.stderr. See KDKPAS-1190. :error: Error messages :returns: Filtered error messages """ if error: filtered_errors = [] for line in error.splitlines(): line = ensure_str(line) if 'this namespace was already imported' in line: continue filtered_errors.append(line) if 'I/O error : Attempt to load network entity' in line: filtered_errors.append( 'ERROR: Schema definition propably missing' 'from XML catalog') error = "\n".join(filtered_errors) return super(Xmllint, self).errors(error)
def scrape_file(self): """ Check XML file with Xmllint and return a tuple of results. Strategy for XML file check is 1) Try to check syntax by opening file. 2) If there's DTD specified in file check against that. 3) If there's no DTD and we have external XSD check againtst that. 4) If there's no external XSD read schemas used in file and do check againts them with schema catalog. :returns: Tuple (status, report, errors) where status -- 0 is success, anything else failure report -- generated report errors -- errors if encountered, else None .. seealso:: https://wiki.csc.fi/wiki/KDK/XMLTiedostomuotojenSkeemat """ if not self._check_wellformed and self._only_wellformed: self.messages('Skipping scraper: Well-formed check not used.') self._collect_elements() return # Try to check syntax by opening file in XML parser try: file_ = open(self.filename, 'rb') parser = etree.XMLParser(dtd_validation=False, no_network=True) tree = etree.parse(file_, parser=parser) self.version = tree.docinfo.xml_version file_.close() except etree.XMLSyntaxError as exception: self.errors("Failed: document is not well-formed.") self.errors(str(exception)) self._collect_elements() return except IOError as exception: self.errors("Failed: missing file.") self.errors(str(exception)) self._collect_elements() return # Try check against DTD if tree.docinfo.doctype: (exitcode, stdout, stderr) = self.exec_xmllint(dtd_check=True) # Try check againts XSD else: if not self._schema: self._schema = self.construct_xsd(tree) if not self._schema: # No given schema and didn't find included schemas but XML # was well formed. self.messages("Success: Document is " "well-formed but does not contain schema.") self._collect_elements() return (exitcode, stdout, stderr) = self.exec_xmllint(schema=self._schema) if exitcode == 0: self.messages( "%s Success\n%s" % (self.filename, ensure_str(stdout))) else: self.errors(ensure_str(stderr)) # Clean up constructed schemas if self._has_constructed_schema: os.remove(self._schema) self._check_supported() self._collect_elements()