def scrape_file(self): """Check if file exists.""" if not self.filename: self._errors.append("No filename given.") elif os.path.isfile(self.filename): self._messages.append("File {} was found.".format( decode_path(self.filename))) else: self._errors.append("File {} does not exist.".format( decode_path(self.filename))) self.streams.append(DummyMeta())
def construct_xsd(self, document_tree): """ Construct one schema file for the given document tree. :returns: Path to the constructed XSD schema """ xsd_exists = False parser = etree.XMLParser(dtd_validation=False, no_network=True) schema_tree = etree.XML(SCHEMA_TEMPLATE, parser) schema_locations = set( document_tree.xpath("//*/@xsi:schemaLocation", namespaces={"xsi": XSI})) for schema_location in schema_locations: xsd_exists = True namespaces_locations = schema_location.strip().split() # Import all found namspace/schema location pairs for namespace, location in zip(*[iter(namespaces_locations)] * 2): xs_import = etree.Element(XS + "import") xs_import.attrib["namespace"] = namespace xs_import.attrib["schemaLocation"] = location schema_tree.append(xs_import) schema_locations = set( document_tree.xpath("//*/@xsi:noNamespaceSchemaLocation", namespaces={"xsi": XSI})) for schema_location in schema_locations: xsd_exists = True # Check if XSD file is included in SIP local_schema_location = os.path.join( os.path.dirname(self.filename), encode_path(schema_location)) if os.path.isfile(local_schema_location): schema_location = local_schema_location xs_import = etree.Element(XS + "import") xs_import.attrib["schemaLocation"] = decode_path(schema_location) schema_tree.append(xs_import) if xsd_exists: # Contstruct the schema _, schema = tempfile.mkstemp(prefix="file-scraper-", suffix=".tmp") elem_tree = etree.ElementTree(schema_tree) elem_tree.write(schema) self._has_constructed_schema = True return schema return []
def identify(self): """Identify file format with using pronom registry.""" versions = get_local_pronom_versions() defaults["xml_pronomSignature"] = versions.pronom_signature defaults["containersignature_file"] = \ versions.pronom_container_signature defaults["xml_fidoExtensionSignature"] = \ versions.fido_extension_signature defaults["format_files"] = [defaults["xml_pronomSignature"]] defaults["format_files"].append(defaults["xml_fidoExtensionSignature"]) self.identify_file( # Python's zipfile module used internally by FIDO doesn't support # paths that are provided as byte strings filename=decode_path(self.filename), extension=False)
def scrape_file(self): """Populate streams with supported metadata objects.""" if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not" "used.") return if "mimetype_guess" not in self._params: raise AttributeError("MediainfoScraper was not given a parameter " "dict containing key 'mimetype_guess'.") try: mediainfo = MediaInfo.parse(decode_path(self.filename)) except Exception as e: # pylint: disable=invalid-name, broad-except self._errors.append("Error in analyzing file.") self._errors.append(six.text_type(e)) self._check_supported() return if not self._tracks_ok(mediainfo): return else: self._messages.append("The file was analyzed successfully.") mime_guess = self._choose_mimetype_guess() for index in range(len(mediainfo.tracks)): for md_class in self._supported_metadata: if md_class.is_supported(mime_guess): md_object = md_class(mediainfo.tracks, index, mime_guess, self._given_mimetype, self._given_version) if not md_object.hascontainer() and index == 0: continue self.streams.append(md_object) # Files scraped with SimpleMediainfoMeta will have (:unav) MIME type, # but for other scrapes the tests need to be performed without allowing # unavs MIME types. if self.streams and isinstance(self.streams[0], SimpleMediainfoMeta): self._check_supported(allow_unav_mime=True, allow_unav_version=True) return self._check_supported(allow_unav_version=True, allow_unap_version=True)
def scrape_file(self): """Populate streams with supported metadata objects.""" try: mediainfo = MediaInfo.parse(decode_path(self.filename)) except Exception as e: # pylint: disable=invalid-name, broad-except self._errors.append("Error in analyzing file.") self._errors.append(six.text_type(e)) self._check_supported() return if not self._tracks_ok(mediainfo): return self._messages.append("The file was analyzed successfully.") for index, track in enumerate(mediainfo.tracks): # Use predefined mimetype/version for first stream, and # detected mimetype for other streams if len(self.streams) == 0: mimetype = self._predefined_mimetype version = self._predefined_version # WAV is a special container format. For WAV files, # no distinction between container and soundtrack needs to # be made, as both are treated as one in the DPS. elif (self._predefined_mimetype == 'audio/x-wav' or file_scraper.mediainfo.track_mimetype( mediainfo.tracks[0]) == 'audio/x-wav'): mimetype = 'audio/x-wav' version = None else: mimetype = file_scraper.mediainfo.track_mimetype(track) version = None # Add track as stream self.streams += list( self.iterate_models(mimetype=mimetype, version=version, tracks=mediainfo.tracks, index=index)) self._check_supported(allow_unav_version=True, allow_unap_version=True)
def scrape_file(self): """ Check XML file with Xmllint and return a tuple of results. Strategy for XML file check is 1) Try to check syntax by opening file. 2) If there's DTD specified in file check against that. 3) If there's no DTD and we have external XSD check againtst that. 4) If there's no external XSD read schemas used in file and do check againts them with schema catalog. :returns: Tuple (status, report, errors) where status -- 0 is success, anything else failure report -- generated report errors -- errors if encountered, else None .. seealso:: https://wiki.csc.fi/wiki/KDK/XMLTiedostomuotojenSkeemat """ if not self._check_wellformed and self._only_wellformed: self._messages.append("Skipping scraper: Well-formed check not " "used.") return # Try to check syntax by opening file in XML parser try: file_ = io_open(self.filename, "rb") parser = etree.XMLParser(dtd_validation=False, no_network=True) tree = etree.parse(file_, parser=parser) file_.close() except etree.XMLSyntaxError as exception: self._errors.append("Failed: document is not well-formed.") self._errors.append(six.text_type(exception)) return except IOError as exception: self._errors.append("Failed: missing file.") self._errors.append(six.text_type(exception)) return # Try check against DTD if tree.docinfo.doctype: (exitcode, stdout, stderr) = self.exec_xmllint(dtd_check=True) # Try check againts XSD else: if not self._schema: self._schema = self.construct_xsd(tree) if not self._schema: # No given schema and didn"t find included schemas but XML # was well formed. self._messages.append("Success: Document is well-formed " "but does not contain schema.") self._add_streams(tree) self._check_supported() return (exitcode, stdout, stderr) = self.exec_xmllint(schema=self._schema) if exitcode == 0: self._messages.append("%s Success\n%s" % (decode_path(self.filename), stdout)) else: self._errors += stderr.splitlines() return # Clean up constructed schemas if self._has_constructed_schema: os.remove(self._schema) self._add_streams(tree) self._check_supported()