class IdentifyFileFormat(DBTask): queue = 'file_operation' def handle_matches(self, fullname, matches, delta_t, matchtype=''): if len(matches) == 0: raise ValueError("No matches for %s" % fullname) f, sigName = matches[-1] self.lastFmt = f.find('name').text def run(self, filename=None): """ Identifies the format of the file using the fido library Args: filename: The filename to identify Returns: The format of the file """ self.fid = Fido() self.fid.handle_matches = self.handle_matches self.fid.identify_file(filename) self.set_progress(100, total=100) return self.lastFmt def undo(self, filename=None): pass def event_outcome_success(self, filename=None, block_size=65536, algorithm='SHA-256'): return "Identified format of %s" % filename
def __init__(self, filename): """ Initialize the reader. Fido is done with old-style python and does not inherit object, so super() is not available. :filename: File path """ self.filename = filename # File path self.puid = None # Identified pronom code self.mimetype = None # Identified mime type self.version = None # Identified file format version Fido.__init__(self, quiet=True, format_files=[ "formats-v94.xml", "format_extensions.xml"])
def test_fido_cache_halting_file(fido_cache_halting_file): """Tests that time used between raw Fido usage and FidoDetector usage does not provide big difference in processing time.""" fido_object = Fido( quiet=True, format_files=["formats-v95.xml", "format_extensions.xml"]) fido_start_time = time.time() fido_object.identify_file(fido_cache_halting_file) fido_elapsed_time = time.time() - fido_start_time fido_reader_start_time = time.time() fido_reader_object = FidoDetector(fido_cache_halting_file) fido_reader_object.detect() fido_reader_elapsed_time = time.time() - fido_reader_start_time # 2 second difference is acceptable with the given test file. assert abs(fido_elapsed_time - fido_reader_elapsed_time) < 2
def load_fido_xml(self, file): """Overloads the default load_fido_xml so that it has an option to prevent being called again. If data has been cached, will use that data instead. :param file: File that will be loaded. """ if _FidoCachedFormats._use_cached: self.formats = _FidoCachedFormats._cached_formats self.puid_format_map = _FidoCachedFormats._cached_puid_format_map self.puid_has_priority_over_map = \ _FidoCachedFormats._cached_puid_has_priority_over_map else: Fido.load_fido_xml(self, file=file) return self.formats
def run(self, filename=None): """ Identifies the format of the file using the fido library Args: filename: The filename to identify Returns: The format of the file """ self.fid = Fido() self.fid.handle_matches = self.handle_matches self.fid.identify_file(filename) self.set_progress(100, total=100) return self.lastFmt
class FormatIdentification(): """ File Format Identification """ def __init__(self): self.fid = Fido() self.fid.handle_matches = self.print_matches self.lastFmt = None def identify_file(self, entry): """ This function identifies the file format of every file that is handed over. """ self.fid.identify_file(entry) return self.lastFmt def print_matches(self, fullname, matches, delta_t, matchtype=''): #print "####" + fullname for (f, s) in matches: self.lastFmt = self.fid.get_puid(f)
class FormatIdentification(): """ File Format Identification """ def __init__(self): self.fid = Fido() self.fid.handle_matches = self.print_matches self.lastFmt = None def identify_file(self, entry): """ This function identifies the file format of every file that is handed over. """ self.fid.identify_file(entry) return self.lastFmt def get_mime_for_puid(self, puid): """ Get mime type for a given puid @type puid: string @param puid: PRONOM Persistent Unique Identifier @rtype: string @return: mime type string (default: application/octet-stream) """ mime_tag = "mime" fmtres = self.fid.puid_format_map[puid] childs = [child for child in fmtres if child.tag.endswith(mime_tag)] if len(childs) == 1: return (childs[0]).text.strip() else: return "application/octet-stream" def print_matches(self, fullname, matches, delta_t, matchtype=''): # print "####" + fullname for (f, s) in matches: self.lastFmt = self.fid.get_puid(f)
class FormatIdentification(): """ File Format Identification """ def __init__(self): if not fido_disabled: self.fid = Fido() self.fid.handle_matches = self.print_matches self.lastFmt = None def identify_file(self, entry): """ This function identifies the file format of every file that is handed over. """ assert not fido_disabled, "Fido module is not available!" self.fid.identify_file(entry) return self.lastFmt def get_mime_for_puid(self, puid): """ :param puid: PRONOM Persistent Unique Identifier :return: mime type string (default: application/octet-stream) """ assert not fido_disabled, "Fido module is not available!" mime_tag = "mime" fmtres = self.fid.puid_format_map[puid] childs = [child for child in fmtres if child.tag.endswith(mime_tag)] if len(childs) == 1: return (childs[0]).text.strip() return "application/octet-stream" def print_matches(self, fullname, matches, delta_t, matchtype=''): assert not fido_disabled, "Fido module is not available!" for (f, s) in matches: self.lastFmt = self.fid.get_puid(f)
def run(self, filename=None, fid=Fido()): """ Identifies the format of the file using the fido library Args: filename: The filename to identify Returns: A tuple with the format name, version and registry key """ self.fid = fid self.fid.handle_matches = self.handle_matches self.fid.identify_file(filename) return (self.format_name, self.format_version, self.format_registry_key)
def test_fido_format_caching(): """Tests that caching works as if no caching has been used.""" fido_object = Fido( quiet=True, format_files=["formats-v95.xml", "format_extensions.xml"]) start_time = time.time() for _ in range(200): reader = _FidoReader('non_existing_file.xml') # If caching works, the time spent to initialize the _FidoReader should # not take long so 30 seconds would be the absolute max. elapsed_time = time.time() - start_time assert elapsed_time < 30 # We're constraining to len for assert, because these three attributes # contains large amount of lxml element-objects and thus would # make comparison very slow. assert len(reader.puid_format_map) == len(fido_object.puid_format_map) assert len(reader.formats) == len(fido_object.formats) assert len(reader.puid_has_priority_over_map) == len( fido_object.puid_has_priority_over_map)
def fido(self): if self._fido is None: logger.debug('Initiating fido') self._fido = Fido(handle_matches=self.handle_matches) logger.info('Initiated fido') return self._fido
def __init__(self): if not fido_disabled: self.fid = Fido() self.fid.handle_matches = self.print_matches self.lastFmt = None
def event_outcome_success(self, filename=None, fid=Fido()): return "Identified format of %s" % filename
def undo(self, filename=None, fid=Fido()): pass
def __init__(self): self.fid = Fido() self.fid.handle_matches = self.print_matches self.lastFmt = None
def fido(self): if self._fido is None: self._fido = Fido(handle_matches=self.handle_matches) return self._fido
def fido(self): if self._fido is None: logger.debug('Initiating fido') self._fido = Fido(handle_matches=self.handle_matches, format_files=FORMAT_FILES) logger.info('Initiated fido') return self._fido