def test_custom_list(self, mock_mimetypes_init, isfile): mimetypes_file = Path.objects.create( entity="path_mimetypes_definitionfile", value='path/to/mime.types') fid = FormatIdentifier(allow_unknown_file_types=True) fid._init_mimetypes() mock_mimetypes_init.assert_called_once_with( files=[mimetypes_file.value])
def test_handle_matches_when_no_matches(self, mock_mimetypes_init): fid = FormatIdentifier(allow_unknown_file_types=True) fid.handle_matches('fullname', [], mock.ANY) self.assertEqual(fid.format_name, 'Unknown File Format') self.assertEqual(fid.format_version, None) self.assertEqual(fid.format_registry_key, None)
def test_handle_matches_when_no_match_on_name_or_version_or_reg_key( self, mock_mimetypes_init): fid = FormatIdentifier(allow_unknown_file_types=True) dummy_matches = [('a', 'b'), ('c', 'd')] fid.handle_matches('fullname', dummy_matches, mock.ANY) self.assertEqual(fid.format_name, None) self.assertEqual(fid.format_version, None) self.assertEqual(fid.format_registry_key, None)
def setUp(self): self.content = b'test file' self.test_file = tempfile.NamedTemporaryFile(suffix='.txt', delete=False) self.addCleanup(os.remove, self.test_file.name) self.test_file.write(self.content) self.test_file.seek(0) self.test_file.close() fid = FormatIdentifier() self.expected = fid.identify_file_format(self.test_file.name)
class FormatValidator(BaseValidator): """ Validates the format of a file against the given ``context``. """ def __init__(self, *args, **kwargs): super(FormatValidator, self).__init__(*args, **kwargs) allow_unknown = self.options.get('allow_unknown_file_types', False) self.fid = FormatIdentifier(allow_unknown_file_types=allow_unknown) def validate(self, filepath, expected=None): logger.debug('Validating format of %s' % filepath) name, version, reg_key = expected if not any(f is not None for f in (name, version, reg_key)): raise ValueError( 'At least one of name, version and registry key is required') val_obj = Validation.objects.create(filename=filepath, time_started=timezone.now(), validator=self.__class__.__name__, required=self.required, task=self.task, information_package=self.ip, responsible=self.responsible, specification={ 'context': self.context, 'options': self.options, }) passed = False try: actual_name, actual_version, actual_reg_key = self.fid.identify_file_format( filepath) if name and name != actual_name: raise ValidationError( "format name for {} is not valid, ({} !={})".format( filepath, name, actual_name)) if version and version != actual_version: raise ValidationError( "format version for {} is not valid, ({} != {})".format( filepath, version, actual_version)) if reg_key and reg_key != actual_reg_key: raise ValidationError( "format registry key for {} is not valid, ({} != {})". format(filepath, reg_key, actual_reg_key)) passed = True except ValidationError: val_obj.message = traceback.format_exc() raise else: message = 'Successfully validated checksum of %s' % filepath val_obj.message = message logger.info(message) finally: val_obj.time_done = timezone.now() val_obj.passed = passed val_obj.save(update_fields=['time_done', 'passed', 'message'])
def __init__(self, filepath=None): self.parser = etree.XMLParser(remove_blank_text=True) self.fid = FormatIdentifier(allow_unknown_file_types=False) if filepath is not None: self.tree = etree.parse(filepath, parser=self.parser) else: self.tree = None
def index_document(tag_version, filepath): exclude_file_format_from_indexing_content = settings.EXCLUDE_FILE_FORMAT_FROM_INDEXING_CONTENT fid = FormatIdentifier() (format_name, format_version, format_registry_key) = fid.identify_file_format(filepath) if format_registry_key not in exclude_file_format_from_indexing_content: index_file_content = True else: index_file_content = False ip = tag_version.tag.information_package extension = os.path.splitext(tag_version.name)[1][1:] dirname = os.path.dirname(filepath) href = normalize_path(os.path.relpath(dirname, ip.object_path)) href = '' if href == '.' else href size, _ = get_tree_size_and_count(filepath) modified = timestamp_to_datetime(os.stat(filepath).st_mtime) tag_version.custom_fields = { 'extension': extension, 'dirname': dirname, 'href': href, 'filename': tag_version.name, 'size': size, 'modified': modified, 'formatname': format_name, 'formatversion': format_version, 'formatkey': format_registry_key, } doc = File.from_obj(tag_version) try: if index_file_content: with open(filepath, 'rb') as f: content = f.read() doc.data = base64.b64encode(content).decode("ascii") doc.save(pipeline='ingest_attachment') else: logger.debug('Skip to index file content for {}'.format(filepath)) doc.save() except ElasticsearchException: logger.exception('Failed to index {}'.format(filepath)) raise return doc, tag_version
def validate_file_format(filename, format_name, format_registry_key, format_version): """ Validates the format of the given file """ fid = FormatIdentifier() actual_format_name, actual_format_version, actual_format_registry_key = fid.identify_file_format( filename) if format_name: assert actual_format_name == format_name, ( "format name for %s is not valid, (%s != %s)" % filename, format_name, actual_format_name) if format_version: assert actual_format_version == format_version, "format version for %s is not valid" % filename if format_registry_key: assert actual_format_registry_key == format_registry_key, ( "format registry key for %s is not valid" % filename) return "Success"
def get_path_response(self, path, request, force_download=False, paginator=None): self.validate_path(path) try: if not path: raise OSError(errno.EISDIR, os.strerror(errno.EISDIR), path) if os.path.isfile(self.object_path): container_path = os.path.join( os.path.dirname(self.object_path), path.split('/', 1)[0]) container_path = normalize_path(container_path) if container_path == self.object_path: path = path.split('/', 1)[1] fid = FormatIdentifier(allow_unknown_file_types=True) content_type = fid.get_mimetype(path) return generate_file_response(self.open_file(path, 'rb'), content_type, force_download=force_download, name=path) except (IOError, OSError) as e: if e.errno == errno.ENOENT: raise exceptions.NotFound # Windows raises PermissionDenied (errno.EACCES) when trying to use # open() on a directory if os.name == 'nt': if e.errno not in (errno.EACCES, errno.EISDIR): raise elif e.errno != errno.EISDIR: raise except IndexError: if force_download: fid = FormatIdentifier(allow_unknown_file_types=True) content_type = fid.get_mimetype(path) return generate_file_response(self.open_file( self.object_path, 'rb'), content_type, force_download=force_download, name=path) entries = self.list_files(path) if paginator is not None: paginated = paginator.paginate_queryset(entries, request) return paginator.get_paginated_response(paginated) return Response(entries)
def test_default_list(self, mock_mimetypes_init): fid = FormatIdentifier(allow_unknown_file_types=True) fid._init_mimetypes() mock_mimetypes_init.assert_called_once_with()
def test_gzipped_file(self, mock_mimetypes_init): fid = FormatIdentifier(allow_unknown_file_types=True) self.assertEqual(fid.get_mimetype('foo.tar.gz'), 'application/gzip')
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) allow_unknown = self.options.get('allow_unknown_file_types', False) self.fid = FormatIdentifier(allow_unknown_file_types=allow_unknown)
def list_files(path, force_download=False, request=None, paginator=None): if isinstance(path, list): if paginator is not None: paginated = paginator.paginate_queryset(path, request) return paginator.get_paginated_response(paginated) return Response(path) fid = FormatIdentifier(allow_unknown_file_types=True) path = path.rstrip('/ ') if os.path.isfile(path): if tarfile.is_tarfile(path): with tarfile.open(path) as tar: entries = [] for member in tar.getmembers(): if not member.isfile(): continue entries.append({ "name": member.name, "type": 'file', "size": member.size, "modified": timestamp_to_datetime(member.mtime), }) if paginator is not None: paginated = paginator.paginate_queryset(entries, request) return paginator.get_paginated_response(paginated) return Response(entries) elif zipfile.is_zipfile(path) and os.path.splitext(path)[1] == '.zip': with zipfile.ZipFile(path) as zipf: entries = [] for member in zipf.filelist: if member.filename.endswith('/'): continue entries.append({ "name": member.filename, "type": 'file', "size": member.file_size, "modified": datetime(*member.date_time), }) if paginator is not None: paginated = paginator.paginate_queryset(entries, request) return paginator.get_paginated_response(paginated) return Response(entries) content_type = fid.get_mimetype(path) return generate_file_response(open(path, 'rb'), content_type, force_download) if os.path.isdir(path): entries = [] for entry in sorted(get_files_and_dirs(path), key=lambda x: x.name): entry_type = "dir" if entry.is_dir() else "file" size, _ = get_tree_size_and_count(entry.path) entries.append( { "name": os.path.basename(entry.path), "type": entry_type, "size": size, "modified": timestamp_to_datetime(entry.stat().st_mtime), } ) if paginator is not None and request is not None: paginated = paginator.paginate_queryset(entries, request) return paginator.get_paginated_response(paginated) if len(path.split('.tar/')) == 2: tar_path, tar_subpath = path.split('.tar/') tar_path += '.tar' with tarfile.open(tar_path) as tar: try: f = io.BytesIO(tar.extractfile(tar_subpath).read()) content_type = fid.get_mimetype(tar_subpath) return generate_file_response(f, content_type, force_download, name=tar_subpath) except KeyError: raise NotFound if len(path.split('.zip/')) == 2: zip_path, zip_subpath = path.split('.zip/') zip_path += '.zip' with zipfile.ZipFile(zip_path) as zipf: try: f = io.BytesIO(zipf.read(zip_subpath)) content_type = fid.get_mimetype(zip_subpath) return generate_file_response(f, content_type, force_download, name=zip_subpath) except KeyError: raise NotFound raise NotFound
def cli(path): fid = FormatIdentifier() res = fid.identify_file_format(path) click.echo(res)
def test_handle_matches_when_no_matches_and_unknown_types_not_allowed( self, mock_mimetypes_init): fid = FormatIdentifier(allow_unknown_file_types=False) with self.assertRaises(ValueError): fid.handle_matches('fullname', [], mock.ANY)
def test_unknown_content_type_when_not_allowed_should_raise_exception( self, mock_mimetypes_init): fid = FormatIdentifier(allow_unknown_file_types=False) with self.assertRaises(FileFormatNotAllowed): fid.get_mimetype('some_random_file')
def test_unknown_content_type(self, mock_mimetypes_init): fid = FormatIdentifier(allow_unknown_file_types=True) self.assertEqual(fid.get_mimetype('some_random_file'), DEFAULT_MIMETYPE)
def fid(self): if self._fid is not None: return self._fid self._fid = FormatIdentifier()