def test_update_object_and_get_through_store_should_get_identical_bytestream(self): # create file storage_factory = PairtreeStorageFactory() store = storage_factory.get_store(store_dir=self.data_dir, uri_base="http://dummy") id = u'owërdœ.file' object = store.create_object(id) with open(self.test_file_path, 'rb') as test_file: object.add_bytestream('dummy.txt', test_file) # update file handle_large_file_2 = BytesIO() handle_large_file_2.write(b'baz') handle_large_file_2.seek(0) object.add_bytestream('dummy.txt', handle_large_file_2) handle_large_file_2.close() # create check string_io_container = BytesIO() string_io_container.write(b'baz') retreived_bytestream = object.get_bytestream('dummy.txt') orig_hash = hashlib.md5(string_io_container.getvalue()).hexdigest() created_hash = hashlib.md5(retreived_bytestream).hexdigest() self.assertEqual(orig_hash, created_hash)
def test_create_object_and_file_should_exist_according_to_store(self): storage_factory = PairtreeStorageFactory() store = storage_factory.get_store(store_dir=self.data_dir, uri_base="http://dummy") id = u'owërdœ.file' object = store.create_object(id) with open(self.test_file_path, 'rb') as test_file: object.add_bytestream('dummy', test_file) self.assertTrue(store.isfile(id, 'dummy'))
def test_delete_object_should_remove_file_from_system(self): storage_factory = PairtreeStorageFactory() store = storage_factory.get_store(store_dir=self.data_dir, uri_base="http://dummy") object = store.create_object('test') file_path = self.test_file_path object.add_file(file_path) self.assertTrue(os.path.isfile(os.path.join(object.location, 'dummy_image.jpg'))) store.delete_object('test') self.assertFalse(os.path.exists(object.location))
def test_store_file_should_create_identical_file(self): storage_factory = PairtreeStorageFactory() store = storage_factory.get_store(store_dir=self.data_dir, uri_base="http://dummy") object = store.create_object('test') with open(self.test_file_path, 'rb') as test_file: object.add_bytestream('dummy.jpg', test_file) orig_hash = hashlib.md5(open(self.test_file_path, 'rb').read()).hexdigest() created_hash = hashlib.md5(open(os.path.join(object.location, 'dummy.jpg'), 'rb').read()).hexdigest() self.assertEqual(orig_hash, created_hash)
def __init__(self, repository_storage_dir): """ Constructor initialises pairtree repository @type repository_storage_dir: string @param repository_storage_dir: repository storage directory """ self.storage_factory = PairtreeStorageFactory() self.repository_storage_dir = repository_storage_dir self.repo_storage_client = self.storage_factory.get_store( store_dir=self.repository_storage_dir, uri_base="http://")
def __init__(self, repository_storage_dir, representations_directory="representations"): """ Constructor initialises pairtree repository :param repository_storage_dir: repository storage directory :param representations_directory: representations directory """ super().__init__(repository_storage_dir) self.storage_factory = PairtreeStorageFactory() self.repository_storage_dir = repository_storage_dir self.representations_directory = representations_directory self.repo_storage_client = self.storage_factory.get_store(store_dir=repository_storage_dir, uri_base="http://") self.representations_directory = representations_directory
def test_create_object_and_file_store_should_be_able_to_retreive_identical_file(self): storage_factory = PairtreeStorageFactory() store = storage_factory.get_store(store_dir=self.data_dir, uri_base="http://dummy") id = u'owërdœ.file' object = store.create_object(id) with open(self.test_file_path, 'rb') as test_file: object.add_bytestream('dummy', test_file) retreived_bytestream = object.get_bytestream('dummy') orig_hash = hashlib.md5(open(self.test_file_path, 'rb').read()).hexdigest() created_hash = hashlib.md5(retreived_bytestream).hexdigest() self.assertEqual(orig_hash, created_hash)
class DirectoryPairtreeStorage(PairtreeStorage): """ Pairtree storage class allowing to build a filesystem hierarchy to store objects that are located by mapping identifier strings to object directory (or folder) paths. """ storage_factory = None repository_storage_dir = None def __init__(self, repository_storage_dir, representations_directory="representations"): """ Constructor initialises pairtree repository :param repository_storage_dir: repository storage directory :param representations_directory: representations directory """ super().__init__(repository_storage_dir) self.storage_factory = PairtreeStorageFactory() self.repository_storage_dir = repository_storage_dir self.representations_directory = representations_directory self.repo_storage_client = self.storage_factory.get_store(store_dir=repository_storage_dir, uri_base="http://") self.representations_directory = representations_directory # noinspection PyProtectedMember def get_dir_path_from_id(self, identifier): """ Get directory path from id :param identifier: identifier :return: directory path """ return self.repo_storage_client._id_to_dirpath(identifier) def get_tar_file_path(self, identifier, representation_label=None): object_path = self.get_object_path(identifier) if representation_label: tar_file_path = os.path.join(object_path, self.representations_directory, "%s.tar" % representation_label) else: tar_file_path = os.path.join(object_path, "%s.tar" % to_safe_filename(identifier)) if os.path.exists(tar_file_path): logger.debug("Package file found at: %s" % tar_file_path) return tar_file_path raise ObjectNotFoundException("Package file not found") def get_object_item_stream(self, identifier, representation_label, entry, tar_file=None): """ Get stream of a representation tar file entry :param identifier: package identifier :param representation_label: label of the representation (used in directory and file names), can be empty, tar assumed to be single package in that case :param entry: entry of the tar file :return: chunks iterator of the tar file """ object_path = self.get_object_path(identifier) tar_file_name = "%s.tar" % representation_label if representation_label else to_safe_filename(identifier) tar_file_path = os.path.join(object_path, self.representations_directory, tar_file_name) if os.path.exists(tar_file_path): logger.debug("Packaged representation file found at: %s" % entry) t = tar_file if tar_file else tarfile.open(tar_file_path, 'r') logger.debug("Accessing access entry %s" % entry) try: inst = ChunkedTarEntryReader(t) return inst.chunks(entry) except KeyError: logger.error('ERROR: Did not find %s in tar archive' % entry) raise ObjectNotFoundException("Entry not found in repository object") def trigger_new_version(self, uuid, identifier, config_path_work, storage_directory): """ Trigger new version depending on changed files in working directory compared to the data set in storage. :param storage_directory: :param config_path_work: :param uuid: UUID of working directory :param identifier: Data asset identifier :return: True, if new version is triggered, False otherwise """ working_dir = os.path.join(config_path_work, uuid) if self.identifier_object_exists(identifier): version = self.curr_version(identifier) data_asset_last_version_path = os.path.join( make_storage_data_directory_path(identifier, storage_directory), version, to_safe_filename(identifier)) working_distributions_dir = os.path.join(working_dir, self.representations_directory) if not os.path.exists(working_distributions_dir): logger.debug("New version is not triggered because working catalogue directory does not exist.") return False stored_distributions_dir = os.path.join(data_asset_last_version_path, self.representations_directory) distribution_files = list_files_in_dir(working_distributions_dir) for dataset_dir in distribution_files: dataset_package_file = os.path.join(working_distributions_dir, "%s.tar" % dataset_dir) dataset_package_stored_file = os.path.join(stored_distributions_dir, "%s.tar" % dataset_dir) files_ident = files_identical(dataset_package_file, dataset_package_stored_file) if not files_ident: logger.debug("New version triggered because hash of dataset packages is not identical") return True logger.debug("New version not triggered.") return False def store(self, identifier, source_directory, progress_reporter=default_reporter, single_package=True): sdir = source_directory[:-1] if source_directory.endswith('/') else source_directory uuid = sdir[sdir.rfind('/')+1:] working_dir = sdir[:sdir.rfind('/')] return self.store_working_directory(uuid, identifier, working_dir, single_package=single_package) def store_working_directory(self, uuid, identifier, working_directory, single_package=True): """ Store working directory either as single package or as representation packages :param uuid: UUID of working directory :param identifier: Object identifier :param working_directory: working directory :param single_package: store as single package or as representation packages :return: version of the stored object """ if single_package: version = super().store(identifier, working_directory, copy_dir=True) else: version = self.store_working_directory_as_representation_packages(uuid, identifier, working_directory) return version def store_working_directory_as_representation_packages(self, uuid, identifier, working_directory): """ Store working directory :param storage_directory: :param working_directory: working directory :param uuid: UUID of working directory :param identifier: Object identifier :return: version """ working_dir = os.path.join(working_directory, uuid) version = self._next_version(identifier) \ if self.trigger_new_version(uuid, identifier, working_directory, self.repository_storage_dir) \ else self.curr_version(identifier) target_dir = os.path.join(make_storage_data_directory_path(identifier, self.repository_storage_dir), version, to_safe_filename(identifier)) changed = False for path, _, files in os.walk(os.path.abspath(working_dir)): sub_path = path.replace(working_dir, "").lstrip("/") for file in files: # copy only packaged datasets, not the directories if not path.startswith(os.path.join(working_dir, self.representations_directory)) \ or fnmatch.fnmatch(file, "*.tar"): source = os.path.join(working_dir, sub_path, file) target = os.path.join(target_dir, sub_path, file) # copy files only if they are not identical if not files_identical(source, target): copy_file_with_base_directory(working_dir, target_dir, sub_path, file) changed = True # update state in storage and working directory if any files have been changed if changed: storage_state_file = os.path.join(target_dir, "state.xml") working_state_file = os.path.join(working_dir, "state.xml") update_state(working_state_file, identifier, version) shutil.copy2(working_state_file, storage_state_file) return version
class PairtreeStorage(object): """ Pairtree storage class allowing to build a filesystem hierarchy for holding objects that are located by mapping identifier strings to object directory (or folder) paths with two characters at a time. """ storage_factory = None repository_storage_dir = None def __init__(self, repository_storage_dir): """ Constructor initialises pairtree repository @type repository_storage_dir: string @param repository_storage_dir: repository storage directory """ self.storage_factory = PairtreeStorageFactory() self.repository_storage_dir = repository_storage_dir self.repo_storage_client = self.storage_factory.get_store( store_dir=self.repository_storage_dir, uri_base="http://") def store(self, identifier, source_file, progress_reporter=default_reporter): """ Storing an object in the pairtree path according to the given identifier. If a version of the object exists, a new version is created. @type identifier: string @param identifier: Identifier @type: source_file: string @param: source_file: Source file path to be stored in the repository @type: progress_reporter: function @param: progress_reporter: progress reporter function @raise: IOError: If the checksum of the copied file is incorrect """ repo_object = self.repo_storage_client.get_object(identifier, True) basename = ntpath.basename(source_file) next_version = self._next_version(identifier) with open(source_file, 'rb') as stream: repo_object.add_bytestream(basename, stream, path="data/%s" % next_version) progress_reporter(50) checksum_source_file = ChecksumFile(source_file).get( ChecksumAlgorithm.SHA256) checksum_target_file = ChecksumFile( self.get_object_path(identifier)).get(ChecksumAlgorithm.SHA256) if checksum_source_file != checksum_target_file: raise IOError( "Storage of repository object for identifier '%s' failed!" % identifier) progress_reporter(100) def identifier_object_exists(self, identifier): """ Verify if an object of the given identifier exists in the repository @type identifier: string @param identifier: Identifier @rtype: boolean @return: True if the object exists, false otherwise """ logger.debug("Looking for object at path: %s/data" % self.repo_storage_client._id_to_dirpath(identifier)) return self.repo_storage_client.exists(identifier, "data") def identifier_version_object_exists(self, identifier, version_num): """ Verify if the given version of the object exists in the repository @type identifier: string @param identifier: Identifier type version_num: int @param version_num: version number @rtype: boolean @return: True if the object exists, false otherwise """ version = '%05d' % version_num return self.repo_storage_client.exists(identifier, "data/%s" % version) def _get_version_parts(self, identifier): """ Get version directories @type identifier: string @param identifier: Identifier @rtype: list @return: List of directories of the versions """ return self.repo_storage_client.list_parts(identifier, "data") def _next_version(self, identifier): """ Get next formatted version directory name @type identifier: string @param identifier: Identifier @rtype: string @return: Formatted version string (constant VersionDirFormat) """ if not self.identifier_object_exists(identifier): return VersionDirFormat % 1 version_num = 1 while self.identifier_version_object_exists(identifier, version_num): version_num += 1 return VersionDirFormat % version_num def curr_version(self, identifier): """ Get current formatted version directory name @type identifier: string @param identifier: Identifier @rtype: string @return: Formatted version string (constant VersionDirFormat) """ return VersionDirFormat % self.curr_version_num(identifier) def curr_version_num(self, identifier): """ Get current version number @type identifier: string @param identifier: Identifier @rtype: int @return: Current version number """ if not self.identifier_object_exists(identifier): raise ValueError( "No repository object for id '%s'. Unable to get current version number." % identifier) version_num = 1 while self.identifier_version_object_exists(identifier, version_num): version_num += 1 version_num -= 1 return version_num def get_object_path(self, identifier, version_num=0): """ Get absolute file path of the stored object. If the version number is omitted, the path of the highest version number is returned. @type identifier: string @param identifier: Identifier @type version_num: int @param version_num: version number @rtype: string @return: Absolute file path of the stored object @raise ObjectNotFoundException if the file is not available """ if not self.identifier_object_exists(identifier): raise ValueError( "No repository object for id '%s'. Unable to get requested version object path." % identifier) if version_num == 0: version_num = self.curr_version_num(identifier) if not self.identifier_version_object_exists(identifier, version_num): raise ValueError("Repository object '%s' has no version %d." % (identifier, version_num)) version = '%05d' % version_num repo_obj = self.repo_storage_client.get_object(identifier, False) repo_obj_path = uri_to_safe_filename( os.path.join(repo_obj.id_to_dirpath(), "data/%s" % version)) try: return next( os.path.join(repo_obj_path, f) for f in os.listdir(repo_obj_path) if os.path.isfile(os.path.join(repo_obj_path, f))) except StopIteration: raise ObjectNotFoundException( "The file object does not exist in the repository") def get_object_item_stream(self, identifier, entry): """ Get stream of tar file entry. @type identifier: string @param identifier: Identifier @type entry: string @param entry: tar file entry (path within tar file) @rtype: binary @return: File content @raise KeyError if the tar entry does not exist in the stored package """ object_path = self.get_object_path(identifier) t = tarfile.open(object_path, 'r') logger.debug("Trying to access entry %s" % entry) try: info = t.getmember(entry) f = t.extractfile(info) inst = ChunkedTarEntryReader(t) return inst.chunks(entry) except KeyError: logger.error('ERROR: Did not find %s in tar archive' % entry) raise ObjectNotFoundException( "Entry not found in repository object") def latest_version_ip_list(self): """ Get a list of latest version packages from repository storage. @return: List of dictionary items of IPs available in repository storage. """ files = Dir(config_path_storage, exclude_file='').files() sortkeyfn = lambda s: s[1] tuples = [] for repofile in files: if repofile.endswith(".tar"): f, fname = os.path.split(repofile) if f.startswith("pairtree_root"): version = f[-5:] if f[-5:] != '' else '00001' repoitem = (repofile, version) tuples.append(repoitem) tuples.sort(key=sortkeyfn, reverse=True) items_grouped_by_version = [] for key, valuesiter in groupby(tuples, key=sortkeyfn): items_grouped_by_version.append( dict(version=key, items=list(v[0] for v in valuesiter))) lastversionfiles = [] for version_items in items_grouped_by_version: for item in version_items['items']: p, f = os.path.split(item) p2 = os.path.join(self.repository_storage_dir, p[:p.find("/data/")]) obj_id = self.repo_storage_client._get_id_from_dirpath(p2) if not obj_id in [x['id'] for x in lastversionfiles]: lastversionfiles.append({ "id": obj_id, "version": version_items['version'], "path": item }) return lastversionfiles
def __init__(self, store_dir, uri_base='urn:x-vioe:'): sf = PairtreeStorageFactory() self.store = sf.get_store(store_dir=store_dir, uri_base=uri_base)
class PairtreeStorage(object): """ Pairtree storage class allowing to build a filesystem hierarchy for holding objects that are located by mapping identifier strings to object directory (or folder) paths with two characters at a time. """ storage_factory = None repository_storage_dir = None def __init__(self, repository_storage_dir): """ Constructor initialises pairtree repository @type repository_storage_dir: string @param repository_storage_dir: repository storage directory """ self.storage_factory = PairtreeStorageFactory() self.repository_storage_dir = repository_storage_dir self.repo_storage_client = self.storage_factory.get_store( store_dir=self.repository_storage_dir, uri_base="http://") # noinspection PyProtectedMember def store(self, identifier, source_directory, progress_reporter=default_reporter): """ Storing a directory in the pairtree path according to the given identifier. If a version of the object exists, a new version is created. :param identifier: identifier :param source_directory: source directory :param progress_reporter: progress reporter :return: """ dirpath = self.repo_storage_client._id_to_dirpath(identifier) next_version = self._next_version(identifier) target_data_directory = os.path.join(dirpath, "data") pathlib.Path(target_data_directory).mkdir(parents=True, exist_ok=True) target_data_version_directory = os.path.join(target_data_directory, next_version) target_data_version_asset_directory = os.path.join( target_data_version_directory, uri_to_safe_filename(identifier)) shutil.copytree(source_directory, target_data_version_asset_directory) progress_reporter(100) return next_version def identifier_object_exists(self, identifier): """ Verify if an object of the given identifier exists in the repository @type identifier: string @param identifier: Identifier @rtype: boolean @return: True if the object exists, false otherwise """ logger.debug("Looking for object at path: %s/data" % self.repo_storage_client._id_to_dirpath(identifier)) return self.repo_storage_client.exists(identifier, "data") def identifier_version_object_exists(self, identifier, version_num): """ Verify if the given version of the object exists in the repository :param identifier: identifier :param version_num: version number :return: """ version = '%05d' % version_num return self.repo_storage_client.exists(identifier, "data/%s" % version) def _get_version_parts(self, identifier): """ Get version directories :param identifier: identifier :return: version parts """ return self.repo_storage_client.list_parts(identifier, "data") def _next_version(self, identifier): """ Get next formatted version directory name :param identifier: identifier :return: next formatted version directory name """ if not self.identifier_object_exists(identifier): return VersionDirFormat % 1 version_num = 1 while self.identifier_version_object_exists(identifier, version_num): version_num += 1 return VersionDirFormat % version_num def curr_version(self, identifier): """ Get current formatted version directory name :param identifier: identifier :return: current formatted version directory name """ return VersionDirFormat % self.curr_version_num(identifier) def curr_version_num(self, identifier): """ Get current version number :param identifier: identifier :return: current version number """ if not self.identifier_object_exists(identifier): return 1 version_num = 1 while self.identifier_version_object_exists(identifier, version_num): version_num += 1 version_num -= 1 return version_num def get_object_path(self, identifier, version_num=0): """ Get absolute file path of the stored object. If the version number is omitted, the path of the highest version number is returned. :param identifier: identifier :param version_num: version number :return: absolute file path of the stored object """ if not self.identifier_object_exists(identifier): raise ValueError("No repository object for id '%s'. " "Unable to get requested version object path." % identifier) if version_num == 0: version_num = self.curr_version_num(identifier) if not self.identifier_version_object_exists(identifier, version_num): raise ValueError("Repository object '%s' has no version %d." % (identifier, version_num)) version = '%05d' % version_num repo_obj = self.repo_storage_client.get_object(identifier, False) repo_obj_path = uri_to_safe_filename( os.path.join(repo_obj.id_to_dirpath(), "data/%s" % version)) try: return next( os.path.join(repo_obj_path, f) for f in os.listdir(repo_obj_path) if os.path.isdir(os.path.join(repo_obj_path, f))) except StopIteration: raise ObjectNotFoundException( "The file object does not exist in the repository") def get_chunked_tar_entry_reader(self, identifier: str) -> ChunkedTarEntryReader: tar_file_path = os.path.join( self.get_object_path(identifier), "%s.tar" % uri_to_safe_filename(identifier)) tar_file = tarfile.open(tar_file_path, 'r') return ChunkedTarEntryReader(tar_file) # noinspection PyProtectedMember def latest_version_ip_list(self) -> list: """ Get a list of latest version packages from repository storage. :return: list of latest version packages """ files = rec_find_files(self.repository_storage_dir) sortkeyfn = lambda s: s[1] tuples = [] for repofile in files: if repofile.endswith(".tar"): f, fname = os.path.split(repofile) if f.startswith("pairtree_root"): version = f[-5:] if f[-5:] != '' else '00001' repoitem = (repofile, version) tuples.append(repoitem) tuples.sort(key=sortkeyfn, reverse=True) items_grouped_by_version = [] for key, valuesiter in groupby(tuples, key=sortkeyfn): items_grouped_by_version.append( dict(version=key, items=list(v[0] for v in valuesiter))) lastversionfiles = [] for version_items in items_grouped_by_version: for item in version_items['items']: p, f = os.path.split(item) p2 = os.path.join(self.repository_storage_dir, p[:p.find("/data/")]) obj_id = self.repo_storage_client._get_id_from_dirpath(p2) if obj_id not in [x['id'] for x in lastversionfiles]: lastversionfiles.append({ "id": obj_id, "version": version_items['version'], "path": item }) return lastversionfiles
class PairtreeStorage(object): """ Pairtree storage class allowing to build a filesystem hierarchy for holding objects that are located by mapping identifier strings to object directory (or folder) paths with two characters at a time. """ storage_factory = None repository_storage_dir = None def __init__(self, repository_storage_dir): """ Constructor initialises pairtree repository @type repository_storage_dir: string @param repository_storage_dir: repository storage directory """ self.storage_factory = PairtreeStorageFactory() self.repository_storage_dir = repository_storage_dir self.repo_storage_client = self.storage_factory.get_store( store_dir=self.repository_storage_dir, uri_base="http://") def identifier_object_exists(self, identifier): """ Verify if an object of the given identifier exists in the repository @type identifier: string @param identifier: Identifier @rtype: boolean @return: True if the object exists, false otherwise """ return self.repo_storage_client.exists(identifier, "data") def identifier_version_object_exists(self, identifier, version_num): """ Verify if the given version of the object exists in the repository @type identifier: string @param identifier: Identifier type version_num: int @param version_num: version number @rtype: boolean @return: True if the object exists, false otherwise """ version = '%05d' % version_num return self.repo_storage_client.exists(identifier, "data/%s" % version) def curr_version_num(self, identifier): """ Get current version number @type identifier: string @param identifier: Identifier @rtype: int @return: Current version number """ if not self.identifier_object_exists(identifier): raise ValueError( "No repository object for id '%s'. Unable to get current version number." % identifier) version_num = 1 while self.identifier_version_object_exists(identifier, version_num): version_num += 1 version_num -= 1 return version_num def get_object_path(self, identifier, version_num=0): """ Get absolute file path of the stored object. If the version number is omitted, the path of the highest version number is returned. @type identifier: string @param identifier: Identifier @type version_num: int @param version_num: version number @rtype: string @return: Absolute file path of the stored object @raise ObjectNotFoundException if the file is not available """ if not self.identifier_object_exists(identifier): raise ValueError( "No repository object for id '%s'. Unable to get requested version object path." % identifier) if version_num == 0: version_num = self.curr_version_num(identifier) if not self.identifier_version_object_exists(identifier, version_num): raise ValueError("Repository object '%s' has no version %d." % (identifier, version_num)) version = '%05d' % version_num repo_obj = self.repo_storage_client.get_object(identifier, False) repo_obj_path = uri_to_safe_filename( os.path.join(repo_obj.id_to_dirpath(), "data/%s" % version)) try: return next( os.path.join(repo_obj_path, f) for f in os.listdir(repo_obj_path) if os.path.isfile(os.path.join(repo_obj_path, f))) except StopIteration: raise ObjectNotFoundException( "The file object does not exist in the repository")