def store_factory(config, store_string): stores = { StoreType.S3.value: S3Store, StoreType.S3H.value: S3MultihashStore, StoreType.AZUREBLOBH.value: AzureMultihashStore, StoreType.GDRIVEH.value: GoogleDriveMultihashStore, StoreType.GDRIVE.value: GoogleDriveStore } sp = store_string.split('/') config_bucket_name, bucket_name = None, None try: store_type = sp[0][:-1] bucket_name = sp[2] config_bucket_name = [] log.debug('Store [%s] ; bucket [%s]' % (store_type, bucket_name), class_name=STORE_FACTORY_CLASS_NAME) for k in config['store'][store_type]: config_bucket_name.append(k) if bucket_name not in config_bucket_name: log.warn( 'Exception creating store -- Configuration not found for bucket [%s]. ' 'The available buckets in config file for store type [%s] are: %s' % (bucket_name, store_type, config_bucket_name), class_name=STORE_FACTORY_CLASS_NAME) return None bucket = config['store'][store_type][bucket_name] return stores[store_type](bucket_name, bucket) except ProfileNotFound as pfn: log.error(pfn, class_name=STORE_FACTORY_CLASS_NAME) return None
def _submit_fn(self, userfn, *args, **kwds): ctx = self._get_ctx() result = False retry_cnt = 0 while True: try: if ctx is not None: result = userfn(ctx, *args, **kwds) else: result = userfn(*args, **kwds) except Exception as e: if retry_cnt < self._retry: retry_cnt += 1 log.warn(output_messages['WARN_WORKER_EXCEPTION'] % (e, retry_cnt), class_name=POOL_CLASS_NAME) self._retry_wait(retry_cnt) continue else: log.error(output_messages['ERROR_WORKER_FAILURE'] % (e, retry_cnt), class_name=POOL_CLASS_NAME) self._release_ctx(ctx) raise e break log.debug(output_messages['DEBUG_WORKER_SUCESS'] % (retry_cnt + 1), class_name=POOL_CLASS_NAME) self._release_ctx(ctx) self._progress() return result
def put(self, key_path, file_path): if not self.drive_path_id: log.error('Drive path [%s] not found.' % self._drive_path, class_name=GDRIVE_STORE) return False if self.key_exists(key_path): log.debug('Key path [%s] already exists in drive path [%s].' % (key_path, self._drive_path), class_name=GDRIVE_STORE) return True if not os.path.exists(file_path): log.error('[%s] not found.' % file_path, class_name=GDRIVE_STORE) return False file_metadata = {'name': key_path, 'parents': [self.drive_path_id]} try: media = MediaFileUpload(file_path, chunksize=-1, resumable=True) self._store.files().create(body=file_metadata, media_body=media).execute() except Exception: raise RuntimeError('The file could not be uploaded: [%s]' % file_path, class_name=GDRIVE_STORE) return True
def _execute(self, command, change_dir=True): cwd = None if change_dir: cwd = self._path proc = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, shell=True, cwd=cwd) input_known_errors = [ 'Permission denied (publickey)', '/dev/tty: No such device or address', 'Host key verification failed.' ] if any(error in proc.stdout for error in input_known_errors): proc = subprocess.run(command, stdout=subprocess.PIPE, universal_newlines=True, cwd=cwd) log.debug(output_messages['DEBUG_EXECUTING_COMMAND'] % command, class_name=GIT_CLIENT_CLASS_NAME) self._check_output(proc) return proc
def put(self, key_path, file_path): self._storage.put(file_path, self._bucket + '/' + key_path) version = None log.debug(output_messages['INFO_FILE_STORED_IN_BUCKET'] % (file_path, self._bucket, key_path, version), class_name=SFTPSTORE_NAME) return key_path
def __commit_metadata(self, full_metadata_path, index_path, metadata, specs, ws_path): idx_path = os.path.join(index_path, 'metadata', self._spec) log.debug(output_messages['DEBUG_COMMIT_SPEC'] % self._spec, class_name=METADATA_CLASS_NAME) # saves README.md if any readme = 'README.md' src_readme = os.path.join(idx_path, readme) self._copy_to_metadata_path(src_readme, full_metadata_path, 'README.md') src_ignore_path = os.path.join(idx_path, MLGIT_IGNORE_FILE_NAME) self._copy_to_metadata_path(src_ignore_path, full_metadata_path, MLGIT_IGNORE_FILE_NAME) amount, workspace_size = self._get_amount_and_size_of_workspace_files( full_metadata_path, ws_path) # saves metadata and commit entity_spec_key = get_spec_key(self.__repo_type) metadata[entity_spec_key]['manifest']['files'] = MANIFEST_FILE metadata[entity_spec_key]['manifest']['size'] = humanize.naturalsize( workspace_size) metadata[entity_spec_key]['manifest']['amount'] = amount storage = metadata[entity_spec_key]['manifest'][STORAGE_SPEC_KEY] manifest = metadata[entity_spec_key]['manifest'] PluginCaller(manifest).call(ADD_METADATA, ws_path, manifest) # Add metadata specific to labels ML entity type self._add_associate_entity_metadata(metadata, specs) self.__commit_spec(full_metadata_path, metadata) return storage
def storage_factory(config, storage_string): storages = {StorageType.S3.value: S3Storage, StorageType.S3H.value: S3MultihashStorage, StorageType.AZUREBLOBH.value: AzureMultihashStorage, StorageType.GDRIVEH.value: GoogleDriveMultihashStorage, StorageType.GDRIVE.value: GoogleDriveStorage, StorageType.SFTPH.value: SFtpStorage} sp = storage_string.split('/') config_bucket_name, bucket_name = None, None try: storage_type = sp[0][:-1] bucket_name = sp[2] config_bucket_name = [] log.debug(output_messages['DEBUG_STORAGE_AND_BUCKET'] % (storage_type, bucket_name), class_name=STORAGE_FACTORY_CLASS_NAME) for k in config[STORAGE_CONFIG_KEY][storage_type]: config_bucket_name.append(k) if bucket_name not in config_bucket_name: log.warn(output_messages['WARN_EXCPETION_CREATING_STORAGE'] % ( bucket_name, storage_type, config_bucket_name), class_name=STORAGE_FACTORY_CLASS_NAME) return None bucket = config[STORAGE_CONFIG_KEY][storage_type][bucket_name] return storages[storage_type](bucket_name, bucket) except ProfileNotFound as pfn: log.error(pfn, class_name=STORAGE_FACTORY_CLASS_NAME) return None
def __commit_metadata(self, full_metadata_path, index_path, metadata, specs, ws_path): idx_path = os.path.join(index_path, 'metadata', self._spec) log.debug(output_messages['DEBUG_COMMIT_SPEC'] % self._spec, class_name=METADATA_CLASS_NAME) # saves README.md if any readme = 'README.md' src_readme = os.path.join(idx_path, readme) if os.path.exists(src_readme): dst_readme = os.path.join(full_metadata_path, readme) try: shutil.copy2(src_readme, dst_readme) except Exception as e: log.error(output_messages['ERROR_COULD_NOT_FIND_README'], class_name=METADATA_CLASS_NAME) raise e amount, workspace_size = self._get_amount_and_size_of_workspace_files( full_metadata_path, ws_path) # saves metadata and commit entity_spec_key = get_spec_key(self.__repo_type) metadata[entity_spec_key]['manifest']['files'] = MANIFEST_FILE metadata[entity_spec_key]['manifest']['size'] = humanize.naturalsize( workspace_size) metadata[entity_spec_key]['manifest']['amount'] = amount storage = metadata[entity_spec_key]['manifest'][STORAGE_SPEC_KEY] manifest = metadata[entity_spec_key]['manifest'] PluginCaller(manifest).call(ADD_METADATA, ws_path, manifest) # Add metadata specific to labels ML entity type self._add_associate_entity_metadata(metadata, specs) self.__commit_spec(full_metadata_path, metadata) return storage
def get_entities(self): """Get a list of entities found in config.yaml. Returns: list of class Entity. """ entities = [] metadata_repository = namedtuple( 'Repository', ['private', 'full_name', 'ssh_url', 'html_url', 'owner']) metadata_owner = namedtuple('Owner', ['email', 'name']) try: for type_entity in EntityType: self.__init_manager(type_entity.value) if not self._manager: continue repository = metadata_repository(False, '', '', '', metadata_owner('', '')) for obj in Repo( self._manager.path).head.commit.tree.traverse(): if SPEC_EXTENSION in obj.name: entity_spec = yaml_load_str( io.BytesIO(obj.data_stream.read())) entity = Entity(repository, entity_spec) if entity.type in type_entity.value and entity not in entities: entities.append(entity) except Exception as error: log.debug( output_messages['DEBUG_ENTITIES_RELATIONSHIP'].format(error), class_name=LocalEntityManager.__name__) return entities
def __commit_metadata(self, full_metadata_path, index_path, metadata, specs, ws_path): idx_path = os.path.join(index_path, 'metadata', self._spec) log.debug('Commit spec [%s] to ml-git metadata' % self._spec, class_name=METADATA_CLASS_NAME) # saves README.md if any readme = 'README.md' src_readme = os.path.join(idx_path, readme) if os.path.exists(src_readme): dst_readme = os.path.join(full_metadata_path, readme) try: shutil.copy2(src_readme, dst_readme) except Exception as e: log.error( 'Could not find file README.md. Entity repository must have README.md file', class_name=METADATA_CLASS_NAME) raise e amount, workspace_size = self._get_amount_and_size_of_workspace_files( full_metadata_path, ws_path) # saves metadata and commit metadata[self.__repo_type]['manifest']['files'] = MANIFEST_FILE metadata[self.__repo_type]['manifest']['size'] = humanize.naturalsize( workspace_size) metadata[self.__repo_type]['manifest']['amount'] = amount store = metadata[self.__repo_type]['manifest']['store'] manifest = metadata[self.__repo_type]['manifest'] PluginCaller(manifest).call(ADD_METADATA, ws_path, manifest) # Add metadata specific to labels ML entity type self._add_associate_entity_metadata(metadata, specs) self.__commit_spec(full_metadata_path, metadata) return store
def _log(self, objkey, links=[], log_file=None): log.debug('Update log for key [%s]' % objkey, class_name=HASH_FS_CLASS_NAME) log_file.write("%s\n" % (objkey)) for link in links: h = link['Hash'] log_file.write("%s\n" % (h))
def _get(self, file, key_path): bucket = self._bucket s3_resource = self._store res = s3_resource.Object(bucket, key_path).get() c = res['Body'] log.debug('Get - downloading [%s] from bucket [%s] into file [%s]' % (key_path, bucket, file), class_name=S3_MULTI_HASH_STORE_NAME) with open(file, 'wb') as f: m = hashlib.sha256() while True: chunk = c.read(self._blk_size) if not chunk: break m.update(chunk) f.write(chunk) h = m.hexdigest() mh = multihash.encode(bytes.fromhex(h), 'sha2-256') cid = CIDv1('dag-pb', mh) ncid = str(cid) if self.check_integrity(key_path, ncid) is False: return False c.close() return True
def fetch_scid(self, key, log_file=None): log.debug(output_messages['DEBUG_BUILDING_STORAGE_LOG'], class_name=HASH_FS_CLASS_NAME) if self._exists(key): links = self.load(key) self._log(key, links['Links'], log_file) else: log.debug(output_messages['DEBUG_BLOB_ALREADY_COMMITED'] % key, class_name=HASH_FS_CLASS_NAME)
def _check_integrity(self, cid, data): cid0 = self._digest(data) if cid == cid0: log.debug(output_messages['DEBUG_CHECKSUM_VERIFIED'] % cid, class_name=HASH_FS_CLASS_NAME) return True log.debug(output_messages['DEBUG_CORRUPTION_DETECTED'] % (cid, cid0), class_name=HASH_FS_CLASS_NAME) return False
def get(self, object_key, dst_file_path): size = 0 descriptor = json_load(self._get_hashpath(object_key)) json_objects = json.dumps(descriptor).encode() is_corrupted = not self._check_integrity(object_key, json_objects) if is_corrupted: return size successfully_wrote = True # concat all chunks to dstfile try: with open(dst_file_path, 'wb') as dst_file: for chunk in descriptor['Links']: chunk_hash = chunk['Hash'] blob_size = chunk['Size'] log.debug(output_messages['DEBUG_GET_CHUNK'] % (chunk_hash, blob_size), class_name=HASH_FS_CLASS_NAME) size += int(blob_size) successfully_wrote = self._write_chunk_in_file(chunk_hash, dst_file) if not successfully_wrote: break except Exception as e: if os.path.exists(dst_file_path): os.remove(dst_file_path) raise e if not successfully_wrote: size = 0 os.unlink(dst_file_path) return size
def _update_file_status(self, cache, filepath, fullpath, scid, st, value): status = Status.a.name prev_hash = value['hash'] scid_ret = scid is_flexible = self._mutability == MutabilityType.FLEXIBLE.value is_strict = self._mutability == MutabilityType.STRICT.value not_unlocked = value['mtime'] != st.st_mtime and 'untime' not in value bare_mode = os.path.exists( os.path.join(self._path, 'metadata', self._spec, 'bare')) if (is_flexible and not_unlocked) or is_strict: if value['status'] == Status.c.name and 'previous_hash' in value: prev_hash = value['previous_hash'] if scid == prev_hash: prev_hash = None status = Status.u.name log.debug(output_messages['DEBUG_RESTORED_FILE'].format( posix_path(filepath)), class_name=MULTI_HASH_CLASS_NAME) else: status = Status.c.name scid_ret = None file_path = Cache(cache).get_keypath(value['hash']) if os.path.exists(file_path): os.unlink(file_path) elif bare_mode and self._mutability == MutabilityType.MUTABLE.value: print('\n') log.warn(output_messages['WARN_FILE_EXISTS_IN_REPOSITORY'] % filepath, class_name=MULTI_HASH_CLASS_NAME) self.update_full_index(posix_path(filepath), fullpath, status, scid, prev_hash) return scid_ret
def update_index_unlock(self, filename): findex = self.get_index() try: findex[filename]['untime'] = time.time() except Exception: log.debug('The file [{}] isn\'t in index'.format(filename), class_name=MULTI_HASH_CLASS_NAME) self._fidx.save()
def _submit_fn(self, userfn, *args, **kwds): ctx = self._get_ctx() result = False retry_cnt = 0 while True: try: if ctx is not None: result = userfn(ctx, *args, **kwds) else: result = userfn(*args, **kwds) except Exception as e: if retry_cnt < self._retry: retry_cnt += 1 log.warn('Worker exception - [%s] -- retry [%d]' % (e, retry_cnt), class_name=POOL_CLASS_NAME) self._retry_wait(retry_cnt) continue else: log.error('Worker failure - [%s] -- [%d] attempts' % (e, retry_cnt), class_name=POOL_CLASS_NAME) self._release_ctx(ctx) raise e break log.debug('Worker success at attempt [%d]' % (retry_cnt + 1), class_name=POOL_CLASS_NAME) self._release_ctx(ctx) self._progress() return result
def _delete(self, key_path): bucket = self._bucket s3_resource = self._store log.debug('Delete - deleting [%s] from bucket [%s]' % (key_path, bucket), class_name=S3_MULTI_HASH_STORE_NAME) return s3_resource.Object(bucket, key_path).delete()
def search_spec_file(repotype, spec, categories_path): root_path = get_root_path() dir_with_cat_path = os.path.join(root_path, repotype, categories_path, spec) dir_without_cat_path = os.path.join(root_path, repotype, spec) files = None dir_files = None try: files = os.listdir(dir_with_cat_path) dir_files = dir_with_cat_path except Exception: try: files = os.listdir(dir_without_cat_path) dir_files = dir_without_cat_path except Exception: # TODO: search '.' path as well # if 'files_without_cat_path' and 'files_with_cat_path' remains as None, the system couldn't find the directory # which means that the entity name passed is wrong if files is None: raise SearchSpecException('The entity name passed is wrong. Please check again') if len(files) > 0: for file in files: if spec in file: log.debug('search spec file: found [%s]-[%s]' % (dir_files, file), class_name=ML_GIT_PROJECT_NAME) return dir_files, file raise SearchSpecException('The entity name passed is wrong. Please check again')
def _delete(self, key_path, version=None): bucket = self._bucket s3_resource = self._storage log.debug(output_messages['DEBUG_DELETING_FROM_BUCKET'] % (key_path, bucket), class_name=S3STORAGE_NAME) if version is not None: s3_resource.Object(bucket, key_path).delete(VersionId=version) else: return s3_resource.Object(bucket, key_path).delete()
def update_log(self, files_to_keep): log.debug(output_messages['DEBUG_UPDATE_LOG_LIST_FILES'], class_name=HASH_FS_CLASS_NAME) fullpath = os.path.join(self._logpath, STORAGE_LOG) if not os.path.exists(fullpath): return None with open(fullpath, 'w') as log_file: for file in files_to_keep: log_file.write("%s\n" % file)
def update_index_unlock(self, filename): findex = self.get_index() try: findex[filename]['untime'] = time.time() except Exception: log.debug(output_messages['DEBUG_FILE_NOT_INDEX'].format(filename), class_name=MULTI_HASH_CLASS_NAME) self._fidx.save()
def connect(self): log.debug('Connect - Storage [%s] ;' % self._store_type, class_name=AZURE_STORE_NAME) try: self._store = BlobServiceClient.from_connection_string( self._account, connection_timeout=300) except Exception: raise RuntimeError('Unable to connect to the Azure storage.')
def check_integrity(self, cid, ncid): if cid == ncid: log.debug(output_messages['DEBUG_CHECKSUM_VERIFIED'] % cid, class_name=MULTI_HASH_STORAGE_NAME) return True log.debug(output_messages['DEBUG_CORRUPTION_DETECTED'] % (cid, ncid), class_name=MULTI_HASH_STORAGE_NAME) return False
def bucket_exists(self): container = ContainerClient.from_connection_string(self._account, self._bucket, connection_timeout=300) try: container.get_container_properties() log.debug(output_messages['DEBUG_CONTAINER_ALREADY_EXISTS'] % self._bucket, class_name=AZURE_STORAGE_NAME) return True except Exception: return False
def check_integrity(self, cid, ncid): if cid == ncid: log.debug('Checksum verified for chunk [%s]' % cid, class_name=MULTI_HASH_STORE_NAME) return True log.error('Corruption detected for chunk [%s] - got [%s]' % (cid, ncid), class_name=MULTI_HASH_STORE_NAME) return False
def check_exists(self): log.debug('Metadata check existence [%s] @ [%s]' % (self.__git, self.__path), class_name=METADATA_MANAGER_CLASS_NAME) try: Repo(self.__path) except Exception: return False return True
def update_log(self, files_to_keep): log.debug('Update hashfs log with a list of files to keep', class_name=HASH_FS_CLASS_NAME) fullpath = os.path.join(self._logpath, 'store.log') if not os.path.exists(fullpath): return None with open(fullpath, 'w') as log_file: for file in files_to_keep: log_file.write("%s\n" % file)
def fetch_scid(self, key, log_file=None): log.debug('Building the store.log with these added files', class_name=HASH_FS_CLASS_NAME) if self._exists(key): links = self.load(key) self._log(key, links['Links'], log_file) else: log.debug('Blob %s already commited' % key, class_name=HASH_FS_CLASS_NAME)