def parse_input_path_for_lithops(sm_config, input_path): if input_path.startswith('s3://') or input_path.startswith('s3a://'): backend = 'aws_s3' bucket, prefix = split_s3_path(input_path) else: backend = 'ibm_cos' bucket, prefix = split_cos_path(input_path) storage = Storage(sm_config['lithops'], backend) if backend == 'aws_s3' and sm_config['lithops']['aws_s3'][ 'endpoint'].startswith('http://'): # WORKAROUND for local Minio access # Lithops forces the url to HTTPS, so overwrite the S3 client with a fixed client # https://github.com/lithops-cloud/lithops/issues/708 storage.storage_handler.s3_client = get_s3_client() keys_in_path = storage.list_keys(bucket, prefix) imzml_keys = [ key for key in keys_in_path if key.lower().endswith('.imzml') ] ibd_keys = [key for key in keys_in_path if key.lower().endswith('.ibd')] debug_info = f'Path {input_path} had keys: {keys_in_path}' assert len( imzml_keys) == 1, f'Couldn\'t determine imzML file. {debug_info}' assert len(ibd_keys) == 1, f'Couldn\'t determine ibd file. {debug_info}' imzml_cobject = CloudObject(storage.backend, bucket, imzml_keys[0]) ibd_cobject = CloudObject(storage.backend, bucket, ibd_keys[0]) return storage, imzml_cobject, ibd_cobject
def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from bucket/s """ partitions = [] parts_per_object = [] if chunk_number: logger.debug('Chunk size set to {}'.format(chunk_size)) elif chunk_size: logger.debug('Chunk number set to {}'.format(chunk_number)) else: logger.debug('Chunk size and chunk number not set ') for entry in map_func_args_list: # Each entry is a bucket sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) for key, obj_size in keys_dict[bucket].items(): if prefix in key and obj_size > 0: if chunk_number: chunk_rest = obj_size % chunk_number obj_chunk_size = (obj_size // chunk_number) + \ round((chunk_rest / chunk_number) + 0.5) elif chunk_size: obj_chunk_size = chunk_size else: obj_chunk_size = obj_size size = total_partitions = 0 ci = obj_size cz = obj_chunk_size parts = ci // cz + (ci % cz > 0) logger.debug( 'Creating {} partitions from object {} ({})'.format( parts, key, sizeof_fmt(obj_size))) while size < obj_size: brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD) brange = None if obj_size == obj_chunk_size else brange partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = obj_chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions += 1 size += obj_chunk_size parts_per_object.append(total_partitions) return partitions, parts_per_object
def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from bucket/s """ logger.debug('Creating dataset chunks from bucket/s ...') partitions = [] parts_per_object = [] for entry in map_func_args_list: # Each entry is a bucket sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) if chunk_size or chunk_number: logger.debug( 'Creating chunks from objects within: {}'.format(bucket)) else: logger.debug('Discovering objects within: {}'.format(bucket)) for key, obj_size in keys_dict[bucket].items(): if prefix in key and obj_size > 0: logger.debug( 'Creating partitions from object {} size {}'.format( key, obj_size)) if chunk_number: chunk_rest = obj_size % chunk_number obj_chunk_size = (obj_size // chunk_number) + \ round((chunk_rest / chunk_number) + 0.5) elif chunk_size: obj_chunk_size = chunk_size else: obj_chunk_size = obj_size size = total_partitions = 0 while size < obj_size: brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD) brange = None if obj_size == obj_chunk_size else brange partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = obj_chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions += 1 size += obj_chunk_size parts_per_object.append(total_partitions) return partitions, parts_per_object
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from a list of objects keys """ if chunk_size or chunk_number: logger.info('Creating chunks from object keys...') partitions = [] parts_per_object = [] for entry in map_func_args_list: # each entry is a key sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) key = '/'.join([prefix, obj_name]) if prefix else obj_name try: obj_size = keys_dict[bucket][key] except Exception: raise Exception('Object key "{}" does not exist in "{}" bucket'.format(key, bucket)) if chunk_number: chunk_rest = obj_size % chunk_number chunk_size = obj_size // chunk_number + chunk_rest if chunk_size and chunk_size < CHUNK_SIZE_MIN: chunk_size = None total_partitions = 0 if chunk_size is not None and obj_size > chunk_size: size = 0 while size < obj_size: brange = (size, size+chunk_size+CHUNK_THRESHOLD) size += chunk_size partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = total_partitions + 1 else: partition = entry partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = None partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = 1 parts_per_object.append(total_partitions) return partitions, parts_per_object
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from a list of objects keys """ if chunk_size or chunk_number: logger.debug('Creating chunks from object keys') partitions = [] parts_per_object = [] for entry in map_func_args_list: # each entry is a key sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) key = '/'.join([prefix, obj_name]) if prefix else obj_name try: obj_size = keys_dict[bucket][key] except Exception: raise Exception( 'Object key "{}" does not exist in "{}" bucket'.format( key, bucket)) if chunk_number: chunk_rest = obj_size % chunk_number obj_chunk_size = (obj_size // chunk_number) + \ round((chunk_rest / chunk_number) + 0.5) elif chunk_size: obj_chunk_size = chunk_size else: obj_chunk_size = obj_size size = total_partitions = 0 while size < obj_size: brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD) brange = None if obj_size == obj_chunk_size else brange partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = obj_chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions += 1 size += obj_chunk_size parts_per_object.append(total_partitions) return partitions, parts_per_object
def _upload_if_needed(src_path, storage, sm_storage, storage_type, s3_client=None, use_db_mutex=True): """ Uploads the object from `src_path` if it doesn't already exist in its translated COS path. Returns a CloudObject for the COS object """ bucket, key = _choose_cos_location(src_path, sm_storage, storage_type) with ExitStack() as stack: if use_db_mutex: # Lock during upload to prevent parallel jobs upload the same file simultaneously stack.enter_context(DBMutex().lock(bucket + key, timeout=1200)) try: storage.head_object(bucket, key) logger.debug(f'{src_path} already uploaded') return CloudObject(storage.backend, bucket, key) except StorageNoSuchKeyError: logger.info(f'Uploading {src_path}...') if src_path.startswith('s3a://'): assert s3_client, 'S3 client must be supplied to support s3a:// paths' src_bucket, src_key = split_s3_path(src_path) obj = s3_client.get_object(Bucket=src_bucket, Key=src_key) if hasattr(storage.get_client(), 'upload_fileobj'): # Try streaming upload to IBM COS transfer_config = TransferConfig(multipart_chunksize=20 * MB, max_concurrency=20, io_chunksize=1 * MB) storage.get_client().upload_fileobj(Fileobj=obj['Body'], Bucket=bucket, Key=key, Config=transfer_config) cobject = CloudObject(storage.backend, bucket, key) else: # Fall back to buffering the entire object in memory for other backends cobject = storage.put_cloudobject(obj['Body'].read(), bucket, key) else: cobject = storage.put_cloudobject(open(src_path, 'rb'), bucket, key) logger.info(f'Uploading {src_path}...Done') return cobject
def _upload_moldbs_from_db(moldb_ids, storage, sm_storage): moldb_defs = [] bucket, prefix = sm_storage['moldb'] # Sort the moldbs because the centroids cache key is affected by their order for moldb_id in sorted(moldb_ids): key = f'{prefix}/{moldb_id}' try: storage.head_object(bucket, key) logger.debug(f'Found mol db at {key}') cobject = CloudObject(storage.backend, bucket, key) except StorageNoSuchKeyError: logger.info(f'Uploading {key}...') mols_query = DB().select( 'SELECT DISTINCT formula FROM molecule WHERE moldb_id = %s', (moldb_id, )) mols = [mol for mol, in mols_query] cobject = save_cobj(storage, mols, bucket=bucket, key=key) logger.info(f'Uploading {key}...Done') (targeted, ) = DB().select_one( 'SELECT targeted FROM molecular_db WHERE id = %s', (moldb_id, )) moldb_defs.append({ 'id': moldb_id, 'cobj': cobject, 'targeted': targeted }) return moldb_defs
def create_partition(bucket, key, entry): if key.endswith('/'): logger.debug( f'Discarding object "{key}" as it is a prefix folder (0.0B)') return obj_size = keys_dict[bucket][key] if chunk_number: chunk_rest = obj_size % chunk_number obj_chunk_size = (obj_size // chunk_number) + \ round((chunk_rest / chunk_number) + 0.5) elif chunk_size: obj_chunk_size = chunk_size else: obj_chunk_size = obj_size size = total_partitions = 0 ci = obj_size cz = obj_chunk_size parts = ci // cz + (ci % cz > 0) logger.debug('Creating {} partitions from object {} ({})'.format( parts, key, sizeof_fmt(obj_size))) while size < obj_size: brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD) brange = None if obj_size == obj_chunk_size else brange partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = obj_chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions += 1 size += obj_chunk_size parts_per_object.append(total_partitions)
def put_cobject(self, body, bucket=None, key=None): """ Put CloudObject into storage. :param body: data content :param bucket: destination bucket :param key: destination key :return: CloudObject instance """ prefix = os.environ.get('LITHOPS_EXECUTION_ID', '') coid = hex(next(self._created_cobjects_n))[2:] name = '{}/cloudobject_{}'.format(prefix, coid) key = key or '/'.join([TEMP_PREFIX, name]) bucket = bucket or self.bucket self.storage_handler.put_object(bucket, key, body) return CloudObject(self.backend, bucket, key)
def test_delete_cloudobject(self): logger.info('Testing Storage.delete_cloudobject') sb = STORAGE_CONFIG['backend'] bucket = STORAGE_CONFIG['bucket'] test_keys = sorted([ PREFIX + '/foo/baz', PREFIX + '/foo/bar/baz', PREFIX + '/foo_baz', PREFIX + '/bar', PREFIX + '/to_be_deleted', ]) for key in test_keys: STORAGE.put_object(bucket, key, key.encode()) cloudobject = CloudObject(sb, bucket, PREFIX + '/to_be_deleted') STORAGE.delete_cloudobject(cloudobject) all_bucket_keys = STORAGE.list_keys(bucket) self.assertFalse(PREFIX + '/to_be_deleted' in all_bucket_keys)
def _upload_moldbs_from_files(file_paths, storage, sm_storage): moldb_defs = [] for file_path in file_paths: bucket, raw_key = _choose_cos_location(file_path, sm_storage, 'moldb') key = raw_key + '_formulas' try: storage.head_object(bucket, key) logger.debug(f'Found mol db at {key}') cobject = CloudObject(storage.backend, bucket, key) except StorageNoSuchKeyError: logger.info(f'Uploading {key}...') mols = read_moldb_file(file_path).formula cobject = save_cobj(storage, mols, bucket=bucket, key=key) logger.info(f'Uploading {key}...Done') moldb_defs.append({ 'id': Path(file_path).stem, 'cobj': cobject, 'targeted': False }) return moldb_defs
def put_cloudobject(self, body: Union[str, bytes, TextIO, BinaryIO], bucket: Optional[str] = None, key: Optional[str] = None) -> CloudObject: """ Put a CloudObject into storage. :param body: Data content, can be a string or byte array or a text/bytes file-like object :param bucket: Destination bucket :param key: Destination key :return: CloudObject instance """ prefix = os.environ.get('__LITHOPS_SESSION_ID', '') coid = hex(next(self._created_cobjects_n))[2:] coname = 'cloudobject_{}'.format(coid) name = '/'.join([prefix, coname]) if prefix else coname key = key or '/'.join([TEMP_PREFIX, name]) bucket = bucket or self.bucket self.storage_handler.put_object(bucket, key, body) return CloudObject(self.backend, bucket, key)
def test_delete_cloudobjects(self): logger.info('Testing Storage.delete_cloudobjects') sb = STORAGE_CONFIG['backend'] bucket = STORAGE_CONFIG['bucket'] test_keys = sorted([ PREFIX + '/foo/baz', PREFIX + '/foo/bar/baz', PREFIX + '/foo_baz', PREFIX + '/bar', PREFIX + '/to_be_deleted1', PREFIX + '/to_be_deleted2', PREFIX + '/to_be_deleted3' ]) cloudobjects = [] keys_to_delete = [ PREFIX + '/to_be_deleted1', PREFIX + '/to_be_deleted2', PREFIX + '/to_be_deleted3' ] for key in keys_to_delete: cobject = CloudObject(sb, bucket, key) cloudobjects.append(cobject) for key in test_keys: STORAGE.put_object(bucket, key, key.encode()) STORAGE.delete_cloudobjects(cloudobjects) all_bucket_keys = STORAGE.list_keys(bucket) self.assertTrue( all(not key in all_bucket_keys for key in keys_to_delete))
def __init__(self, backend, bucket, key): CloudObject.__init__(self, backend, bucket, key)
def upload_if_needed(storage, src, target_bucket, target_prefix=None): example_prefix = 'cos://embl-datasets/' if src.startswith(example_prefix): can_access_directly = (storage.backend in ('ibm_cos', 'cos') and object_exists(storage, 'embl-datasets', src[len(example_prefix):])) if not can_access_directly: # If using the sample datasets with a non-COS storage backend, use HTTPS instead logger.info( f'Translating IBM COS path to public HTTPS path for example file "{src}"' ) src = src.replace( example_prefix, 'https://s3.us-east.cloud-object-storage.appdomain.cloud/embl-datasets/' ) if '://' in src: backend, path = src.split('://', maxsplit=1) bucket, key = path.split('/', maxsplit=1) else: backend = None bucket = None filename = Path(src).name key = f'{target_prefix}/{filename}' if target_prefix else filename if backend not in ('https', 'http', None): # If it's not HTTP / filesystem, assume it's a bucket/key that Lithops can find assert object_exists(storage, bucket, key), f'Could not resolve input path "{src}"' return CloudObject(storage.backend, bucket, key) if object_exists(storage, target_bucket, key): # If the file would have to be uploaded, but there's already a copy in the storage bucket, use it logger.debug( f'Found input file already uploaded at "{storage.backend}://{target_bucket}/{key}"' ) return CloudObject(storage.backend, target_bucket, key) else: # Upload from HTTP or filesystem if backend in ('https', 'http'): r = requests.get(src, stream=True) r.raise_for_status() stream = r.raw else: src_path = Path(src) assert src_path.exists(), f'Could not find input file "{src}"' stream = src_path.open('rb') logger.info( f'Uploading "{src}" to "{storage.backend}://{target_bucket}/{key}"' ) if hasattr(storage.get_client(), 'upload_fileobj'): # Try a streaming upload through boto3 interface storage.get_client().upload_fileobj(Fileobj=stream, Bucket=target_bucket, Key=key) return CloudObject(storage.backend, target_bucket, key) else: # Fall back to buffering the entire object in memory for other backends data = stream.read() return storage.put_cloudobject(data, target_bucket, key)