def parse_input_path_for_lithops(sm_config, input_path):
    if input_path.startswith('s3://') or input_path.startswith('s3a://'):
        backend = 'aws_s3'
        bucket, prefix = split_s3_path(input_path)
    else:
        backend = 'ibm_cos'
        bucket, prefix = split_cos_path(input_path)

    storage = Storage(sm_config['lithops'], backend)
    if backend == 'aws_s3' and sm_config['lithops']['aws_s3'][
            'endpoint'].startswith('http://'):
        # WORKAROUND for local Minio access
        # Lithops forces the url to HTTPS, so overwrite the S3 client with a fixed client
        # https://github.com/lithops-cloud/lithops/issues/708
        storage.storage_handler.s3_client = get_s3_client()

    keys_in_path = storage.list_keys(bucket, prefix)
    imzml_keys = [
        key for key in keys_in_path if key.lower().endswith('.imzml')
    ]
    ibd_keys = [key for key in keys_in_path if key.lower().endswith('.ibd')]

    debug_info = f'Path {input_path} had keys: {keys_in_path}'
    assert len(
        imzml_keys) == 1, f'Couldn\'t determine imzML file. {debug_info}'
    assert len(ibd_keys) == 1, f'Couldn\'t determine ibd file. {debug_info}'

    imzml_cobject = CloudObject(storage.backend, bucket, imzml_keys[0])
    ibd_cobject = CloudObject(storage.backend, bucket, ibd_keys[0])
    return storage, imzml_cobject, ibd_cobject
Exemple #2
0
def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size,
                                chunk_number):
    """
    Create partitions from bucket/s
    """
    partitions = []
    parts_per_object = []

    if chunk_number:
        logger.debug('Chunk size set to {}'.format(chunk_size))
    elif chunk_size:
        logger.debug('Chunk number set to {}'.format(chunk_number))
    else:
        logger.debug('Chunk size and chunk number not set ')

    for entry in map_func_args_list:
        # Each entry is a bucket
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])

        for key, obj_size in keys_dict[bucket].items():
            if prefix in key and obj_size > 0:

                if chunk_number:
                    chunk_rest = obj_size % chunk_number
                    obj_chunk_size = (obj_size // chunk_number) + \
                        round((chunk_rest / chunk_number) + 0.5)
                elif chunk_size:
                    obj_chunk_size = chunk_size
                else:
                    obj_chunk_size = obj_size

                size = total_partitions = 0

                ci = obj_size
                cz = obj_chunk_size
                parts = ci // cz + (ci % cz > 0)
                logger.debug(
                    'Creating {} partitions from object {} ({})'.format(
                        parts, key, sizeof_fmt(obj_size)))

                while size < obj_size:
                    brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD)
                    brange = None if obj_size == obj_chunk_size else brange

                    partition = entry.copy()
                    partition['obj'] = CloudObject(sb, bucket, key)
                    partition['obj'].data_byte_range = brange
                    partition['obj'].chunk_size = obj_chunk_size
                    partition['obj'].part = total_partitions
                    partitions.append(partition)

                    total_partitions += 1
                    size += obj_chunk_size

                parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Exemple #3
0
def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size,
                                chunk_number):
    """
    Create partitions from bucket/s
    """
    logger.debug('Creating dataset chunks from bucket/s ...')
    partitions = []
    parts_per_object = []

    for entry in map_func_args_list:
        # Each entry is a bucket
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])

        if chunk_size or chunk_number:
            logger.debug(
                'Creating chunks from objects within: {}'.format(bucket))
        else:
            logger.debug('Discovering objects within: {}'.format(bucket))

        for key, obj_size in keys_dict[bucket].items():
            if prefix in key and obj_size > 0:
                logger.debug(
                    'Creating partitions from object {} size {}'.format(
                        key, obj_size))

                if chunk_number:
                    chunk_rest = obj_size % chunk_number
                    obj_chunk_size = (obj_size // chunk_number) + \
                        round((chunk_rest / chunk_number) + 0.5)
                elif chunk_size:
                    obj_chunk_size = chunk_size
                else:
                    obj_chunk_size = obj_size

                size = total_partitions = 0

                while size < obj_size:
                    brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD)
                    brange = None if obj_size == obj_chunk_size else brange

                    partition = entry.copy()
                    partition['obj'] = CloudObject(sb, bucket, key)
                    partition['obj'].data_byte_range = brange
                    partition['obj'].chunk_size = obj_chunk_size
                    partition['obj'].part = total_partitions
                    partitions.append(partition)

                    total_partitions += 1
                    size += obj_chunk_size

                parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Exemple #4
0
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size, chunk_number):
    """
    Create partitions from a list of objects keys
    """
    if chunk_size or chunk_number:
        logger.info('Creating chunks from object keys...')
    partitions = []
    parts_per_object = []

    for entry in map_func_args_list:
        # each entry is a key
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])
        key = '/'.join([prefix, obj_name]) if prefix else obj_name

        try:
            obj_size = keys_dict[bucket][key]
        except Exception:
            raise Exception('Object key "{}" does not exist in "{}" bucket'.format(key, bucket))

        if chunk_number:
            chunk_rest = obj_size % chunk_number
            chunk_size = obj_size // chunk_number + chunk_rest

        if chunk_size and chunk_size < CHUNK_SIZE_MIN:
            chunk_size = None

        total_partitions = 0

        if chunk_size is not None and obj_size > chunk_size:
            size = 0
            while size < obj_size:
                brange = (size, size+chunk_size+CHUNK_THRESHOLD)
                size += chunk_size
                partition = entry.copy()
                partition['obj'] = CloudObject(sb, bucket, key)
                partition['obj'].data_byte_range = brange
                partition['obj'].chunk_size = chunk_size
                partition['obj'].part = total_partitions
                partitions.append(partition)
                total_partitions = total_partitions + 1
        else:
            partition = entry
            partition['obj'] = CloudObject(sb, bucket, key)
            partition['obj'].data_byte_range = None
            partition['obj'].chunk_size = chunk_size
            partition['obj'].part = total_partitions
            partitions.append(partition)
            total_partitions = 1

        parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Exemple #5
0
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size,
                             chunk_number):
    """
    Create partitions from a list of objects keys
    """
    if chunk_size or chunk_number:
        logger.debug('Creating chunks from object keys')

    partitions = []
    parts_per_object = []

    for entry in map_func_args_list:
        # each entry is a key
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])
        key = '/'.join([prefix, obj_name]) if prefix else obj_name

        try:
            obj_size = keys_dict[bucket][key]
        except Exception:
            raise Exception(
                'Object key "{}" does not exist in "{}" bucket'.format(
                    key, bucket))

        if chunk_number:
            chunk_rest = obj_size % chunk_number
            obj_chunk_size = (obj_size // chunk_number) + \
                round((chunk_rest / chunk_number) + 0.5)
        elif chunk_size:
            obj_chunk_size = chunk_size
        else:
            obj_chunk_size = obj_size

        size = total_partitions = 0

        while size < obj_size:
            brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD)
            brange = None if obj_size == obj_chunk_size else brange

            partition = entry.copy()
            partition['obj'] = CloudObject(sb, bucket, key)
            partition['obj'].data_byte_range = brange
            partition['obj'].chunk_size = obj_chunk_size
            partition['obj'].part = total_partitions
            partitions.append(partition)

            total_partitions += 1
            size += obj_chunk_size

        parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Exemple #6
0
def _upload_if_needed(src_path,
                      storage,
                      sm_storage,
                      storage_type,
                      s3_client=None,
                      use_db_mutex=True):
    """
    Uploads the object from `src_path` if it doesn't already exist in its translated COS path.
    Returns a CloudObject for the COS object
    """
    bucket, key = _choose_cos_location(src_path, sm_storage, storage_type)

    with ExitStack() as stack:
        if use_db_mutex:
            # Lock during upload to prevent parallel jobs upload the same file simultaneously
            stack.enter_context(DBMutex().lock(bucket + key, timeout=1200))

        try:
            storage.head_object(bucket, key)
            logger.debug(f'{src_path} already uploaded')
            return CloudObject(storage.backend, bucket, key)
        except StorageNoSuchKeyError:
            logger.info(f'Uploading {src_path}...')
            if src_path.startswith('s3a://'):
                assert s3_client, 'S3 client must be supplied to support s3a:// paths'
                src_bucket, src_key = split_s3_path(src_path)

                obj = s3_client.get_object(Bucket=src_bucket, Key=src_key)
                if hasattr(storage.get_client(), 'upload_fileobj'):
                    # Try streaming upload to IBM COS
                    transfer_config = TransferConfig(multipart_chunksize=20 *
                                                     MB,
                                                     max_concurrency=20,
                                                     io_chunksize=1 * MB)
                    storage.get_client().upload_fileobj(Fileobj=obj['Body'],
                                                        Bucket=bucket,
                                                        Key=key,
                                                        Config=transfer_config)
                    cobject = CloudObject(storage.backend, bucket, key)
                else:
                    # Fall back to buffering the entire object in memory for other backends
                    cobject = storage.put_cloudobject(obj['Body'].read(),
                                                      bucket, key)
            else:
                cobject = storage.put_cloudobject(open(src_path, 'rb'), bucket,
                                                  key)
            logger.info(f'Uploading {src_path}...Done')
            return cobject
Exemple #7
0
def _upload_moldbs_from_db(moldb_ids, storage, sm_storage):
    moldb_defs = []
    bucket, prefix = sm_storage['moldb']
    # Sort the moldbs because the centroids cache key is affected by their order
    for moldb_id in sorted(moldb_ids):
        key = f'{prefix}/{moldb_id}'
        try:
            storage.head_object(bucket, key)
            logger.debug(f'Found mol db at {key}')
            cobject = CloudObject(storage.backend, bucket, key)
        except StorageNoSuchKeyError:
            logger.info(f'Uploading {key}...')
            mols_query = DB().select(
                'SELECT DISTINCT formula FROM molecule WHERE moldb_id = %s',
                (moldb_id, ))
            mols = [mol for mol, in mols_query]
            cobject = save_cobj(storage, mols, bucket=bucket, key=key)
            logger.info(f'Uploading {key}...Done')
        (targeted, ) = DB().select_one(
            'SELECT targeted FROM molecular_db WHERE id = %s', (moldb_id, ))
        moldb_defs.append({
            'id': moldb_id,
            'cobj': cobject,
            'targeted': targeted
        })

    return moldb_defs
Exemple #8
0
    def create_partition(bucket, key, entry):

        if key.endswith('/'):
            logger.debug(
                f'Discarding object "{key}" as it is a prefix folder (0.0B)')
            return

        obj_size = keys_dict[bucket][key]

        if chunk_number:
            chunk_rest = obj_size % chunk_number
            obj_chunk_size = (obj_size // chunk_number) + \
                round((chunk_rest / chunk_number) + 0.5)
        elif chunk_size:
            obj_chunk_size = chunk_size
        else:
            obj_chunk_size = obj_size

        size = total_partitions = 0

        ci = obj_size
        cz = obj_chunk_size
        parts = ci // cz + (ci % cz > 0)
        logger.debug('Creating {} partitions from object {} ({})'.format(
            parts, key, sizeof_fmt(obj_size)))

        while size < obj_size:
            brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD)
            brange = None if obj_size == obj_chunk_size else brange

            partition = entry.copy()
            partition['obj'] = CloudObject(sb, bucket, key)
            partition['obj'].data_byte_range = brange
            partition['obj'].chunk_size = obj_chunk_size
            partition['obj'].part = total_partitions
            partitions.append(partition)

            total_partitions += 1
            size += obj_chunk_size

        parts_per_object.append(total_partitions)
Exemple #9
0
    def put_cobject(self, body, bucket=None, key=None):
        """
        Put CloudObject into storage.
        :param body: data content
        :param bucket: destination bucket
        :param key: destination key
        :return: CloudObject instance
        """
        prefix = os.environ.get('LITHOPS_EXECUTION_ID', '')
        coid = hex(next(self._created_cobjects_n))[2:]
        name = '{}/cloudobject_{}'.format(prefix, coid)
        key = key or '/'.join([TEMP_PREFIX, name])
        bucket = bucket or self.bucket
        self.storage_handler.put_object(bucket, key, body)

        return CloudObject(self.backend, bucket, key)
Exemple #10
0
 def test_delete_cloudobject(self):
     logger.info('Testing Storage.delete_cloudobject')
     sb = STORAGE_CONFIG['backend']
     bucket = STORAGE_CONFIG['bucket']
     test_keys = sorted([
         PREFIX + '/foo/baz',
         PREFIX + '/foo/bar/baz',
         PREFIX + '/foo_baz',
         PREFIX + '/bar',
         PREFIX + '/to_be_deleted',
     ])
     for key in test_keys:
         STORAGE.put_object(bucket, key, key.encode())
     cloudobject = CloudObject(sb, bucket, PREFIX + '/to_be_deleted')
     STORAGE.delete_cloudobject(cloudobject)
     all_bucket_keys = STORAGE.list_keys(bucket)
     self.assertFalse(PREFIX + '/to_be_deleted' in all_bucket_keys)
Exemple #11
0
def _upload_moldbs_from_files(file_paths, storage, sm_storage):
    moldb_defs = []
    for file_path in file_paths:
        bucket, raw_key = _choose_cos_location(file_path, sm_storage, 'moldb')
        key = raw_key + '_formulas'
        try:
            storage.head_object(bucket, key)
            logger.debug(f'Found mol db at {key}')
            cobject = CloudObject(storage.backend, bucket, key)
        except StorageNoSuchKeyError:
            logger.info(f'Uploading {key}...')
            mols = read_moldb_file(file_path).formula
            cobject = save_cobj(storage, mols, bucket=bucket, key=key)
            logger.info(f'Uploading {key}...Done')
        moldb_defs.append({
            'id': Path(file_path).stem,
            'cobj': cobject,
            'targeted': False
        })

    return moldb_defs
Exemple #12
0
    def put_cloudobject(self,
                        body: Union[str, bytes, TextIO, BinaryIO],
                        bucket: Optional[str] = None,
                        key: Optional[str] = None) -> CloudObject:
        """
        Put a CloudObject into storage.

        :param body: Data content, can be a string or byte array or a text/bytes file-like object
        :param bucket: Destination bucket
        :param key: Destination key

        :return: CloudObject instance
        """
        prefix = os.environ.get('__LITHOPS_SESSION_ID', '')
        coid = hex(next(self._created_cobjects_n))[2:]
        coname = 'cloudobject_{}'.format(coid)
        name = '/'.join([prefix, coname]) if prefix else coname
        key = key or '/'.join([TEMP_PREFIX, name])
        bucket = bucket or self.bucket
        self.storage_handler.put_object(bucket, key, body)

        return CloudObject(self.backend, bucket, key)
Exemple #13
0
    def test_delete_cloudobjects(self):
        logger.info('Testing Storage.delete_cloudobjects')
        sb = STORAGE_CONFIG['backend']
        bucket = STORAGE_CONFIG['bucket']
        test_keys = sorted([
            PREFIX + '/foo/baz', PREFIX + '/foo/bar/baz', PREFIX + '/foo_baz',
            PREFIX + '/bar', PREFIX + '/to_be_deleted1',
            PREFIX + '/to_be_deleted2', PREFIX + '/to_be_deleted3'
        ])
        cloudobjects = []
        keys_to_delete = [
            PREFIX + '/to_be_deleted1', PREFIX + '/to_be_deleted2',
            PREFIX + '/to_be_deleted3'
        ]
        for key in keys_to_delete:
            cobject = CloudObject(sb, bucket, key)
            cloudobjects.append(cobject)
        for key in test_keys:
            STORAGE.put_object(bucket, key, key.encode())

        STORAGE.delete_cloudobjects(cloudobjects)
        all_bucket_keys = STORAGE.list_keys(bucket)
        self.assertTrue(
            all(not key in all_bucket_keys for key in keys_to_delete))
Exemple #14
0
 def __init__(self, backend, bucket, key):
     CloudObject.__init__(self, backend, bucket, key)
def upload_if_needed(storage, src, target_bucket, target_prefix=None):
    example_prefix = 'cos://embl-datasets/'
    if src.startswith(example_prefix):
        can_access_directly = (storage.backend in ('ibm_cos', 'cos')
                               and object_exists(storage, 'embl-datasets',
                                                 src[len(example_prefix):]))
        if not can_access_directly:
            # If using the sample datasets with a non-COS storage backend, use HTTPS instead
            logger.info(
                f'Translating IBM COS path to public HTTPS path for example file "{src}"'
            )
            src = src.replace(
                example_prefix,
                'https://s3.us-east.cloud-object-storage.appdomain.cloud/embl-datasets/'
            )

    if '://' in src:
        backend, path = src.split('://', maxsplit=1)
        bucket, key = path.split('/', maxsplit=1)
    else:
        backend = None
        bucket = None
        filename = Path(src).name
        key = f'{target_prefix}/{filename}' if target_prefix else filename

    if backend not in ('https', 'http', None):
        # If it's not HTTP / filesystem, assume it's a bucket/key that Lithops can find
        assert object_exists(storage, bucket,
                             key), f'Could not resolve input path "{src}"'
        return CloudObject(storage.backend, bucket, key)

    if object_exists(storage, target_bucket, key):
        # If the file would have to be uploaded, but there's already a copy in the storage bucket, use it
        logger.debug(
            f'Found input file already uploaded at "{storage.backend}://{target_bucket}/{key}"'
        )
        return CloudObject(storage.backend, target_bucket, key)
    else:
        # Upload from HTTP or filesystem
        if backend in ('https', 'http'):
            r = requests.get(src, stream=True)
            r.raise_for_status()
            stream = r.raw
        else:
            src_path = Path(src)
            assert src_path.exists(), f'Could not find input file "{src}"'
            stream = src_path.open('rb')

        logger.info(
            f'Uploading "{src}" to "{storage.backend}://{target_bucket}/{key}"'
        )
        if hasattr(storage.get_client(), 'upload_fileobj'):
            # Try a streaming upload through boto3 interface
            storage.get_client().upload_fileobj(Fileobj=stream,
                                                Bucket=target_bucket,
                                                Key=key)
            return CloudObject(storage.backend, target_bucket, key)
        else:
            # Fall back to buffering the entire object in memory for other backends
            data = stream.read()
            return storage.put_cloudobject(data, target_bucket, key)