Example #1
0
def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size,
                                chunk_number):
    """
    Create partitions from bucket/s
    """
    partitions = []
    parts_per_object = []

    if chunk_number:
        logger.debug('Chunk size set to {}'.format(chunk_size))
    elif chunk_size:
        logger.debug('Chunk number set to {}'.format(chunk_number))
    else:
        logger.debug('Chunk size and chunk number not set ')

    for entry in map_func_args_list:
        # Each entry is a bucket
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])

        for key, obj_size in keys_dict[bucket].items():
            if prefix in key and obj_size > 0:

                if chunk_number:
                    chunk_rest = obj_size % chunk_number
                    obj_chunk_size = (obj_size // chunk_number) + \
                        round((chunk_rest / chunk_number) + 0.5)
                elif chunk_size:
                    obj_chunk_size = chunk_size
                else:
                    obj_chunk_size = obj_size

                size = total_partitions = 0

                ci = obj_size
                cz = obj_chunk_size
                parts = ci // cz + (ci % cz > 0)
                logger.debug(
                    'Creating {} partitions from object {} ({})'.format(
                        parts, key, sizeof_fmt(obj_size)))

                while size < obj_size:
                    brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD)
                    brange = None if obj_size == obj_chunk_size else brange

                    partition = entry.copy()
                    partition['obj'] = CloudObject(sb, bucket, key)
                    partition['obj'].data_byte_range = brange
                    partition['obj'].chunk_size = obj_chunk_size
                    partition['obj'].part = total_partitions
                    partitions.append(partition)

                    total_partitions += 1
                    size += obj_chunk_size

                parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Example #2
0
def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size,
                                chunk_number):
    """
    Create partitions from bucket/s
    """
    logger.debug('Creating dataset chunks from bucket/s ...')
    partitions = []
    parts_per_object = []

    for entry in map_func_args_list:
        # Each entry is a bucket
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])

        if chunk_size or chunk_number:
            logger.debug(
                'Creating chunks from objects within: {}'.format(bucket))
        else:
            logger.debug('Discovering objects within: {}'.format(bucket))

        for key, obj_size in keys_dict[bucket].items():
            if prefix in key and obj_size > 0:
                logger.debug(
                    'Creating partitions from object {} size {}'.format(
                        key, obj_size))

                if chunk_number:
                    chunk_rest = obj_size % chunk_number
                    obj_chunk_size = (obj_size // chunk_number) + \
                        round((chunk_rest / chunk_number) + 0.5)
                elif chunk_size:
                    obj_chunk_size = chunk_size
                else:
                    obj_chunk_size = obj_size

                size = total_partitions = 0

                while size < obj_size:
                    brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD)
                    brange = None if obj_size == obj_chunk_size else brange

                    partition = entry.copy()
                    partition['obj'] = CloudObject(sb, bucket, key)
                    partition['obj'].data_byte_range = brange
                    partition['obj'].chunk_size = obj_chunk_size
                    partition['obj'].part = total_partitions
                    partitions.append(partition)

                    total_partitions += 1
                    size += obj_chunk_size

                parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Example #3
0
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size, chunk_number):
    """
    Create partitions from a list of objects keys
    """
    if chunk_size or chunk_number:
        logger.info('Creating chunks from object keys...')
    partitions = []
    parts_per_object = []

    for entry in map_func_args_list:
        # each entry is a key
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])
        key = '/'.join([prefix, obj_name]) if prefix else obj_name

        try:
            obj_size = keys_dict[bucket][key]
        except Exception:
            raise Exception('Object key "{}" does not exist in "{}" bucket'.format(key, bucket))

        if chunk_number:
            chunk_rest = obj_size % chunk_number
            chunk_size = obj_size // chunk_number + chunk_rest

        if chunk_size and chunk_size < CHUNK_SIZE_MIN:
            chunk_size = None

        total_partitions = 0

        if chunk_size is not None and obj_size > chunk_size:
            size = 0
            while size < obj_size:
                brange = (size, size+chunk_size+CHUNK_THRESHOLD)
                size += chunk_size
                partition = entry.copy()
                partition['obj'] = CloudObject(sb, bucket, key)
                partition['obj'].data_byte_range = brange
                partition['obj'].chunk_size = chunk_size
                partition['obj'].part = total_partitions
                partitions.append(partition)
                total_partitions = total_partitions + 1
        else:
            partition = entry
            partition['obj'] = CloudObject(sb, bucket, key)
            partition['obj'].data_byte_range = None
            partition['obj'].chunk_size = chunk_size
            partition['obj'].part = total_partitions
            partitions.append(partition)
            total_partitions = 1

        parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Example #4
0
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size,
                             chunk_number):
    """
    Create partitions from a list of objects keys
    """
    if chunk_size or chunk_number:
        logger.debug('Creating chunks from object keys')

    partitions = []
    parts_per_object = []

    for entry in map_func_args_list:
        # each entry is a key
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])
        key = '/'.join([prefix, obj_name]) if prefix else obj_name

        try:
            obj_size = keys_dict[bucket][key]
        except Exception:
            raise Exception(
                'Object key "{}" does not exist in "{}" bucket'.format(
                    key, bucket))

        if chunk_number:
            chunk_rest = obj_size % chunk_number
            obj_chunk_size = (obj_size // chunk_number) + \
                round((chunk_rest / chunk_number) + 0.5)
        elif chunk_size:
            obj_chunk_size = chunk_size
        else:
            obj_chunk_size = obj_size

        size = total_partitions = 0

        while size < obj_size:
            brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD)
            brange = None if obj_size == obj_chunk_size else brange

            partition = entry.copy()
            partition['obj'] = CloudObject(sb, bucket, key)
            partition['obj'].data_byte_range = brange
            partition['obj'].chunk_size = obj_chunk_size
            partition['obj'].part = total_partitions
            partitions.append(partition)

            total_partitions += 1
            size += obj_chunk_size

        parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Example #5
0
def create_partitions(config, internal_storage, map_iterdata, chunk_size, chunk_number):
    """
    Method that returns the function that will create the partitions of the objects in the Cloud
    """
    logger.debug('Starting partitioner')

    parts_per_object = None

    sbs = set()
    buckets = set()
    prefixes = set()
    obj_names = set()
    urls = set()

    logger.debug("Parsing input data")
    for elem in map_iterdata:
        if 'url' in elem:
            urls.add(elem['url'])
        elif 'obj' in elem:
            if type(elem['obj']) == CloudObject:
                elem['obj'] = '{}://{}/{}'.format(elem['obj'].backend,
                                                  elem['obj'].bucket,
                                                  elem['obj'].key)
            sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj'])
            if sb is None:
                sb = internal_storage.backend
                elem['obj'] = '{}://{}'.format(sb, elem['obj'])
            if obj_name:
                obj_names.add((bucket, prefix))
            elif prefix:
                prefixes.add((bucket, prefix))
            else:
                buckets.add(bucket)
            sbs.add(sb)

    if len(sbs) > 1:
        raise Exception('Currently we only support to process one storage backend at a time. '
                        'Current storage backends: {}'.format(sbs))

    if [prefixes, obj_names, urls, buckets].count(True) > 1:
        raise Exception('You must provide as an input data a list of bucktes, '
                        'a list of buckets with object prefix, a list of keys '
                        'or a list of urls. Intermingled types are not allowed.')

    if not urls:
        # process objects from an object store. No url
        sb = sbs.pop()
        if sb == internal_storage.backend:
            storage = internal_storage.storage
        else:
            storage = Storage(config=config, backend=sb)
        objects = {}
        if obj_names:
            for bucket, prefix in obj_names:
                logger.debug("Listing objects in '{}://{}/'"
                             .format(sb, '/'.join([bucket, prefix])))
                if bucket not in objects:
                    objects[bucket] = []
                prefix = prefix + '/' if prefix else prefix
                objects[bucket].extend(storage.list_objects(bucket, prefix))
        elif prefixes:
            for bucket, prefix in prefixes:
                logger.debug("Listing objects in '{}://{}/'"
                             .format(sb, '/'.join([bucket, prefix])))
                if bucket not in objects:
                    objects[bucket] = []
                prefix = prefix + '/' if prefix else prefix
                objects[bucket].extend(storage.list_objects(bucket, prefix))
        elif buckets:
            for bucket in buckets:
                logger.debug("Listing objects in '{}://{}'".format(sb, bucket))
                objects[bucket] = storage.list_objects(bucket)

        keys_dict = {}
        for bucket in objects:
            keys_dict[bucket] = {}
            for obj in objects[bucket]:
                keys_dict[bucket][obj['Key']] = obj['Size']

    if buckets or prefixes:
        partitions, parts_per_object = _split_objects_from_buckets(map_iterdata, keys_dict, chunk_size, chunk_number)

    elif obj_names:
        partitions, parts_per_object = _split_objects_from_keys(map_iterdata, keys_dict, chunk_size, chunk_number)

    elif urls:
        partitions, parts_per_object = _split_objects_from_urls(map_iterdata, chunk_size, chunk_number)

    else:
        raise ValueError('You did not provide any bucket or object key/url')

    return partitions, parts_per_object
Example #6
0
def _split_objects_from_object_storage(map_func_args_list, chunk_size,
                                       chunk_number, internal_storage, config):
    """
    Create partitions from a list of buckets or object keys
    """
    if chunk_number:
        logger.debug('Chunk size set to {}'.format(chunk_size))
    elif chunk_size:
        logger.debug('Chunk number set to {}'.format(chunk_number))
    else:
        logger.debug('Chunk size and chunk number not set ')

    sbs = set()
    buckets = set()
    prefixes = set()
    obj_names = set()

    for elem in map_func_args_list:
        if type(elem['obj']) == CloudObject:
            elem['obj'] = '{}://{}/{}'.format(elem['obj'].backend,
                                              elem['obj'].bucket,
                                              elem['obj'].key)
        sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj'])
        if sb is None:
            sb = internal_storage.backend
            elem['obj'] = '{}://{}'.format(sb, elem['obj'])
        if obj_name:
            obj_names.add((bucket, prefix))
        elif prefix:
            prefixes.add((bucket, prefix))
        else:
            buckets.add(bucket)
        sbs.add(sb)

    if len(sbs) > 1:
        raise Exception(
            'Process objects from multiple storage backends is not supported. '
            'Current storage backends: {}'.format(sbs))
    sb = sbs.pop()
    if sb == internal_storage.backend:
        storage = internal_storage.storage
    else:
        storage = Storage(config=config, backend=sb)

    objects = {}

    if obj_names:
        for bucket, prefix in obj_names:
            logger.debug("Listing objects in '{}://{}'".format(
                sb, '/'.join([bucket, prefix])))
            if bucket not in objects:
                objects[bucket] = []
            prefix = prefix + '/' if prefix else prefix
            objects[bucket].extend(storage.list_objects(bucket, prefix))
        logger.debug("Total objects found: {}".format(len(objects[bucket])))

    elif prefixes:
        for bucket, prefix in prefixes:
            logger.debug("Listing objects in '{}://{}'".format(
                sb, '/'.join([bucket, prefix])))
            if bucket not in objects:
                objects[bucket] = []
            prefix = prefix + '/' if prefix else prefix
            objects[bucket].extend(storage.list_objects(bucket, prefix))
        logger.debug("Total objects found: {}".format(len(objects[bucket])))

    elif buckets:
        for bucket in buckets:
            logger.debug("Listing objects in '{}://{}'".format(sb, bucket))
            objects[bucket] = storage.list_objects(bucket)
        logger.debug("Total objects found: {}".format(len(objects[bucket])))

    if all([len(objects[bucket]) == 0 for bucket in objects]):
        raise Exception(
            f'No objects found in bucket: {", ".join(objects.keys())}')

    keys_dict = {}
    for bucket in objects:
        keys_dict[bucket] = {}
        for obj in objects[bucket]:
            keys_dict[bucket][obj['Key']] = obj['Size']

    partitions = []
    parts_per_object = []

    def create_partition(bucket, key, entry):

        if key.endswith('/'):
            logger.debug(
                f'Discarding object "{key}" as it is a prefix folder (0.0B)')
            return

        obj_size = keys_dict[bucket][key]

        if chunk_number:
            chunk_rest = obj_size % chunk_number
            obj_chunk_size = (obj_size // chunk_number) + \
                round((chunk_rest / chunk_number) + 0.5)
        elif chunk_size:
            obj_chunk_size = chunk_size
        else:
            obj_chunk_size = obj_size

        size = total_partitions = 0

        ci = obj_size
        cz = obj_chunk_size
        parts = ci // cz + (ci % cz > 0)
        logger.debug('Creating {} partitions from object {} ({})'.format(
            parts, key, sizeof_fmt(obj_size)))

        while size < obj_size:
            brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD)
            brange = None if obj_size == obj_chunk_size else brange

            partition = entry.copy()
            partition['obj'] = CloudObject(sb, bucket, key)
            partition['obj'].data_byte_range = brange
            partition['obj'].chunk_size = obj_chunk_size
            partition['obj'].part = total_partitions
            partitions.append(partition)

            total_partitions += 1
            size += obj_chunk_size

        parts_per_object.append(total_partitions)

    for entry in map_func_args_list:
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])

        if obj_name:
            # each entry is an object key
            key = '/'.join([prefix, obj_name]) if prefix else obj_name
            create_partition(bucket, key, entry)

        else:
            # each entry is a bucket
            for key in keys_dict[bucket]:
                create_partition(bucket, key, entry)

    return partitions, parts_per_object