Esempio n. 1
0
def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size,
                                chunk_number):
    """
    Create partitions from bucket/s
    """
    logger.info('Creating dataset chunks from bucket/s ...')
    partitions = []
    parts_per_object = []

    for entry in map_func_args_list:
        # Each entry is a bucket
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])

        if chunk_size or chunk_number:
            logger.info(
                'Creating chunks from objects within: {}'.format(bucket))
        else:
            logger.info('Discovering objects within: {}'.format(bucket))

        for key, obj_size in keys_dict[bucket].items():
            if prefix in key and obj_size > 0:
                logger.debug(
                    'Creating partitions from object {} size {}'.format(
                        key, obj_size))
                total_partitions = 0
                size = 0

                if chunk_number:
                    chunk_rest = obj_size % chunk_number
                    chunk_size = obj_size // chunk_number + chunk_rest

                if chunk_size and chunk_size < CHUNK_SIZE_MIN:
                    chunk_size = None

                if chunk_size is not None and obj_size > chunk_size:
                    while size < obj_size:
                        brange = (size, size + chunk_size + CHUNK_THRESHOLD)
                        size += chunk_size
                        partition = entry.copy()
                        partition['obj'] = CloudObject(sb, bucket, key)
                        partition['obj'].data_byte_range = brange
                        partition['obj'].chunk_size = chunk_size
                        partition['obj'].part = total_partitions
                        partitions.append(partition)
                        total_partitions = total_partitions + 1
                else:
                    partition = entry.copy()
                    partition['obj'] = CloudObject(sb, bucket, key)
                    partition['obj'].data_byte_range = None
                    partition['obj'].chunk_size = chunk_size
                    partition['obj'].part = total_partitions
                    partitions.append(partition)
                    total_partitions = 1

                parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Esempio n. 2
0
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size,
                             chunk_number):
    """
    Create partitions from a list of objects keys
    """
    if chunk_size or chunk_number:
        logger.info('Creating chunks from object keys...')
    partitions = []
    parts_per_object = []

    for entry in map_func_args_list:
        # each entry is a key
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])
        key = '/'.join([prefix, obj_name]) if prefix else obj_name

        try:
            obj_size = keys_dict[bucket][key]
        except Exception:
            raise Exception(
                'Object key "{}" does not exist in "{}" bucket'.format(
                    key, bucket))

        if chunk_number:
            chunk_rest = obj_size % chunk_number
            chunk_size = obj_size // chunk_number + chunk_rest

        if chunk_size and chunk_size < CHUNK_SIZE_MIN:
            chunk_size = None

        total_partitions = 0

        if chunk_size is not None and obj_size > chunk_size:
            size = 0
            while size < obj_size:
                brange = (size, size + chunk_size + CHUNK_THRESHOLD)
                size += chunk_size
                partition = entry.copy()
                partition['obj'] = CloudObject(sb, bucket, key)
                partition['obj'].data_byte_range = brange
                partition['obj'].chunk_size = chunk_size
                partition['obj'].part = total_partitions
                partitions.append(partition)
                total_partitions = total_partitions + 1
        else:
            partition = entry
            partition['obj'] = CloudObject(sb, bucket, key)
            partition['obj'].data_byte_range = None
            partition['obj'].chunk_size = chunk_size
            partition['obj'].part = total_partitions
            partitions.append(partition)
            total_partitions = 1

        parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Esempio n. 3
0
def create_partitions(pywren_config, map_iterdata, chunk_size, chunk_number):
    """
    Method that returns the function that will create the partitions of the objects in the Cloud
    """
    logger.debug('Starting partitioner')

    parts_per_object = None

    sbs = set()
    buckets = set()
    prefixes = set()
    obj_names = set()
    urls = set()

    logger.debug("Parsing input data")
    for elem in map_iterdata:
        if 'url' in elem:
            urls.add(elem['url'])
        elif 'obj' in elem:
            sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj'])
            if obj_name:
                obj_names.add((bucket, prefix))
            elif prefix:
                prefixes.add((bucket, prefix))
            else:
                buckets.add(bucket)
            sbs.add(sb)

    if len(sbs) > 1:
        raise Exception(
            'Currently we only support to process one storage backend at a time. '
            'Current storage backends: {}'.format(sbs))

    if [prefixes, obj_names, urls, buckets].count(True) > 1:
        raise Exception(
            'You must provide as an input data a list of bucktes, '
            'a list of buckets with object prefix, a list of keys '
            'or a list of urls. Intermingled types are not allowed.')

    if not urls:
        # process objects from an object store. No url
        sb = sbs.pop()
        storage_handler = Storage(pywren_config, sb).get_storage_handler()
        objects = {}
        if obj_names:
            for bucket, prefix in obj_names:
                logger.debug("Listing objects in '{}://{}'".format(
                    sb, '/'.join([bucket, prefix])))
                if bucket not in objects:
                    objects[bucket] = []
                objects[bucket].extend(
                    storage_handler.list_objects(bucket, prefix))
        elif prefixes:
            for bucket, prefix in prefixes:
                logger.debug("Listing objects in '{}://{}'".format(
                    sb, '/'.join([bucket, prefix])))
                if bucket not in objects:
                    objects[bucket] = []
                objects[bucket].extend(
                    storage_handler.list_objects(bucket, prefix))
        elif buckets:
            for bucket in buckets:
                logger.debug("Listing objects in '{}://{}'".format(sb, bucket))
                objects[bucket] = storage_handler.list_objects(bucket)

        keys_dict = {}
        for bucket in objects:
            keys_dict[bucket] = {}
            for obj in objects[bucket]:
                keys_dict[bucket][obj['Key']] = obj['Size']

    if buckets or prefixes:
        partitions, parts_per_object = _split_objects_from_buckets(
            map_iterdata, keys_dict, chunk_size, chunk_number)

    elif obj_names:
        partitions, parts_per_object = _split_objects_from_keys(
            map_iterdata, keys_dict, chunk_size, chunk_number)

    elif urls:
        partitions, parts_per_object = _split_objects_from_urls(
            map_iterdata, chunk_size, chunk_number)

    else:
        raise ValueError('You did not provide any bucket or object key/url')

    return partitions, parts_per_object