Esempio n. 1
0
def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size,
                                chunk_number):
    """
    Create partitions from bucket/s
    """
    logger.info('Creating dataset chunks from bucket/s ...')
    partitions = []
    parts_per_object = []

    for entry in map_func_args_list:
        # Each entry is a bucket
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])

        if chunk_size or chunk_number:
            logger.info(
                'Creating chunks from objects within: {}'.format(bucket))
        else:
            logger.info('Discovering objects within: {}'.format(bucket))

        for key, obj_size in keys_dict[bucket].items():
            if prefix in key and obj_size > 0:
                logger.debug(
                    'Creating partitions from object {} size {}'.format(
                        key, obj_size))
                total_partitions = 0
                size = 0

                if chunk_number:
                    chunk_rest = obj_size % chunk_number
                    chunk_size = obj_size // chunk_number + chunk_rest

                if chunk_size and chunk_size < CHUNK_SIZE_MIN:
                    chunk_size = None

                if chunk_size is not None and obj_size > chunk_size:
                    while size < obj_size:
                        brange = (size, size + chunk_size + CHUNK_THRESHOLD)
                        size += chunk_size
                        partition = entry.copy()
                        partition['obj'] = CloudObject(sb, bucket, key)
                        partition['obj'].data_byte_range = brange
                        partition['obj'].chunk_size = chunk_size
                        partition['obj'].part = total_partitions
                        partitions.append(partition)
                        total_partitions = total_partitions + 1
                else:
                    partition = entry.copy()
                    partition['obj'] = CloudObject(sb, bucket, key)
                    partition['obj'].data_byte_range = None
                    partition['obj'].chunk_size = chunk_size
                    partition['obj'].part = total_partitions
                    partitions.append(partition)
                    total_partitions = 1

                parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Esempio n. 2
0
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size,
                             chunk_number):
    """
    Create partitions from a list of objects keys
    """
    if chunk_size or chunk_number:
        logger.info('Creating chunks from object keys...')
    partitions = []
    parts_per_object = []

    for entry in map_func_args_list:
        # each entry is a key
        sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj'])
        key = '/'.join([prefix, obj_name]) if prefix else obj_name

        try:
            obj_size = keys_dict[bucket][key]
        except Exception:
            raise Exception(
                'Object key "{}" does not exist in "{}" bucket'.format(
                    key, bucket))

        if chunk_number:
            chunk_rest = obj_size % chunk_number
            chunk_size = obj_size // chunk_number + chunk_rest

        if chunk_size and chunk_size < CHUNK_SIZE_MIN:
            chunk_size = None

        total_partitions = 0

        if chunk_size is not None and obj_size > chunk_size:
            size = 0
            while size < obj_size:
                brange = (size, size + chunk_size + CHUNK_THRESHOLD)
                size += chunk_size
                partition = entry.copy()
                partition['obj'] = CloudObject(sb, bucket, key)
                partition['obj'].data_byte_range = brange
                partition['obj'].chunk_size = chunk_size
                partition['obj'].part = total_partitions
                partitions.append(partition)
                total_partitions = total_partitions + 1
        else:
            partition = entry
            partition['obj'] = CloudObject(sb, bucket, key)
            partition['obj'].data_byte_range = None
            partition['obj'].chunk_size = chunk_size
            partition['obj'].part = total_partitions
            partitions.append(partition)
            total_partitions = 1

        parts_per_object.append(total_partitions)

    return partitions, parts_per_object
Esempio n. 3
0
 def put_object(self, content, bucket=None, key=None):
     """
     Put temporal data object into storage.
     :param key: data key
     :param data: data content
     :return: CloudObject instance
     """
     prefix = self.tmp_obj_prefix or 'tmp'
     key = key or '{}.pickle'.format('data_{}'.format(self.tmp_obj_count))
     key = '/'.join([prefix, key])
     bucket = bucket or self.bucket
     body = pickle.dumps(content)
     self.storage_handler.put_object(bucket, key, body)
     self.tmp_obj_count += 1
     return CloudObject(self.backend, bucket, key)
Esempio n. 4
0
    def put_cobject(self, body, bucket=None, key=None):
        """
        Put CloudObject into storage.
        :param body: data content
        :param bucket: destination bucket
        :param key: destination key
        :return: CloudObject instance
        """
        prefix = os.environ.get('PYWREN_EXECUTION_ID', '')
        name = '{}/cloudobject_{}'.format(prefix, self.cloudobject_count)
        key = key or '/'.join([TEMP_PREFIX, name])
        bucket = bucket or self.bucket
        self.storage_handler.put_object(bucket, key, body)
        self.cloudobject_count += 1

        return CloudObject(self.backend, bucket, key)