def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from bucket/s """ logger.info('Creating dataset chunks from bucket/s ...') partitions = [] parts_per_object = [] for entry in map_func_args_list: # Each entry is a bucket sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) if chunk_size or chunk_number: logger.info( 'Creating chunks from objects within: {}'.format(bucket)) else: logger.info('Discovering objects within: {}'.format(bucket)) for key, obj_size in keys_dict[bucket].items(): if prefix in key and obj_size > 0: logger.debug( 'Creating partitions from object {} size {}'.format( key, obj_size)) total_partitions = 0 size = 0 if chunk_number: chunk_rest = obj_size % chunk_number chunk_size = obj_size // chunk_number + chunk_rest if chunk_size and chunk_size < CHUNK_SIZE_MIN: chunk_size = None if chunk_size is not None and obj_size > chunk_size: while size < obj_size: brange = (size, size + chunk_size + CHUNK_THRESHOLD) size += chunk_size partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = total_partitions + 1 else: partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = None partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = 1 parts_per_object.append(total_partitions) return partitions, parts_per_object
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from a list of objects keys """ if chunk_size or chunk_number: logger.info('Creating chunks from object keys...') partitions = [] parts_per_object = [] for entry in map_func_args_list: # each entry is a key sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) key = '/'.join([prefix, obj_name]) if prefix else obj_name try: obj_size = keys_dict[bucket][key] except Exception: raise Exception( 'Object key "{}" does not exist in "{}" bucket'.format( key, bucket)) if chunk_number: chunk_rest = obj_size % chunk_number chunk_size = obj_size // chunk_number + chunk_rest if chunk_size and chunk_size < CHUNK_SIZE_MIN: chunk_size = None total_partitions = 0 if chunk_size is not None and obj_size > chunk_size: size = 0 while size < obj_size: brange = (size, size + chunk_size + CHUNK_THRESHOLD) size += chunk_size partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = total_partitions + 1 else: partition = entry partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = None partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = 1 parts_per_object.append(total_partitions) return partitions, parts_per_object
def put_object(self, content, bucket=None, key=None): """ Put temporal data object into storage. :param key: data key :param data: data content :return: CloudObject instance """ prefix = self.tmp_obj_prefix or 'tmp' key = key or '{}.pickle'.format('data_{}'.format(self.tmp_obj_count)) key = '/'.join([prefix, key]) bucket = bucket or self.bucket body = pickle.dumps(content) self.storage_handler.put_object(bucket, key, body) self.tmp_obj_count += 1 return CloudObject(self.backend, bucket, key)
def put_cobject(self, body, bucket=None, key=None): """ Put CloudObject into storage. :param body: data content :param bucket: destination bucket :param key: destination key :return: CloudObject instance """ prefix = os.environ.get('PYWREN_EXECUTION_ID', '') name = '{}/cloudobject_{}'.format(prefix, self.cloudobject_count) key = key or '/'.join([TEMP_PREFIX, name]) bucket = bucket or self.bucket self.storage_handler.put_object(bucket, key, body) self.cloudobject_count += 1 return CloudObject(self.backend, bucket, key)