def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from bucket/s """ logger.info('Creating dataset chunks from bucket/s ...') partitions = [] parts_per_object = [] for entry in map_func_args_list: # Each entry is a bucket sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) if chunk_size or chunk_number: logger.info( 'Creating chunks from objects within: {}'.format(bucket)) else: logger.info('Discovering objects within: {}'.format(bucket)) for key, obj_size in keys_dict[bucket].items(): if prefix in key and obj_size > 0: logger.debug( 'Creating partitions from object {} size {}'.format( key, obj_size)) total_partitions = 0 size = 0 if chunk_number: chunk_rest = obj_size % chunk_number chunk_size = obj_size // chunk_number + chunk_rest if chunk_size and chunk_size < CHUNK_SIZE_MIN: chunk_size = None if chunk_size is not None and obj_size > chunk_size: while size < obj_size: brange = (size, size + chunk_size + CHUNK_THRESHOLD) size += chunk_size partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = total_partitions + 1 else: partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = None partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = 1 parts_per_object.append(total_partitions) return partitions, parts_per_object
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from a list of objects keys """ if chunk_size or chunk_number: logger.info('Creating chunks from object keys...') partitions = [] parts_per_object = [] for entry in map_func_args_list: # each entry is a key sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) key = '/'.join([prefix, obj_name]) if prefix else obj_name try: obj_size = keys_dict[bucket][key] except Exception: raise Exception( 'Object key "{}" does not exist in "{}" bucket'.format( key, bucket)) if chunk_number: chunk_rest = obj_size % chunk_number chunk_size = obj_size // chunk_number + chunk_rest if chunk_size and chunk_size < CHUNK_SIZE_MIN: chunk_size = None total_partitions = 0 if chunk_size is not None and obj_size > chunk_size: size = 0 while size < obj_size: brange = (size, size + chunk_size + CHUNK_THRESHOLD) size += chunk_size partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = total_partitions + 1 else: partition = entry partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = None partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = 1 parts_per_object.append(total_partitions) return partitions, parts_per_object
def create_partitions(pywren_config, map_iterdata, chunk_size, chunk_number): """ Method that returns the function that will create the partitions of the objects in the Cloud """ logger.debug('Starting partitioner') parts_per_object = None sbs = set() buckets = set() prefixes = set() obj_names = set() urls = set() logger.debug("Parsing input data") for elem in map_iterdata: if 'url' in elem: urls.add(elem['url']) elif 'obj' in elem: sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj']) if obj_name: obj_names.add((bucket, prefix)) elif prefix: prefixes.add((bucket, prefix)) else: buckets.add(bucket) sbs.add(sb) if len(sbs) > 1: raise Exception( 'Currently we only support to process one storage backend at a time. ' 'Current storage backends: {}'.format(sbs)) if [prefixes, obj_names, urls, buckets].count(True) > 1: raise Exception( 'You must provide as an input data a list of bucktes, ' 'a list of buckets with object prefix, a list of keys ' 'or a list of urls. Intermingled types are not allowed.') if not urls: # process objects from an object store. No url sb = sbs.pop() storage_handler = Storage(pywren_config, sb).get_storage_handler() objects = {} if obj_names: for bucket, prefix in obj_names: logger.debug("Listing objects in '{}://{}'".format( sb, '/'.join([bucket, prefix]))) if bucket not in objects: objects[bucket] = [] objects[bucket].extend( storage_handler.list_objects(bucket, prefix)) elif prefixes: for bucket, prefix in prefixes: logger.debug("Listing objects in '{}://{}'".format( sb, '/'.join([bucket, prefix]))) if bucket not in objects: objects[bucket] = [] objects[bucket].extend( storage_handler.list_objects(bucket, prefix)) elif buckets: for bucket in buckets: logger.debug("Listing objects in '{}://{}'".format(sb, bucket)) objects[bucket] = storage_handler.list_objects(bucket) keys_dict = {} for bucket in objects: keys_dict[bucket] = {} for obj in objects[bucket]: keys_dict[bucket][obj['Key']] = obj['Size'] if buckets or prefixes: partitions, parts_per_object = _split_objects_from_buckets( map_iterdata, keys_dict, chunk_size, chunk_number) elif obj_names: partitions, parts_per_object = _split_objects_from_keys( map_iterdata, keys_dict, chunk_size, chunk_number) elif urls: partitions, parts_per_object = _split_objects_from_urls( map_iterdata, chunk_size, chunk_number) else: raise ValueError('You did not provide any bucket or object key/url') return partitions, parts_per_object