def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from bucket/s """ partitions = [] parts_per_object = [] if chunk_number: logger.debug('Chunk size set to {}'.format(chunk_size)) elif chunk_size: logger.debug('Chunk number set to {}'.format(chunk_number)) else: logger.debug('Chunk size and chunk number not set ') for entry in map_func_args_list: # Each entry is a bucket sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) for key, obj_size in keys_dict[bucket].items(): if prefix in key and obj_size > 0: if chunk_number: chunk_rest = obj_size % chunk_number obj_chunk_size = (obj_size // chunk_number) + \ round((chunk_rest / chunk_number) + 0.5) elif chunk_size: obj_chunk_size = chunk_size else: obj_chunk_size = obj_size size = total_partitions = 0 ci = obj_size cz = obj_chunk_size parts = ci // cz + (ci % cz > 0) logger.debug( 'Creating {} partitions from object {} ({})'.format( parts, key, sizeof_fmt(obj_size))) while size < obj_size: brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD) brange = None if obj_size == obj_chunk_size else brange partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = obj_chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions += 1 size += obj_chunk_size parts_per_object.append(total_partitions) return partitions, parts_per_object
def _split_objects_from_buckets(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from bucket/s """ logger.debug('Creating dataset chunks from bucket/s ...') partitions = [] parts_per_object = [] for entry in map_func_args_list: # Each entry is a bucket sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) if chunk_size or chunk_number: logger.debug( 'Creating chunks from objects within: {}'.format(bucket)) else: logger.debug('Discovering objects within: {}'.format(bucket)) for key, obj_size in keys_dict[bucket].items(): if prefix in key and obj_size > 0: logger.debug( 'Creating partitions from object {} size {}'.format( key, obj_size)) if chunk_number: chunk_rest = obj_size % chunk_number obj_chunk_size = (obj_size // chunk_number) + \ round((chunk_rest / chunk_number) + 0.5) elif chunk_size: obj_chunk_size = chunk_size else: obj_chunk_size = obj_size size = total_partitions = 0 while size < obj_size: brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD) brange = None if obj_size == obj_chunk_size else brange partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = obj_chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions += 1 size += obj_chunk_size parts_per_object.append(total_partitions) return partitions, parts_per_object
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from a list of objects keys """ if chunk_size or chunk_number: logger.info('Creating chunks from object keys...') partitions = [] parts_per_object = [] for entry in map_func_args_list: # each entry is a key sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) key = '/'.join([prefix, obj_name]) if prefix else obj_name try: obj_size = keys_dict[bucket][key] except Exception: raise Exception('Object key "{}" does not exist in "{}" bucket'.format(key, bucket)) if chunk_number: chunk_rest = obj_size % chunk_number chunk_size = obj_size // chunk_number + chunk_rest if chunk_size and chunk_size < CHUNK_SIZE_MIN: chunk_size = None total_partitions = 0 if chunk_size is not None and obj_size > chunk_size: size = 0 while size < obj_size: brange = (size, size+chunk_size+CHUNK_THRESHOLD) size += chunk_size partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = total_partitions + 1 else: partition = entry partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = None partition['obj'].chunk_size = chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions = 1 parts_per_object.append(total_partitions) return partitions, parts_per_object
def _split_objects_from_keys(map_func_args_list, keys_dict, chunk_size, chunk_number): """ Create partitions from a list of objects keys """ if chunk_size or chunk_number: logger.debug('Creating chunks from object keys') partitions = [] parts_per_object = [] for entry in map_func_args_list: # each entry is a key sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) key = '/'.join([prefix, obj_name]) if prefix else obj_name try: obj_size = keys_dict[bucket][key] except Exception: raise Exception( 'Object key "{}" does not exist in "{}" bucket'.format( key, bucket)) if chunk_number: chunk_rest = obj_size % chunk_number obj_chunk_size = (obj_size // chunk_number) + \ round((chunk_rest / chunk_number) + 0.5) elif chunk_size: obj_chunk_size = chunk_size else: obj_chunk_size = obj_size size = total_partitions = 0 while size < obj_size: brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD) brange = None if obj_size == obj_chunk_size else brange partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = obj_chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions += 1 size += obj_chunk_size parts_per_object.append(total_partitions) return partitions, parts_per_object
def create_partitions(config, internal_storage, map_iterdata, chunk_size, chunk_number): """ Method that returns the function that will create the partitions of the objects in the Cloud """ logger.debug('Starting partitioner') parts_per_object = None sbs = set() buckets = set() prefixes = set() obj_names = set() urls = set() logger.debug("Parsing input data") for elem in map_iterdata: if 'url' in elem: urls.add(elem['url']) elif 'obj' in elem: if type(elem['obj']) == CloudObject: elem['obj'] = '{}://{}/{}'.format(elem['obj'].backend, elem['obj'].bucket, elem['obj'].key) sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj']) if sb is None: sb = internal_storage.backend elem['obj'] = '{}://{}'.format(sb, elem['obj']) if obj_name: obj_names.add((bucket, prefix)) elif prefix: prefixes.add((bucket, prefix)) else: buckets.add(bucket) sbs.add(sb) if len(sbs) > 1: raise Exception('Currently we only support to process one storage backend at a time. ' 'Current storage backends: {}'.format(sbs)) if [prefixes, obj_names, urls, buckets].count(True) > 1: raise Exception('You must provide as an input data a list of bucktes, ' 'a list of buckets with object prefix, a list of keys ' 'or a list of urls. Intermingled types are not allowed.') if not urls: # process objects from an object store. No url sb = sbs.pop() if sb == internal_storage.backend: storage = internal_storage.storage else: storage = Storage(config=config, backend=sb) objects = {} if obj_names: for bucket, prefix in obj_names: logger.debug("Listing objects in '{}://{}/'" .format(sb, '/'.join([bucket, prefix]))) if bucket not in objects: objects[bucket] = [] prefix = prefix + '/' if prefix else prefix objects[bucket].extend(storage.list_objects(bucket, prefix)) elif prefixes: for bucket, prefix in prefixes: logger.debug("Listing objects in '{}://{}/'" .format(sb, '/'.join([bucket, prefix]))) if bucket not in objects: objects[bucket] = [] prefix = prefix + '/' if prefix else prefix objects[bucket].extend(storage.list_objects(bucket, prefix)) elif buckets: for bucket in buckets: logger.debug("Listing objects in '{}://{}'".format(sb, bucket)) objects[bucket] = storage.list_objects(bucket) keys_dict = {} for bucket in objects: keys_dict[bucket] = {} for obj in objects[bucket]: keys_dict[bucket][obj['Key']] = obj['Size'] if buckets or prefixes: partitions, parts_per_object = _split_objects_from_buckets(map_iterdata, keys_dict, chunk_size, chunk_number) elif obj_names: partitions, parts_per_object = _split_objects_from_keys(map_iterdata, keys_dict, chunk_size, chunk_number) elif urls: partitions, parts_per_object = _split_objects_from_urls(map_iterdata, chunk_size, chunk_number) else: raise ValueError('You did not provide any bucket or object key/url') return partitions, parts_per_object
def _split_objects_from_object_storage(map_func_args_list, chunk_size, chunk_number, internal_storage, config): """ Create partitions from a list of buckets or object keys """ if chunk_number: logger.debug('Chunk size set to {}'.format(chunk_size)) elif chunk_size: logger.debug('Chunk number set to {}'.format(chunk_number)) else: logger.debug('Chunk size and chunk number not set ') sbs = set() buckets = set() prefixes = set() obj_names = set() for elem in map_func_args_list: if type(elem['obj']) == CloudObject: elem['obj'] = '{}://{}/{}'.format(elem['obj'].backend, elem['obj'].bucket, elem['obj'].key) sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj']) if sb is None: sb = internal_storage.backend elem['obj'] = '{}://{}'.format(sb, elem['obj']) if obj_name: obj_names.add((bucket, prefix)) elif prefix: prefixes.add((bucket, prefix)) else: buckets.add(bucket) sbs.add(sb) if len(sbs) > 1: raise Exception( 'Process objects from multiple storage backends is not supported. ' 'Current storage backends: {}'.format(sbs)) sb = sbs.pop() if sb == internal_storage.backend: storage = internal_storage.storage else: storage = Storage(config=config, backend=sb) objects = {} if obj_names: for bucket, prefix in obj_names: logger.debug("Listing objects in '{}://{}'".format( sb, '/'.join([bucket, prefix]))) if bucket not in objects: objects[bucket] = [] prefix = prefix + '/' if prefix else prefix objects[bucket].extend(storage.list_objects(bucket, prefix)) logger.debug("Total objects found: {}".format(len(objects[bucket]))) elif prefixes: for bucket, prefix in prefixes: logger.debug("Listing objects in '{}://{}'".format( sb, '/'.join([bucket, prefix]))) if bucket not in objects: objects[bucket] = [] prefix = prefix + '/' if prefix else prefix objects[bucket].extend(storage.list_objects(bucket, prefix)) logger.debug("Total objects found: {}".format(len(objects[bucket]))) elif buckets: for bucket in buckets: logger.debug("Listing objects in '{}://{}'".format(sb, bucket)) objects[bucket] = storage.list_objects(bucket) logger.debug("Total objects found: {}".format(len(objects[bucket]))) if all([len(objects[bucket]) == 0 for bucket in objects]): raise Exception( f'No objects found in bucket: {", ".join(objects.keys())}') keys_dict = {} for bucket in objects: keys_dict[bucket] = {} for obj in objects[bucket]: keys_dict[bucket][obj['Key']] = obj['Size'] partitions = [] parts_per_object = [] def create_partition(bucket, key, entry): if key.endswith('/'): logger.debug( f'Discarding object "{key}" as it is a prefix folder (0.0B)') return obj_size = keys_dict[bucket][key] if chunk_number: chunk_rest = obj_size % chunk_number obj_chunk_size = (obj_size // chunk_number) + \ round((chunk_rest / chunk_number) + 0.5) elif chunk_size: obj_chunk_size = chunk_size else: obj_chunk_size = obj_size size = total_partitions = 0 ci = obj_size cz = obj_chunk_size parts = ci // cz + (ci % cz > 0) logger.debug('Creating {} partitions from object {} ({})'.format( parts, key, sizeof_fmt(obj_size))) while size < obj_size: brange = (size, size + obj_chunk_size + CHUNK_THRESHOLD) brange = None if obj_size == obj_chunk_size else brange partition = entry.copy() partition['obj'] = CloudObject(sb, bucket, key) partition['obj'].data_byte_range = brange partition['obj'].chunk_size = obj_chunk_size partition['obj'].part = total_partitions partitions.append(partition) total_partitions += 1 size += obj_chunk_size parts_per_object.append(total_partitions) for entry in map_func_args_list: sb, bucket, prefix, obj_name = utils.split_object_url(entry['obj']) if obj_name: # each entry is an object key key = '/'.join([prefix, obj_name]) if prefix else obj_name create_partition(bucket, key, entry) else: # each entry is a bucket for key in keys_dict[bucket]: create_partition(bucket, key, entry) return partitions, parts_per_object