def list_bucket(prefix, bucket, backend, debug, config): if config: config = load_yaml_config(config) log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) storage = Storage(config=config, backend=backend) logger.info('Listing objects in bucket {}'.format(bucket)) objects = storage.list_objects(bucket, prefix=prefix) if objects: width = max([len(obj['Key']) for obj in objects]) print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width)) print('-' * width, '\t', '-' * 20, '\t', '-' * 9) for obj in objects: key = obj['Key'] date = obj['LastModified'].strftime("%b %d %Y %H:%M:%S") size = sizeof_fmt(obj['Size']) print('{:{width}} \t {} \t {:>9}'.format(key, date, size, width=width)) print() print('Total objects: {}'.format(len(objects))) else: width = 10 print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width)) print('-' * width, '\t', '-' * 20, '\t', '-' * 9) print('\nThe bucket is empty')
def list_bucket(bucket, backend, debug): log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) storage = Storage(backend=backend) logger.info('Listing objects in bucket {}'.format(bucket)) objects = storage.list_objects(bucket) width = max([len(obj['Key']) for obj in objects]) print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width)) print('-' * width, '\t', '-' * 20, '\t', '-' * 9) for obj in objects: key = obj['Key'] date = obj['LastModified'].strftime("%b %d %Y %H:%M:%S") size = sizeof_fmt(obj['Size']) print('{:{width}} \t {} \t {:>9}'.format(key, date, size, width=width)) print()
def generate_command(number, prefix, partitions, image): bucket = None with FunctionExecutor(runtime=image) as fexec: bucket = fexec.config['lithops']['storage_bucket'] futures = fexec.map(generate_records, range(partitions), extra_args=[number, prefix], include_modules=['util']) results = fexec.get_result(fs=futures) # print(results) partition_size = record_size * number # Check if all files have been uploaded storage_client = Storage() partition_list = storage_client.list_objects(bucket, prefix + '/') assert len( partition_list ) == partitions, f'partition_list: {len(partition_list)}; partitions: {partitions}' for info in partition_list: assert info[ 'Size'] == partition_size, f'partition size: {partition_size} \ninfo: {info}' print('Done!')
def sort_command(input_prefix, output_prefix, max_parallelism, image): storage_client = Storage() bucket = None input_info_lis = None with FunctionExecutor(runtime=image, workers=max_parallelism) as fexec: bucket = fexec.config['lithops']['storage_bucket'] input_info_list = storage_client.list_objects(bucket, input_prefix + '/') input_size = sum(info['Size'] for info in input_info_list) (num_shuffles, last_values_per_category) = make_plan(input_size) current_values_per_category = 1 current_prefix = input_prefix current_keys_list = [{ 'keys_list': [key_name], 'prefix': input_prefix + '-intermediate0', 'category_stack': [] } for key_name in storage_client.list_keys(bucket, input_prefix + '/')] for current_shuffle in range(num_shuffles): # Change values per category of last shuffle if current_shuffle == num_shuffles - 1: current_values_per_category = last_values_per_category radix_sort_futures = fexec.map(radix_sort_by_byte, current_keys_list, extra_args={ 'values_per_category': current_values_per_category }, include_modules=['util']) radix_sort_results = fexec.get_result(fs=radix_sort_futures) categories_keys_lists = {} for res in radix_sort_results: intermediate_keys_list = res['keys_list'] input_category_stack = res['category_stack'] for key_name in intermediate_keys_list: category_id = int(key_name.rsplit(sep='/', maxsplit=3)[-3]) new_category_stack = input_category_stack + [category_id] new_category_stack_str = '/'.join( [str(x) for x in new_category_stack]) if new_category_stack_str in categories_keys_lists: categories_keys_lists[new_category_stack_str].append( key_name) else: categories_keys_lists[new_category_stack_str] = [ key_name ] # Partition category lists # Attach prefix metadata so that sorter knows what to name files each_category_size = input_size / ( (256 / current_values_per_category) * (current_shuffle + 1)) num_partitions_per_category = math.ceil(each_category_size / buffer_size_to_categorize) current_keys_list = [] for category_stack_str, cat_keys_list in categories_keys_lists.items( ): for sub_list in np.array_split(cat_keys_list, num_partitions_per_category): partition_entry = { 'keys_list': sub_list, 'prefix': f'{input_prefix}-intermediate{str(current_shuffle + 1)}', 'category_stack': [int(x) for x in category_stack_str.split('/')] } current_keys_list.append(partition_entry) consider_last_byte_sorted = False if last_values_per_category == 1: consider_last_byte_sorted = True for entry in current_keys_list: entry['prefix'] = output_prefix sorted_keys_list = sorted(current_keys_list, key=lambda x: x['category_stack']) sort_category_futures = fexec.map(sort_category, sorted_keys_list, extra_args={ 'consider_last_byte_sorted': consider_last_byte_sorted }, include_modules=['util']) results = fexec.get_result(fs=sort_category_futures) # print(results) # Check if size of output matches size of input output_info_list = storage_client.list_objects(bucket, output_prefix) output_size = sum(info['Size'] for info in output_info_list) assert input_size == output_size, f'input size: {input_size}, output_size: {output_size}' print('Done!')