Beispiel #1
0
def list_bucket(prefix, bucket, backend, debug, config):
    if config:
        config = load_yaml_config(config)
    log_level = logging.INFO if not debug else logging.DEBUG
    setup_lithops_logger(log_level)
    storage = Storage(config=config, backend=backend)
    logger.info('Listing objects in bucket {}'.format(bucket))
    objects = storage.list_objects(bucket, prefix=prefix)

    if objects:
        width = max([len(obj['Key']) for obj in objects])

        print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width))
        print('-' * width, '\t', '-' * 20, '\t', '-' * 9)
        for obj in objects:
            key = obj['Key']
            date = obj['LastModified'].strftime("%b %d %Y %H:%M:%S")
            size = sizeof_fmt(obj['Size'])
            print('{:{width}} \t {} \t {:>9}'.format(key, date, size, width=width))
        print()
        print('Total objects: {}'.format(len(objects)))
    else:
        width = 10
        print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width))
        print('-' * width, '\t', '-' * 20, '\t', '-' * 9)
        print('\nThe bucket is empty')
Beispiel #2
0
def list_bucket(bucket, backend, debug):
    log_level = logging.INFO if not debug else logging.DEBUG
    setup_lithops_logger(log_level)
    storage = Storage(backend=backend)
    logger.info('Listing objects in bucket {}'.format(bucket))
    objects = storage.list_objects(bucket)

    width = max([len(obj['Key']) for obj in objects])

    print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width))
    print('-' * width, '\t', '-' * 20, '\t', '-' * 9)
    for obj in objects:
        key = obj['Key']
        date = obj['LastModified'].strftime("%b %d %Y %H:%M:%S")
        size = sizeof_fmt(obj['Size'])
        print('{:{width}} \t {} \t {:>9}'.format(key, date, size, width=width))
    print()
def generate_command(number, prefix, partitions, image):
    bucket = None
    with FunctionExecutor(runtime=image) as fexec:
        bucket = fexec.config['lithops']['storage_bucket']
        futures = fexec.map(generate_records,
                            range(partitions),
                            extra_args=[number, prefix],
                            include_modules=['util'])
        results = fexec.get_result(fs=futures)
        # print(results)

    partition_size = record_size * number

    # Check if all files have been uploaded
    storage_client = Storage()
    partition_list = storage_client.list_objects(bucket, prefix + '/')
    assert len(
        partition_list
    ) == partitions, f'partition_list: {len(partition_list)}; partitions: {partitions}'
    for info in partition_list:
        assert info[
            'Size'] == partition_size, f'partition size: {partition_size} \ninfo: {info}'

    print('Done!')
def sort_command(input_prefix, output_prefix, max_parallelism, image):
    storage_client = Storage()
    bucket = None
    input_info_lis = None

    with FunctionExecutor(runtime=image, workers=max_parallelism) as fexec:
        bucket = fexec.config['lithops']['storage_bucket']
        input_info_list = storage_client.list_objects(bucket,
                                                      input_prefix + '/')
        input_size = sum(info['Size'] for info in input_info_list)
        (num_shuffles, last_values_per_category) = make_plan(input_size)

        current_values_per_category = 1
        current_prefix = input_prefix
        current_keys_list = [{
            'keys_list': [key_name],
            'prefix': input_prefix + '-intermediate0',
            'category_stack': []
        } for key_name in storage_client.list_keys(bucket, input_prefix + '/')]
        for current_shuffle in range(num_shuffles):
            # Change values per category of last shuffle
            if current_shuffle == num_shuffles - 1:
                current_values_per_category = last_values_per_category

            radix_sort_futures = fexec.map(radix_sort_by_byte,
                                           current_keys_list,
                                           extra_args={
                                               'values_per_category':
                                               current_values_per_category
                                           },
                                           include_modules=['util'])
            radix_sort_results = fexec.get_result(fs=radix_sort_futures)

            categories_keys_lists = {}
            for res in radix_sort_results:
                intermediate_keys_list = res['keys_list']
                input_category_stack = res['category_stack']
                for key_name in intermediate_keys_list:
                    category_id = int(key_name.rsplit(sep='/', maxsplit=3)[-3])
                    new_category_stack = input_category_stack + [category_id]
                    new_category_stack_str = '/'.join(
                        [str(x) for x in new_category_stack])
                    if new_category_stack_str in categories_keys_lists:
                        categories_keys_lists[new_category_stack_str].append(
                            key_name)
                    else:
                        categories_keys_lists[new_category_stack_str] = [
                            key_name
                        ]

            # Partition category lists
            # Attach prefix metadata so that sorter knows what to name files
            each_category_size = input_size / (
                (256 / current_values_per_category) * (current_shuffle + 1))
            num_partitions_per_category = math.ceil(each_category_size /
                                                    buffer_size_to_categorize)

            current_keys_list = []
            for category_stack_str, cat_keys_list in categories_keys_lists.items(
            ):
                for sub_list in np.array_split(cat_keys_list,
                                               num_partitions_per_category):
                    partition_entry = {
                        'keys_list':
                        sub_list,
                        'prefix':
                        f'{input_prefix}-intermediate{str(current_shuffle + 1)}',
                        'category_stack':
                        [int(x) for x in category_stack_str.split('/')]
                    }
                    current_keys_list.append(partition_entry)

        consider_last_byte_sorted = False
        if last_values_per_category == 1:
            consider_last_byte_sorted = True
        for entry in current_keys_list:
            entry['prefix'] = output_prefix
        sorted_keys_list = sorted(current_keys_list,
                                  key=lambda x: x['category_stack'])
        sort_category_futures = fexec.map(sort_category,
                                          sorted_keys_list,
                                          extra_args={
                                              'consider_last_byte_sorted':
                                              consider_last_byte_sorted
                                          },
                                          include_modules=['util'])
        results = fexec.get_result(fs=sort_category_futures)
        # print(results)

    # Check if size of output matches size of input

    output_info_list = storage_client.list_objects(bucket, output_prefix)
    output_size = sum(info['Size'] for info in output_info_list)
    assert input_size == output_size, f'input size: {input_size}, output_size: {output_size}'

    print('Done!')