def _info_dataset(ktk_cube_dataset_id, ds, cube): click.echo("") click.echo(h("Dataset: {}".format(ktk_cube_dataset_id))) ds = ds.load_partition_indices() schema = ds.schema all_cols = get_dataset_columns(ds) payload_cols = sorted( all_cols - (set(cube.dimension_columns) | set(cube.partition_columns)) ) dim_cols = sorted(set(cube.dimension_columns) & all_cols) click.echo(b("Partition Keys:") + _collist_string(ds.partition_keys, schema)) click.echo(b("Partitions:") + " {}".format(len(ds.partitions))) click.echo( b("Metadata:") + "\n{}".format( "\n".join( " {}".format(line) for line in json.dumps( ds.metadata, indent=2, sort_keys=True, separators=(",", ": ") ).split("\n") ) ) ) click.echo(b("Dimension Columns:") + _collist_string(dim_cols, schema)) click.echo(b("Payload Columns:") + _collist_string(payload_cols, schema))
def stats(ctx, include, exclude): """ Collect technical statistic from cube. """ cube = ctx.obj["cube"] store = ctx.obj["store"] all_datasets = set(ctx.obj["datasets"].keys()) selected_datasets = filter_items("dataset", all_datasets, include, exclude) try: result = collect_stats_bag(cube=cube, store=store, datasets=selected_datasets).compute() except RuntimeError as e: raise click.UsageError("Failed to collect stats: {e}".format(e=e)) data = result[0] blobsize = 0 files = 0 for i, ktk_cube_dataset_id in enumerate(sorted(data.keys())): stats = data[ktk_cube_dataset_id] if i > 0: click.echo("") click.echo(h(ktk_cube_dataset_id)) for what in sorted(stats.keys()): click.echo(b("{}:".format(what)) + " {:,}".format(stats[what])) blobsize += stats["blobsize"] files += stats["files"] click.echo("") click.echo(h("__total__")) click.echo(b("blobsize:") + " {:,}".format(blobsize)) click.echo(b("files:") + " {:,}".format(files))
def info(ctx): """ Show certain infos about the cube. """ cube = ctx.obj["cube"] datasets = ctx.obj["datasets"] seed_ds = datasets[cube.seed_dataset] seed_schema = seed_ds.schema click.echo(h("Infos")) click.echo(b("UUID Prefix:") + " {}".format(cube.uuid_prefix)) click.echo( b("Dimension Columns:") + _collist_string(cube.dimension_columns, seed_schema) ) click.echo( b("Partition Columns:") + _collist_string(cube.partition_columns, seed_schema) ) click.echo(b("Index Columns:") + _collist_string_index(cube, datasets)) click.echo(b("Seed Dataset:") + " {}".format(cube.seed_dataset)) for ktk_cube_dataset_id in sorted(datasets.keys()): _info_dataset(ktk_cube_dataset_id, datasets[ktk_cube_dataset_id], cube)