Beispiel #1
0
 def get_rows_data(rows_files):
     file_sizes = []
     partition_bounds = []
     parts_file = [
         x['path'] for x in rows_files if x['path'].endswith('parts')
     ]
     if parts_file:
         parts = hadoop_ls(parts_file[0])
         for i, x in enumerate(parts):
             index = x['path'].split(f'{parts_file[0]}/part-')[1].split(
                 '-')[0]
             if i < len(parts) - 1:
                 test_index = parts[i + 1]['path'].split(
                     f'{parts_file[0]}/part-')[1].split('-')[0]
                 if test_index == index:
                     continue
             file_sizes.append(x['size_bytes'])
     metadata_file = [
         x['path'] for x in rows_files
         if x['path'].endswith('metadata.json.gz')
     ]
     if metadata_file:
         with hadoop_open(metadata_file[0], 'rb') as f:
             rows_meta = json.loads(f.read())
             try:
                 partition_bounds = [(x['start']['locus']['contig'],
                                      x['start']['locus']['position'],
                                      x['end']['locus']['contig'],
                                      x['end']['locus']['position'])
                                     for x in rows_meta['jRangeBounds']]
             except KeyError:
                 pass
     return partition_bounds, file_sizes
Beispiel #2
0
def hail_metadata(t_path):
    """Create a metadata plot for a Hail Table or MatrixTable.

    Parameters
    ----------
    t_path : str
        Path to the Hail Table or MatrixTable files.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure` or :class:`bokeh.models.widgets.panels.Tabs` or :class:`bokeh.models.layouts.Column`
    """
    def get_rows_data(rows_files):
        file_sizes = []
        partition_bounds = []
        parts_file = [
            x['path'] for x in rows_files if x['path'].endswith('parts')
        ]
        if parts_file:
            parts = hadoop_ls(parts_file[0])
            for i, x in enumerate(parts):
                index = x['path'].split(f'{parts_file[0]}/part-')[1].split(
                    '-')[0]
                if i < len(parts) - 1:
                    test_index = parts[i + 1]['path'].split(
                        f'{parts_file[0]}/part-')[1].split('-')[0]
                    if test_index == index:
                        continue
                file_sizes.append(x['size_bytes'])
        metadata_file = [
            x['path'] for x in rows_files
            if x['path'].endswith('metadata.json.gz')
        ]
        if metadata_file:
            with hadoop_open(metadata_file[0], 'rb') as f:
                rows_meta = json.loads(f.read())
                try:
                    partition_bounds = [(x['start']['locus']['contig'],
                                         x['start']['locus']['position'],
                                         x['end']['locus']['contig'],
                                         x['end']['locus']['position'])
                                        for x in rows_meta['jRangeBounds']]
                except KeyError:
                    pass
        return partition_bounds, file_sizes

    def scale_file_sizes(file_sizes):
        min_file_size = min(file_sizes) * 1.1
        total_file_size = sum(file_sizes)
        all_scales = [('T', 1e12), ('G', 1e9), ('M', 1e6), ('K', 1e3),
                      ('', 1e0)]
        for overall_scale, overall_factor in all_scales:
            if total_file_size > overall_factor:
                total_file_size /= overall_factor
                break
        for scale, factor in all_scales:
            if min_file_size > factor:
                file_sizes = [x / factor for x in file_sizes]
                break
        total_file_size = f'{total_file_size:.1f} {overall_scale}B'
        return total_file_size, file_sizes, scale

    files = hadoop_ls(t_path)

    rows_file = [x['path'] for x in files if x['path'].endswith('rows')]
    entries_file = [x['path'] for x in files if x['path'].endswith('entries')]
    success_file = [
        x['modification_time'] for x in files if x['path'].endswith('SUCCESS')
    ]

    metadata_file = [
        x['path'] for x in files if x['path'].endswith('metadata.json.gz')
    ]
    if not metadata_file:
        raise FileNotFoundError('No metadata.json.gz file found.')

    with hadoop_open(metadata_file[0], 'rb') as f:
        overall_meta = json.loads(f.read())
        rows_per_partition = overall_meta['components']['partition_counts'][
            'counts']

    if not rows_file:
        raise FileNotFoundError('No rows directory found.')
    rows_files = hadoop_ls(rows_file[0])

    data_type = 'Table'
    if entries_file:
        data_type = 'MatrixTable'
        rows_file = [
            x['path'] for x in rows_files if x['path'].endswith('rows')
        ]
        rows_files = hadoop_ls(rows_file[0])
    row_partition_bounds, row_file_sizes = get_rows_data(rows_files)

    total_file_size, row_file_sizes, row_scale = scale_file_sizes(
        row_file_sizes)

    panel_size = 480
    subpanel_size = 120

    if not row_partition_bounds:
        warning('Table is not partitioned. Only plotting file sizes')
        row_file_sizes_hist, row_file_sizes_edges = np.histogram(
            row_file_sizes, bins=50)
        p_file_size = figure(plot_width=panel_size, plot_height=panel_size)
        p_file_size.quad(right=row_file_sizes_hist,
                         left=0,
                         bottom=row_file_sizes_edges[:-1],
                         top=row_file_sizes_edges[1:],
                         fill_color="#036564",
                         line_color="#033649")
        p_file_size.yaxis.axis_label = f'File size ({row_scale}B)'
        return p_file_size

    all_data = {
        'partition_widths':
        [-1 if x[0] != x[2] else x[3] - x[1] for x in row_partition_bounds],
        'partition_bounds':
        [f'{x[0]}:{x[1]}-{x[2]}:{x[3]}' for x in row_partition_bounds],
        'spans_chromosome': [
            'Spans chromosomes' if x[0] != x[2] else 'Within chromosome'
            for x in row_partition_bounds
        ],
        'row_file_sizes':
        row_file_sizes,
        'row_file_sizes_human':
        [f'{x:.1f} {row_scale}B' for x in row_file_sizes],
        'rows_per_partition':
        rows_per_partition,
        'index':
        list(range(len(rows_per_partition)))
    }

    if entries_file:
        entries_rows_files = hadoop_ls(entries_file[0])
        entries_rows_file = [
            x['path'] for x in entries_rows_files if x['path'].endswith('rows')
        ]
        if entries_rows_file:
            entries_files = hadoop_ls(entries_rows_file[0])
            entry_partition_bounds, entry_file_sizes = get_rows_data(
                entries_files)
            total_entry_file_size, entry_file_sizes, entry_scale = scale_file_sizes(
                entry_file_sizes)
            all_data['entry_file_sizes'] = entry_file_sizes
            all_data['entry_file_sizes_human'] = [
                f'{x:.1f} {entry_scale}B' for x in row_file_sizes
            ]

    title = f'{data_type}: {t_path}'

    msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_file_size}<br/>"
    if success_file[0]:
        msg += success_file[0]

    tools = "hover,save,pan,box_zoom,reset,wheel_zoom"

    source = ColumnDataSource(pd.DataFrame(all_data))
    p = figure(tools=tools, plot_width=panel_size, plot_height=panel_size)
    p.title.text = title
    p.xaxis.axis_label = 'Number of rows'
    p.yaxis.axis_label = f'File size ({row_scale}B)'
    color_map = factor_cmap('spans_chromosome',
                            palette=Spectral8,
                            factors=list(set(all_data['spans_chromosome'])))
    p.scatter('rows_per_partition',
              'row_file_sizes',
              color=color_map,
              legend='spans_chromosome',
              source=source)
    p.legend.location = 'bottom_right'
    p.select_one(HoverTool).tooltips = [
        (x, f'@{x}') for x in ('rows_per_partition', 'row_file_sizes_human',
                               'partition_bounds', 'index')
    ]

    p_stats = Div(text=msg)
    p_rows_per_partition = figure(x_range=p.x_range,
                                  plot_width=panel_size,
                                  plot_height=subpanel_size)
    p_file_size = figure(y_range=p.y_range,
                         plot_width=subpanel_size,
                         plot_height=panel_size)

    rows_per_partition_hist, rows_per_partition_edges = np.histogram(
        all_data['rows_per_partition'], bins=50)
    p_rows_per_partition.quad(top=rows_per_partition_hist,
                              bottom=0,
                              left=rows_per_partition_edges[:-1],
                              right=rows_per_partition_edges[1:],
                              fill_color="#036564",
                              line_color="#033649")
    row_file_sizes_hist, row_file_sizes_edges = np.histogram(
        all_data['row_file_sizes'], bins=50)
    p_file_size.quad(right=row_file_sizes_hist,
                     left=0,
                     bottom=row_file_sizes_edges[:-1],
                     top=row_file_sizes_edges[1:],
                     fill_color="#036564",
                     line_color="#033649")

    rows_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]])

    if 'entry_file_sizes' in all_data:
        title = f'Statistics for {data_type}: {t_path}'

        msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_entry_file_size}<br/>"
        if success_file[0]:
            msg += success_file[0]

        source = ColumnDataSource(pd.DataFrame(all_data))
        p = figure(tools=tools, plot_width=panel_size, plot_height=panel_size)
        p.title.text = title
        p.xaxis.axis_label = 'Number of rows'
        p.yaxis.axis_label = f'File size ({entry_scale}B)'
        color_map = factor_cmap('spans_chromosome',
                                palette=Spectral8,
                                factors=list(set(
                                    all_data['spans_chromosome'])))
        p.scatter('rows_per_partition',
                  'entry_file_sizes',
                  color=color_map,
                  legend='spans_chromosome',
                  source=source)
        p.legend.location = 'bottom_right'
        p.select_one(HoverTool).tooltips = [
            (x, f'@{x}')
            for x in ('rows_per_partition', 'entry_file_sizes_human',
                      'partition_bounds', 'index')
        ]

        p_stats = Div(text=msg)
        p_rows_per_partition = figure(x_range=p.x_range,
                                      plot_width=panel_size,
                                      plot_height=subpanel_size)
        p_rows_per_partition.quad(top=rows_per_partition_hist,
                                  bottom=0,
                                  left=rows_per_partition_edges[:-1],
                                  right=rows_per_partition_edges[1:],
                                  fill_color="#036564",
                                  line_color="#033649")
        p_file_size = figure(y_range=p.y_range,
                             plot_width=subpanel_size,
                             plot_height=panel_size)

        row_file_sizes_hist, row_file_sizes_edges = np.histogram(
            all_data['entry_file_sizes'], bins=50)
        p_file_size.quad(right=row_file_sizes_hist,
                         left=0,
                         bottom=row_file_sizes_edges[:-1],
                         top=row_file_sizes_edges[1:],
                         fill_color="#036564",
                         line_color="#033649")
        entries_grid = gridplot([[p_rows_per_partition, p_stats],
                                 [p, p_file_size]])

        return Tabs(tabs=[
            Panel(child=entries_grid, title='Entries'),
            Panel(child=rows_grid, title='Rows')
        ])
    else:
        return rows_grid