Ejemplo n.º 1
0
def make_sizeplots(
    *,
    output_directory: str,
    target_y_header: str,
    make_pngs: bool = True,
    make_pdf: bool = True,
) -> None:
    df = read_collation()
    sns.set(style='whitegrid')
    for target_x_header in (MODEL_BYTES, 'training_time'):
        pkp.make_sizeplots_trellised(
            df=df,
            target_x_header=target_x_header,
            target_y_header=target_y_header,
            output_directory=output_directory,
            make_png=make_pngs,
            make_pdf=make_pdf,
            name=f'trellis_scatter_{target_x_header}',
        )
Ejemplo n.º 2
0
def main():
    hpo_df = collate_hpo_experiments()
    elapsed = pd.to_datetime(hpo_df['datetime_complete']) - pd.to_datetime(hpo_df['datetime_start'])
    total_elapsed = elapsed.sum()
    total_hours = round(total_elapsed / pd.Timedelta('1 hour'))

    best_replicates_df = read_collation()

    total_training_time = best_replicates_df['training_time'].sum()
    total_evaluation_time = best_replicates_df['evaluation_time'].sum()
    total_replicates_time = pd.Timedelta(seconds=total_training_time + total_evaluation_time)
    replicates_hours = round(total_replicates_time / pd.Timedelta('1 hour'))

    columns = [
        'searcher', 'dataset',
        'inverse_relations',
        'model', 'loss', 'regularizer', 'optimizer', 'training_approach',
        'negative_sampler',
    ]
    configurations = len(best_replicates_df[columns].drop_duplicates().index)

    sdf = best_replicates_df[columns].nunique().reset_index()

    rows = [
        (k.replace('_', ' ').title() + 's', v)
        for k, v in sdf.values
        if v > 1 and k != 'inverse_relations'
    ]

    rows.extend([
        ('HPO Configurations', configurations),
        ('HPO Experiments', len(hpo_df.index)),
        ('HPO Compute Hours', total_hours),
        ('Replicate Experiments', len(best_replicates_df.index)),
        ('Replicate Compute Hours', replicates_hours),
    ])
    print(tabulate(rows, headers=['header', 'count'], tablefmt='github'))
Ejemplo n.º 3
0
def make_plots(
    *,
    target_header: str,
    make_png: bool = True,
    make_pdf: bool = True,
):
    """Collate all HPO results in a single table."""
    df = read_collation()

    for k in ['searcher', 'evaluator']:
        if k in df.columns:
            del df[k]

    sns.set_style("whitegrid")
    summary_1d_directory = os.path.join(SUMMARY_DIRECTORY, '1D-slices')
    os.makedirs(summary_1d_directory, exist_ok=True)
    pkp.write_1d_sliced_summaries(
        df=df,
        target_header=target_header,
        output_directory=summary_1d_directory,
        make_pdf=make_pdf,
        make_png=make_png,
    )
    with open(os.path.join(HERE, 'README.md'), 'w') as file:
        print(f'# Ablation Results\n', file=file)
        print(f'Output at {time.asctime()}\n', file=file)
        for v in sorted(df['dataset'].unique()):
            print(
                f'<img src="summary/1D-slices/dataset_{v}.png" alt="{v}"/>\n',
                file=file)

    sns.set_style("darkgrid")
    dataset_optimizer_directory = os.path.join(
        SUMMARY_DIRECTORY, 'dataset_optimizer_model_summary')
    os.makedirs(dataset_optimizer_directory, exist_ok=True)
    pkp.write_dataset_optimizer_model_summaries(
        df=df,
        target_header=target_header,
        output_directory=dataset_optimizer_directory,
        make_pdf=make_pdf,
        make_png=make_png,
    )
    pkp.write_1d_sliced_summaries_stratified(
        df=df,
        target_header=target_header,
        output_directory=SUMMARY_DIRECTORY,
        make_pdf=make_pdf,
        make_png=make_png,
    )
    pkp.write_2d_summaries(
        df=df,
        target_header=target_header,
        output_directory=SUMMARY_DIRECTORY,
        make_pdf=make_pdf,
        make_png=make_png,
    )

    sizeplot_dir = os.path.join(SUMMARY_DIRECTORY, 'sizeplots')
    os.makedirs(sizeplot_dir, exist_ok=True)
    pkp.make_sizeplots_trellised(
        df=df,
        target_x_header='model_bytes',
        target_y_header=target_header,
        output_directory=sizeplot_dir,
        make_pdf=make_pdf,
        make_png=make_png,
    )
Ejemplo n.º 4
0
def main():
    """Make interpretation at top 5, 10, and 15 best."""
    df = read_collation()
    target = 'hits@10'
    do_gold(df=df, target=target)
    do_top(df=df, target=target)
Ejemplo n.º 5
0
def make_plots(
    *,
    df: Optional[pd.DataFrame] = None,
    target_header: str,
    output_directory: str,
    make_pngs: bool = True,
    make_pdfs: bool = True,
):
    if df is None:
        df = read_collation()
    del df['training_time']
    del df['evaluation_time']
    del df['model_bytes']
    del df['searcher']  # always same
    loss_loops = set(map(tuple, df[['loss', 'training_approach']].values))
    loss_loops_counter = Counter(loss for loss, _ in loss_loops)
    loss_mult = {
        loss
        for loss, count in loss_loops_counter.items() if count > 1
    }
    df['loss_training_approach'] = [
        (f'{loss} ({training_approach})' if loss in loss_mult else loss)
        for loss, training_approach in df[['loss', 'training_approach']].values
    ]

    it = tqdm(df.groupby(['dataset', 'optimizer']),
              desc='Making dataset/optimizer figures')
    for (dataset, optimizer), sub_df in it:
        it.write(
            f'creating trellised barplots: dataset/optimizer ({dataset}/{optimizer})'
        )
        pkp.write_experimental_heatmap(
            df=sub_df,
            dataset=dataset,
            optimizer=optimizer,
            target_header=target_header,
            output_directory=output_directory,
            name=f'{dataset}_{optimizer}_heat',
        )
        pkp.write_dataset_optimizer_barplots(
            df=sub_df,
            dataset=dataset,
            optimizer=optimizer,
            target_header=target_header,
            output_directory=output_directory,
            name=f'{dataset}_{optimizer}',
            make_pngs=make_pngs,
            make_pdfs=make_pdfs,
        )

        # Loss / Model / (Training Loop Chart | Inverse)
        for hue in ('training_approach', 'inverse_relations'):
            it.write(f'creating barplot: loss/model/{hue} barplot')
            pkp.make_loss_plot_barplot(
                df=sub_df,
                target_header=target_header,
                hue=hue,
                output_directory=output_directory,
                dataset=dataset,
                name=f'{dataset}_{optimizer}_model_loss_{hue}',
                make_pngs=make_pngs,
                make_pdfs=make_pdfs,
            )

        y, col, hue = 'loss', 'model', 'training_approach',
        it.write(f'creating barplot: {y}/{col}/{hue}')
        pkp.plot_3d_barplot(
            df=sub_df,
            dataset=dataset,
            optimizer=optimizer,
            y=y,
            hue=hue,
            col=col,
            target_header=target_header,
            slice_dir=output_directory,
            name=f'{dataset}_{optimizer}_{y}_{col}_{hue}',
            make_pngs=make_pngs,
            make_pdfs=make_pdfs,
        )

        gkey = [
            c for c in sub_df.columns if c not in {target_header, 'replicate'}
        ]
        gdf = sub_df.groupby(gkey)[target_header].median().reset_index()

        # 2-way plots
        for y, hue in [
            ('loss_training_approach', 'inverse_relations'),
            ('loss', 'inverse_relations'),
            ('loss', 'training_approach'),
            ('training_approach', 'inverse_relations'),
        ]:
            it.write(f'creating barplot: {y}/{hue} aggregated')
            # Aggregated
            pkp.make_2way_boxplot(
                df=gdf,
                target_header=target_header,
                y=y,
                hue=hue,
                slice_dir=output_directory,
                dataset=dataset,
                name=f'{dataset}_{optimizer}_{y}_{hue}_agg',
                make_pngs=make_pngs,
                make_pdfs=make_pdfs,
            )

            it.write(f'creating barplot: {y}/{hue}')
            pkp.make_2way_boxplot(
                df=sub_df,
                target_header=target_header,
                y=y,
                hue=hue,
                slice_dir=output_directory,
                dataset=dataset,
                name=f'{dataset}_{optimizer}_{y}_{hue}',
                make_pngs=make_pngs,
                make_pdfs=make_pdfs,
            )

    it = tqdm(df.groupby('dataset'), desc=f'Making 1D slice plots for dataset')
    for dataset, sub_df in it:
        it.write(f'creating summary chart for {dataset}')
        pkp.make_summary_chart(
            df=sub_df,
            target_header=target_header,
            slice_dir=output_directory,
            dataset=dataset,
            make_pngs=make_pngs,
            make_pdfs=make_pdfs,
            name=dataset,
        )

        gkey = [
            c for c in sub_df.columns if c not in {target_header, 'replicate'}
        ]
        gdf = sub_df.groupby(gkey)[target_header].median().reset_index()

        it.write(f'creating summary chart for {dataset} (aggregated)')
        pkp.make_summary_chart(
            df=gdf,
            target_header=target_header,
            slice_dir=output_directory,
            dataset=dataset,
            make_pngs=make_pngs,
            make_pdfs=make_pdfs,
            name=f'{dataset}_agg',
        )