Example #1
0
def main(*args):

    s3_client = s3Client(S3_BUCKET, SENDER_EMAIL, RECIPIENT_EMAIL)
    gs_client = gsClient(VFXPY_SPREADSHEET_KEY, COMMUNITY_SPREADSHEET_KEY)

    print("Starting...")
    packages = remove_irrelevant_packages(get_packages(gs_client))
    save_to_file(s3_client, packages)
    generate_svg_wheel(s3_client, packages)
    compare_and_notify(s3_client, gs_client)
    print("Exiting...")
def clean_data():
    """
    先简单的处理一下数据,
    主要目的是数据清理,移除乱码的数据以及在文本中错误的换行修正
    """
    logging.info("clean data begin")

    # 临时数据列表
    tmp_data_list = []
    tmp_line = ""

    with open(file_path, "rb") as file:
        for line in file:
            # 由于其他数据会出现还行现象,不是以id开头的行是其他数据换行生成的数据,这些数据修正一下
            # 修正的思路是读到id之后不是立即处理该行数据,而是继续读取,追加后面行的内容,直到读到下一个id,再处理这些暂存起来的数据信息
            # 字节数据解析成字符串,并移除换行符
            line = line.decode(encoding="utf-8").replace("\r\n", "\n").replace("\n", "")
            # 粗略地处理数据
            if line.startswith("\"ID\"") is False and re.match(data_pattern, line) is None:
                # 既不是表头也不是以id开头,说明时上一项错误换行的内容,记录该内容并继续读下一行
                tmp_line += line
                continue
            else:
                # 读取到了下一个 id 的信息,处理一下上一次暂存的信息
                if tmp_line.strip():
                    data, need = __parse_data(tmp_line)
                    if need:
                        tmp_data_list.append(data)
                        # 分块写入文件,避免占用大量内存
                        if len(tmp_data_list) > page_size:
                            save_to_file(tmp_data_list, pure_data_file_path)
                            tmp_data_list.clear()
                # 下一个id的信息
                tmp_line = line.replace("\r\n", "\n").replace("\n", "")

    # 由于最后一条没有下一个 id 帮助触发数据处理了,因此在结尾处主动处理一下最后一条 id 的数据
    if tmp_line.strip():
        data, need = __parse_data(tmp_line)
        if need:
            tmp_data_list.append(data)
            # 分块写入文件,避免占用大量内存
            if len(tmp_data_list) > page_size:
                save_to_file(tmp_data_list, pure_data_file_path)
                tmp_data_list.clear()

    # 最后再处理一下末尾阶段不足1000条的数据
    save_to_file(tmp_data_list, pure_data_file_path)
    tmp_data_list.clear()
    logging.info("clean data end")
Example #3
0
        res_false = res[number_of_tests].loc[res[number_of_tests].correct_pred == False]
        res_unseen = res_unseen_bay[number_of_tests]

        all_uncs = [[res_correct[unc_name], res_false[unc_name], res_unseen[unc_name]] for unc_name in unc_names]

        unc_labels = ('true', 'false', 'unseen')
        for idx, (unc_name, uncs) in enumerate(zip(unc_names, all_uncs)):
            fig = plt.figure(figsize=(7, 5))
            # ax = fig.add_subplot(len(unc_names), 1, idx+1)
            ax = fig.add_subplot(111)
            ax.set_title(f'{unc_name}, rho={arguments["rho"]}, std={arguments["std_prior"]}, T={number_of_tests}, {arguments["loss_type"]}\n'
                         f'Accuracy: {round(res[number_of_tests].correct_pred.mean()*100, 2)}%')
            uncs_to_show = [unc.loc[unc < 1*unc.max()] for unc in uncs]
            g.plot_uncertainty(
                ax,
                unc_name,
                uncs,
                unc_labels,
            )
            ax.legend()

            (save_path / f'{arguments["trainset"]}/images/{arguments["loss_type"]}/{unc_name}/').mkdir(exist_ok=True, parents=True)
            fig.savefig(save_path/f'{arguments["trainset"]}/images/{arguments["loss_type"]}/{unc_name}/{arguments["loss_type"]}_{unc_name}_rho{arguments["rho"]}_std{arguments["std_prior"]}_T{number_of_tests}.png')
            (save_path / f'{arguments["trainset"]}/pickles/{arguments["loss_type"]}/{unc_name}/').mkdir(exist_ok=True, parents=True)
            u.save_to_file(fig, save_path/f'{arguments["trainset"]}/pickles/{arguments["loss_type"]}/{unc_name}/{arguments["loss_type"]}_{unc_name}_rho{arguments["rho"]}_std{arguments["std_prior"]}_T{number_of_tests}.pkl')
            plt.close(fig)

        print('This exp time elapsed:', round(time() - start_exp), 's')
        print('Total time elapsed:', round(time() - start_tot), 's')
        start_exp = time()
Example #4
0
def main(*args):
    packages = remove_irrelevant_packages(get_top_packages(), TO_CHART)
    annotate_wheels(packages)
    save_to_file(packages)
    generate_svg_wheel(packages, TO_CHART)
def main(
    exp_nbs=exp_nbs,
    path_to_results=save_path,
    path_to_exps=path_to_exps,
    n=n,
    nb_of_runs=nb_of_runs,
    number_of_tests=number_of_tests,
    verbose=verbose,
    nb_of_random=nb_of_random,
    do_recompute_outputs=do_recompute_outputs,
    save_csv=save_csv,
    device='cpu',
):
    save_path = pathlib.Path(path_to_results)
    if do_recompute_outputs:
        save_path = save_path / 'recomputed'
    else:
        save_path = save_path / 'saved_from_polyaxon'
    save_path.mkdir(exist_ok=True, parents=True)
    if not do_recompute_outputs:
        nb_of_runs = 1

    if not os.path.exists(save_path / 'deadzones.pkl'):
        deadzones = pd.DataFrame(columns=['group_nb', 'exp_nb', 'unc_name'])
    else:
        deadzones = load_from_file(save_path / 'deadzones.pkl', )
        deadzones.to_csv(save_path / 'deadzones.csv')

    recomputed_exps = []

    start_time = time()

    for repeat_idx in range(nb_of_runs):
        for exp_nb in exp_nbs:
            print(
                f'Repeat number {repeat_idx + 1} / {nb_of_runs}, Exp nb {exp_nb}'
            )
            arguments = get_args(exp_nb, path_to_exps)
            determinist = arguments.get('rho', 'determinist') == 'determinist'

            def recompute_outputs(deadzones):
                bay_net_trained, arguments, group_nb = get_trained_model_and_args_and_groupnb(
                    exp_nb, exp_path=path_to_exps)
                bay_net_trained.to(device)

                arguments['number_of_tests'] = number_of_tests

                all_eval_outputs, _ = get_seen_outputs_and_labels(
                    bay_net_trained,
                    arguments,
                    device=device,
                    verbose=verbose,
                )

                all_outputs_unseen = get_unseen_outputs(
                    bay_net_trained,
                    arguments,
                    nb_of_random,
                    device=device,
                    verbose=verbose,
                )

                if determinist:
                    dzs = get_deadzones(
                        all_eval_outputs, all_outputs_unseen,
                        get_all_uncertainty_measures_not_bayesian, n)
                    iterator = zip(['us', 'pe'], dzs)
                else:
                    dzs = get_deadzones(all_eval_outputs, all_outputs_unseen,
                                        get_all_uncertainty_measures_bayesian,
                                        n)
                    iterator = zip(['vr', 'pe', 'mi'], dzs)
                for unc_name, dz in iterator:
                    deadzones = deadzones.append(
                        pd.DataFrame.from_dict({
                            'group_nb': [group_nb],
                            'exp_nb': [exp_nb],
                            'trainset': [arguments.get('trainset', 'mnist')],
                            'type_of_unseen': [arguments['type_of_unseen']],
                            'epoch': [arguments['epoch']],
                            'number_of_tests': [arguments['number_of_tests']],
                            'unc_name': [unc_name],
                            f'dz_{n}': [dz],
                        }))
                return deadzones

            if do_recompute_outputs:
                deadzones = recompute_outputs(deadzones)

            else:
                try:
                    results, arguments, group_nb = get_res_args_groupnb(
                        exp_nb, exp_path=path_to_exps)
                except RuntimeError as e:
                    if e.__str__() == "Attempting to deserialize object on a CUDA device but torch.cuda.is_available() " \
                                      "is False. If you are running on a CPU-only machine, please use torch.load with " \
                                      "map_location='cpu' to map your storages to the CPU.":
                        recompute_outputs()
                        recomputed_exps.append(exp_nb)
                        continue
                    else:
                        raise e

                def seen_and_unseen_and_n(results, unc, n):
                    return (results.get(
                        get_unc_key(results.columns, f'seen {unc}'),
                        [torch.tensor([-1], dtype=torch.float)])[0],
                            results.get(
                                get_unc_key(results.columns, f'unseen {unc}'),
                                [torch.tensor([-1], dtype=torch.float)])[0], n)

                try:
                    dz_pe = get_deadzone_from_unc(
                        *seen_and_unseen_and_n(results, 'pe', n))
                except:
                    dz_pe = -1

                if determinist:
                    dz_us = get_deadzone_from_unc(
                        *seen_and_unseen_and_n(results, 'us', n))
                    iterator = zip(['us', 'pe'], [dz_us, dz_pe])
                else:
                    dz_vr = get_deadzone_from_unc(
                        *seen_and_unseen_and_n(results, 'vr', n))
                    dz_mi = get_deadzone_from_unc(
                        *seen_and_unseen_and_n(results, 'mi', n))
                    iterator = zip(['vr', 'pe', 'mi'], [dz_vr, dz_pe, dz_mi])
                for unc_name, dz in iterator:
                    deadzones = deadzones.append(
                        pd.DataFrame.from_dict({
                            'deadzone_number': [n],
                            'group_nb': [group_nb],
                            'trainset': [arguments.get('trainset', 'mnist')],
                            'exp_nb': [exp_nb],
                            'type_of_unseen': [arguments['type_of_unseen']],
                            'epoch': [arguments['epoch']],
                            'number_of_tests': [arguments['number_of_tests']],
                            'unc_name': [unc_name],
                            f'dz_{n}': [dz],
                        }))

            print(f'Time Elapsed:{round(time() - start_time)} s.')

    deadzones.exp_nb = deadzones.exp_nb.astype('int')

    if save_csv:

        save_to_file(arguments, save_path / 'arguments.pkl')
        deadzones.sort_values('exp_nb')
        deadzones.to_pickle(save_path / 'deadzones.pkl')
        deadzones.to_csv(save_path / 'deadzones.csv')

    print(deadzones)
    default=20)
parser.add_argument('--loss_type',
                    help='which loss to use',
                    choices=['exp', 'uniform', 'criterion'],
                    type=str,
                    default='uniform')
parser.add_argument('--std_prior',
                    help='the standard deviation of the prior',
                    type=float,
                    default=0.1)
parser.add_argument('--delta',
                    help='probability upper bound of error higher that risk',
                    type=float)

args = parser.parse_args()
save_to_file(vars(args), './output/arguments.pkl')

trainset = args.trainset
rho = args.rho
epoch = args.epoch
batch_size = args.batch_size
number_of_tests = args.number_of_tests
loss_type = args.loss_type
std_prior = args.std_prior
stds_prior = (std_prior, std_prior)
delta = args.delta
risks = np.linspace(0.01, 0.5, 50)

if torch.cuda.is_available():
    device = 'cuda'
else:
          f'Mutual Information:{unseen_mi.mean()}')
    res = pd.concat((res,
                     pd.DataFrame.from_dict({
                         'sigma_initial': [log(1 + exp(rho))],
                         'seen_uncertainty_vr': [eval_vr],
                         'seen_uncertainty_pe': [eval_pe],
                         'seen_uncertainty_mi': [eval_mi],
                         'unseen_uncertainty_vr': [unseen_vr],
                         'unseen_uncertainty_pe': [unseen_pe],
                         'unseen_uncertainty_mi': [unseen_mi],
                     })),
                    axis=1)

convert_df_to_cpu(res)

save_to_file(arguments, f'{output_file}/arguments.pkl')
if args.save_loss:
    save_to_file(loss, f'{output_file}/loss.pkl')
if args.save_observables:
    save_to_file(observables, f'{output_file}/TrainingLogs.pkl')
if args.save_outputs:
    torch.save(all_outputs_unseen, f'{output_file}/unseen_outputs.pt')
    torch.save(all_outputs_eval, f'{output_file}/seen_outputs.pt')
# torch.save(res, f'{output_file}/results.pt')
res.to_pickle(f'{output_file}/results.pkl')
torch.save(bay_net.state_dict(), f'{output_file}/final_weights.pt')
torch.save(observables.max_weights, f'{output_file}/best_weights.pt')
pd.DataFrame.from_dict({k: [v]
                        for k, v in arguments.items()
                        }).to_csv(f'{output_file}/arguments.csv')
Example #8
0
def main(*args):
    packages = remove_irrelevant_packages(get_top_packages(), TO_CHART)
    annotate_wheels(packages)
    save_to_file(packages)
    generate_svg_wheel(packages, TO_CHART)
Example #9
0
def train_vcae(n_epochs,
               model,
               train_iterator,
               val_iterator,
               optimizer,
               device,
               criterion,
               save_best=True,
               verbose=True,
               is_nf=False,
               nf=None):
    model_name = 'NormalizingFlow' + model.__class__.__name__ if is_nf else model.__class__.__name__
    writer, experiment_name, best_model_path = setup_experiment(model_name,
                                                                log_dir="./tb")

    mb = master_bar(range(n_epochs))

    train_losses, val_losses = [], []
    best_val_loss = float('+inf')

    for epoch in mb:
        train_loss = run_epoch(model,
                               train_iterator,
                               optimizer,
                               criterion,
                               mb,
                               phase='train',
                               epoch=epoch,
                               writer=writer,
                               is_nf=is_nf,
                               nf=nf,
                               device=device)

        val_loss = run_epoch(model,
                             val_iterator,
                             None,
                             criterion,
                             mb,
                             phase='val',
                             epoch=epoch,
                             writer=writer,
                             is_nf=is_nf,
                             nf=nf,
                             device=device)

        # save logs
        dict_saver = {}
        dict_saver.update({'train_loss_mean': train_loss})
        dict_saver.update({'test_loss_mean': val_loss})
        file_to_save_path = ''.join(
            [LOG_PATH, FILE_NAME, experiment_name, FILE_EXCITON])
        save_to_file(file_to_save_path, dict_saver)

        # save the best model
        if save_best and (val_loss < best_val_loss):
            best_val_loss = val_loss
            save_model(nf if is_nf else model, best_model_path)

        if verbose:
            # append to a list for real-time plotting
            train_losses.append(train_loss)
            val_losses.append(val_loss)

            # start plotting for notebook
            mb.main_bar.comment = f'EPOCHS, best_loss:{best_val_loss}'
            mb.child.comment = f"train_loss:{round(train_loss, 3)}, val_loss:{round(val_loss, 3)}"
            plot_loss_update(epoch, n_epochs, mb, train_losses, val_losses)

    return best_model_path
from src.utils import get_file_and_dir_path_in_dir, load_from_file, save_to_file

path_to_exps = 'output/determinist_cifar10'

files, _ = get_file_and_dir_path_in_dir(path_to_exps, 'arguments.pkl')

for file in files:
    args = load_from_file(file)
    args['number_of_tests'] = 1
    print(file, 'changed')
    save_to_file(args, file)
Example #11
0
if show_fig or save_fig:
    for exp_nb in exp_nbs:
        fig = plot_acc_cov(number_of_tests_to_print,
                           exp_nb,
                           results_train,
                           figsize=figsize)
        arguments = get_args(exp_nb, path)
        fig.suptitle(
            f'Trainset: Acc-Coverage, w.r.t. nb of tests and uncertainty measure\n'
            f'{dict({k: v for k, v in arguments.items() if k not in ["type_of_unseen", "number_of_tests"]})}',
            wrap=True)
        if save_fig:
            save_png_path.mkdir(exist_ok=True, parents=True)
            save_pkl_path.mkdir(exist_ok=True, parents=True)
            fig.savefig(save_png_path / f'{exp_nb}-acc-coverage-train.png')
            save_to_file(fig,
                         save_pkl_path / f'{exp_nb}-acc-coverage-train.pkl')
        if show_fig:
            fig.show()

results_eval = pd.read_csv(save_csv_path / 'results_eval.csv')
if show_fig or save_fig:
    for exp_nb in exp_nbs:
        fig = plot_acc_cov(number_of_tests_to_print,
                           exp_nb,
                           results_eval,
                           figsize=figsize)
        arguments = get_args(exp_nb, path)
        fig.suptitle(
            f'Testset: Acc-Coverage, w.r.t. nb of tests and uncertainty measure\n'
            f'{dict({k: v for k, v in arguments.items() if k not in ["type_of_unseen", "number_of_tests"]})}',
            wrap=True)
Example #12
0
                                            arguments, group_nb, exp_nb)

            fig1 = utils.compute_density_train_seen_unseen(
                arguments=arguments,
                all_outputs_train=all_outputs_train,
                all_outputs_seen=all_outputs_seen,
                all_outputs_unseen=all_outputs_unseen,
                show_fig=show_fig,
                save_fig=save_fig,
                save_path=save_path_hists,
                figsize=figsize,
            )
            if save_fig:
                print('Saving figure...')
                fig1.savefig(save_path_hists)
                save_to_file(fig1, str(save_path_hists).replace('png', 'pkl'))
                print('Figure saved.')
            if show_fig:
                print('Showing figures...')
                fig1.show()
                print('Figure shown.')
            print('Done')

        if do_train_correct_false:
            print('Do train correct false...')
            save_path_hists = get_save_path(save_path, 'train_correct_false',
                                            arguments, group_nb, exp_nb)

            fig2 = utils.compute_density_correct_false(
                arguments=arguments,
                all_outputs=all_outputs_train,
    # ax4.set_title(f'softmax output unseen. VR: {round(vr_unseen.item(), 4)}, PE: {round(pe_unseen.item(), 4)}, MI: {round(mi_unseen.item(), 4)}')
    if is_cifar:
        ax4.set_xticks(range(10))
        ax4.set_xticklabels(cifar_labels)
        ax4.tick_params(axis='x', rotation=45)

    fig.show()

    save_path = 'results/images/softmax_output'
    save_path = pathlib.Path(save_path)
    save_fig = False

    if save_fig:
        save_path.mkdir(exist_ok=True, parents=True)
        fig.savefig(save_path / f'softmax_output_{exp}_{img_index_seen}.png')
        u.save_to_file(
            fig, save_path / f'softmax_output_{exp}_{img_index_seen}.pkl')

# %% Compute det outputs

reload_modules()

trainset = 'cifar10'
res_det = pd.DataFrame()

det_net_trained, arguments, _ = su.get_trained_model_and_args_and_groupnb(
    f'determinist_{trainset}', f'output/')
evalloader_seen = su.get_evalloader_seen(arguments, shuffle=False)

labels, all_outputs = e.eval_bayesian(
    det_net_trained,
    evalloader_seen,
            # cur_auc = metrics.auc(xs_means.iloc[xs_means.argsort()], ys_means.iloc[xs_means.argsort()])
            axs.set_title(
                title +
                f'Bay AUC: {round(100 * aucs[idx_unc].mean(), 2)} +- {round(100 * 1.97 * aucs[idx_unc].std() / 5, 2)} %, T={number_of_tests}'
            )
            axs.legend(handles=legend_elements)

            fig.suptitle(f'')
            if save_fig:
                (save_path / f'{unc_name}').mkdir(exist_ok=True, parents=True)
                fig.savefig(
                    save_path / f'{unc_name}' /
                    f'roc_{typ}_{arg["loss_type"]}_{unc_name}_T{number_of_tests}.png'
                )
                save_to_file(
                    fig, save_path / f'{unc_name}' /
                    f'roc_{typ}_{arg["loss_type"]}_{unc_name}_T{number_of_tests}.pkl'
                )
                lp = arg['loss_type']
                print(
                    f"Fig saved in {save_path / f'{unc_name}' / f'roc_{typ}_{lp}_{unc_name}_T{number_of_tests}.png'}"
                )
            plt.close(fig)

            # res.loc[
            #     [res.rho == rho, res.std_prior == std_prior, res.T == number_of_tests, res.unc_name == unc_name],
            #     [f'{trainset}+_{typ}']
            # ] = [aucs[idx_unc].mean(), 1.97 * aucs[idx_unc].std() / 5]

    print(aucs.mean(1))
    print(aucs.std(1) * 1.97 / 5)
    print(aucs_det.mean(1))