Esempio n. 1
0
def main(cfg, expt_dir, hyperparameters):
    if cfg.expt_id == 'timestamp':
        cfg.expt_id = dt.today().strftime('%Y-%m-%d_%H:%M')

    logging.basicConfig(level=logging.DEBUG,
                        format="[%(name)s] [%(levelname)s] %(message)s",
                        filename=expt_dir / 'expt.log')

    cfg.train.class_names = LABELS
    dataset_cls = ManifestDataSet
    metrics_names = {
        'train': ['loss', 'uar'],
        'val': ['loss', 'uar'],
        'test': ['loss', 'uar']
    }

    cfg = create_manifest(cfg, expt_dir)
    process_func = None

    patterns = list(itertools.product(*hyperparameters.values()))
    val_results = pd.DataFrame(np.zeros(
        (len(patterns), len(hyperparameters) + len(metrics_names['val']))),
                               columns=list(hyperparameters.keys()) +
                               metrics_names['val'])

    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(hyperparameters)
    groups = None

    def experiment(pattern, cfg):
        for i, param in enumerate(hyperparameters.keys()):
            cfg = set_hyperparameter(cfg, param, pattern[i])

        cfg.train.model.model_path = str(
            expt_dir /
            f"{'_'.join([str(p).replace('/', '-') for p in pattern])}.pth")
        cfg.train.log_id = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}"

        with mlflow.start_run():
            result_series, val_pred, _ = typical_train(cfg, load_func,
                                                       label_func,
                                                       process_func,
                                                       dataset_cls, groups)

            mlflow.log_params({
                hyperparameter: value
                for hyperparameter, value in zip(hyperparameters.keys(),
                                                 pattern)
            })

        return result_series, val_pred

    # For debugging
    if cfg.n_parallel == 1:
        result_pred_list = [
            experiment(pattern, deepcopy(cfg)) for pattern in patterns
        ]
    else:
        cfg.n_jobs = 0
        result_pred_list = Parallel(n_jobs=cfg.n_parallel, verbose=0)([
            delayed(experiment)(pattern, deepcopy(cfg)) for pattern in patterns
        ])

    val_results.iloc[:, :len(hyperparameters)] = patterns
    result_list = np.array([result for result, pred in result_pred_list])
    val_results.iloc[:, len(hyperparameters):] = result_list
    pp.pprint(val_results)
    pp.pprint(val_results.iloc[:, len(hyperparameters):].describe())

    val_results.to_csv(expt_dir / 'val_results.csv', index=False)
    print(f"Devel results saved into {expt_dir / 'val_results.csv'}")
    for (_, _), pattern in zip(result_pred_list, patterns):
        pattern_name = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}"
        dump_dict(expt_dir / f'{pattern_name}.txt', cfg)

    # Train with train + devel dataset
    if cfg.test:
        best_trial_idx = val_results['uar'].argmax()

        best_pattern = patterns[best_trial_idx]
        for i, param in enumerate(hyperparameters.keys()):
            cfg = set_hyperparameter(cfg, param, best_pattern[i])

        dump_dict(expt_dir / 'best_parameters.txt',
                  {p: v
                   for p, v in zip(hyperparameters.keys(), best_pattern)})

        metrics, pred_dict_list, _ = typical_experiment(
            cfg, load_func, label_func, process_func, dataset_cls, groups)

        sub_name = f"uar-{metrics[-1]:.4f}_sub_{'_'.join([str(p).replace('/', '-') for p in best_pattern])}.csv"
        pd.DataFrame(pred_dict_list['test']).to_csv(expt_dir /
                                                    f'{sub_name}_prob.csv',
                                                    index=False,
                                                    header=None)
        pd.DataFrame(pred_dict_list['test'].argmax(axis=1)).to_csv(expt_dir /
                                                                   sub_name,
                                                                   index=False,
                                                                   header=None)
        print(f"Submission file is saved in {expt_dir / sub_name}")

    mlflow.end_run()
Esempio n. 2
0
def main(expt_conf, hyperparameters) -> float:
    if expt_conf['expt_id'] == 'timestamp':
        expt_conf['expt_id'] = dt.today().strftime('%Y-%m-%d_%H:%M')

    expt_dir = Path(
        __file__).resolve().parents[1] / 'output' / expt_conf['expt_id']
    Path(expt_dir).mkdir(exist_ok=True, parents=True)
    expt_conf['log_dir'] = str(expt_dir / 'tensorboard')

    logging.basicConfig(level=logging.DEBUG,
                        format="[%(name)s] [%(levelname)s] %(message)s",
                        filename=expt_dir / 'expt.log')

    if expt_conf['n_classes'] == 2:
        expt_conf['class_names'] = [0, 1]
    else:
        expt_conf['class_names'] = [0, 1, 2]

    metrics_names = {
        'train': ['loss', 'uar'],
        'val': ['loss', 'uar'],
        'test': ['loss', 'uar']
    }

    dataset_cls = ManifestWaveDataSet

    patterns = list(itertools.product(*hyperparameters.values()))
    val_results = pd.DataFrame(np.zeros(
        (len(patterns), len(hyperparameters) + len(metrics_names['val']))),
                               columns=list(hyperparameters.keys()) +
                               metrics_names['val'])

    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(hyperparameters)

    one_audio_sec = 10
    expt_conf['sample_rate'] = 4000
    seq_len = 50
    load_func = set_load_func(expt_conf['sample_rate'], one_audio_sec)
    process_func = set_process_func(expt_conf['model_type'], seq_len)
    expt_conf = set_data_paths(expt_dir, expt_conf)

    groups = None

    def experiment(pattern, expt_conf):
        for i, param in enumerate(hyperparameters.keys()):
            expt_conf[param] = pattern[i]

        expt_conf['model_path'] = str(
            expt_dir /
            f"{'_'.join([str(p).replace('/', '-') for p in pattern])}.pth")
        expt_conf[
            'log_id'] = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}"

        with mlflow.start_run():
            result_series, val_pred, _ = typical_train(expt_conf, load_func,
                                                       label_func,
                                                       process_func,
                                                       dataset_cls, groups)

            mlflow.log_params({
                hyperparameter: value
                for hyperparameter, value in zip(hyperparameters.keys(),
                                                 pattern)
            })
            mlflow.log_artifacts(expt_dir)

        return result_series, val_pred

    # For debugging
    if expt_conf['n_parallel'] == 1:
        result_pred_list = [
            experiment(pattern, deepcopy(expt_conf)) for pattern in patterns
        ]
    else:
        expt_conf['n_jobs'] = 0
        result_pred_list = Parallel(
            n_jobs=expt_conf['n_parallel'], verbose=0)([
                delayed(experiment)(pattern, deepcopy(expt_conf))
                for pattern in patterns
            ])

    val_results.iloc[:, :len(hyperparameters)] = [[str(param) for param in p]
                                                  for p in patterns]
    result_list = np.array([result for result, pred in result_pred_list])
    val_results.iloc[:, len(hyperparameters):] = result_list
    pp.pprint(val_results)
    pp.pprint(val_results.iloc[:, len(hyperparameters):].describe())

    val_results.to_csv(expt_dir / 'val_results.csv', index=False)
    print(f"Devel results saved into {expt_dir / 'val_results.csv'}")
    for (_, _), pattern in zip(result_pred_list, patterns):
        pattern_name = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}"
        dump_dict(expt_dir / f'{pattern_name}.txt', expt_conf)

    # Train with train + devel dataset
    if expt_conf['test']:
        best_trial_idx = val_results['uar'].argmax()

        best_pattern = patterns[best_trial_idx]
        for i, param in enumerate(hyperparameters.keys()):
            expt_conf[param] = best_pattern[i]
        dump_dict(expt_dir / 'best_parameters.txt',
                  {p: v
                   for p, v in zip(hyperparameters.keys(), best_pattern)})

        metrics, pred_dict_list, _ = typical_experiment(
            expt_conf, load_func, label_func, process_func, dataset_cls,
            groups)

        sub_name = f"sub_{'_'.join([str(p).replace('/', '-') for p in best_pattern])}.csv"
        pd.DataFrame(pred_dict_list['test']).to_csv(expt_dir /
                                                    f'{sub_name}_prob.csv',
                                                    index=False,
                                                    header=None)
        pd.DataFrame(pred_dict_list['test'].argmax(axis=1) + 1).to_csv(
            expt_dir / sub_name, index=False, header=None)
        print(f"Submission file is saved in {expt_dir / sub_name}")

    mlflow.end_run()
Esempio n. 3
0
def main(expt_conf, hyperparameters, typical_train_func):
    if expt_conf['expt_id'] == 'timestamp':
        expt_conf['expt_id'] = dt.today().strftime('%Y-%m-%d_%H:%M')
    expt_dir = Path(
        __file__).resolve().parents[1] / 'output' / expt_conf['expt_id']

    logging.basicConfig(level=logging.DEBUG,
                        format="[%(name)s] [%(levelname)s] %(message)s",
                        filename=expt_dir / 'expt.log')

    expt_conf['class_names'] = [0, 1]
    metrics_names = {
        'train': ['loss', 'uar'],
        'val': ['loss', 'uar'],
        'test': ['loss', 'uar']
    }

    expt_conf['sample_rate'] = 44100

    expt_conf, groups = set_data_paths(expt_conf)

    patterns = list(itertools.product(*hyperparameters.values()))
    val_results = pd.DataFrame(np.zeros(
        (len(patterns), len(hyperparameters) + len(metrics_names['val']))),
                               columns=list(hyperparameters.keys()) +
                               metrics_names['val'])
    dataset_cls = ManifestWaveDataSet
    process_func = None

    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(hyperparameters)

    def experiment(pattern, expt_conf):
        for i, param in enumerate(hyperparameters.keys()):
            expt_conf[param] = pattern[i]

        expt_conf['model_path'] = str(
            expt_dir /
            f"{'_'.join([str(p).replace('/', '-') for p in pattern])}.pth")
        expt_conf[
            'log_id'] = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}"

        with mlflow.start_run():
            result_series, val_pred, _ = typical_train_func(
                expt_conf, load_func, label_func, process_func, dataset_cls,
                groups)

            mlflow.log_params({
                hyperparameter: value
                for hyperparameter, value in zip(hyperparameters.keys(),
                                                 pattern)
            })
            # mlflow.log_artifacts(expt_dir)

        return result_series, val_pred

    # For debugging
    if expt_conf['n_parallel'] == 1:
        result_pred_list = [
            experiment(pattern, deepcopy(expt_conf)) for pattern in patterns
        ]
    else:
        expt_conf['n_jobs'] = 0
        result_pred_list = Parallel(
            n_jobs=expt_conf['n_parallel'], verbose=0)([
                delayed(experiment)(pattern, deepcopy(expt_conf))
                for pattern in patterns
            ])

    val_results.iloc[:, :len(hyperparameters)] = patterns
    result_list = np.array([result for result, pred in result_pred_list])
    val_results.iloc[:, len(hyperparameters):] = result_list
    pp.pprint(val_results)
    pp.pprint(val_results.iloc[:, len(hyperparameters):].describe())

    val_results.to_csv(expt_dir / 'val_results.csv', index=False)
    print(f"Devel results saved into {expt_dir / 'val_results.csv'}")
    for (_, _), pattern in zip(result_pred_list, patterns):
        pattern_name = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}"
        dump_dict(expt_dir / f'{pattern_name}.txt', expt_conf)

    # Train with train + devel dataset
    if expt_conf['test']:
        best_trial_idx = val_results['uar'].argmax()

        best_pattern = patterns[best_trial_idx]
        for i, param in enumerate(hyperparameters.keys()):
            expt_conf[param] = best_pattern[i]
        dump_dict(expt_dir / 'best_parameters.txt',
                  {p: v
                   for p, v in zip(hyperparameters.keys(), best_pattern)})

        train_df = pd.read_csv(expt_conf['train_path']).iloc[:, :-1]

        metrics, pred_dict_list, experimentor = typical_experiment(
            expt_conf, load_func, label_func, process_func, dataset_cls,
            groups)

        if expt_conf['return_prob']:
            ensemble_pred = np.argmax(np.array([
                pred_dict['test'] for pred_dict in pred_dict_list
            ]).sum(axis=0),
                                      axis=1)
        else:
            ensemble_pred = stats.mode(np.array(
                [pred_dict['test'] for pred_dict in pred_dict_list]),
                                       axis=0)[0][0]
        _, test_labels = load_func(expt_conf['test_path'])
        uar = balanced_accuracy_score(test_labels, ensemble_pred)
        print(f'{uar:.05f}')
        print(
            f'Confusion matrix: \n{confusion_matrix(test_labels, ensemble_pred)}'
        )
        sub_name = f"sub_{'_'.join([str(p).replace('/', '-') for p in best_pattern])}_{uar:.04f}.csv"
        pd.DataFrame(ensemble_pred).to_csv(expt_dir / sub_name, index=False)
        print(f"Submission file is saved in {expt_dir / sub_name}")

        result_file_name = f"results_{expt_conf['model_type']}_{expt_conf['target']}_{expt_conf['test_data_kind']}.csv"
        with open(expt_dir.parent / result_file_name, 'a') as f:
            f.write(
                f"{expt_conf['n_splits']},{expt_conf['feature']},{val_results['uar'].max()},{uar}\n"
            )

    mlflow.end_run()