def main(cfg, expt_dir, hyperparameters): if cfg.expt_id == 'timestamp': cfg.expt_id = dt.today().strftime('%Y-%m-%d_%H:%M') logging.basicConfig(level=logging.DEBUG, format="[%(name)s] [%(levelname)s] %(message)s", filename=expt_dir / 'expt.log') cfg.train.class_names = LABELS dataset_cls = ManifestDataSet metrics_names = { 'train': ['loss', 'uar'], 'val': ['loss', 'uar'], 'test': ['loss', 'uar'] } cfg = create_manifest(cfg, expt_dir) process_func = None patterns = list(itertools.product(*hyperparameters.values())) val_results = pd.DataFrame(np.zeros( (len(patterns), len(hyperparameters) + len(metrics_names['val']))), columns=list(hyperparameters.keys()) + metrics_names['val']) pp = pprint.PrettyPrinter(indent=4) pp.pprint(hyperparameters) groups = None def experiment(pattern, cfg): for i, param in enumerate(hyperparameters.keys()): cfg = set_hyperparameter(cfg, param, pattern[i]) cfg.train.model.model_path = str( expt_dir / f"{'_'.join([str(p).replace('/', '-') for p in pattern])}.pth") cfg.train.log_id = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}" with mlflow.start_run(): result_series, val_pred, _ = typical_train(cfg, load_func, label_func, process_func, dataset_cls, groups) mlflow.log_params({ hyperparameter: value for hyperparameter, value in zip(hyperparameters.keys(), pattern) }) return result_series, val_pred # For debugging if cfg.n_parallel == 1: result_pred_list = [ experiment(pattern, deepcopy(cfg)) for pattern in patterns ] else: cfg.n_jobs = 0 result_pred_list = Parallel(n_jobs=cfg.n_parallel, verbose=0)([ delayed(experiment)(pattern, deepcopy(cfg)) for pattern in patterns ]) val_results.iloc[:, :len(hyperparameters)] = patterns result_list = np.array([result for result, pred in result_pred_list]) val_results.iloc[:, len(hyperparameters):] = result_list pp.pprint(val_results) pp.pprint(val_results.iloc[:, len(hyperparameters):].describe()) val_results.to_csv(expt_dir / 'val_results.csv', index=False) print(f"Devel results saved into {expt_dir / 'val_results.csv'}") for (_, _), pattern in zip(result_pred_list, patterns): pattern_name = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}" dump_dict(expt_dir / f'{pattern_name}.txt', cfg) # Train with train + devel dataset if cfg.test: best_trial_idx = val_results['uar'].argmax() best_pattern = patterns[best_trial_idx] for i, param in enumerate(hyperparameters.keys()): cfg = set_hyperparameter(cfg, param, best_pattern[i]) dump_dict(expt_dir / 'best_parameters.txt', {p: v for p, v in zip(hyperparameters.keys(), best_pattern)}) metrics, pred_dict_list, _ = typical_experiment( cfg, load_func, label_func, process_func, dataset_cls, groups) sub_name = f"uar-{metrics[-1]:.4f}_sub_{'_'.join([str(p).replace('/', '-') for p in best_pattern])}.csv" pd.DataFrame(pred_dict_list['test']).to_csv(expt_dir / f'{sub_name}_prob.csv', index=False, header=None) pd.DataFrame(pred_dict_list['test'].argmax(axis=1)).to_csv(expt_dir / sub_name, index=False, header=None) print(f"Submission file is saved in {expt_dir / sub_name}") mlflow.end_run()
def main(expt_conf, hyperparameters) -> float: if expt_conf['expt_id'] == 'timestamp': expt_conf['expt_id'] = dt.today().strftime('%Y-%m-%d_%H:%M') expt_dir = Path( __file__).resolve().parents[1] / 'output' / expt_conf['expt_id'] Path(expt_dir).mkdir(exist_ok=True, parents=True) expt_conf['log_dir'] = str(expt_dir / 'tensorboard') logging.basicConfig(level=logging.DEBUG, format="[%(name)s] [%(levelname)s] %(message)s", filename=expt_dir / 'expt.log') if expt_conf['n_classes'] == 2: expt_conf['class_names'] = [0, 1] else: expt_conf['class_names'] = [0, 1, 2] metrics_names = { 'train': ['loss', 'uar'], 'val': ['loss', 'uar'], 'test': ['loss', 'uar'] } dataset_cls = ManifestWaveDataSet patterns = list(itertools.product(*hyperparameters.values())) val_results = pd.DataFrame(np.zeros( (len(patterns), len(hyperparameters) + len(metrics_names['val']))), columns=list(hyperparameters.keys()) + metrics_names['val']) pp = pprint.PrettyPrinter(indent=4) pp.pprint(hyperparameters) one_audio_sec = 10 expt_conf['sample_rate'] = 4000 seq_len = 50 load_func = set_load_func(expt_conf['sample_rate'], one_audio_sec) process_func = set_process_func(expt_conf['model_type'], seq_len) expt_conf = set_data_paths(expt_dir, expt_conf) groups = None def experiment(pattern, expt_conf): for i, param in enumerate(hyperparameters.keys()): expt_conf[param] = pattern[i] expt_conf['model_path'] = str( expt_dir / f"{'_'.join([str(p).replace('/', '-') for p in pattern])}.pth") expt_conf[ 'log_id'] = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}" with mlflow.start_run(): result_series, val_pred, _ = typical_train(expt_conf, load_func, label_func, process_func, dataset_cls, groups) mlflow.log_params({ hyperparameter: value for hyperparameter, value in zip(hyperparameters.keys(), pattern) }) mlflow.log_artifacts(expt_dir) return result_series, val_pred # For debugging if expt_conf['n_parallel'] == 1: result_pred_list = [ experiment(pattern, deepcopy(expt_conf)) for pattern in patterns ] else: expt_conf['n_jobs'] = 0 result_pred_list = Parallel( n_jobs=expt_conf['n_parallel'], verbose=0)([ delayed(experiment)(pattern, deepcopy(expt_conf)) for pattern in patterns ]) val_results.iloc[:, :len(hyperparameters)] = [[str(param) for param in p] for p in patterns] result_list = np.array([result for result, pred in result_pred_list]) val_results.iloc[:, len(hyperparameters):] = result_list pp.pprint(val_results) pp.pprint(val_results.iloc[:, len(hyperparameters):].describe()) val_results.to_csv(expt_dir / 'val_results.csv', index=False) print(f"Devel results saved into {expt_dir / 'val_results.csv'}") for (_, _), pattern in zip(result_pred_list, patterns): pattern_name = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}" dump_dict(expt_dir / f'{pattern_name}.txt', expt_conf) # Train with train + devel dataset if expt_conf['test']: best_trial_idx = val_results['uar'].argmax() best_pattern = patterns[best_trial_idx] for i, param in enumerate(hyperparameters.keys()): expt_conf[param] = best_pattern[i] dump_dict(expt_dir / 'best_parameters.txt', {p: v for p, v in zip(hyperparameters.keys(), best_pattern)}) metrics, pred_dict_list, _ = typical_experiment( expt_conf, load_func, label_func, process_func, dataset_cls, groups) sub_name = f"sub_{'_'.join([str(p).replace('/', '-') for p in best_pattern])}.csv" pd.DataFrame(pred_dict_list['test']).to_csv(expt_dir / f'{sub_name}_prob.csv', index=False, header=None) pd.DataFrame(pred_dict_list['test'].argmax(axis=1) + 1).to_csv( expt_dir / sub_name, index=False, header=None) print(f"Submission file is saved in {expt_dir / sub_name}") mlflow.end_run()
def main(expt_conf, hyperparameters, typical_train_func): if expt_conf['expt_id'] == 'timestamp': expt_conf['expt_id'] = dt.today().strftime('%Y-%m-%d_%H:%M') expt_dir = Path( __file__).resolve().parents[1] / 'output' / expt_conf['expt_id'] logging.basicConfig(level=logging.DEBUG, format="[%(name)s] [%(levelname)s] %(message)s", filename=expt_dir / 'expt.log') expt_conf['class_names'] = [0, 1] metrics_names = { 'train': ['loss', 'uar'], 'val': ['loss', 'uar'], 'test': ['loss', 'uar'] } expt_conf['sample_rate'] = 44100 expt_conf, groups = set_data_paths(expt_conf) patterns = list(itertools.product(*hyperparameters.values())) val_results = pd.DataFrame(np.zeros( (len(patterns), len(hyperparameters) + len(metrics_names['val']))), columns=list(hyperparameters.keys()) + metrics_names['val']) dataset_cls = ManifestWaveDataSet process_func = None pp = pprint.PrettyPrinter(indent=4) pp.pprint(hyperparameters) def experiment(pattern, expt_conf): for i, param in enumerate(hyperparameters.keys()): expt_conf[param] = pattern[i] expt_conf['model_path'] = str( expt_dir / f"{'_'.join([str(p).replace('/', '-') for p in pattern])}.pth") expt_conf[ 'log_id'] = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}" with mlflow.start_run(): result_series, val_pred, _ = typical_train_func( expt_conf, load_func, label_func, process_func, dataset_cls, groups) mlflow.log_params({ hyperparameter: value for hyperparameter, value in zip(hyperparameters.keys(), pattern) }) # mlflow.log_artifacts(expt_dir) return result_series, val_pred # For debugging if expt_conf['n_parallel'] == 1: result_pred_list = [ experiment(pattern, deepcopy(expt_conf)) for pattern in patterns ] else: expt_conf['n_jobs'] = 0 result_pred_list = Parallel( n_jobs=expt_conf['n_parallel'], verbose=0)([ delayed(experiment)(pattern, deepcopy(expt_conf)) for pattern in patterns ]) val_results.iloc[:, :len(hyperparameters)] = patterns result_list = np.array([result for result, pred in result_pred_list]) val_results.iloc[:, len(hyperparameters):] = result_list pp.pprint(val_results) pp.pprint(val_results.iloc[:, len(hyperparameters):].describe()) val_results.to_csv(expt_dir / 'val_results.csv', index=False) print(f"Devel results saved into {expt_dir / 'val_results.csv'}") for (_, _), pattern in zip(result_pred_list, patterns): pattern_name = f"{'_'.join([str(p).replace('/', '-') for p in pattern])}" dump_dict(expt_dir / f'{pattern_name}.txt', expt_conf) # Train with train + devel dataset if expt_conf['test']: best_trial_idx = val_results['uar'].argmax() best_pattern = patterns[best_trial_idx] for i, param in enumerate(hyperparameters.keys()): expt_conf[param] = best_pattern[i] dump_dict(expt_dir / 'best_parameters.txt', {p: v for p, v in zip(hyperparameters.keys(), best_pattern)}) train_df = pd.read_csv(expt_conf['train_path']).iloc[:, :-1] metrics, pred_dict_list, experimentor = typical_experiment( expt_conf, load_func, label_func, process_func, dataset_cls, groups) if expt_conf['return_prob']: ensemble_pred = np.argmax(np.array([ pred_dict['test'] for pred_dict in pred_dict_list ]).sum(axis=0), axis=1) else: ensemble_pred = stats.mode(np.array( [pred_dict['test'] for pred_dict in pred_dict_list]), axis=0)[0][0] _, test_labels = load_func(expt_conf['test_path']) uar = balanced_accuracy_score(test_labels, ensemble_pred) print(f'{uar:.05f}') print( f'Confusion matrix: \n{confusion_matrix(test_labels, ensemble_pred)}' ) sub_name = f"sub_{'_'.join([str(p).replace('/', '-') for p in best_pattern])}_{uar:.04f}.csv" pd.DataFrame(ensemble_pred).to_csv(expt_dir / sub_name, index=False) print(f"Submission file is saved in {expt_dir / sub_name}") result_file_name = f"results_{expt_conf['model_type']}_{expt_conf['target']}_{expt_conf['test_data_kind']}.csv" with open(expt_dir.parent / result_file_name, 'a') as f: f.write( f"{expt_conf['n_splits']},{expt_conf['feature']},{val_results['uar'].max()},{uar}\n" ) mlflow.end_run()