def generate_data(n=10000, d=11, data_type='Syn1', seed=1, output_dir='.'): """ :param n: Number of samples :param d: input dimension :param data_type: the name of the syn dataset :param seed: random seed for numpy """ np.random.seed(seed) # X generation X = np.random.randn(n, d) # Y generation if data_type in ['Syn1', 'Syn2', 'Syn3']: Y = Basic_Label_Generation(X, data_type) elif data_type in ['Syn4', 'Syn5', 'Syn6']: Y = Complex_Label_Generation(X, data_type) data = np.concatenate([X, np.expand_dims(Y, axis=1)], axis=1) output_dir = os.path.join(output_dir, '{}_{}'.format(data_type, str(d))) create_dir(output_dir) output_path = os.path.join(output_dir, 'data.csv') pd.DataFrame(data=data).to_csv(output_path, index=False) create_dataset_partitions(output_path)
def cross_validation(config, model_handler, score_config, folds, gpus_list, grid_params, seeds_arr, output_dir='./grid_search'): create_dir(output_dir) if 'experiment_number' not in config: config['experiment_number'] = 0 valid_list = [] for seed in seeds_arr: all_test_scores = [] all_val_scores = [] all_test_experiments = [] for k in folds: config['random_seed'] = seed data = read_train_val_test_folds( config['csv'], k, apply_standardization=config['apply_standardization']) all_results, is_valid = distributed_grid_search( gpus_list, model_handler, copy.deepcopy(config), grid_parmas=copy.deepcopy(grid_params), name='iter_{}_seed_{}_conf'.format(k, seed), train_args=model_handler_params(data, score_config), output_dir=output_dir) config['experiment_number'] += len(all_results) best_val_score, best_val_experiment, correspond_test_score = get_best_val_result( all_results, score_config['score_increases']) all_test_scores.append(correspond_test_score) all_val_scores.append(best_val_score) all_test_experiments.append(best_val_experiment) valid_list.append({'fold': k, 'seed': seed, 'is valid': is_valid}) write_cv_results(all_test_scores, all_val_scores, all_test_experiments, output_dir, name='_{}'.format(str(seed))) for valid_desc in valid_list: print(valid_desc)
def create_dataset_partitions(csv_path, dataset_name=None, train_proportion=0.7, k_folds=5, seed=1): print(csv_path) np.random.seed(seed=seed) output_dir = os.path.dirname(csv_path) + '/cross_validation' create_dir(output_dir) create_dir(output_dir + '/StratifiedKFold') create_dir(output_dir + '/seed_{}'.format(seed)) df = pd.read_csv(csv_path) if dataset_name is not None: dataset_handler = DatasetHandler.get_dataset_handler(dataset_name) df = dataset_handler(df) df = df.sample(frac=1, random_state=seed).reset_index(drop=True) y = df.iloc[:, -1].values kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed) val_proportion = 1 - (1. / k_folds) - train_proportion for i, (train_val_idx, test_idx) in enumerate(kf.split(df, y)): df_i = df.copy() test_df = df_i.iloc[test_idx] train_val_df = df_i.iloc[train_val_idx] y_train_val = train_val_df.iloc[:, -1].values train_df, val_df, _, _ = train_test_split( train_val_df, y_train_val, random_state=seed, stratify=y_train_val, test_size=val_proportion / (val_proportion + train_proportion)) fold_dir = os.path.join(output_dir, 'fold_{}'.format(i)) create_dir(fold_dir) train_df.to_csv(os.path.join(fold_dir, 'train.csv'), index=False) val_df.to_csv(os.path.join(fold_dir, 'val.csv'), index=False) test_df.to_csv(os.path.join(fold_dir, 'test.csv'), index=False)
def create_experiment_directory(config, copy_model=True, return_sub_dirs=False): experiment_dir = config['experiments_dir'] + '/{}'.format( config['experiment_number']) weights_dir = os.path.join(experiment_dir, "weights") logs_dir = os.path.join(experiment_dir, "logs") create_dir(experiment_dir) create_dir(weights_dir) create_dir(logs_dir) write_config(config, experiment_dir + '/configuration.json') if copy_model: copyfile( config['models_dir'] + '/model{}.py'.format(config['model_number']), experiment_dir + '/model.py') if return_sub_dirs: return experiment_dir, weights_dir, logs_dir else: return experiment_dir
for ablation_conf in ablation_study_configs: model_number = ablation_conf['model_number'] grid_params = ablation_conf['grid'] experiment_name = ablation_conf['exp_name'] for competition_name in [ CompetitionsHandler.GESTURE_PHASE, CompetitionsHandler.EYE_MOVEMENTS, CompetitionsHandler.GAS ]: shared_config[ 'experiments_dir'] = '{}/{}Competitions/{}/experiments/{}'.format( base_dir, model, competition_name, experiment_name) output_dir = os.path.join(shared_config['experiments_dir'], 'grid_search') create_dir(shared_config['experiments_dir']) shared_config['model_number'] = model_number shared_config['model_name'] = model shared_config['competition_name'] = competition_name config, score_config = CompetitionsHandler.get_configs( competition_name) config = CompetitionsHandler.merge_configs(shared_config, config) cross_validation(config, ModelHandler, score_config, folds=folds, gpus_list=gpus_list, grid_params=grid_params, output_dir=output_dir, seeds_arr=seeds_arr)
'FCN_with_feature_selection': FCN_with_feature_selection_grid} if __name__ == '__main__': base_dir = 'path/to/base/dir' model_type_arr = ['FCN', 'FCN_with_oracle_mask', 'FCN_with_feature_selection'] syn_names = ['Syn1', 'Syn2', 'Syn3', 'Syn4', 'Syn5', 'Syn6'] d_arr = [11, 50, 100, 150, 200, 250, 300] folds = [0, 1, 2, 3, 4] gpus_list = ['0'] seeds_arr = [1, 2, 3] for syn in syn_names: for model_type in model_type_arr: for d in d_arr: experiment_name = 'exp_{}_model_{}_d_{}'.format(syn, model_type, d) config['experiments_dir'] = '{}/FeatureSelectionSynExp/experiments/{}'.format(base_dir, experiment_name) config['csv'] = 'data/FeatureSelectionSynExp/{}_{}/data.csv'.format(syn, d) config['input_dim'] = d config['mask'] = Ground_Truth_Mask_Generation(d, syn) config['model_type'] = model_type output_dir = os.path.join(config['experiments_dir'], 'grid_search') create_dir(config['experiments_dir']) cross_validation(config, ModelHandler, score_config, folds=folds, gpus_list=gpus_list, grid_params=grid_map[model_type], output_dir=output_dir, seeds_arr=seeds_arr)