def read_data(bench_dir, datasets): bench = Benchmark(bench_dir, cache=False) n_configs = bench.get_number_of_configs(datasets[0]) data = [ bench.query(dataset_name=d, tag="Train/val_accuracy", config_id=ind) for d in datasets for ind in range(n_configs) ] configs = [ bench.query(dataset_name=d, tag="config", config_id=ind) for d in datasets for ind in range(n_configs) ] dataset_names = [d for d in datasets for ind in range(n_configs)] y = np.array([curve[-1] for curve in data]) return np.array(configs), y, np.array(dataset_names)
def read_data(bench_dir, dataset_name): bench = Benchmark(bench_dir, cache=False) dataset_name = 'Fashion-MNIST' n_configs = bench.get_number_of_configs(dataset_name) # Query API data = [] for config_id in range(n_configs): data_point = dict() data_point["config"] = bench.query(dataset_name=dataset_name, tag="config", config_id=config_id) for tag in bench.get_queriable_tags(dataset_name=dataset_name, config_id=config_id): if tag.startswith("Train/"): data_point[tag] = bench.query(dataset_name=dataset_name, tag=tag, config_id=config_id) data.append(data_point) # Split: 50% train, 25% validation, 25% test (the data is already shuffled) indices = np.arange(n_configs) ind_train = indices[0:int(np.floor(0.5 * n_configs))] ind_val = indices[int(np.floor(0.5 * n_configs)):int(np.floor(0.75 * n_configs))] ind_test = indices[int(np.floor(0.75 * n_configs)):] array_data = np.array(data) train_data = array_data[ind_train] val_data = array_data[ind_val] test_data = array_data[ind_test] # Cut curves for validation and test cut_position = 11 val_data, val_targets = cut_data(val_data, cut_position) test_data, test_targets = cut_data(test_data, cut_position) train_data, train_targets = cut_data( train_data, 51) # Cut last value as it is repeated return train_data, val_data, test_data, train_targets, val_targets, test_targets
# and install all requirements before from api import Benchmark # Download from https://ndownloader.figshare.com/files/21188598 and unzip bench_dir = "data_2k_lw.json" bench = Benchmark(bench_dir, cache=True) path = "data/runs/mlp_results/" dataset_names = bench.get_dataset_names() openml_task_ids = bench.get_openml_task_ids() os.mkdir(path) for task in dataset_names: nrun = bench.get_number_of_configs(task) df = pd.DataFrame( [bench.query(dataset_name=task, tag="config", config_id=0)]) df['final_val_accuracy'] = bench.query(dataset_name=task, tag="final_val_accuracy", config_id=0) df['final_test_accuracy'] = bench.query(dataset_name=task, tag="final_val_accuracy", config_id=0) df['final_val_balanced_accuracy'] = bench.query(dataset_name=task, tag="final_val_accuracy", config_id=0) df['final_test_balanced_accuracy'] = bench.query(dataset_name=task, tag="final_val_accuracy", config_id=0) data = []