def operate(self, dm_list: typing.List, phase='train'): # The input of a NaiveSelectorOperator is a list of DataManager self.check_phase(phase) x = None y = None if phase == 'train': for dm in dm_list: if x is None: x = dm.train_X y = dm.train_y else: x = np.hstack((x, dm.train_X)) x = self.selector.fit_transform(x, y) dm = DataManager(x, y, spilt=False) else: for dm in dm_list: if x is None: x = dm.test_X else: x = np.hstack((x, dm.test_X)) x = self.selector.transform(x) dm = DataManager() dm.test_X = x return dm
def predict_from_dirctory(self, dirname, target_shape=(224, 224, 3), **kwargs): img_data_manager = DataManager() img_data_manager.test_dir = dirname img_data_manager.target_shape = target_shape super().predict(img_data_manager, **kwargs) return self
def test_cash_module(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.regressor import Regressor from alphaml.datasets.rgs_dataset.dataset_loader import load_data from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import random result = [] for i in range(1): x, y, _ = load_data('boston') train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2) dm = DataManager(train_x, train_y) cls = Regressor( exclude_models=['xgboost', 'mlp'], #include_models=['mlp'], optimizer='tpe', ensemble_method='blending', ensemble_size=args.ensemble_size, ).fit(dm, metric='mse', update_mode=2, runcount=args.run_count) pred = cls.predict(test_x) print(pred) result.append(mean_squared_error(test_y, pred)) print(result)
def test_hyperspace(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT try: for dataset in datasets: for run_id in range(start_run, rep_num): X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) for update_mode in [2, 3]: task_format = dataset + '_mode_%d_%d' % (update_mode, run_id) cls = Classifier(optimizer='ts_smbo', seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_format, update_mode=update_mode) print(cls.predict(X)) except Exception as e: print(e) print('Exit!')
def test_estimator(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT rep_num = args.rep run_count = args.run_count datasets = args.datasets.split(',') print(rep_num, run_count, datasets) for dataset in datasets: dataset_id = dataset.split('_')[0] result_dir = 'data/' + dataset_id if not os.path.exists(result_dir): os.mkdir(result_dir) task_format = dataset + '_est_%d' X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) for optimizer in ['smbo']: cls = Classifier(include_models=['gradient_boosting'], optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_format) print(cls.predict(X))
def operate(self, dm_list: typing.List, phase='train'): assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager) self.check_phase(phase) dm = dm_list[0] if phase == 'train': x = dm.train_X newfeature = np.zeros((len(x), 1)) for i, sample in enumerate(x): cnt = 0 for column in sample: if column == 0: cnt += 1 newfeature[i] = cnt result_dm = DataManager() result_dm.train_X = newfeature result_dm.train_y = dm.train_y else: x = dm.test_X newfeature = np.zeros((len(x), 1)) for i, sample in enumerate(x): cnt = 0 for column in sample: if column == 0: cnt += 1 newfeature[i] = cnt result_dm = DataManager() result_dm.test_X = newfeature return result_dm
def operate(self, dm_list: typing.List, phase='train'): # The input of a AutoCrossOperator is a DataManager assert len(dm_list) == 1 dm = dm_list[0] assert isinstance(dm, DataManager) self.check_phase(phase) feature_types = dm.feature_types onehot_index = [i for i in range(len(feature_types)) if feature_types[i] == "One-Hot"] numerical_index = [i for i in range(len(feature_types)) if feature_types[i] == 'Discrete' or feature_types[i] == 'Float'] if phase == 'train': from sklearn.model_selection import train_test_split if self.stratify: train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2, stratify=dm.train_y) else: train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2) x = dm.train_X self.autocross.fit(train_x, val_x, train_y, val_y, onehot_index, numerical_index) result_dm = DataManager() result_dm.train_X = self.autocross.transform(x) result_dm.train_y = dm.train_y else: x = dm.test_X result_dm = DataManager() result_dm.test_X = self.autocross.transform(x) return result_dm
def one_hot(dm: DataManager) -> DataManager: """ Convert the categorical features to float with one-hot encoding :param dm: :return: """ feature_types = dm.feature_types categorical_index = [ i for i in range(len(feature_types)) if feature_types[i] == "Categorical" ] other_index = [ i for i in range(len(feature_types)) if feature_types[i] != "Categorical" ] encoder = OneHotEncoder(handle_unknown="ignore") (train_x, _), (valid_x, _), (test_x, _) = dm.get_train(), dm.get_val(), dm.get_test() train_size = len(train_x) valid_size = 0 test_size = 0 if train_x is None: raise ValueError("train_x has no value!!!") if valid_x is not None and test_x is not None: x = np.concatenate([train_x, valid_x, test_x]) valid_size = len(valid_x) test_size = len(test_x) elif valid_x is not None: x = np.concatenate([train_x, valid_x]) valid_size = len(valid_x) else: x = train_x categorical_x = x[:, categorical_index] other_x = x[:, other_index] encoder.fit(categorical_x) categorical_x = encoder.transform(categorical_x).toarray() categorical_features = ["One-Hot"] * categorical_x.shape[1] other_features = [feature_types[i] for i in other_index] x = np.hstack((categorical_x, other_x)).astype(np.float) dm.feature_types = np.concatenate((categorical_features, other_features)) train_x, valid_x, test_x = _split_data(x, train_size, valid_size, test_size) if valid_size == 0: valid_x = None if test_size == 0: test_x = None dm.train_X = train_x dm.val_X = valid_x dm.test_X = test_x return dm
def evaluate_c(): rep_num = 10 run_count = 500 start_id = args.start_runid datasets = args.datasets.split(',') task_id = 'exp5_eval_c' print(rep_num, run_count, datasets, task_id) for dataset in datasets: # Make directories. dataset_id = dataset.split('_')[0] save_dir = "data/%s/" % dataset_id if not os.path.exists(save_dir): os.makedirs(save_dir) result = dict() seeds = get_seeds(dataset, start_id + rep_num) for run_id in range(start_id, start_id + rep_num): seed = seeds[run_id] # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) # Test each optimizer algorithm: for p in [1, 4, 10, 14, 16, 20]: task_name = dataset + '_%s_%d_%d_%d' % (task_id, run_count, run_id, p) mode = 3 optimizer = 'mono_smbo' print('Test %s optimizer => %s' % (optimizer, task_name)) # Construct the AutoML classifier. cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_name, update_mode=mode, param=p) acc = cls.score(X_test, y_test) key_id = '%s_%d_%d_%d_%s' % (dataset, run_count, run_id, p, optimizer) result[key_id] = acc # Display and save the test result. print(result) with open( 'data/%s/%s_test_%s_%d_%d_%d.pkl' % (dataset_id, dataset, task_id, run_count, rep_num, start_id), 'wb') as f: pickle.dump(result, f)
def test_categorical_indexer(): train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8], ["c", 10, "java", 4.8]]) valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8], ["d", 10, "python", 4.8]]) test_x = np.array([["a", 1, "scala", 4.5]]) dm = DataManager() dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"] dm.train_X = train_x dm.val_X = valid_x dm.test_X = test_x dm = categorical_indexer(dm) print(dm.feature_types) print(dm.train_X) print("----------------------------") print(dm.val_X) print("----------------------------") print(dm.test_X)
def operate(self, dm_list: typing.List, phase='train'): ''' :return: self.result_dm is a new Datamanager with data splited for training and validation ''' x = None y = None if phase == 'train': for dm in dm_list: if x is None: x = dm.train_X y = dm.train_y else: x = np.hstack((x, dm.train_X)) self.selector.fit(x, y) else: for dm in dm_list: if x is None: x = dm.test_X else: x = np.hstack((x, dm.test_X)) if self.model == self.RANDOM_FOREST: self.sorted_features = np.argsort( self.selector.feature_importances_)[::-1] elif self.model == self.LASSO_REGRESSION: if self.selector.coef_.ndim == 1: self.sorted_features = np.argsort(self.selector.coef_)[::-1] else: importances = np.linalg.norm(self.selector.coef_, axis=0, ord=1) self.sorted_features = np.argsort(importances)[::-1] x = x[:, self.sorted_features[:self.kbest]] dm = DataManager() if phase == 'train': dm.train_X = x dm.train_y = y else: dm.test_X = x return dm
def test_impute_dm(): train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8], ["c", 10, "java", 4.8]]) valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8], ["d", 10, "python", 4.8]]) test_x = np.array([["a", 1, "scala", 4.5]]) train_x[2][0] = "???" train_x[2][2] = "???" valid_x[0][1] = np.nan test_x[0][-1] = np.nan dm = DataManager() dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"] dm.train_X = train_x.astype(np.object) dm.val_X = valid_x.astype(np.object) dm.test_X = test_x.astype(np.object) dm = impute_dm(dm, "???") print(dm.feature_types) print(dm.train_X) print("----------------------------") print(dm.val_X) print("----------------------------") print(dm.test_X)
def fit(self, data: DataManager, **kwargs): from alphaml.engine.evaluator.dl_evaluator import BaseImgEvaluator task_type = kwargs['task_type'] if data.train_X is None and data.train_y is None: inputshape = data.target_shape classnum = None else: inputshape = data.train_X.shape[1:] classnum = None if task_type == 'img_multiclass': data.train_y, self.map_dict, self.rev_map_dict = map_label(data.train_y) data.train_y = to_categorical(data.train_y) data.val_y, _, _ = map_label(data.val_y, self.map_dict) data.val_y = to_categorical(data.val_y) classnum = len(self.rev_map_dict) elif task_type == 'img_binary': data.train_y, self.map_dict, self.rev_map_dict = map_label(data.train_y, if_binary=True) data.val_y, _, _ = map_label(data.val_y, self.map_dict, if_binary=True) classnum = 1 elif task_type == 'img_multilabel-indicator': classnum = get_classnum(data.train_y) self.evaluator = BaseImgEvaluator(inputshape, classnum) return super().fit(data, **kwargs)
def operate(self, dm_list: typing.List, phase='train'): self.check_phase(phase) x = None y = None if phase == 'train': for dm in dm_list: if x is None: x = dm.train_X y = dm.train_y else: x = np.hstack((x, dm.train_X)) dm = DataManager(x, y, spilt=False) else: for dm in dm_list: if x is None: x = dm.test_X else: x = np.hstack((x, dm.test_X)) dm = DataManager() dm.test_X = x return dm
def test_cash_module(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier import random from sklearn.metrics import roc_auc_score result = [] for i in range(1): import xlrd sheet = xlrd.open_workbook("lyqdata.xlsx") sheet = sheet.sheet_by_index(0) nrows = sheet.nrows X_train = [] y_train = [] for i in range(2, nrows): X_train.append(sheet.row_values(i, start_colx=1)) y_train.append(int(sheet.cell_value(i, 0))) dm = DataManager(X_train, y_train) cls = Classifier( # include_models=['liblinear_svc', 'libsvm_svc', 'random_forest', 'logistic_regression', 'mlp'], include_models=['mlp'], optimizer='smbo', cross_valid=False, ensemble_method='ensemble_selection', ensemble_size=args.ensemble_size, save_dir='data/save_models' ) cls.fit(dm, metric='auc', runcount=args.run_count) sheet = xlrd.open_workbook("lyqtestdata.xlsx") sheet = sheet.sheet_by_index(0) nrows = sheet.nrows X_test = [] y_test = [] for i in range(1, nrows): X_test.append(sheet.row_values(i, start_colx=1)) y_test.append(int(sheet.cell_value(i, 0))) pred = cls.predict_proba(X_test) result.append(roc_auc_score(y_test, pred[:, 1:2])) print(result) import pickle with open('result.pkl', 'wb') as f: pickle.dump(result, f)
def test_cash_module(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier import random from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder result = [] for i in range(1): import xlrd sheet = xlrd.open_workbook("ybai_Keratoconus_TJ_20190425.xlsx") sheet = sheet.sheet_by_index(0) nrows = sheet.nrows X_train = [] y_train = [] for i in range(1, nrows): X_train.append(sheet.row_values(i, start_colx=1)) y_train.append(int(sheet.cell_value(i, 0))) encoder = OneHotEncoder() encoder.fit(np.reshape(y_train, (len(y_train), 1))) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train) dm = DataManager(X_train, y_train) cls = Classifier( # include_models=['liblinear_svc', 'libsvm_svc', 'xgboost', 'random_forest', 'logistic_regression', 'mlp'], optimizer='smbo', ensemble_method='bagging', ensemble_size=args.ensemble_size, ) cls.fit(dm, metric='auc', runcount=args.run_count) pred = cls.predict_proba(X_test) print(pred) y_test = encoder.transform(np.reshape(y_test, (len(y_test), 1))).toarray() result.append(roc_auc_score(y_test, pred)) print(result) import pickle with open('result.pkl', 'wb') as f: pickle.dump(result, f)
def test_no_free_lunch(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data for dataset in datasets: seeds = get_seeds(dataset, rep_num) for run_id in range(rep_num): seed = seeds[run_id] # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) for algo in algo_list: for optimizer in ['smbo']: task_format = dataset + '_' + algo + '_%d_%d' cls = Classifier( include_models=[algo], optimizer=optimizer, seed=seed).fit( dm, metric='accuracy', runcount=run_count, task_name=task_format % (run_count, run_id)) print(cls.predict(X))
def operate(self, dm_list: typing.List, phase='train'): # The input of a PCAOperator is a DataManager assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager) self.check_phase(phase) dm = dm_list[0] feature_types = dm.feature_types numerical_index = [i for i in range(len(feature_types)) if feature_types[i] == "Float" or feature_types[i] == "Discrete"] if phase == 'train': x = dm.train_X result_dm = DataManager() result_dm.train_X = self.pca.fit_transform(x[:, numerical_index]) result_dm.train_y = dm.train_y else: x = dm.test_X result_dm = DataManager() result_dm.test_X = self.pca.fit_transform(x[:, numerical_index]) return result_dm
def fit_from_directory(self, dirname, target_shape=(224, 224, 3), valid_split=0.1, **kwargs): img_data_manager = DataManager() if isinstance(dirname, (list, tuple)): if len(dirname) != 2: raise ValueError( "Expected one directory or a list or tuple of two directories for training and validation!" ) if dirname[1] is None: img_data_manager.train_valid_dir = dirname[0] else: img_data_manager.train_dir = dirname[0] img_data_manager.valid_dir = dirname[1] else: img_data_manager.train_valid_dir = dirname img_data_manager.target_shape = target_shape img_data_manager.split_size = valid_split kwargs['task_type'] = 'img_multilabel-indicator' kwargs['metric'] = kwargs.get('metric', 'acc') super().fit(img_data_manager, **kwargs) return self
def test_hyperspace(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT try: for dataset in datasets: for run_id in range(start_run, rep_num): X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) for n_est in [1, 2, 4, 8, 12]: algos = algo_list[:n_est] task_format = dataset + '_hp_%d_%d' % (n_est, run_id) cls = Classifier( include_models=algos, optimizer='smbo', seed=seed).fit( dm, metric='accuracy', runcount=run_count, task_name=task_format) print(cls.predict(X)) except Exception as e: print(e) print('Exit!')
def operate(self, dm_list: typing.List, phase='train') -> DataManager: # The input of a PolynomialFeatureOperator is a DataManager assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager) self.check_phase(phase) dm = dm_list[0] feature_types = dm.feature_types numericial_index = [i for i in range(len(feature_types)) if feature_types[i] == "Float" or feature_types[i] == "Discrete"] init_length = len(numericial_index) + 1 if phase == 'train': x = dm.train_X newfeatures = self.polynomialfeatures.fit_transform(x[:, numericial_index]) result_dm = DataManager() result_dm.train_X = newfeatures[:, init_length:] result_dm.train_y = dm.train_y else: x = dm.test_X newfeatures = self.polynomialfeatures.transform(x[:, numericial_index]) result_dm = DataManager() result_dm.test_X = newfeatures[:, init_length:] return result_dm
def operate(self, dm_list: typing.List, phase='train'): # The input of a ImputeOperator is a pd.Dataframe assert len(dm_list) == 1 and isinstance(dm_list[0], pd.DataFrame) self.check_phase(phase) input_df = dm_list[0] df = self.impute_df(input_df) dm = DataManager() label_col = df.columns[self.label_col] if phase == 'train' else None dm.set_col_type(df, label_col) data = df.values if phase == 'train': # Swap label index to -1 swap_list = list(range(data.shape[1])) del (swap_list[self.label_col]) swap_list.append(self.label_col) data = data[:, swap_list] dm.train_X = data[:, :-1] dm.train_y = data[:, -1] else: dm.test_X = data return dm
from alphaml.engine.components.feature_engineering.auto_feature import AutoFeature from alphaml.estimators.classifier import Classifier from alphaml.engine.components.data_manager import DataManager from time import time warnings.filterwarnings("ignore") parser = argparse.ArgumentParser() parser.add_argument("--generated_feature", type=int, default=1) parser.add_argument("--dataset", type=str) args = parser.parse_args() x, y, c = load_data(args.dataset) dm = DataManager(x, y) lr = LogisticRegression() lr.fit(dm.train_X, dm.train_y) y_pred = lr.predict(dm.val_X) print("original lr accu:", accuracy_score(dm.val_y, y_pred), flush=True) if args.generated_feature > 0: af = AutoFeature("accuracy", "auto_cross") af.fit(dm, args.generated_feature) dm = af.transform(dm) clf = Classifier() start_time = time() clf.fit(dm, metric="accuracy", runcount=50) print("alphaml time:", time() - start_time)
def test_claim(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT perfs_list = list() for dataset in datasets: for run_id in range(rep_num): X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) task_format = dataset + '_claim_%d' for optimizer in ['smbo']: cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_format % run_id) print(cls.predict(X)) file_id = 'data/%s/%s_claim_%d_%s.data' % (dataset, dataset, run_id, 'smac') with open(file_id, 'rb') as f: data = pickle.load(f) best_id = np.argmax(data['perfs']) best_value = data['perfs'][best_id] if data['perfs'].count(best_value) > 1: stats = dict() for conf, perf in zip(data['configs'], data['perfs']): if perf == best_value: est = conf['estimator'] if est not in stats: stats[est] = 0 stats[est] += 1 tmp_id = np.argmax(stats.values()) best_estimator = list(stats.keys())[tmp_id] print('=' * 20, best_value, stats) else: best_estimator = data['configs'][best_id]['estimator'] print('=' * 20, data['perfs'][best_id], data['configs'][best_id]) run_cnts = len([ item for item in data['configs'] if item['estimator'] == best_estimator ]) task_format = dataset + '_claim_single_%d' cls = Classifier(include_models=[best_estimator], optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_cnts, task_name=task_format % run_id) print(cls.predict(X)) file_id = 'data/%s/%s_claim_single_%d_%s.data' % ( dataset, dataset, run_id, 'smac') with open(file_id, 'rb') as f: data_s = pickle.load(f) print('=' * 20 + 'single', max(data_s['perfs'])) perfs_list.append((data['perfs'], data_s['perfs'])) for item in perfs_list: item1, item2 = item print(len(item1), max(item1), len(item2), max(item2)) print('=' * 50) print(perfs_list)
def test_auto(): from sklearn.datasets import load_breast_cancer from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier X, y = load_breast_cancer(return_X_y=True) # Classifier(exclude_models=['libsvm_svc']).fit(DataManager(X, y)) for _ in range(5): Classifier(include_models=['adaboost', 'gradient_boosting', 'random_forest'], optimizer='ts_smac').fit(DataManager(X, y)) for _ in range(5): Classifier(include_models=['adaboost', 'gradient_boosting', 'random_forest'], optimizer='smac').fit(DataManager(X, y))
import pandas as pd import warnings from alphaml.engine.components.data_manager import DataManager from alphaml.engine.components.feature_engineering.auto_feature import AutoFeature warnings.filterwarnings("ignore") home_path = os.path.expanduser('~') train_path = os.path.join(home_path, "datasets/santander/train.csv") test_path = os.path.join(home_path, "datasets/santander/test.csv") df_train = pd.read_csv(train_path) df_test = pd.read_csv(test_path) df_train.drop(labels=["ID_code"], axis=1, inplace=True) df_test.drop(labels=["ID_code"], axis=1, inplace=True) x_train = df_train.drop(labels=["target"], axis=1).values y_train = df_train["target"].values x_test = df_test.values del df_train del df_test dm = DataManager(x_train, y_train) dm.test_X = x_test auto_feature = AutoFeature(metrics="auc") dm = auto_feature.fit(dm, generated_num=100)
def test_normalize(dm): dm = normalize(dm) print("after normalize rescale\n") print(dm.train_X) print(dm.val_X) print(dm.test_X) print(dm.feature_types) if __name__ == '__main__': np.random.seed(19941125) dm = DataManager() dm.train_X = np.random.rand(5, 5) dm.val_X = np.random.rand(3, 5) dm.test_X = np.random.rand(2, 5) dm.feature_types = ["Discrete", "One-Hot", "Float", "Float", "Categorical"] print("Original data......\n") print(dm.train_X) print(dm.val_X) print(dm.test_X) print(dm.feature_types) print("start test MinMaxScaler.......\n") test_minmax(dm) print("start test StandardScaler......\n")
def test_exp2_evaluation(): rep_num = args.rep run_count = args.run_count start_id = args.start_runid datasets = args.datasets.split(',') print(rep_num, run_count, datasets) task_id = "exp_2_evaluation" for dataset in datasets: dataset_id = dataset.split('_')[0] result_dir = 'data/' + dataset_id if not os.path.exists(result_dir): os.mkdir(result_dir) # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) # optimizer_algos = ['cmab_ts', 'rl_1_0.3', 'rl_2_1', 'rl_3_0'] optimizer_algos = ['cmab_ts', 'rl_2_1', 'rl_3_0'] # Test each optimizer algorithm: for opt_algo in optimizer_algos: result = dict() mode, eta = None, None # Parse the parameters for each optimizer. if opt_algo.startswith('rl'): if len(opt_algo.split('_')) == 3: _, mode, eta = opt_algo.split('_') mode = int(mode) optimizer = 'rl_smbo' eta = float(eta) else: raise ValueError('Wrong params!') else: optimizer = opt_algo print('Test optimizer: %s' % optimizer) seeds = get_seeds(dataset, rep_num) for run_id in range(start_id, rep_num): task_name = dataset + '_%s_%d_%d' % (task_id, run_count, run_id) seed = seeds[run_id] # Construct the AutoML classifier. cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, runtime=None, task_name=task_name, update_mode=mode, param=eta) # Test the CASH performance on test set. cash_test_acc = cls.score(X_test, y_test) key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id, optimizer) result[key_id] = [cash_test_acc] print(result) # Save the test result. with open( 'data/%s/%s_test_result_%s_%s_%d_%d_%d.pkl' % (dataset_id, dataset, opt_algo, task_id, run_count, rep_num, start_id), 'wb') as f: pickle.dump(result, f)
def test_exp4_runtime(): rep_num = args.rep run_count = args.run_count B = args.B if B > 0: run_count = 0 start_id = args.start_runid datasets = args.datasets.split(',') print(rep_num, run_count, datasets) task_id = "exp4_runtime" for dataset in datasets: dataset_id = dataset.split('_')[0] result_dir = 'data/' + dataset_id if not os.path.exists(result_dir): os.mkdir(result_dir) # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) runcount_dict = dict() tpe_runcount = 0. optimizer_algos = ['mono_smbo_4', 'smbo', 'tpe'] # optimizer_algos = ['mono_smbo_3_0'] # Test each optimizer algorithm: assert optimizer_algos[-1] == 'tpe' for opt_algo in optimizer_algos: # if algo is tpe, we need to estimate its runcount in one hour. if opt_algo != 'tpe': runcount_dict[opt_algo] = list() else: count_list = list() for key in runcount_dict.keys(): count_list.append(np.mean(runcount_dict[key])) assert len(count_list) > 0 tpe_runcount = np.min(count_list) print('=' * 50, tpe_runcount) result = dict() mode, eta = None, None # Parse the parameters for each optimizer. if opt_algo.startswith('mono_smbo'): mode = 2 if len(opt_algo.split('_')) == 3: _, _, mode = opt_algo.split('_') mode = int(mode) eta = 10 optimizer = 'mono_smbo' else: optimizer = opt_algo print('Test optimizer: %s' % optimizer) seeds = get_seeds(dataset, rep_num) for run_id in range(start_id, rep_num): if B > 0: task_name = dataset + '_%s_%d_%d_%d' % (task_id, B, run_count, run_id) else: task_name = dataset + '_%s_%d_%d' % (task_id, run_count, run_id) seed = seeds[run_id] runcount_const = run_count if opt_algo != 'tpe' else tpe_runcount # Construct the AutoML classifier. cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=runcount_const, runtime=B, task_name=task_name, update_mode=mode, param=eta) # Test the CASH performance on test set. cash_test_acc = cls.score(X_test, y_test) # Load CASH intermediate infos. if optimizer == 'smbo': file_id = 'smac' elif optimizer == 'tpe': file_id = 'hyperopt' elif optimizer == 'mono_smbo': file_id = 'mm_bandit_%d_smac' % mode else: raise ValueError('Invalid optimizer!') tmp_task_id = '%s_%d' % (task_id, B) if B > 0 else task_id tmp_configs, tmp_perfs = load_infos(dataset, tmp_task_id, run_count, run_id, file_id) if opt_algo != 'tpe': runcount_dict[opt_algo].append(len(tmp_configs)) model_infos = (tmp_configs, tmp_perfs) ensemble_size = 50 task_type = type_of_target(dm.train_y) if optimizer == 'tpe': task_type = 'hyperopt_' + task_type metric = accuracy_score ensemble_model = EnsembleSelection(model_infos, ensemble_size, task_type, metric, n_best=20) ensemble_model.fit(dm) ens_val_pred = ensemble_model.predict(dm.val_X) ens_val_acc = accuracy_score(ens_val_pred, dm.val_y) ens_pred = ensemble_model.predict(X_test) ens_test_acc = accuracy_score(ens_pred, y_test) key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id, optimizer) result[key_id] = [cash_test_acc, ens_val_acc, ens_test_acc] print(result) # Save the test result. with open( 'data/%s/%s_test_result_%s_%s_%d_%d_%d.pkl' % (dataset_id, dataset, opt_algo, task_id, run_count, rep_num, start_id), 'wb') as f: pickle.dump(result, f)
def test_cash_module(): rep_num = args.rep run_count = args.run_count start_id = args.start_runid datasets = args.datasets.split(',') optimizer_algos = args.opt_algo.split(',') task_id = args.task_id print(rep_num, run_count, datasets, optimizer_algos, task_id) result = dict() for dataset in datasets: dataset_id = dataset.split('_')[0] result_dir = 'data/' + dataset_id if not os.path.exists(result_dir): os.mkdir(result_dir) seeds = get_seeds(dataset, rep_num) for run_id in range(start_id, rep_num): task_name = dataset + '_%s_%d_%d' % (task_id, run_count, run_id) seed = seeds[run_id] # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) # Test each optimizer algorithm: for optimizer in optimizer_algos: # Parse the parameters for each optimizer. mode = 2 eta, r = 2, 2 if optimizer.startswith('baseline'): optimizer, mode = optimizer.split('_') mode = 1 if mode == 'rand' else 2 if optimizer.startswith('sh'): if len(optimizer.split('_')) == 2: optimizer, eta = optimizer.split('_') eta = float(eta) else: raise ValueError('Wrong SH params!') if optimizer.startswith('rl'): if len(optimizer.split('_')) == 3: _, mode, eta = optimizer.split('_') mode = int(mode) optimizer = 'rl_smbo' else: raise ValueError('Wrong SH params!') if optimizer.startswith('ts_smbo'): mode = 1 if len(optimizer.split('_')) == 3: _, _, mode = optimizer.split('_') mode = int(mode) optimizer = 'ts_smbo' if optimizer.startswith('mcmc_ts'): _, _, mode, eta, r = optimizer.split('_') mode = int(mode) eta = int(eta) r = int(r) optimizer = 'mcmc_ts_smbo' if optimizer.startswith('ucb_smbo'): mode = 1 if len(optimizer.split('_')) == 3: _, _, mode = optimizer.split('_') mode = int(mode) optimizer = 'ucb_smbo' if optimizer.startswith('mono_smbo'): mode = 2 if len(optimizer.split('_')) == 4: _, _, mode, r = optimizer.split('_') mode, r = int(mode), int(r) eta = 10 optimizer = 'mono_smbo' print('Test %s optimizer => %s' % (optimizer, task_name)) # Construct the AutoML classifier. cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_name, update_mode=mode, eta=eta, r=r, param=eta) acc = cls.score(X_test, y_test) key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id, optimizer) result[key_id] = acc # Display and save the test result. print(result) with open( 'data/%s/%s_test_result_%s_%d_%d_%d.pkl' % (dataset_id, dataset_id, task_id, run_count, rep_num, start_id), 'wb') as f: pickle.dump(result, f)