def test_estimator(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT rep_num = args.rep run_count = args.run_count datasets = args.datasets.split(',') print(rep_num, run_count, datasets) for dataset in datasets: dataset_id = dataset.split('_')[0] result_dir = 'data/' + dataset_id if not os.path.exists(result_dir): os.mkdir(result_dir) task_format = dataset + '_est_%d' X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) for optimizer in ['smbo']: cls = Classifier(include_models=['gradient_boosting'], optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_format) print(cls.predict(X))
def test_hyperspace(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT try: for dataset in datasets: for run_id in range(start_run, rep_num): X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) for update_mode in [2, 3]: task_format = dataset + '_mode_%d_%d' % (update_mode, run_id) cls = Classifier(optimizer='ts_smbo', seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_format, update_mode=update_mode) print(cls.predict(X)) except Exception as e: print(e) print('Exit!')
def evaluate_c(): rep_num = 10 run_count = 500 start_id = args.start_runid datasets = args.datasets.split(',') task_id = 'exp5_eval_c' print(rep_num, run_count, datasets, task_id) for dataset in datasets: # Make directories. dataset_id = dataset.split('_')[0] save_dir = "data/%s/" % dataset_id if not os.path.exists(save_dir): os.makedirs(save_dir) result = dict() seeds = get_seeds(dataset, start_id + rep_num) for run_id in range(start_id, start_id + rep_num): seed = seeds[run_id] # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) # Test each optimizer algorithm: for p in [1, 4, 10, 14, 16, 20]: task_name = dataset + '_%s_%d_%d_%d' % (task_id, run_count, run_id, p) mode = 3 optimizer = 'mono_smbo' print('Test %s optimizer => %s' % (optimizer, task_name)) # Construct the AutoML classifier. cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_name, update_mode=mode, param=p) acc = cls.score(X_test, y_test) key_id = '%s_%d_%d_%d_%s' % (dataset, run_count, run_id, p, optimizer) result[key_id] = acc # Display and save the test result. print(result) with open( 'data/%s/%s_test_%s_%d_%d_%d.pkl' % (dataset_id, dataset, task_id, run_count, rep_num, start_id), 'wb') as f: pickle.dump(result, f)
def test_auto(): from sklearn.datasets import load_breast_cancer from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier X, y = load_breast_cancer(return_X_y=True) # Classifier(exclude_models=['libsvm_svc']).fit(DataManager(X, y)) for _ in range(5): Classifier(include_models=['adaboost', 'gradient_boosting', 'random_forest'], optimizer='ts_smac').fit(DataManager(X, y)) for _ in range(5): Classifier(include_models=['adaboost', 'gradient_boosting', 'random_forest'], optimizer='smac').fit(DataManager(X, y))
def test_cash_module(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier import random from sklearn.metrics import roc_auc_score result = [] for i in range(1): import xlrd sheet = xlrd.open_workbook("lyqdata.xlsx") sheet = sheet.sheet_by_index(0) nrows = sheet.nrows X_train = [] y_train = [] for i in range(2, nrows): X_train.append(sheet.row_values(i, start_colx=1)) y_train.append(int(sheet.cell_value(i, 0))) dm = DataManager(X_train, y_train) cls = Classifier( # include_models=['liblinear_svc', 'libsvm_svc', 'random_forest', 'logistic_regression', 'mlp'], include_models=['mlp'], optimizer='smbo', cross_valid=False, ensemble_method='ensemble_selection', ensemble_size=args.ensemble_size, save_dir='data/save_models' ) cls.fit(dm, metric='auc', runcount=args.run_count) sheet = xlrd.open_workbook("lyqtestdata.xlsx") sheet = sheet.sheet_by_index(0) nrows = sheet.nrows X_test = [] y_test = [] for i in range(1, nrows): X_test.append(sheet.row_values(i, start_colx=1)) y_test.append(int(sheet.cell_value(i, 0))) pred = cls.predict_proba(X_test) result.append(roc_auc_score(y_test, pred[:, 1:2])) print(result) import pickle with open('result.pkl', 'wb') as f: pickle.dump(result, f)
def test_cash_module(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier import random from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder result = [] for i in range(1): import xlrd sheet = xlrd.open_workbook("ybai_Keratoconus_TJ_20190425.xlsx") sheet = sheet.sheet_by_index(0) nrows = sheet.nrows X_train = [] y_train = [] for i in range(1, nrows): X_train.append(sheet.row_values(i, start_colx=1)) y_train.append(int(sheet.cell_value(i, 0))) encoder = OneHotEncoder() encoder.fit(np.reshape(y_train, (len(y_train), 1))) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train) dm = DataManager(X_train, y_train) cls = Classifier( # include_models=['liblinear_svc', 'libsvm_svc', 'xgboost', 'random_forest', 'logistic_regression', 'mlp'], optimizer='smbo', ensemble_method='bagging', ensemble_size=args.ensemble_size, ) cls.fit(dm, metric='auc', runcount=args.run_count) pred = cls.predict_proba(X_test) print(pred) y_test = encoder.transform(np.reshape(y_test, (len(y_test), 1))).toarray() result.append(roc_auc_score(y_test, pred)) print(result) import pickle with open('result.pkl', 'wb') as f: pickle.dump(result, f)
def test_cash_module(): df = pd.read_csv("data/cls_data/santander/train.csv") df = df.drop(columns=["ID"]) cls = Classifier(include_models=['xgboost', 'random_forest', 'decision_tree'], optimizer='baseline', ensemble_method='ensemble_selection', ensemble_size=30, ).fit(df, metric='auc', runcount=1) df = pd.read_csv("data/cls_data/santander/test.csv") data = df.values df = df.drop(columns=["ID"]) pred2 = cls.predict(df) print(pred2) import csv with open('data/cls_data/santander/submission.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['ID', 'TARGET']) for i in range(len(pred2)): line = [int(data[i, 0]), pred2[i]] writer.writerow(line)
def test_no_free_lunch(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data for dataset in datasets: seeds = get_seeds(dataset, rep_num) for run_id in range(rep_num): seed = seeds[run_id] # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) for algo in algo_list: for optimizer in ['smbo']: task_format = dataset + '_' + algo + '_%d_%d' cls = Classifier( include_models=[algo], optimizer=optimizer, seed=seed).fit( dm, metric='accuracy', runcount=run_count, task_name=task_format % (run_count, run_id)) print(cls.predict(X))
def test_hyperspace(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT try: for dataset in datasets: for run_id in range(start_run, rep_num): X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) for n_est in [1, 2, 4, 8, 12]: algos = algo_list[:n_est] task_format = dataset + '_hp_%d_%d' % (n_est, run_id) cls = Classifier( include_models=algos, optimizer='smbo', seed=seed).fit( dm, metric='accuracy', runcount=run_count, task_name=task_format) print(cls.predict(X)) except Exception as e: print(e) print('Exit!')
def test_claim(): from alphaml.engine.components.data_manager import DataManager from alphaml.estimators.classifier import Classifier from alphaml.datasets.cls_dataset.dataset_loader import load_data from alphaml.utils.constants import MAX_INT perfs_list = list() for dataset in datasets: for run_id in range(rep_num): X, y, _ = load_data(dataset) dm = DataManager(X, y) seed = np.random.random_integers(MAX_INT) task_format = dataset + '_claim_%d' for optimizer in ['smbo']: cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_format % run_id) print(cls.predict(X)) file_id = 'data/%s/%s_claim_%d_%s.data' % (dataset, dataset, run_id, 'smac') with open(file_id, 'rb') as f: data = pickle.load(f) best_id = np.argmax(data['perfs']) best_value = data['perfs'][best_id] if data['perfs'].count(best_value) > 1: stats = dict() for conf, perf in zip(data['configs'], data['perfs']): if perf == best_value: est = conf['estimator'] if est not in stats: stats[est] = 0 stats[est] += 1 tmp_id = np.argmax(stats.values()) best_estimator = list(stats.keys())[tmp_id] print('=' * 20, best_value, stats) else: best_estimator = data['configs'][best_id]['estimator'] print('=' * 20, data['perfs'][best_id], data['configs'][best_id]) run_cnts = len([ item for item in data['configs'] if item['estimator'] == best_estimator ]) task_format = dataset + '_claim_single_%d' cls = Classifier(include_models=[best_estimator], optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_cnts, task_name=task_format % run_id) print(cls.predict(X)) file_id = 'data/%s/%s_claim_single_%d_%s.data' % ( dataset, dataset, run_id, 'smac') with open(file_id, 'rb') as f: data_s = pickle.load(f) print('=' * 20 + 'single', max(data_s['perfs'])) perfs_list.append((data['perfs'], data_s['perfs'])) for item in perfs_list: item1, item2 = item print(len(item1), max(item1), len(item2), max(item2)) print('=' * 50) print(perfs_list)
def test_cash_module(): rep_num = args.rep run_count = args.run_count start_id = args.start_runid datasets = args.datasets.split(',') optimizer_algos = args.opt_algo.split(',') task_id = args.task_id print(rep_num, run_count, datasets, optimizer_algos, task_id) result = dict() for dataset in datasets: dataset_id = dataset.split('_')[0] result_dir = 'data/' + dataset_id if not os.path.exists(result_dir): os.mkdir(result_dir) seeds = get_seeds(dataset, rep_num) for run_id in range(start_id, rep_num): task_name = dataset + '_%s_%d_%d' % (task_id, run_count, run_id) seed = seeds[run_id] # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) # Test each optimizer algorithm: for optimizer in optimizer_algos: # Parse the parameters for each optimizer. mode = 2 eta, r = 2, 2 if optimizer.startswith('baseline'): optimizer, mode = optimizer.split('_') mode = 1 if mode == 'rand' else 2 if optimizer.startswith('sh'): if len(optimizer.split('_')) == 2: optimizer, eta = optimizer.split('_') eta = float(eta) else: raise ValueError('Wrong SH params!') if optimizer.startswith('rl'): if len(optimizer.split('_')) == 3: _, mode, eta = optimizer.split('_') mode = int(mode) optimizer = 'rl_smbo' else: raise ValueError('Wrong SH params!') if optimizer.startswith('ts_smbo'): mode = 1 if len(optimizer.split('_')) == 3: _, _, mode = optimizer.split('_') mode = int(mode) optimizer = 'ts_smbo' if optimizer.startswith('mcmc_ts'): _, _, mode, eta, r = optimizer.split('_') mode = int(mode) eta = int(eta) r = int(r) optimizer = 'mcmc_ts_smbo' if optimizer.startswith('ucb_smbo'): mode = 1 if len(optimizer.split('_')) == 3: _, _, mode = optimizer.split('_') mode = int(mode) optimizer = 'ucb_smbo' if optimizer.startswith('mono_smbo'): mode = 2 if len(optimizer.split('_')) == 4: _, _, mode, r = optimizer.split('_') mode, r = int(mode), int(r) eta = 10 optimizer = 'mono_smbo' print('Test %s optimizer => %s' % (optimizer, task_name)) # Construct the AutoML classifier. cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, task_name=task_name, update_mode=mode, eta=eta, r=r, param=eta) acc = cls.score(X_test, y_test) key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id, optimizer) result[key_id] = acc # Display and save the test result. print(result) with open( 'data/%s/%s_test_result_%s_%d_%d_%d.pkl' % (dataset_id, dataset_id, task_id, run_count, rep_num, start_id), 'wb') as f: pickle.dump(result, f)
from time import time warnings.filterwarnings("ignore") parser = argparse.ArgumentParser() parser.add_argument("--generated_feature", type=int, default=1) parser.add_argument("--dataset", type=str) args = parser.parse_args() x, y, c = load_data(args.dataset) dm = DataManager(x, y) lr = LogisticRegression() lr.fit(dm.train_X, dm.train_y) y_pred = lr.predict(dm.val_X) print("original lr accu:", accuracy_score(dm.val_y, y_pred), flush=True) if args.generated_feature > 0: af = AutoFeature("accuracy", "auto_cross") af.fit(dm, args.generated_feature) dm = af.transform(dm) clf = Classifier() start_time = time() clf.fit(dm, metric="accuracy", runcount=50) print("alphaml time:", time() - start_time) print("dataset:", args.dataset) print("generated data:", args.generated_feature, ", alphaml score:", clf.score(dm.val_X, dm.val_y))
def test_exp4_runtime(): rep_num = args.rep run_count = args.run_count B = args.B if B > 0: run_count = 0 start_id = args.start_runid datasets = args.datasets.split(',') print(rep_num, run_count, datasets) task_id = "exp4_runtime" for dataset in datasets: dataset_id = dataset.split('_')[0] result_dir = 'data/' + dataset_id if not os.path.exists(result_dir): os.mkdir(result_dir) # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) runcount_dict = dict() tpe_runcount = 0. optimizer_algos = ['mono_smbo_4', 'smbo', 'tpe'] # optimizer_algos = ['mono_smbo_3_0'] # Test each optimizer algorithm: assert optimizer_algos[-1] == 'tpe' for opt_algo in optimizer_algos: # if algo is tpe, we need to estimate its runcount in one hour. if opt_algo != 'tpe': runcount_dict[opt_algo] = list() else: count_list = list() for key in runcount_dict.keys(): count_list.append(np.mean(runcount_dict[key])) assert len(count_list) > 0 tpe_runcount = np.min(count_list) print('=' * 50, tpe_runcount) result = dict() mode, eta = None, None # Parse the parameters for each optimizer. if opt_algo.startswith('mono_smbo'): mode = 2 if len(opt_algo.split('_')) == 3: _, _, mode = opt_algo.split('_') mode = int(mode) eta = 10 optimizer = 'mono_smbo' else: optimizer = opt_algo print('Test optimizer: %s' % optimizer) seeds = get_seeds(dataset, rep_num) for run_id in range(start_id, rep_num): if B > 0: task_name = dataset + '_%s_%d_%d_%d' % (task_id, B, run_count, run_id) else: task_name = dataset + '_%s_%d_%d' % (task_id, run_count, run_id) seed = seeds[run_id] runcount_const = run_count if opt_algo != 'tpe' else tpe_runcount # Construct the AutoML classifier. cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=runcount_const, runtime=B, task_name=task_name, update_mode=mode, param=eta) # Test the CASH performance on test set. cash_test_acc = cls.score(X_test, y_test) # Load CASH intermediate infos. if optimizer == 'smbo': file_id = 'smac' elif optimizer == 'tpe': file_id = 'hyperopt' elif optimizer == 'mono_smbo': file_id = 'mm_bandit_%d_smac' % mode else: raise ValueError('Invalid optimizer!') tmp_task_id = '%s_%d' % (task_id, B) if B > 0 else task_id tmp_configs, tmp_perfs = load_infos(dataset, tmp_task_id, run_count, run_id, file_id) if opt_algo != 'tpe': runcount_dict[opt_algo].append(len(tmp_configs)) model_infos = (tmp_configs, tmp_perfs) ensemble_size = 50 task_type = type_of_target(dm.train_y) if optimizer == 'tpe': task_type = 'hyperopt_' + task_type metric = accuracy_score ensemble_model = EnsembleSelection(model_infos, ensemble_size, task_type, metric, n_best=20) ensemble_model.fit(dm) ens_val_pred = ensemble_model.predict(dm.val_X) ens_val_acc = accuracy_score(ens_val_pred, dm.val_y) ens_pred = ensemble_model.predict(X_test) ens_test_acc = accuracy_score(ens_pred, y_test) key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id, optimizer) result[key_id] = [cash_test_acc, ens_val_acc, ens_test_acc] print(result) # Save the test result. with open( 'data/%s/%s_test_result_%s_%s_%d_%d_%d.pkl' % (dataset_id, dataset, opt_algo, task_id, run_count, rep_num, start_id), 'wb') as f: pickle.dump(result, f)
def test_exp2_evaluation(): rep_num = args.rep run_count = args.run_count start_id = args.start_runid datasets = args.datasets.split(',') print(rep_num, run_count, datasets) task_id = "exp_2_evaluation" for dataset in datasets: dataset_id = dataset.split('_')[0] result_dir = 'data/' + dataset_id if not os.path.exists(result_dir): os.mkdir(result_dir) # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) # optimizer_algos = ['cmab_ts', 'rl_1_0.3', 'rl_2_1', 'rl_3_0'] optimizer_algos = ['cmab_ts', 'rl_2_1', 'rl_3_0'] # Test each optimizer algorithm: for opt_algo in optimizer_algos: result = dict() mode, eta = None, None # Parse the parameters for each optimizer. if opt_algo.startswith('rl'): if len(opt_algo.split('_')) == 3: _, mode, eta = opt_algo.split('_') mode = int(mode) optimizer = 'rl_smbo' eta = float(eta) else: raise ValueError('Wrong params!') else: optimizer = opt_algo print('Test optimizer: %s' % optimizer) seeds = get_seeds(dataset, rep_num) for run_id in range(start_id, rep_num): task_name = dataset + '_%s_%d_%d' % (task_id, run_count, run_id) seed = seeds[run_id] # Construct the AutoML classifier. cls = Classifier(optimizer=optimizer, seed=seed).fit(dm, metric='accuracy', runcount=run_count, runtime=None, task_name=task_name, update_mode=mode, param=eta) # Test the CASH performance on test set. cash_test_acc = cls.score(X_test, y_test) key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id, optimizer) result[key_id] = [cash_test_acc] print(result) # Save the test result. with open( 'data/%s/%s_test_result_%s_%s_%d_%d_%d.pkl' % (dataset_id, dataset, opt_algo, task_id, run_count, rep_num, start_id), 'wb') as f: pickle.dump(result, f)
df["Sex"] = df["Sex"].replace(["male", "female"], [0, 1]) df.drop(columns="Ticket", axis=1, inplace=True) for i in range(df.shape[0]): if df["Cabin"][i] == "C23 C25 C27": df["Cabin"][i] = 0 else: df["Cabin"][i] = 1 df["Cabin"] = df["Cabin"].astype("float") df = pd.get_dummies(df) x = df.values x_train = x[:train_size] x_test = x[train_size:] dm = DataManager() dm.train_X = x_train dm.train_y = y_train clf = Classifier(optimizer="smbo") clf.fit(dm, metric="accuracy", runcount=200) submission = pd.read_csv(home_path + "/datasets/titanic/gender_submission.csv") submission["Survived"] = clf.predict(x_test) submission.to_csv(home_path + "/datasets/titanic/xgboost.csv", index=False)
def evaluate_k(): algo_list = [ 'xgboost', 'liblinear_svc', 'gradient_boosting', 'decision_tree', 'passive_aggressive', 'qda', 'random_forest', 'sgd', 'extra_trees', 'lda', 'gaussian_nb', 'libsvm_svc', 'logistic_regression', 'adaboost', 'k_nearest_neighbors' ] rep_num = args.rep run_count = args.run_count start_id = args.start_runid datasets = args.datasets.split(',') task_id = 'exp5_eval_k' print(rep_num, run_count, datasets, task_id) for dataset in datasets: # Make directories. dataset_id = dataset.split('_')[0] save_dir = "data/%s/" % dataset_id if not os.path.exists(save_dir): os.makedirs(save_dir) # Dataset partition. X, y, _ = load_data(dataset) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) dm = DataManager(X_train, y_train) # opt_algos = ['mono_smbo_3_0', 'smbo', 'baseline_2', 'tpe'] opt_algos = ['mono_smbo_3_0', 'smbo', 'baseline_2'] for algo in opt_algos: result = dict() seeds = get_seeds(dataset, rep_num) for run_id in range(start_id, rep_num): seed = seeds[run_id] # Test each optimizer algorithm: for n_est in [15, 12, 8, 4, 2, 1]: algos = algo_list[:n_est] task_name = dataset + '_%s_%d_%d_%d' % (task_id, run_count, run_id, n_est) mode, param = 3, None if algo.startswith('mono_smbo'): optimizer = 'mono_smbo' mode, param = 3, 10 elif algo.startswith('baseline'): optimizer = 'baseline' mode = 2 else: optimizer = algo print('Test %s optimizer => %s' % (optimizer, task_name)) # Construct the AutoML classifier. cls = Classifier(optimizer=optimizer, seed=seed, include_models=algos).fit( dm, metric='accuracy', runcount=run_count, task_name=task_name, update_mode=mode, param=param) acc = cls.score(X_test, y_test) key_id = '%s_%d_%d_%d_%s' % (dataset, run_count, n_est, run_id, optimizer) result[key_id] = acc # Display and save the test result. print(result) with open( 'data/%s/%s_test_%s_%d_%d_%d.pkl' % (dataset_id, dataset, algo, run_count, rep_num, start_id), 'wb') as f: pickle.dump(result, f)