def run(dataset_train, dataset_test, pop_size, gens, cross_rate, fb, max_time=1200): Xtrain, ytrain = dataset_train[:, :-1], dataset_train[:, -1] Xtest, ytest = dataset_test[:, :-1], dataset_test[:, -1] est_gp = Feat(obj="fitness,complexity", pop_size=pop_size, gens=gens, max_time=max_time, max_stall=50, batch_size=10000, ml = "LinearRidgeRegression", sel='lexicase', surv='nsga2', max_depth=10, max_dim=min([Xtrain.shape[1]*2,50]), #random_state=random_seed, functions="+,-,*,/,sqrt,sin,cos,tanh,exp,log,^,x,kd", otype="f", backprop=True, iters=10, n_threads=1, verbosity=1, # tuned parameters cross_rate= cross_rate, fb = fb, root_xo_rate = 0.75, softmax_norm = False ) est_gp.fit(Xtrain, ytrain) return RMSE(est_gp.predict(Xtrain), ytrain), RMSE(est_gp.predict(Xtest), ytest), est_gp.get_model()
def test_saving_loading(self): self.debug("Pickle Feat object") reg = clone(self.reg) reg.fit(self.X, self.yr) initial_pred = reg.predict(self.X) reg.save('Feat_tmp.json') loaded_reg = Feat().load('Feat_tmp.json') # print('loaded_reg:',type(loaded_reg).__name__) loaded_pred = loaded_reg.predict(self.X) # print('initial pred:',initial_pred) # print('loaded pred:',loaded_pred) diff = np.abs(initial_pred-loaded_pred) for i,d in enumerate(diff): if d > 0.0001: print('pred:',initial_pred[i],'loaded:',loaded_pred[i], 'diff:',d) assert(d < 0.0001) # assert(all([ip==lp for ip,lp in zip(initial_pred, loaded_pred)])) assert(reg.get_representation() == loaded_reg.get_representation()) assert(reg.get_model() == loaded_reg.get_model()) assert((reg.get_coefs() == loaded_reg.get_coefs()).all()) loaded_params = loaded_reg.get_params() # print('\n',10*'=','\n') # print('loaded_params:') # for k,v in loaded_params.items(): # print(k,':',v) for k,v in reg.get_params().items(): if k not in loaded_params.keys(): print(k,'not in ',loaded_params.keys()) assert(k in loaded_params.keys()) if isinstance(v,float): if np.abs(loaded_params[k] - v) > 0.0001: print('loaded_params[',k,'] =', loaded_params[k], '\nwhich is different from:', v) assert(np.abs(loaded_params[k] - v) < 0.0001) elif loaded_params[k] != v: print('loaded_params[',k,'] =', loaded_params[k], '\nwhich is different from:', v) assert(loaded_params[k] == v) loaded_reg.fit(self.X, self.yr)
df = pd.read_csv('../d_heart.csv') df.describe() X = df.drop('class', axis=1).values y = df['class'].values n_splits = 5 kf = KFold(n_splits=n_splits) kf.get_n_splits(X) clf = Feat( max_depth=6, # max_stall=20, # max_dim=X.shape[1], max_dim=min(50, 2 * X.shape[1]), pop_size=200, # ml='CART', ml='LR', verbosity=1, shuffle=True, classification=True, backprop=True, random_state=42) lr = LR() rocs = [] aucs = [] lr_rocs = [] lr_aucs = [] for train_idx, test_idx in kf.split(X): clf.fit(X[train_idx], y[train_idx]) lr.fit(X[train_idx], y[train_idx])
X, y, names = read_file(dataset, classification=False) # parameter variation hyper_params = [{ 'hillclimb': [True], 'iters': [1, 10, 100], }, { 'backprop': [True], 'iters': [1, 10, 100], }] # create the classifier clf = Feat(pop_size=100, gens=100, ml="LinearRidgeRegression", sel='simanneal', surv='simanneal', max_depth=10, max_dim=min([X.shape[1] * 2, 50]), random_state=random_seed, n_threads=1, verbosity=1, logfile=save_file.split('.csv')[0] + '_' + str(random_seed) + '.csv') #functions # 10-fold CV score for the pipeline clf_name = 'FeatSimAnneal' # evaluate the model evaluate_model(dataset, save_file, random_seed, clf, clf_name, hyper_params,
'fb': [0.0,0.25,0.5,0.75,1.0] } # create the classifier clf = Feat(obj="fitness,complexity", residual_xo=True, pop_size=500, gens=200, max_time=3600, max_stall=50, use_batch=True, batch_size=1000, ml = "LinearRidgeRegression", sel='lexicase', surv='nsga2', max_depth=10, max_dim=min([X.shape[1]*2,50]), random_state=random_seed, backprop=True, iters=10, n_threads=1, verbosity=2, # tuned parameters cross_rate= 0.75, fb = 0.0, root_xo_rate = 0.5, softmax_norm = False # logfile=save_file.split('.csv')[0]+'_'+str(random_seed)+'.log' ) #functions # 10-fold CV score for the pipeline clf_name = 'FeatResXO'
import numpy as np from feat import Feat from sklearn.model_selection import KFold df = pd.read_csv('d_example_patients.csv') df.drop('id', axis=1, inplace=True) X = df.drop('class', axis=1).values y = df['class'].values zfile = 'd_example_patients_long.csv' kf = KFold(n_splits=3) kf.get_n_splits(X) clf = Feat( max_depth=5, max_dim=min(50, 2 * X.shape[1]), verbosity=1, shuffle=True, ml='LR', classification=True, functions= "max,+,-,*,/,exp,log,and,or,not,=,<,>,ite,mean,median,min,variance,skew,kurtosis,slope,count", random_state=42) scores = [] for train_idx, test_idx in kf.split(X): clf.fit(X[train_idx], y[train_idx], zfile, train_idx) scores.append(clf.score(X[test_idx], y[test_idx], zfile, test_idx)) print('scores:', scores)
class TestFeatWrapper(unittest.TestCase): def setUp(self): self.v = verbosity self.clf = Feat(verbosity=self.v) diabetes = load_diabetes() self.X = diabetes.data self.y = diabetes.target #Test 1: Assert the length of labels returned from predict def test_predict_length(self): self.debug("Fit the Data") self.clf.fit(self.X,self.y) self.debug("Predicting the Results") pred = self.clf.predict(self.X) self.debug("Comparing the Length of labls in Predicted vs Actual ") expected_length = len(self.y) actual_length = len(pred) self.assertEqual( actual_length , expected_length ) #Test 2: Assert the length of labels returned from fit_predict def test_fitpredict_length(self): self.debug("Calling fit_predict from Feat") pred = self.clf.fit_predict(self.X,self.y) self.debug("Comparing the length of labls in fit_predict vs actual ") expected_length = len(self.y) actual_length = len(pred) self.assertEqual( actual_length , expected_length ) #Test 3: Assert the length of labels returned from transform def test_transform_length(self): self.debug("Calling fit") self.clf.fit(self.X,self.y) trans_X = self.clf.transform(self.X) self.debug("Comparing the length of labls in transform vs actual feature set ") expected_value = self.X.shape[0] actual_value = trans_X.shape[0] self.assertEqual( actual_value , expected_value ) #Test 4: Assert the length of labels returned from fit_transform def test_fit_transform_length(self): self.debug("In wrappertest.py...Calling fit transform") trans_X = self.clf.fit_transform(self.X,self.y) self.debug("Comparing the length of labls in transform vs actual feature set ") expected_value = self.X.shape[0] actual_value = trans_X.shape[0] self.assertEqual( actual_value , expected_value ) #Test 5: Transform with Z def test_transform_length_z(self,zfile=None,zids=None): self.debug("Calling fit") self.clf.fit(self.X,self.y) trans_X = self.clf.transform(self.X,zfile,zids) self.debug("Comparing the length of labls in transform vs actual feature set ") expected_value = self.X.shape[0] actual_value = trans_X.shape[0] self.assertEqual( actual_value , expected_value ) def debug(self,message): if ( self.v > 0 ): print (message) def test_coefs(self): self.debug("In wrappertest.py...Calling test_coefs") self.clf.fit(self.X,self.y) coefs = self.clf.get_coefs() print('coefs:',coefs) self.assertTrue( len(coefs)>0 )
hyper_params = [{ 'cross_rate': [0.25, 0.5, 0.75] }, { 'fb': [0.25, 0.5, 0.75] }] # create the classifier clf = Feat(obj="fitness,complexity,corr", pop_size=500, gens=200, max_time=600, max_stall=50, use_batch=True, batch_size=1000, ml="LinearRidgeRegression", sel='lexicase', surv='nsga2', max_depth=6, max_dim=min([X.shape[1] * 2, 50]), random_state=random_seed, backprop=True, iters=10, n_threads=1, verbosity=2, logfile=save_file.split('.csv')[0] + '_' + str(random_seed) + '.csv') #functions # 10-fold CV score for the pipeline clf_name = 'FeatCorr' # evaluate the model evaluate_model(dataset, save_file,
class TestFeatWrapper(unittest.TestCase): def setUp(self): self.v = verbosity self.clf = Feat(verbosity=verbosity, n_threads=1) diabetes = load_diabetes() self.X = diabetes.data self.y = diabetes.target #Test 1: Assert the length of labels returned from predict def test_predict_length(self): self.debug("Fit the Data") self.clf.fit(self.X,self.y) self.debug("Predicting the Results") pred = self.clf.predict(self.X) self.debug("Comparing the Length of labls in Predicted vs Actual ") expected_length = len(self.y) actual_length = len(pred) self.assertEqual( actual_length , expected_length ) #Test 2: Assert the length of labels returned from fit_predict def test_fitpredict_length(self): self.debug("Calling fit_predict from Feat") pred = self.clf.fit_predict(self.X,self.y) self.debug("Comparing the length of labls in fit_predict vs actual ") expected_length = len(self.y) actual_length = len(pred) self.assertEqual( actual_length , expected_length ) #Test 3: Assert the length of labels returned from transform def test_transform_length(self): self.debug("Calling fit") self.clf.fit(self.X,self.y) trans_X = self.clf.transform(self.X) self.debug("Comparing the length of labls in transform vs actual feature set ") expected_value = self.X.shape[0] actual_value = trans_X.shape[0] self.assertEqual( actual_value , expected_value ) #Test 4: Assert the length of labels returned from fit_transform def test_fit_transform_length(self): self.debug("In wrappertest.py...Calling fit transform") trans_X = self.clf.fit_transform(self.X,self.y) self.debug("Comparing the length of labls in transform vs actual feature set ") expected_value = self.X.shape[0] actual_value = trans_X.shape[0] self.assertEqual( actual_value , expected_value ) #Test 5: Transform with Z def test_transform_length_z(self,zfile=None,zids=None): self.debug("Calling fit") self.clf.fit(self.X,self.y) trans_X = self.clf.transform(self.X,zfile,zids) self.debug("Comparing the length of labls in transform vs actual feature set ") expected_value = self.X.shape[0] actual_value = trans_X.shape[0] self.assertEqual( actual_value , expected_value ) def debug(self,message): if ( self.v > 0 ): print (message) def test_coefs(self): self.debug("In wrappertest.py...Calling test_coefs") self.clf.fit(self.X,self.y) coefs = self.clf.get_coefs() print('coefs:',coefs) self.assertTrue( len(coefs)>0 ) def test_dataframe(self): self.debug("In wrappertest.py...Calling test_dataframe") dfX = pd.DataFrame(data=self.X,columns=['fishy'+str(i) for i in np.arange(self.X.shape[1])], index=None) # print(dfX.head()) # print('dfX.columns:',dfX.columns) dfy = pd.DataFrame(data={'label':self.y}) self.clf.fit(dfX,dfy['label']) # print('clf feature_names:',self.clf.feature_names) # print('dfX.columns:',','.join(dfX.columns).encode()) assert(self.clf.feature_names == ','.join(dfX.columns).encode()) #Test: Assert the length of labels returned from predict def test_predict_stats_length(self): self.debug("Fit the Data") self.clf.fit(self.X,self.y) print("Num generations is ", self.clf.gens) for key in self.clf.stats: print("Length for ", key, "is ", len(self.clf.stats[key])) self.assertEqual(len(self.clf.stats[key]), self.clf.gens)
import pandas as pd from pmlb import fetch_data df = pd.read_csv('mnist.csv', sep='\t') print(df.columns) X = df.drop('class', axis=1).values y = df['class'].values from feat import Feat ft = Feat(classification=True, verbosity=2) ft.fit(X[:60000], y[:60000]) print(ft.score(X[60000:], y[60000:]))
df = pd.read_csv('d_example_patients.csv') df.drop('id', axis=1, inplace=True) X = df.drop('class', axis=1).values y = df['class'].values zfile = 'd_example_patients_long.csv' kf = StratifiedKFold(n_splits=3) kf.get_n_splits(X) clf = Feat( max_depth=5, max_dim=min(50, 2 * X.shape[1]), gens=20, pop_size=100, verbosity=1, shuffle=True, ml='LR', classification=True, feature_names=','.join(df.drop('class', axis=1).columns), functions="+,-,*,/,exp,log,and,or,not,=,<,<=,>,>=,ite,split,split_c," "mean,median,max,min,variance,skew,kurtosis,slope,count", backprop=True, iters=10, random_state=42) scores = [] for train_idx, test_idx in kf.split(X, y): # print('train_idx:',train_idx) clf.fit(X[train_idx], y[train_idx], zfile, train_idx) scores.append(clf.score(X[test_idx], y[test_idx], zfile, test_idx)) print('scores:', scores)
def train_predict(self, data, time_budget, n_class, schema): s1 = time.time() seed = SEED fix_seed(seed) LOGGER.info(f'time_budget:{time_budget}') LOGGER.info(f'n_class:{n_class}') LOGGER.info(f'node:{data["fea_table"].shape[0]}') LOGGER.info(f'edge:{data["edge_file"].shape[0]}') #pre-process data process_data = ProcessData(data) table = process_data.pre_process(time_budget, n_class, schema) # Feature Dimension Reduction feat = Feat() process_data.drop_unique_columns(table) drop_sum_columns = process_data.drop_excessive_columns(table) feat.fit_transform(table, drop_sum_columns) LOGGER.info( f'train:test={(table.df["is_test"]!=1).sum()}:{(table.df["is_test"]==1).sum()}' ) #这里好像没用到哦 table.large_features = False if table.ori_columns.shape[0] > 500: table.large_features = True model_type_list = ['sage', 'gat', 'tagc', 'gcn'] repeat = 3 model_name_list = [ f'{model_type_list[i]}{i+len(model_type_list)*j}' for j in range(repeat) for i in range(len(model_type_list)) ] model_type_list = model_type_list * repeat LOGGER.info('use node embedding') categories = [ 'node_index', 'degree_bins', 'bin_2-neighbor_mean_degree_bins' ] for model in set(model_type_list): LOGGER.info( f"""{model} feature num:{eval(f'table.{model}_columns.shape[0]')}""" ) exec( f'table.{model}_data = process_data.process_gnn_data(table,table.{model}_columns,categories)' ) allmodel = AllModel() table.lr_epoch = 16 table.lr_list = [0.05, 0.03, 0.01, 0.0075, 0.005, 0.003, 0.001, 0.0005] train_valid_idx_list, valid_idx_list = split_train_and_valid( table, train_rate=0.8, seed=SEED, mode=split_mode) train_idx, test_idx = split_train_and_test(table) test_idx = test_idx.sort_values() run_model = [] run_type = [] run_time = {} for i in range(len(model_type_list)): seed = SEED * (i + 1) fix_seed(seed) model_type = model_type_list[i] model_name = model_name_list[i] if model_type not in run_time: init_time, one_epoch_time, early_stopping_rounds = allmodel.get_run_time( table, model_type, model_name, train_idx, test_idx, seed=seed) run_lr_time = len(table.lr_list) * ( init_time + table.lr_epoch * one_epoch_time) run_time500 = init_time * (2) + one_epoch_time * ( 500 + early_stopping_rounds) * 2 + run_lr_time run_time300 = init_time * (2) + one_epoch_time * ( 300 + early_stopping_rounds) * 2 + run_lr_time run_time150 = init_time * (2) + one_epoch_time * ( 150 + early_stopping_rounds) * 2 + run_lr_time run_time[model_type] = (run_time500 - run_lr_time, run_time300 - run_lr_time, run_time150 - run_lr_time, early_stopping_rounds, init_time, one_epoch_time, run_lr_time) else: run_time500, run_time300, run_time150, early_stopping_rounds, init_time, one_epoch_time, run_lr_time = run_time[ model_type] s2 = time.time() LOGGER.info( f"time_budget:{time_budget}s,used time:{s2-s1:.2f}s,{model_name} model will use {run_time500:.2f}s|{run_time300:.2f}s|{run_time150:.2f}s" ) if s2 - s1 + run_time500 + 5 < time_budget: LOGGER.info('train 500 epoch') allmodel.V37_fit_transform(table, model_type, model_name, train_valid_idx_list, valid_idx_list, train_idx, test_idx, mode=split_mode, num_boost_round=500, seed=seed) run_model.append(model_name) run_type.append(model_type) elif s2 - s1 + run_time300 + 5 < time_budget: LOGGER.info('train 300 epoch') allmodel.V37_fit_transform(table, model_type, model_name, train_valid_idx_list, valid_idx_list, train_idx, test_idx, mode=split_mode, num_boost_round=300, seed=seed) run_model.append(model_name) run_type.append(model_type) elif s2 - s1 + run_time150 + 5 < time_budget: LOGGER.info('train 150 epoch') allmodel.V37_fit_transform(table, model_type, model_name, train_valid_idx_list, valid_idx_list, train_idx, test_idx, mode=split_mode, num_boost_round=150, seed=seed) run_model.append(model_name) run_type.append(model_type) elif len(allmodel.valid_models[0]) == 0: this_epoch = int(( (time_budget - (s2 - s1 + 5) - run_lr_time) / 2 - init_time) / (one_epoch_time) - early_stopping_rounds) LOGGER.info(f'short time train {this_epoch} epoch') allmodel.V37_fit_transform(table, model_type, model_name, train_valid_idx_list, valid_idx_list, train_idx, test_idx, mode=split_mode, num_boost_round=this_epoch, seed=seed) run_model.append(model_name) run_type.append(model_type) elif time_budget - (s2 - s1) < 5: LOGGER.info('never train; break') break else: LOGGER.info('no train this model; continue') continue if offline: if table.especial: df = table.df[['node_index', 'is_test']] df = df.merge(data['test_label'], how='left', on='node_index') test_label = df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()), 'label'].astype('int').values else: test_label = data['test_label']['label'].values else: test_label = None preds1, valid_acc1 = get_preds(0, run_model, run_type, allmodel, model_name_list, table, test_label, valid_idx_list) preds2, valid_acc2 = get_preds(1, run_model, run_type, allmodel, model_name_list, table, test_label, valid_idx_list) preds = (preds1 + preds2) / 2 preds = preds.argmax(axis=1).flatten() if table.especial: LOGGER.info(f'preds\n{preds}') df = table.df[['label', 'is_test']] df['preds'] = int( df.loc[[not i for i in table.directed_mask.tolist()], 'label'].value_counts().index[0]) df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()), 'preds'] = preds preds = df.loc[df['is_test'] == 1, 'preds'].values LOGGER.info( f"train label\n{data['train_label']['label'].value_counts()/data['train_label'].shape[0]}" ) df_preds = pd.Series(preds, name='preds') LOGGER.info( f"preds label\n{df_preds.value_counts()/df_preds.shape[0]}") if offline: preds1 = preds1.argmax(axis=1).flatten() preds2 = preds2.argmax(axis=1).flatten() if table.especial: LOGGER.info(f'preds1\n{preds1}') df = table.df[['label', 'is_test']] df['preds'] = int( df.loc[[not i for i in table.directed_mask.tolist()], 'label'].value_counts().index[0]) df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()), 'preds'] = preds1 preds1 = df.loc[df['is_test'] == 1, 'preds'].values LOGGER.info(f'preds2\n{preds2}') df = table.df[['label', 'is_test']] df['preds'] = int( df.loc[[not i for i in table.directed_mask.tolist()], 'label'].value_counts().index[0]) df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()), 'preds'] = preds2 preds2 = df.loc[df['is_test'] == 1, 'preds'].values df_test = table.df[['degree', 'label', 'is_test']] df_test = df_test.loc[df_test['is_test'] == 1] df_test['preds'] = preds df_test['label'] = data['test_label']['label'].values df_test['acc'] = df_test['preds'] == df_test['label'] pd.set_option('display.max_rows', 1000) print(df_test.groupby('degree')['acc'].mean()) return preds, valid_acc1, valid_acc2, preds1, preds2 else: return preds
from sklearn.linear_model import LogisticRegression as LR df = pd.read_csv('../d_heart.csv') df.describe() X = df.drop('class', axis=1).values y = df['class'].values n_splits = 5 kf = KFold(n_splits=n_splits) kf.get_n_splits(X) clf = Feat( max_depth=6, # max_dim=X.shape[1], max_dim=min(50, 2 * X.shape[1]), pop_size=500, verbosity=0, shuffle=True, classification=True, functions= "+,-,*,/,exp,log,and,or,not,xor,=,<,>,ite,gauss,gauss2d,sign,logit,tanh", random_state=42) lr = LR() rocs = [] aucs = [] lr_rocs = [] lr_aucs = [] for train_idx, test_idx in kf.split(X): clf.fit(X[train_idx], y[train_idx]) lr.fit(X[train_idx], y[train_idx])
import pandas as pd import numpy as np from feat import Feat import sys seed = sys.argv[1] df = pd.read_csv('../examples/d_heart.csv', sep=',') df.describe() X = df.drop('class', axis=1).values y = df['class'].values clf = Feat(max_depth=3, max_dim=1, gens=100, pop_size=200, verbosity=2, shuffle=True, classification=True, functions="+,-,*,/,exp,log,and,or,not,=,<,>,ite", random_state=seed, softmax_norm=True) clf.fit(X, y)
class TestFeatWrapper(unittest.TestCase): def setUp(self): self.v = verbosity self.clf = Feat(verbosity=verbosity, n_threads=1) diabetes = load_diabetes() self.X = diabetes.data self.y = diabetes.target #Test 1: Assert the length of labels returned from predict def test_predict_length(self): self.debug("Fit the Data") self.clf.fit(self.X, self.y) self.debug("Predicting the Results") pred = self.clf.predict(self.X) self.debug("Comparing the Length of labls in Predicted vs Actual ") expected_length = len(self.y) actual_length = len(pred) self.assertEqual(actual_length, expected_length) #Test 2: Assert the length of labels returned from fit_predict def test_fitpredict_length(self): self.debug("Calling fit_predict from Feat") pred = self.clf.fit_predict(self.X, self.y) self.debug("Comparing the length of labls in fit_predict vs actual ") expected_length = len(self.y) actual_length = len(pred) self.assertEqual(actual_length, expected_length) #Test 3: Assert the length of labels returned from transform def test_transform_length(self): self.debug("Calling fit") self.clf.fit(self.X, self.y) trans_X = self.clf.transform(self.X) self.debug( "Comparing the length of labls in transform vs actual feature set " ) expected_value = self.X.shape[0] actual_value = trans_X.shape[0] self.assertEqual(actual_value, expected_value) #Test 4: Assert the length of labels returned from fit_transform def test_fit_transform_length(self): self.debug("In wrappertest.py...Calling fit transform") trans_X = self.clf.fit_transform(self.X, self.y) self.debug( "Comparing the length of labls in transform vs actual feature set " ) expected_value = self.X.shape[0] actual_value = trans_X.shape[0] self.assertEqual(actual_value, expected_value) #Test 5: Transform with Z def test_transform_length_z(self, zfile=None, zids=None): self.debug("Calling fit") self.clf.fit(self.X, self.y) trans_X = self.clf.transform(self.X, zfile, zids) self.debug( "Comparing the length of labls in transform vs actual feature set " ) expected_value = self.X.shape[0] actual_value = trans_X.shape[0] self.assertEqual(actual_value, expected_value) def debug(self, message): if (self.v > 0): print(message) def test_coefs(self): self.debug("In wrappertest.py...Calling test_coefs") self.clf.fit(self.X, self.y) coefs = self.clf.get_coefs() self.assertTrue(len(coefs) > 0) def test_dataframe(self): self.debug("In wrappertest.py...Calling test_dataframe") dfX = pd.DataFrame( data=self.X, columns=['fishy' + str(i) for i in np.arange(self.X.shape[1])], index=None) dfy = pd.DataFrame(data={'label': self.y}) self.clf.fit(dfX, dfy['label']) assert (self.clf.feature_names == ','.join(dfX.columns).encode()) #Test: Assert the length of labels returned from predict def test_predict_stats_length(self): self.debug("Fit the Data") self.clf.fit(self.X, self.y) for key in self.clf.stats: self.assertEqual(len(self.clf.stats[key]), self.clf.gens) #Test ability to pickle feat model def test_pickling(self): self.debug("Pickle Feat object") with open('test_pickle.pkl', 'wb') as f: pickle.dump(self.clf, f) with open('test_pickle.pkl', 'rb') as f: loaded_clf = pickle.load(f) assert (loaded_clf.get_params() == self.clf.get_params()) def test_archive(self): """test archiving ability""" self.debug("Test archive") self.clf.classification = True self.clf.ml = b'LR' self.clf.fit(self.X, np.array(self.y > np.median(self.y), dtype=np.int)) archive = self.clf.get_archive() preds = self.clf.predict_archive(self.X) probs = self.clf.predict_proba_archive(self.X) for arch, pred, prob in zip(archive, preds, probs): self.assertTrue(arch['id'] == pred['id']) self.assertTrue(arch['id'] == prob['id']) def test_lr_l1(self): """testing l1 penalized LR""" self.clf.classification = True self.clf.ml = b'L1_LR' self.clf.fit(self.X, np.array(self.y > np.median(self.y), dtype=np.int)) self.assertEqual(len(self.clf.predict(self.X)), len(self.y))
def setUp(self): self.v = verbosity self.clf = Feat(verbosity=verbosity, n_threads=1) diabetes = load_diabetes() self.X = diabetes.data self.y = diabetes.target
}, { 'cross_rate': [0.0], 'fb': [0.0,0.25,0.5,0.75,1.0], 'softmax_norm': [True,False] }] # create the classifier clf = Feat(obj="fitness,complexity", pop_size=500, gens=100, max_time=600, max_stall=10, #use_batch=True, #batch_size=500, ml = "LinearRidgeRegression", sel='lexicase', surv='nsga2', max_depth=6, max_dim=min([X.shape[1]*2,50]), random_state=random_seed, backprop=True, iters=10, n_threads=1, verbosity=1)#, #logfile=save_file.split('.csv')[0]+'_'+str(random_seed)+'.log') #functions # 10-fold CV score for the pipeline clf_name = 'Feat' # evaluate the model evaluate_model(dataset, save_file, random_seed, clf, clf_name, hyper_params, classification=False)