Example #1
0
def run(dataset_train, dataset_test, pop_size, gens, cross_rate, fb, max_time=1200):

    Xtrain, ytrain = dataset_train[:, :-1], dataset_train[:, -1]
    Xtest,  ytest  = dataset_test[:, :-1],  dataset_test[:, -1]
    
    est_gp = Feat(obj="fitness,complexity",
               pop_size=pop_size,
               gens=gens,
               max_time=max_time,
               max_stall=50,
               batch_size=10000,
               ml = "LinearRidgeRegression",
               sel='lexicase',
               surv='nsga2',
               max_depth=10,
               max_dim=min([Xtrain.shape[1]*2,50]),
               #random_state=random_seed,
               functions="+,-,*,/,sqrt,sin,cos,tanh,exp,log,^,x,kd",
               otype="f",
               backprop=True,
               iters=10,
               n_threads=1,
               verbosity=1,
               # tuned parameters
               cross_rate= cross_rate,
               fb = fb,
               root_xo_rate = 0.75,
               softmax_norm = False
               )
    
    est_gp.fit(Xtrain, ytrain)
    
    return RMSE(est_gp.predict(Xtrain), ytrain), RMSE(est_gp.predict(Xtest), ytest), est_gp.get_model()
Example #2
0
    def test_saving_loading(self):
        self.debug("Pickle Feat object")
    
        reg = clone(self.reg) 
        reg.fit(self.X, self.yr)
        initial_pred = reg.predict(self.X)
        reg.save('Feat_tmp.json')

        loaded_reg = Feat().load('Feat_tmp.json')
        # print('loaded_reg:',type(loaded_reg).__name__)
        loaded_pred = loaded_reg.predict(self.X)
        # print('initial pred:',initial_pred)
        # print('loaded pred:',loaded_pred)
        diff = np.abs(initial_pred-loaded_pred)
        for i,d in enumerate(diff):
            if d > 0.0001:
                print('pred:',initial_pred[i],'loaded:',loaded_pred[i],
                      'diff:',d)
            assert(d < 0.0001)
        # assert(all([ip==lp for ip,lp in zip(initial_pred, loaded_pred)]))

        assert(reg.get_representation() == loaded_reg.get_representation())
        assert(reg.get_model() == loaded_reg.get_model())
        assert((reg.get_coefs() == loaded_reg.get_coefs()).all())
        loaded_params = loaded_reg.get_params()
        # print('\n',10*'=','\n')
        # print('loaded_params:')
        # for k,v in loaded_params.items():
        #     print(k,':',v)

        for k,v in reg.get_params().items():
            if k not in loaded_params.keys():
                print(k,'not in ',loaded_params.keys())
                assert(k in loaded_params.keys())
            if isinstance(v,float):
                if np.abs(loaded_params[k] - v) > 0.0001:
                    print('loaded_params[',k,'] =',
                      loaded_params[k], '\nwhich is different from:', v)
                assert(np.abs(loaded_params[k] - v) < 0.0001)
            elif loaded_params[k] != v:
                print('loaded_params[',k,'] =',
                      loaded_params[k], '\nwhich is different from:', v)
                assert(loaded_params[k] == v)

        loaded_reg.fit(self.X, self.yr)
Example #3
0
df = pd.read_csv('../d_heart.csv')
df.describe()
X = df.drop('class', axis=1).values
y = df['class'].values
n_splits = 5
kf = KFold(n_splits=n_splits)
kf.get_n_splits(X)

clf = Feat(
    max_depth=6,
    # max_stall=20,
    # max_dim=X.shape[1],
    max_dim=min(50, 2 * X.shape[1]),
    pop_size=200,
    # ml='CART',
    ml='LR',
    verbosity=1,
    shuffle=True,
    classification=True,
    backprop=True,
    random_state=42)
lr = LR()
rocs = []
aucs = []
lr_rocs = []
lr_aucs = []

for train_idx, test_idx in kf.split(X):
    clf.fit(X[train_idx], y[train_idx])
    lr.fit(X[train_idx], y[train_idx])
Example #4
0
 X, y, names = read_file(dataset, classification=False)
 # parameter variation
 hyper_params = [{
     'hillclimb': [True],
     'iters': [1, 10, 100],
 }, {
     'backprop': [True],
     'iters': [1, 10, 100],
 }]
 # create the classifier
 clf = Feat(pop_size=100,
            gens=100,
            ml="LinearRidgeRegression",
            sel='simanneal',
            surv='simanneal',
            max_depth=10,
            max_dim=min([X.shape[1] * 2, 50]),
            random_state=random_seed,
            n_threads=1,
            verbosity=1,
            logfile=save_file.split('.csv')[0] + '_' + str(random_seed) +
            '.csv')
 #functions
 # 10-fold CV score for the pipeline
 clf_name = 'FeatSimAnneal'
 # evaluate the model
 evaluate_model(dataset,
                save_file,
                random_seed,
                clf,
                clf_name,
                hyper_params,
Example #5
0
                   'fb': [0.0,0.25,0.5,0.75,1.0]
                   }
   # create the classifier
   clf = Feat(obj="fitness,complexity",
              residual_xo=True,
              pop_size=500,
              gens=200,
              max_time=3600,
              max_stall=50,
              use_batch=True,
              batch_size=1000,
              ml = "LinearRidgeRegression",
              sel='lexicase',
              surv='nsga2',
              max_depth=10,
              max_dim=min([X.shape[1]*2,50]),
              random_state=random_seed,
              backprop=True,
              iters=10,
              n_threads=1,
              verbosity=2,
              # tuned parameters
              cross_rate= 0.75,
              fb = 0.0,
              root_xo_rate = 0.5,
              softmax_norm = False
              # logfile=save_file.split('.csv')[0]+'_'+str(random_seed)+'.log'
              )
 #functions 
   # 10-fold CV score for the pipeline
   clf_name = 'FeatResXO'
Example #6
0
import numpy as np

from feat import Feat
from sklearn.model_selection import KFold

df = pd.read_csv('d_example_patients.csv')
df.drop('id', axis=1, inplace=True)
X = df.drop('class', axis=1).values
y = df['class'].values
zfile = 'd_example_patients_long.csv'
kf = KFold(n_splits=3)
kf.get_n_splits(X)

clf = Feat(
    max_depth=5,
    max_dim=min(50, 2 * X.shape[1]),
    verbosity=1,
    shuffle=True,
    ml='LR',
    classification=True,
    functions=
    "max,+,-,*,/,exp,log,and,or,not,=,<,>,ite,mean,median,min,variance,skew,kurtosis,slope,count",
    random_state=42)
scores = []
for train_idx, test_idx in kf.split(X):
    clf.fit(X[train_idx], y[train_idx], zfile, train_idx)
    scores.append(clf.score(X[test_idx], y[test_idx], zfile, test_idx))

print('scores:', scores)
Example #7
0
class TestFeatWrapper(unittest.TestCase):

    def setUp(self):
        self.v = verbosity
        self.clf = Feat(verbosity=self.v)
        diabetes = load_diabetes()
        self.X = diabetes.data
        self.y = diabetes.target
        
    #Test 1: Assert the length of labels returned from predict
    def test_predict_length(self):
        self.debug("Fit the Data")
        self.clf.fit(self.X,self.y)

        self.debug("Predicting the Results")
        pred = self.clf.predict(self.X)

        self.debug("Comparing the Length of labls in Predicted vs Actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual( actual_length , expected_length )

    #Test 2:  Assert the length of labels returned from fit_predict
    def test_fitpredict_length(self):
        self.debug("Calling fit_predict from Feat")
        pred = self.clf.fit_predict(self.X,self.y)

        self.debug("Comparing the length of labls in fit_predict vs actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual( actual_length , expected_length )

    #Test 3:  Assert the length of labels returned from transform
    def test_transform_length(self):
        self.debug("Calling fit")
        self.clf.fit(self.X,self.y)
        trans_X = self.clf.transform(self.X)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )

    #Test 4:  Assert the length of labels returned from fit_transform
    def test_fit_transform_length(self):
        self.debug("In wrappertest.py...Calling fit transform")
        trans_X = self.clf.fit_transform(self.X,self.y)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )
        
    #Test 5:  Transform with Z
    def test_transform_length_z(self,zfile=None,zids=None):
        self.debug("Calling fit")
        self.clf.fit(self.X,self.y)
        trans_X = self.clf.transform(self.X,zfile,zids)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )

    def debug(self,message):
        if ( self.v > 0 ):
            print (message)

    def test_coefs(self):
        self.debug("In wrappertest.py...Calling test_coefs")
        self.clf.fit(self.X,self.y)
        coefs = self.clf.get_coefs()
        print('coefs:',coefs)
        self.assertTrue( len(coefs)>0 )
Example #8
0
 hyper_params = [{
     'cross_rate': [0.25, 0.5, 0.75]
 }, {
     'fb': [0.25, 0.5, 0.75]
 }]
 # create the classifier
 clf = Feat(obj="fitness,complexity,corr",
            pop_size=500,
            gens=200,
            max_time=600,
            max_stall=50,
            use_batch=True,
            batch_size=1000,
            ml="LinearRidgeRegression",
            sel='lexicase',
            surv='nsga2',
            max_depth=6,
            max_dim=min([X.shape[1] * 2, 50]),
            random_state=random_seed,
            backprop=True,
            iters=10,
            n_threads=1,
            verbosity=2,
            logfile=save_file.split('.csv')[0] + '_' + str(random_seed) +
            '.csv')
 #functions
 # 10-fold CV score for the pipeline
 clf_name = 'FeatCorr'
 # evaluate the model
 evaluate_model(dataset,
                save_file,
Example #9
0
class TestFeatWrapper(unittest.TestCase):

    def setUp(self):
        self.v = verbosity
        self.clf = Feat(verbosity=verbosity, n_threads=1)
        diabetes = load_diabetes()
        self.X = diabetes.data
        self.y = diabetes.target
        
    #Test 1: Assert the length of labels returned from predict
    def test_predict_length(self):
        self.debug("Fit the Data")
        self.clf.fit(self.X,self.y)

        self.debug("Predicting the Results")
        pred = self.clf.predict(self.X)

        self.debug("Comparing the Length of labls in Predicted vs Actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual( actual_length , expected_length )

    #Test 2:  Assert the length of labels returned from fit_predict
    def test_fitpredict_length(self):
        self.debug("Calling fit_predict from Feat")
        pred = self.clf.fit_predict(self.X,self.y)

        self.debug("Comparing the length of labls in fit_predict vs actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual( actual_length , expected_length )

    #Test 3:  Assert the length of labels returned from transform
    def test_transform_length(self):
        self.debug("Calling fit")
        self.clf.fit(self.X,self.y)
        trans_X = self.clf.transform(self.X)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )

    #Test 4:  Assert the length of labels returned from fit_transform
    def test_fit_transform_length(self):
        self.debug("In wrappertest.py...Calling fit transform")
        trans_X = self.clf.fit_transform(self.X,self.y)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )
        
    #Test 5:  Transform with Z
    def test_transform_length_z(self,zfile=None,zids=None):
        self.debug("Calling fit")
        self.clf.fit(self.X,self.y)
        trans_X = self.clf.transform(self.X,zfile,zids)

        self.debug("Comparing the length of labls in transform vs actual feature set ")
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual( actual_value , expected_value )

    def debug(self,message):
        if ( self.v > 0 ):
            print (message)

    def test_coefs(self):
        self.debug("In wrappertest.py...Calling test_coefs")
        self.clf.fit(self.X,self.y)
        coefs = self.clf.get_coefs()
        print('coefs:',coefs)
        self.assertTrue( len(coefs)>0 )

    def test_dataframe(self):
        self.debug("In wrappertest.py...Calling test_dataframe")
        dfX = pd.DataFrame(data=self.X,columns=['fishy'+str(i) 
                                        for i in np.arange(self.X.shape[1])],
                                        index=None)
        # print(dfX.head())
        # print('dfX.columns:',dfX.columns)
        dfy = pd.DataFrame(data={'label':self.y})

        self.clf.fit(dfX,dfy['label'])
        # print('clf feature_names:',self.clf.feature_names)
        # print('dfX.columns:',','.join(dfX.columns).encode())
        assert(self.clf.feature_names == ','.join(dfX.columns).encode())

    #Test: Assert the length of labels returned from predict
    def test_predict_stats_length(self):
        self.debug("Fit the Data")
        self.clf.fit(self.X,self.y)

        print("Num generations is ", self.clf.gens)
        for key in self.clf.stats:
            print("Length for ", key, "is ", len(self.clf.stats[key]))
            self.assertEqual(len(self.clf.stats[key]), self.clf.gens)
Example #10
0
import pandas as pd
from pmlb import fetch_data

df = pd.read_csv('mnist.csv', sep='\t')
print(df.columns)
X = df.drop('class', axis=1).values
y = df['class'].values

from feat import Feat

ft = Feat(classification=True, verbosity=2)

ft.fit(X[:60000], y[:60000])

print(ft.score(X[60000:], y[60000:]))
Example #11
0
df = pd.read_csv('d_example_patients.csv')
df.drop('id', axis=1, inplace=True)
X = df.drop('class', axis=1).values
y = df['class'].values
zfile = 'd_example_patients_long.csv'
kf = StratifiedKFold(n_splits=3)
kf.get_n_splits(X)

clf = Feat(
    max_depth=5,
    max_dim=min(50, 2 * X.shape[1]),
    gens=20,
    pop_size=100,
    verbosity=1,
    shuffle=True,
    ml='LR',
    classification=True,
    feature_names=','.join(df.drop('class', axis=1).columns),
    functions="+,-,*,/,exp,log,and,or,not,=,<,<=,>,>=,ite,split,split_c,"
    "mean,median,max,min,variance,skew,kurtosis,slope,count",
    backprop=True,
    iters=10,
    random_state=42)
scores = []

for train_idx, test_idx in kf.split(X, y):
    # print('train_idx:',train_idx)
    clf.fit(X[train_idx], y[train_idx], zfile, train_idx)
    scores.append(clf.score(X[test_idx], y[test_idx], zfile, test_idx))

print('scores:', scores)
Example #12
0
    def train_predict(self, data, time_budget, n_class, schema):
        s1 = time.time()
        seed = SEED
        fix_seed(seed)
        LOGGER.info(f'time_budget:{time_budget}')
        LOGGER.info(f'n_class:{n_class}')
        LOGGER.info(f'node:{data["fea_table"].shape[0]}')
        LOGGER.info(f'edge:{data["edge_file"].shape[0]}')

        #pre-process data
        process_data = ProcessData(data)
        table = process_data.pre_process(time_budget, n_class, schema)

        # Feature Dimension Reduction
        feat = Feat()

        process_data.drop_unique_columns(table)
        drop_sum_columns = process_data.drop_excessive_columns(table)

        feat.fit_transform(table, drop_sum_columns)
        LOGGER.info(
            f'train:test={(table.df["is_test"]!=1).sum()}:{(table.df["is_test"]==1).sum()}'
        )

        #这里好像没用到哦
        table.large_features = False
        if table.ori_columns.shape[0] > 500:
            table.large_features = True

        model_type_list = ['sage', 'gat', 'tagc', 'gcn']

        repeat = 3
        model_name_list = [
            f'{model_type_list[i]}{i+len(model_type_list)*j}'
            for j in range(repeat) for i in range(len(model_type_list))
        ]
        model_type_list = model_type_list * repeat

        LOGGER.info('use node embedding')
        categories = [
            'node_index', 'degree_bins', 'bin_2-neighbor_mean_degree_bins'
        ]

        for model in set(model_type_list):
            LOGGER.info(
                f"""{model} feature num:{eval(f'table.{model}_columns.shape[0]')}"""
            )
            exec(
                f'table.{model}_data = process_data.process_gnn_data(table,table.{model}_columns,categories)'
            )

        allmodel = AllModel()

        table.lr_epoch = 16

        table.lr_list = [0.05, 0.03, 0.01, 0.0075, 0.005, 0.003, 0.001, 0.0005]

        train_valid_idx_list, valid_idx_list = split_train_and_valid(
            table, train_rate=0.8, seed=SEED, mode=split_mode)
        train_idx, test_idx = split_train_and_test(table)

        test_idx = test_idx.sort_values()
        run_model = []
        run_type = []
        run_time = {}
        for i in range(len(model_type_list)):
            seed = SEED * (i + 1)
            fix_seed(seed)
            model_type = model_type_list[i]
            model_name = model_name_list[i]
            if model_type not in run_time:
                init_time, one_epoch_time, early_stopping_rounds = allmodel.get_run_time(
                    table,
                    model_type,
                    model_name,
                    train_idx,
                    test_idx,
                    seed=seed)
                run_lr_time = len(table.lr_list) * (
                    init_time + table.lr_epoch * one_epoch_time)
                run_time500 = init_time * (2) + one_epoch_time * (
                    500 + early_stopping_rounds) * 2 + run_lr_time
                run_time300 = init_time * (2) + one_epoch_time * (
                    300 + early_stopping_rounds) * 2 + run_lr_time
                run_time150 = init_time * (2) + one_epoch_time * (
                    150 + early_stopping_rounds) * 2 + run_lr_time
                run_time[model_type] = (run_time500 - run_lr_time,
                                        run_time300 - run_lr_time,
                                        run_time150 - run_lr_time,
                                        early_stopping_rounds, init_time,
                                        one_epoch_time, run_lr_time)
            else:
                run_time500, run_time300, run_time150, early_stopping_rounds, init_time, one_epoch_time, run_lr_time = run_time[
                    model_type]
            s2 = time.time()
            LOGGER.info(
                f"time_budget:{time_budget}s,used time:{s2-s1:.2f}s,{model_name} model will use {run_time500:.2f}s|{run_time300:.2f}s|{run_time150:.2f}s"
            )
            if s2 - s1 + run_time500 + 5 < time_budget:
                LOGGER.info('train 500 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=500,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif s2 - s1 + run_time300 + 5 < time_budget:
                LOGGER.info('train 300 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=300,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif s2 - s1 + run_time150 + 5 < time_budget:
                LOGGER.info('train 150 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=150,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif len(allmodel.valid_models[0]) == 0:
                this_epoch = int((
                    (time_budget -
                     (s2 - s1 + 5) - run_lr_time) / 2 - init_time) /
                                 (one_epoch_time) - early_stopping_rounds)
                LOGGER.info(f'short time train {this_epoch} epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=this_epoch,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif time_budget - (s2 - s1) < 5:
                LOGGER.info('never train; break')
                break
            else:
                LOGGER.info('no train this model; continue')
                continue

        if offline:
            if table.especial:
                df = table.df[['node_index', 'is_test']]
                df = df.merge(data['test_label'], how='left', on='node_index')
                test_label = df.loc[(df['is_test'] == 1) &
                                    (table.directed_mask.tolist()),
                                    'label'].astype('int').values
            else:
                test_label = data['test_label']['label'].values
        else:
            test_label = None

        preds1, valid_acc1 = get_preds(0, run_model, run_type, allmodel,
                                       model_name_list, table, test_label,
                                       valid_idx_list)
        preds2, valid_acc2 = get_preds(1, run_model, run_type, allmodel,
                                       model_name_list, table, test_label,
                                       valid_idx_list)
        preds = (preds1 + preds2) / 2

        preds = preds.argmax(axis=1).flatten()

        if table.especial:
            LOGGER.info(f'preds\n{preds}')
            df = table.df[['label', 'is_test']]
            df['preds'] = int(
                df.loc[[not i for i in table.directed_mask.tolist()],
                       'label'].value_counts().index[0])
            df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                   'preds'] = preds
            preds = df.loc[df['is_test'] == 1, 'preds'].values

        LOGGER.info(
            f"train label\n{data['train_label']['label'].value_counts()/data['train_label'].shape[0]}"
        )
        df_preds = pd.Series(preds, name='preds')
        LOGGER.info(
            f"preds label\n{df_preds.value_counts()/df_preds.shape[0]}")

        if offline:
            preds1 = preds1.argmax(axis=1).flatten()
            preds2 = preds2.argmax(axis=1).flatten()
            if table.especial:
                LOGGER.info(f'preds1\n{preds1}')
                df = table.df[['label', 'is_test']]
                df['preds'] = int(
                    df.loc[[not i for i in table.directed_mask.tolist()],
                           'label'].value_counts().index[0])
                df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                       'preds'] = preds1
                preds1 = df.loc[df['is_test'] == 1, 'preds'].values

                LOGGER.info(f'preds2\n{preds2}')
                df = table.df[['label', 'is_test']]
                df['preds'] = int(
                    df.loc[[not i for i in table.directed_mask.tolist()],
                           'label'].value_counts().index[0])
                df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                       'preds'] = preds2
                preds2 = df.loc[df['is_test'] == 1, 'preds'].values

            df_test = table.df[['degree', 'label', 'is_test']]
            df_test = df_test.loc[df_test['is_test'] == 1]
            df_test['preds'] = preds
            df_test['label'] = data['test_label']['label'].values
            df_test['acc'] = df_test['preds'] == df_test['label']

            pd.set_option('display.max_rows', 1000)
            print(df_test.groupby('degree')['acc'].mean())

            return preds, valid_acc1, valid_acc2, preds1, preds2
        else:
            return preds
Example #13
0
from sklearn.linear_model import LogisticRegression as LR

df = pd.read_csv('../d_heart.csv')
df.describe()
X = df.drop('class', axis=1).values
y = df['class'].values
n_splits = 5
kf = KFold(n_splits=n_splits)
kf.get_n_splits(X)

clf = Feat(
    max_depth=6,
    # max_dim=X.shape[1],
    max_dim=min(50, 2 * X.shape[1]),
    pop_size=500,
    verbosity=0,
    shuffle=True,
    classification=True,
    functions=
    "+,-,*,/,exp,log,and,or,not,xor,=,<,>,ite,gauss,gauss2d,sign,logit,tanh",
    random_state=42)
lr = LR()
rocs = []
aucs = []
lr_rocs = []
lr_aucs = []

for train_idx, test_idx in kf.split(X):
    clf.fit(X[train_idx], y[train_idx])
    lr.fit(X[train_idx], y[train_idx])
Example #14
0
import pandas as pd

import numpy as np

from feat import Feat
import sys
seed = sys.argv[1]

df = pd.read_csv('../examples/d_heart.csv', sep=',')
df.describe()
X = df.drop('class', axis=1).values
y = df['class'].values
clf = Feat(max_depth=3,
           max_dim=1,
           gens=100,
           pop_size=200,
           verbosity=2,
           shuffle=True,
           classification=True,
           functions="+,-,*,/,exp,log,and,or,not,=,<,>,ite",
           random_state=seed,
           softmax_norm=True)
clf.fit(X, y)
Example #15
0
class TestFeatWrapper(unittest.TestCase):
    def setUp(self):
        self.v = verbosity
        self.clf = Feat(verbosity=verbosity, n_threads=1)
        diabetes = load_diabetes()
        self.X = diabetes.data
        self.y = diabetes.target

    #Test 1: Assert the length of labels returned from predict
    def test_predict_length(self):
        self.debug("Fit the Data")
        self.clf.fit(self.X, self.y)

        self.debug("Predicting the Results")
        pred = self.clf.predict(self.X)

        self.debug("Comparing the Length of labls in Predicted vs Actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual(actual_length, expected_length)

    #Test 2:  Assert the length of labels returned from fit_predict
    def test_fitpredict_length(self):
        self.debug("Calling fit_predict from Feat")
        pred = self.clf.fit_predict(self.X, self.y)

        self.debug("Comparing the length of labls in fit_predict vs actual ")
        expected_length = len(self.y)
        actual_length = len(pred)
        self.assertEqual(actual_length, expected_length)

    #Test 3:  Assert the length of labels returned from transform
    def test_transform_length(self):
        self.debug("Calling fit")
        self.clf.fit(self.X, self.y)
        trans_X = self.clf.transform(self.X)

        self.debug(
            "Comparing the length of labls in transform vs actual feature set "
        )
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual(actual_value, expected_value)

    #Test 4:  Assert the length of labels returned from fit_transform
    def test_fit_transform_length(self):
        self.debug("In wrappertest.py...Calling fit transform")
        trans_X = self.clf.fit_transform(self.X, self.y)

        self.debug(
            "Comparing the length of labls in transform vs actual feature set "
        )
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual(actual_value, expected_value)

    #Test 5:  Transform with Z
    def test_transform_length_z(self, zfile=None, zids=None):
        self.debug("Calling fit")
        self.clf.fit(self.X, self.y)
        trans_X = self.clf.transform(self.X, zfile, zids)

        self.debug(
            "Comparing the length of labls in transform vs actual feature set "
        )
        expected_value = self.X.shape[0]
        actual_value = trans_X.shape[0]
        self.assertEqual(actual_value, expected_value)

    def debug(self, message):
        if (self.v > 0):
            print(message)

    def test_coefs(self):
        self.debug("In wrappertest.py...Calling test_coefs")
        self.clf.fit(self.X, self.y)
        coefs = self.clf.get_coefs()
        self.assertTrue(len(coefs) > 0)

    def test_dataframe(self):
        self.debug("In wrappertest.py...Calling test_dataframe")
        dfX = pd.DataFrame(
            data=self.X,
            columns=['fishy' + str(i) for i in np.arange(self.X.shape[1])],
            index=None)
        dfy = pd.DataFrame(data={'label': self.y})

        self.clf.fit(dfX, dfy['label'])
        assert (self.clf.feature_names == ','.join(dfX.columns).encode())

    #Test: Assert the length of labels returned from predict
    def test_predict_stats_length(self):
        self.debug("Fit the Data")
        self.clf.fit(self.X, self.y)

        for key in self.clf.stats:
            self.assertEqual(len(self.clf.stats[key]), self.clf.gens)

    #Test ability to pickle feat model
    def test_pickling(self):
        self.debug("Pickle Feat object")

        with open('test_pickle.pkl', 'wb') as f:
            pickle.dump(self.clf, f)

        with open('test_pickle.pkl', 'rb') as f:
            loaded_clf = pickle.load(f)

        assert (loaded_clf.get_params() == self.clf.get_params())

    def test_archive(self):
        """test archiving ability"""
        self.debug("Test archive")

        self.clf.classification = True
        self.clf.ml = b'LR'
        self.clf.fit(self.X, np.array(self.y > np.median(self.y),
                                      dtype=np.int))
        archive = self.clf.get_archive()
        preds = self.clf.predict_archive(self.X)
        probs = self.clf.predict_proba_archive(self.X)

        for arch, pred, prob in zip(archive, preds, probs):
            self.assertTrue(arch['id'] == pred['id'])
            self.assertTrue(arch['id'] == prob['id'])

    def test_lr_l1(self):
        """testing l1 penalized LR"""
        self.clf.classification = True
        self.clf.ml = b'L1_LR'
        self.clf.fit(self.X, np.array(self.y > np.median(self.y),
                                      dtype=np.int))

        self.assertEqual(len(self.clf.predict(self.X)), len(self.y))
Example #16
0
 def setUp(self):
     self.v = verbosity
     self.clf = Feat(verbosity=verbosity, n_threads=1)
     diabetes = load_diabetes()
     self.X = diabetes.data
     self.y = diabetes.target
Example #17
0
                   },
                   {
                    'cross_rate': [0.0],
                    'fb': [0.0,0.25,0.5,0.75,1.0],
                    'softmax_norm': [True,False]
                   }]
    # create the classifier
    clf = Feat(obj="fitness,complexity",
               pop_size=500,
               gens=100,
               max_time=600,
               max_stall=10,
               #use_batch=True,
               #batch_size=500,
               ml = "LinearRidgeRegression",
               sel='lexicase',
               surv='nsga2',
               max_depth=6,
               max_dim=min([X.shape[1]*2,50]),
               random_state=random_seed,
               backprop=True,
               iters=10,
               n_threads=1,
               verbosity=1)#,
               #logfile=save_file.split('.csv')[0]+'_'+str(random_seed)+'.log')
  #functions 
    # 10-fold CV score for the pipeline
    clf_name = 'Feat'
# evaluate the model
    evaluate_model(dataset, save_file, random_seed, clf, clf_name, hyper_params, 
                   classification=False)