Example #1
0
def tpot_fit(events, X_all, y_all, num_threads):
    inner_cv = PurgedKFold(
        n_splits=5,
        t1=events["t1"],
        pct_embargo=0,
        random_state=42,
    )
    clf = tpot.TPOTClassifier(
        generations=500,
        population_size=60,
        offspring_size=None,
        #                               mutation_rate=0.9,
        #                               crossover_rate=0.1,
        scoring='neg_log_loss',
        cv=inner_cv,
        #                               subsample=1.0,
        n_jobs=num_threads,
        max_time_mins=500,
        max_eval_time_mins=5,
        random_state=42,
        periodic_checkpoint_folder='checks',
        #                               use_dask=True,
        config_dict='TPOT light',
        verbosity=2,
    )

    clf.fit(X_all, y_all)
    clf.export('tpot_pipeline.py')
Example #2
0
    def train(
        self,
        train_file: Union[str, Path],
        validation_file: Optional[Union[str, Path]] = None,
        workdir: Optional[Union[str, Path]] = None,
    ) -> Dict[str, float]:
        X_train, y_train = self.load_data(train_file)
        assert y_train is not None

        with tempfile.TemporaryDirectory() as tempdir:
            workdir = Path(workdir or tempdir)
            log_file_name = workdir / "tpot.log"
            pipeline_file_name = workdir / "fitted_pipeline.pkl"
            pipeline_code_file_name = workdir / "pipeline.py"

            with open(log_file_name, "w") as log_file:
                teeing_log_file = TeeingIO(log_file, sys.stdout)
                if self._task == "classification":
                    model = tpot.TPOTClassifier(
                        log_file=teeing_log_file,
                        **self._kwargs,
                    )
                else:
                    model = tpot.TPOTRegressor(log_file=teeing_log_file,
                                               **self._kwargs)

                model.fit(X_train, y_train)

            with open(log_file_name) as log_file:
                tpot_log = log_file.read()

            model.export(str(pipeline_code_file_name))
            self._estimator = model.fitted_pipeline_
            with open(pipeline_file_name, "wb") as pipeline_file:
                pickle.dump(self._estimator, pipeline_file)

        metrics = self._get_metrics_from_log(tpot_log)

        if validation_file is not None:
            X_val, y_val = self.load_data(validation_file)
            assert y_val is not None
            metrics["validation_score"] = model.score(X_val, y_val)

        return metrics
def demo():
    """
    DOCSTRING
    """
    telescope = pandas.read_csv('data/MAGIC Gamma Telescope Data.csv')
    telescope_shuffle = telescope.iloc[numpy.random.permutation(
        len(telescope))]
    tele = telescope_shuffle.reset_index(drop=True)
    tele['Class'] = tele['Class'].map({'g': 0, 'h': 1})
    tele_class = tele['Class'].values
    training_indices, testing_indices = sklearn.model_selection.train_test_split(
        tele.index, stratify=tele_class, train_size=0.75, test_size=0.25)
    validation_indices = testing_indices
    tpot = tpot.TPOTClassifier(generations=5, verbosity=2)
    tpot.fit(
        tele.drop('Class', axis=1).loc[training_indices].values,
        tele.loc[training_indices, 'Class'].values)
    tpot.score(
        tele.drop('Class', axis=1).loc[validation_indices].values,
        tele.loc[validation_indices, 'Class'].values)
    tpot.export('pipeline.py')
Example #4
0
def get_robust_tpot(
    max_time_mins=5,
    scoring="f1_macro",
    cv=5,
    random_state=42,
    n_jobs=1,
    verbose=False,
):
    clf = RobustSearch(
        search_model=tpot.TPOTClassifier(
            scoring=scoring,
            cv=cv,
            n_jobs=n_jobs,
            max_time_mins=max_time_mins,
            random_state=random_state,
            verbosity=2,
            disable_update_check=True,
        ),
        verbose=verbose,
    )
    return clf
Example #5
0
    def __init__(self, nrows=100, ncols=10, seed=42, max_time_mins=1):
        self.pipelines = []

        print("Generating pipelines for tests")
        np.random.seed(seed)
        X, y = sklearn.datasets.make_classification(nrows, ncols)
        self.X = X
        self.y = y
        print("TPOT-generated pipelines")
        clf = tpot.TPOTClassifier(
            max_time_mins=max_time_mins,
            verbosity=3,
            random_state=seed,
        )
        clf.fit(self.X, self.y)
        ga_pipelines = [
            e["pipeline_obj"] for e in clf.evaluated_individuals_.values()
        ]
        assert len(ga_pipelines) > 0
        self.pipelines.extend(ga_pipelines)

        print("Random-search-generated pipeliens")
        clf2 = RandomSearch(
            tpot.config.classifier_config_dict,
            4,
            max_time_mins=max_time_mins,
            max_time_mins_per_pipeline=1,
            max_retries=100,
            cv=3,
            scoring="f1_macro",
            random_state=seed,
        )
        clf2.fit(self.X, self.y)
        rand_pipelines = [
            e["pipeline_obj"] for e in clf2.evaluated_individuals_.values()
        ]
        assert len(rand_pipelines) > 0
        self.pipelines.extend(rand_pipelines)
        np.random.shuffle(self.pipelines)
Example #6
0
def get_robust_tpot(
    config_dict=None,
    max_time_mins=5,
    scoring="f1_macro",
    cv=5,
    random_state=42,
    n_jobs=-1,
    check_point_folder=None,
    verbosity=2,
):
    clf = RobustSearch(search_model=tpot.TPOTClassifier(
        config_dict=config_dict,
        scoring=scoring,
        cv=cv,
        n_jobs=n_jobs,
        max_time_mins=max_time_mins,
        # max on a single timeline...otherwise can blow out
        # and end up with not a single pipeline fit
        max_eval_time_mins=MAX_TIME_MINS_PER_PIPELINE,
        random_state=random_state,
        verbosity=verbosity,
        disable_update_check=True,
    ))
    return clf
Example #7
0
rf_estimator = ensemble.RandomForestClassifier()
rf_grid = {
    'max_depth': list(range(1, 9)),
    'n_estimators': list(range(1, 300, 100))
}
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid,
                                                   titanic_train1, y_train)
X_train = utils.select_features(rf_final_estimator,
                                titanic_train1,
                                threshold='mean')

tpot_estimator = tpot.TPOTClassifier(
    generations=10,
    population_size=40,
    verbosity=2,
    early_stop=3,
    random_state=100,
    cv=5,
    scoring='accuracy',
    periodic_checkpoint_folder='E:/checkpoint')
tpot_estimator.fit(X_train, y_train)
print(tpot_estimator.score(X_train, y_train))
print(tpot_estimator.fitted_pipeline_)
print(tpot_estimator._optimized_pipeline)
print(tpot_estimator.evaluated_individuals_)

titanic_test1 = titanic[titanic_train.shape[0]:]
X_test = utils.select_features(rf_final_estimator,
                               titanic_test1,
                               threshold='mean')
Example #8
0
dir = 'C:/Users/akhram/Desktop/Algo/Problems/Classification/DontOverFitII'
train = pd.read_csv(os.path.join(dir, 'train.csv'))
print(train.info())
print(train.columns)

sns.countplot(x='target',data=train)

#filter unique value features
train1 = train.iloc[:,2:] 
y = train['target'].astype(int)

X_train, X_eval, y_train, y_eval = model_selection.train_test_split(train1, y, test_size=0.1, random_state=1)

tpot_estimator = tpot.TPOTClassifier(generations=10, population_size=40, 
                                     verbosity=2, early_stop=2, 
                                     random_state=100,
                                     cv=5, scoring='roc_auc',
                                     config_dict=None, warm_start=True,
                                     periodic_checkpoint_folder='E:/checkpoint')

tpot_estimator.fit(X_train, y_train)
print(tpot_estimator.score(X_train, y_train))
print(tpot_estimator.evaluated_individuals_)
print(tpot_estimator.fitted_pipeline_)

print(tpot_estimator.score(X_eval, y_eval))

test = pd.read_csv(os.path.join(dir, 'test.csv'))
print(test.info())
print(test.columns)

test1 = test.iloc[:,1:] 
Example #9
0
        config = eval(config_var_name)

if template_str:
    print(">> TEMPLATE STRING:      ", template_str)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y.astype(np.float64),
                                                    train_size=0.8,
                                                    test_size=0.2)

if conf_type == 'template':
    # USE TEMPLATE
    # (No stacking)
    clf_t = tpot.TPOTClassifier(
        generations=100,
        population_size=100,
        verbosity=2,
        config_dict=config_nn,  # We can be permissive when template_str is set
        template=template_str)
else:
    # USE CONFIG DICT
    # (Stacking allowed)
    clf_t = tpot.TPOTClassifier(
        generations=100,
        population_size=100,
        verbosity=2,
        config_dict=config,
    )

start_t = time.time()
print(">> BEGIN TRAINING AT:     {0:.2f}".format(start_t))
clf_t.fit(X_train, y_train)
 def __init__(self, **kwargs):
     self.clf = tpot.TPOTClassifier(**kwargs)
        X_sel_1, X_hof_1, y_sel_1, y_hof_1 =\
        sklearn.model_selection.train_test_split(X_1, y_1, test_size = 0.2)
        X_0 = X[ind_0]
        y_0 = y[ind_0]
        X_sel_0, X_hof_0, y_sel_0, y_hof_0 =\
        sklearn.model_selection.train_test_split(X_0, y_0,
        test_size = y_hof_1.shape[0])
        X_sel_0_tr, X_sel_0_drop, y_sel_0_tr, y_sel_0_drop =\
        sklearn.model_selection.train_test_split(X_sel_0, y_sel_0,
            train_size = y_sel_1.shape[0])
        X_sel = numpy.concatenate((X_sel_0_tr, X_sel_1), axis=0)
        y_sel = numpy.concatenate((y_sel_0_tr, y_sel_1), axis=0)
        X_hof = numpy.concatenate((X_hof_0, X_hof_1), axis=0)
        y_hof = numpy.concatenate((y_hof_0, y_hof_1), axis=0)
        print(f"{cities.iloc[i, 0]}")
        print(f"{y_sel_1.shape[0]}/{y_sel.shape[0]}")
        print(f"{y_hof_1.shape[0]}/{y_hof.shape[0]}")
        log.write("{},{}/{},{}/{}\n".format(cities.iloc[i, 0],
                                            y_sel_1.shape[0], y_sel.shape[0],
                                            y_hof_1.shape[0], y_hof.shape[0]))
        clf = tpot.TPOTClassifier(n_jobs=job)
        clf.fit(X_sel, y_sel)
        max_score = clf.score(X_sel, y_sel)
        test_score = clf.score(X_hof, y_hof)
        print(f"{cities.iloc[i, 0]} {max_score} {test_score}")
        with open(f"tex/{cities.iloc[i, 0]}_tpot.tex", "w") as fout:
            fout.write(
                latex_template.format(cities.iloc[i, 0], max_score,
                                      test_score))
connection.close()
Example #12
0
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import tpot

X,y = datasets.load_iris(return_X_y=True)

#model = tpot.TPOTClassifier(generations=50, population_size=50, verbosity=3, config_dict='TPOT light')
model = tpot.TPOTClassifier(generations=50, population_size=50, verbosity=3)
model.fit(X, y)

model.export('iris-tpot-result.py')

pipe = model._toolbox.compile(expr=model._optimized_pipeline)
cv_pred = cross_val_predict(pipe, X, y, cv=5)
print("Score: %.4f" % f1_score(y, cv_pred, average='micro'))
Example #13
0
def build_dicts(tpot_dict, loss_dict, train, targets, reload=False):
    tpot_mdr_classifier_config_dict = {

        # Classifiers
        'mdr.MDRClassifier': {
            'tie_break': [0, 1],
            'default_label': [0, 1]
        },

        # Feature Selectors
        'skrebate.ReliefF': {
            'n_features_to_select': range(1, 6),
            'n_neighbors': [2, 10, 50, 100, 250, 500]
        },
        'skrebate.SURF': {
            'n_features_to_select': range(1, 6)
        },
        'skrebate.SURFstar': {
            'n_features_to_select': range(1, 6)
        },
        'skrebate.MultiSURF': {
            'n_features_to_select': range(1, 6)
        }
    }

    train = train.copy()
    targets = targets.copy()

    for col in train.keys():
        if not is_numeric_dtype(train[col]):
            train[col] = LabelEncoder().fit_transform(train[col])

    for col in targets.keys():

        if col == 'sig_id': continue

        if col in tpot_dict:
            print(f'\tAlready fitted {col} with loss {loss_dict[col]}')
            continue

        print(f'Fitting {col}...')

        t = time()

        inp = train.copy().drop(columns=['sig_id'])
        lbls = targets[col].copy()

        if lbls.sum() > 1:
            inp, lbls = repeat_sample(inp, lbls, 2)

        X_train, X_test, y_train, y_test = train_test_split(inp,
                                                            lbls,
                                                            stratify=lbls)

        clf = tpot.TPOTClassifier(generations=5,
                                  population_size=50,
                                  verbosity=3,
                                  warm_start=True)

        clf.fit(X_train, y_train)
        loss = clf.score(X_test, y_test)

        tpot_dict[col] = clf
        loss_dict[col] = loss

        clf.export(f'input/tpot/{col}')
        with open(f'input/tpot/loss_dict', 'wb+') as hand:
            pickle.dump(loss_dict, hand)

        print('{}\t\t{}\t\t{:.5f}\n'.format(
            str(datetime.timedelta(seconds=time() - t))[:7], col, loss))

    total_loss = 0
    for v in loss_dict.values():
        total_loss += v
    print(f'Average loss: {total_loss / (len(targets.keys())-1)}')

    return tpot_dict, loss_dict
data = pd.read_csv('model2.csv', index_col=0)
data.columns
data.drop('UniqueCarrier', axis=1)

#clean the data ( data preprocessing )
data_shuffle = data.iloc[np.random.permutation(len(data))]
data = data_shuffle.reset_index(
    drop=True)  ## it was tele before, is this valid naming

# obtaining the class values
data_class = data['Class'].values

# Split data with 75% for training and 25% for testing
trainig_indices, validation_indices = training_indices, testing_indices = train_test_split(
    data.index, stratify=data_class, train_size=0.75, test_size=0.25)

# Genetic programming begins !!!
tpot = tp.TPOTClassifier(generations=5,
                         verbosity=2)  # optimization runs for 5 generations
tpot.fit(
    data.drop('Class', axis=1).loc[training_indices].values,
    data.loc[training_indices, 'Class'].values)

# Score accuracy
tpot.score(
    data.drop('Class', axis=1).loc[validation_indices].values,
    data.loc[validation_indices, 'Class'].values)

# Export the pipeline (generations) as pipeline.py
tpot.export('pipeline.py')
Example #15
0
# scores = cross_val_score(clf, train_data, train_lable, cv=5, verbose=5)
# print(scores.mean())

# 0.8927205970809264
# clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
# scores = cross_val_score(clf, train_data, train_lable, cv=5, verbose=5)
# print(scores.mean())

# clf = autosklearn.classification.AutoSklearnClassifier()
# scores = cross_val_score(clf, train_data, train_lable, cv=5, verbose=5)
# print(scores.mean())

# clf = autokeras.classifier()

clf = tpot.TPOTClassifier(verbosity=3,
                          periodic_checkpoint_folder="tpot",
                          warm_start=True)

clf.fit(train_data, train_lable)
test_data, test_ID = handle_data(test_data,
                                 test_data_handled_path,
                                 need_shuffle=False)
# print(test_data)
# print(test_ID)
predict = clf.predict(test_data)
# print(predict)
result = pd.DataFrame({'ID': test_ID, 'pred': predict})
result.to_csv(result_data_path, index=False)

clf.export('tpot_pipeline.py')
Example #16
0
          'learning_rate': np.arange(0.0001,0.1,0.0001),
          'n_estimators': np.arange(1,200,1),
          'nthread':[6],
          'gamma':np.arange(0.00001,0.1,0.00001),
          'subsample':np.arange(0.1,2,0.1),
          'reg_lambda': np.arange(0.1,200,1),
          'reg_alpha': np.arange(1,200,1),
          'min_child_weight': np.arange(1,200,1),
         'colsample_bytree': np.arange(0.1,2,0.1),
          'colsample_bylevel': np.arange(0.1,2,0.1)
         }}


tpot_estimator = tpot.TPOTClassifier(generations=5, population_size=100,
                                offspring_size=250,
                                verbosity=2, early_stop=3,  random_state=1,                              
                                config_dict=dict,
                                cv = 5, scoring = 'accuracy')
tpot_estimator.fit(X_train, y_train)
print(tpot_estimator.evaluated_individuals_)
print(tpot_estimator.fitted_pipeline_)
print(tpot_estimator.score(X_train, y_train))

print(tpot_estimator.score(X_eval, y_eval))


titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))
print(titanic_test.info())

titanic_test['Age_imputed'] = age_imputer.transform(titanic_test[['Age']])
titanic_test['Fare'] = fare_imputer.transform(titanic_test[['Fare']])
Example #17
0
    'n_estimators': np.arange(1, 200, 1),
    'nthread': [6],
    'gamma': np.arange(0.00001, 0.1, 0.00001),
    'subsample': np.arange(0.1, 2, 0.1),
    'reg_lambda': np.arange(0.1, 200, 1),
    'reg_alpha': np.arange(1, 200, 1),
    'min_child_weight': np.arange(1, 200, 1),
    'colsample_bytree': np.arange(0.1, 2, 0.1),
    'colsample_bylevel': np.arange(0.1, 2, 0.1)
}

tpot_estimator = tpot.TPOTClassifier(
    generations=5,
    population_size=100,
    offspring_size=250,
    verbosity=2,
    early_stop=3,
    config_dict={'xgboost.XGBClassifier': params},
    cv=5,
    scoring='accuracy')
tpot_estimator.fit(X_train, y_train)
print(tpot_estimator.score(X_train, y_train))
print(tpot_estimator.fitted_pipeline_)
print(tpot_estimator._optimized_pipeline)
print(tpot_estimator.evaluated_individuals_)

titanic_test1 = titanic[titanic_train.shape[0]:]
X_test = utils.select_features(rf_final_estimator,
                               titanic_test1,
                               threshold='mean')