class OnlineTpotClassifer(base.Classifier): def __init__(self, n_training_samples, classes: list, max_time_mins: int = 15): self.n_training_samples = n_training_samples self.max_time_mins = max_time_mins self.training_samples_x = [] self.training_samples_y = [] self.estimator = None self.classes = classes def learn_one(self, x: dict, y: base.typing.ClfTarget, **kwargs) -> base.Classifier: if self.estimator is None: self.training_samples_x.append(x) self.training_samples_y.append(y) if len(self.training_samples_x) >= self.n_training_samples and self.estimator is None: x_train = np.stack([dict2numpy(i) for i in self.training_samples_x]) self.estimator = TPOTClassifier(max_time_mins=self.max_time_mins) self.estimator.fit(x_train, self.training_samples_y) self.training_samples_y = [] self.training_samples_x = [] return self def predict_proba_one(self, x: dict) -> typing.Dict[base.typing.ClfTarget, float]: if self.estimator is not None: y_pred = self.estimator.predict_proba([list(x.values())])[0] return {self.classes[i]: p for i, p in enumerate(y_pred)} else: return {c: 1 / len(self.classes) for c in self.classes} def predict_proba_many(self, X): return pd.Series(self.estimator.predict_proba(X), columns=self.classes) @property def _multiclass(self): return True def learn_many(self, X, y): self.estimator.partial_fit(X=X.values, y=y.values, classes=self.classes) return self def predict_one(self, x): if self.estimator is not None: y_pred = self.estimator.predict([list(x.values())])[0] return y_pred else: return self.classes[0] def predict_many(self, X): return pd.Series(self.estimator.predict(X))
def build_classifier(data, feature_pipeline, generations, population_size, name): X, y = data Xt = feature_pipeline.fit_transform(X) Xt = Xt.astype(float) categories = pandas.unique(y) config = make_tpot_pmml_config(classifier_config_dict) config = filter_config(config) del config[ "sklearn.naive_bayes.GaussianNB"] # Does not support nesting - see http://mantis.dmg.org/view.php?id=208 del config["sklearn.neighbors.KNeighborsClassifier"] del config[ "sklearn.svm.LinearSVC"] # Does not support classifier.predict_proba(Xt) del config["sklearn.tree.DecisionTreeClassifier"] classifier = TPOTClassifier(generations=generations, population_size=population_size, random_state=13, config_dict=config, verbosity=2) classifier.fit(Xt, y) pipeline = make_pmml_pipeline(Pipeline(steps=feature_pipeline.steps + classifier.fitted_pipeline_.steps), active_fields=X.columns.values, target_fields=[y.name]) print(repr(pipeline)) store_pkl(pipeline, name) result = DataFrame(classifier.predict(Xt), columns=[y.name]) if (len(categories) > 0): probabilities = DataFrame(classifier.predict_proba(Xt), columns=[ "probability(" + str(category) + ")" for category in categories ]) result = pandas.concat([result, probabilities], axis=1) store_csv(result, name)
class TpotEstimator(BaseEstimator): def __init__(self, task, **kwargs): super(TpotEstimator, self).__init__(task) if task == 'regression': self.tpot = TPOTRegressor(**kwargs) else: self.tpot = TPOTClassifier(**kwargs) self.name = 'tpot' self.label_encoder = None self.obj_cols = None def train(self, X, y, X_test): self.obj_cols = column_object_category_bool(X) self.label_encoder = SafeOrdinalEncoder() X[self.obj_cols] = self.label_encoder.fit_transform(X[self.obj_cols]) self.tpot.fit(X, y) def predict_proba(self, X): X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols]) proba = self.tpot.predict_proba(X) print(f'proba.shape:{proba.shape}') return proba def predict(self, X): X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols]) return self.tpot.predict(X)
def build_classifier(data, name): X, y = data categories = pandas.unique(y) config = make_tpot_pmml_config(classifier_config_dict) del config["sklearn.neighbors.KNeighborsClassifier"] classifier = TPOTClassifier(generations=1, population_size=3, random_state=13, config_dict=config, verbosity=2) classifier.fit(X, y) pipeline = make_pmml_pipeline(classifier.fitted_pipeline_, active_fields=X.columns.values, target_fields=[y.name]) print(repr(pipeline)) store_pkl(pipeline, name + ".pkl") result = DataFrame(classifier.predict(X), columns=[y.name]) if (len(categories) > 0): probabilities = DataFrame(classifier.predict_proba(X), columns=[ "probability(" + str(category) + ")" for category in categories ]) result = pandas.concat([result, probabilities], axis=1) store_csv(result, name + ".csv")
def run_AutoTpot(self): # Running the AutoTpot pipeline automl = TPOTClassifier(generations=1, verbosity=2, config_dict='TPOT sparse') automl.fit(self.train, self.y_train) # TPOT produces ready-to-run, standalone Python code for the best-performing model, # in the form of a scikit-learn pipeline. # Exporting the best models automl.export(os.path.join(self.args.save_dir, 'tpot-sportswear.py')) print('The best pipeline discovered through auto-tpot is {}'.format(automl.fitted_pipeline_)) print('Saving the best model discovered through TPOT.') # Dumping ensemble of the models joblib.dump(automl, os.path.join(self.args.checkpoint_dir, 'auto-tpot.pickle')) # Calculating time per prediction # Start time ****************************************************************************** start = timeit.default_timer() # Predicting label, confidence probability on the test data set predictions = automl.predict(self.test) predictions_prob = automl.predict_proba(self.test) # Binary class values : rounding them to 0 or 1 predictions = [round(value) for value in predictions] end = timeit.default_timer() # End Time ****************************************************************************** print('Time per prediction : {}'.format((end - start) / self.test.shape[0])) self.visualize(predictions, automl)
def test_predict_proba2(): """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)""" tpot_obj = TPOTClassifier() pipeline_string = ( 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5)') tpot_obj._optimized_pipeline = creator.Individual.from_string( pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile( expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict_proba(testing_features) rows = result.shape[0] columns = result.shape[1] try: for i in range(rows): for j in range(columns): float_range(result[i][j]) assert True except Exception: assert False
class TPOTClassifierModel(Model): def __init__( self, name: str, model_params: Dict[str, Any], ) -> None: super().__init__(name, model_params) self._model = TPOTClassifier(**model_params) def _force_fit(self, X: np.ndarray, y: np.ndarray) -> None: self._model.fit(X, y) # TODO: This is required by DESlib which is kind of annoying. # They call check_if_fitted(model, 'classes_'), meaning these have # to act more like general sklearn models # # If using more classifiers, the creation of a ClassifierModel # base class is probably required to ensure consistency self.classes_ = self._model.fitted_pipeline_.classes_ def fit(self, X: np.ndarray, y: np.ndarray) -> None: if not isinstance(self._model, TPOTClassifier): raise RuntimeError( 'Due to TPOT being unpickelable, saving this' + ' means only the actual sklearn.Pipeline' + ' was saved. Calling fit will fit this pipeline' + ' rather than the TPOT algorithm. If this is' + ' desired behaviour, please use `_force_fit`' + ' instead') self._force_fit(X, y) def save(self, path: str) -> None: # See comment above class if isinstance(self._model, TPOTClassifier): self._model = self._model.fitted_pipeline_ with open(path, 'wb') as file: pickle.dump(self, file) @classmethod def load(cls, path: str): with open(path, 'rb') as file: model = pickle.load(file) return cast(TPOTClassifierModel, model) def predict(self, X: np.ndarray) -> np.ndarray: return self._model.predict(X) def predict_proba(self, X: np.ndarray) -> np.ndarray: return self._model.predict_proba(X)
def test_predict_proba(): """Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_classes)""" tpot_obj = TPOTClassifier() pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5)') tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict_proba(testing_features) num_labels = np.amax(testing_classes) + 1 assert result.shape == (testing_features.shape[0], num_labels)
def test_predict_proba(): """Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_classes)""" tpot_obj = TPOTClassifier() pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5)') tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict_proba(testing_features) num_labels = np.amax(testing_classes) + 1 assert result.shape == (testing_features.shape[0], num_labels)
class TPOTModelParam(ModelParamObject): def __init__(self, **kwargs): ModelParamObject.__init__(self) self.model = TPOTClassifier(**kwargs) self.fitted = False self.to_grid_search = False def optimize(self, X, y): """ In the TPOT case, this runs the standard TPOT optimization algorithm. """ print("Performing TPOT genetic optimization.") self.model.fit(X, y) self.optimized = True def predict_proba(self, X): return self.model.predict_proba(X)
def main(): df_train = pd.read_csv(os.getenv('PREPARED_TRAINING')) df_valid = pd.read_csv(os.getenv('PREPARED_VALIDATING')) df_test = pd.read_csv(os.getenv('PREPARED_TESTING')) feature_cols = list(df_train.columns[:-1]) target_col = df_train.columns[-1] X_train = df_train[feature_cols].values y_train = df_train[target_col].values X_valid = df_valid[feature_cols].values y_valid = df_valid[target_col].values X_test = df_test[feature_cols].values prefix = os.getenv('STORING') tsne_data = np.load(os.path.join(prefix, 'tsne_2d_5p.npz')) tsne_train = tsne_data['train'] tsne_valid = tsne_data['valid'] tsne_test = tsne_data['test'] # concat features X_train_concat = np.concatenate([X_train, tsne_train], axis=1) X_valid_concat = np.concatenate([X_valid, tsne_valid], axis=1) X_test_concat = np.concatenate([X_test, tsne_test], axis=1) tpot = TPOTClassifier(max_time_mins=int(os.getenv('TIME_LIMIT_ALL', '1440')), max_eval_time_mins=int( os.getenv('TIME_LIMIT_PART', '5')), population_size=100, scoring='log_loss', cv=3, verbosity=2, random_state=67) tpot.fit(X_train_concat, y_train) loss = tpot.score(X_valid_concat, y_valid) print(loss) tpot.export(os.path.join(prefix, 'tpot_pipeline.py')) p_test = tpot.predict_proba(X_test_concat) df_pred = pd.DataFrame({'id': df_test['id'], 'probability': p_test[:, 1]}) csv_path = os.getenv('PREDICTING') df_pred.to_csv(csv_path, columns=('id', 'probability'), index=None) print('Saved: {}'.format(csv_path))
class TPOTBaselineModel(Model): def __init__( self, name: str, model_params: Dict[str, Any], ) -> None: super().__init__(name, model_params) self._model = TPOTClassifier(**model_params) def _force_fit(self, X: np.ndarray, y: np.ndarray) -> None: self._model.fit(X, y) def fit(self, X: np.ndarray, y: np.ndarray) -> None: if not isinstance(self._model, TPOTClassifier): raise RuntimeError( 'Due to TPOT being unpickelable, saving this' + ' means only the actual sklearn.Pipeline' + ' was saved. Calling fit will fit this pipeline' + ' rather than the TPOT algorithm. If this is' + ' desired behaviour, please use `_force_fit`' + ' instead') self._force_fit(X, y) def save(self, path: str) -> None: # See comment above class if isinstance(self._model, TPOTClassifier): self._model = self._model.fitted_pipeline_ with open(path, 'wb') as file: pickle.dump(self, file) @classmethod def load(cls, path: str): with open(path, 'rb') as file: model = pickle.load(file) return cast(TPOTBaselineModel, model) def predict(self, X: np.ndarray) -> np.ndarray: return self._model.predict(X) def predict_proba(self, X: np.ndarray) -> np.ndarray: # TODO: May cause issues if SVG or SVM model is best return self._model.predict_proba(X)
def process_tpot(X_train, X_test, y_train, df_types, m_type, seed, *args): """Function that trains and tests data using tpot""" from tpot import TPOTClassifier from tpot import TPOTRegressor from ..config import classifier_config_dict # Register Timer def handler(signum, frame): raise SystemExit('Time limit exceeded, sending system exit...') signal.signal(signal.SIGALRM, handler) # default cv is 5 if m_type == 'classification': automl = TPOTClassifier(generations=100, population_size=100, config_dict=classifier_config_dict, verbosity=3, max_time_mins=int(10800/60), scoring='f1_weighted', n_jobs=N_CORES, random_state=seed) else: automl = TPOTRegressor(generations=100, population_size=100, verbosity=3, max_time_mins=int(10800/60), scoring='neg_mean_squared_error', n_jobs=N_CORES, random_state=seed) # Set timer # for long running processes TPOT sometimes does not end even with generations signal.alarm(TIME_PER_TASK+GRACE_PERIOD) automl.fit(X_train.values, y_train.values) signal.alarm(0) return (automl.predict_proba(X_test.values) if m_type == 'classification' else automl.predict(X_test.values))
def StackerTPOT(X_trainval, y_trainval, X_test, y_test): import tpot from sklearn.model_selection import RepeatedStratifiedKFold from tpot import TPOTClassifier cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='neg_brier_score', verbosity=2, random_state=1, n_jobs=-1) model.fit(X_trainval, y_trainval) model.export('tpot_best_model.py') #Now calculate the brier score of the best model on the test data y_prob = model.predict_proba(X_test) return brier_score_loss(y_test, y_prob[:, 0], pos_label=1)
def test_predict_proba2(): """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)""" tpot_obj = TPOTClassifier() pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5)') tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict_proba(testing_features) rows = result.shape[0] columns = result.shape[1] try: for i in range(rows): for j in range(columns): float_range(result[i][j]) assert True except Exception: assert False
import uuid if __name__ == '__main__': multiprocessing.set_start_method('forkserver') import pandas as pd from sklearn import metrics from tpot import TPOTClassifier from kirgsn import reducing path_input_extended = 'input' tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2, scoring='roc_auc', cv=5, #max_time_mins=60*3, random_state=1990, n_jobs=-1, periodic_checkpoint_folder='out') train = pd.read_csv(join(path_input_extended, 'train.csv'), na_values="-1") test = pd.read_csv(join(path_input_extended, 'test.csv'), na_values="-1") cols = [c for c in train.columns if c not in ['id', 'target']] tpot.fit(train[cols], train['target']) tpot.export('out/tpotted.py') test['target'] = tpot.predict_proba(test[cols])[:, 1] test[['id', 'target']].to_csv('out/submissions/tpot_{}_{}.csv.gz'.format( str(uuid.uuid4()).split(sep='-')[0]), index=False, float_format='%.5f', compression='gzip')
class TpotBaseline(object): def __init__(self, *, input_path, output_path, output_file_name): self.__input_path = input_path self.__output_path = output_path self.__output_file_name = output_file_name self.__train, self.__test = [None for _ in range(2)] self.__sample_submission = None self.__train_feature, self.__train_label = [None for _ in range(2)] self.__test_feature = None self.__categorical_index = None self.__numeric_index = None self.__encoder = None self.__imputer = None self.__clf = None def data_prepare(self): self.__train = pd.read_csv( os.path.join(self.__input_path, "train_feature_df.csv")) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_feature_df.csv")) self.__sample_submission = pd.read_csv( os.path.join(self.__input_path, "sample_submission_one.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop("TARGET", axis=1) self.__test_feature = self.__test[self.__train_feature.columns] # 离散变量缺失值处理 + 连续化 self.__categorical_index = np.where( self.__train_feature.dtypes == "object")[0] self.__numeric_index = np.where( self.__train_feature.dtypes != "object")[0] self.__train_feature.iloc[:, self.__categorical_index] = ( self.__train_feature.iloc[:, self.__categorical_index].fillna( "missing")) self.__test_feature.iloc[:, self.__categorical_index] = ( self.__test_feature.iloc[:, self.__categorical_index].fillna( "missing")) self.__encoder = ce.TargetEncoder() self.__encoder.fit( self.__train_feature.iloc[:, self.__categorical_index], self.__train_label) self.__train_feature.iloc[:, self. __categorical_index] = self.__encoder.transform( self.__train_feature. iloc[:, self.__categorical_index]) self.__test_feature.iloc[:, self. __categorical_index] = self.__encoder.transform( self.__test_feature. iloc[:, self.__categorical_index]) # 连续变量缺失值处理 self.__imputer = Imputer(strategy="median") self.__imputer.fit(self.__train_feature.iloc[:, self.__numeric_index]) self.__train_feature.iloc[:, self. __numeric_index] = self.__imputer.transform( self.__train_feature. iloc[:, self.__numeric_index]) self.__test_feature.iloc[:, self. __numeric_index] = self.__imputer.transform( self.__test_feature.iloc[:, self. __numeric_index]) def model_fit(self): self.__clf = TPOTClassifier(scoring="roc_auc", n_jobs=-1, verbosity=2) self.__clf.fit(self.__train_feature.values, self.__train_label.values) def model_predict(self): self.__sample_submission["TARGET"] = self.__clf.predict_proba( self.__test_feature.values)[:, 1] self.__sample_submission.to_csv(os.path.join(self.__output_path, self.__output_file_name), index=False) self.__clf.export(os.path.join(self.__output_path, "tpot_baseline.py"))
def autoframe( task, metalearning, prepb, feat_type, resultsfile, X_train, y_train, X_test, y_test, dataset, framework, foldn, ncore, timeforjob, dirt, meta, fitmetrics, outputdir, target, ): shape = [] shape = [X_train.shape, y_train.shape, X_test.shape, y_test.shape] start = time.time() if framework == 'autosklearn': if task == "bt" or task == "bre": automl = autoclf( metalearning, framework, feat_type, timeforjob, foldn, ncore, X_train, y_train, fitmetrics, ) y_pred_prob = automl.predict_proba(X_test) elif task == "it": automl = autoreg( metalearning, framework, feat_type, timeforjob, foldn, ncore, X_train, y_train, fitmetrics, ) y_pred_prob = [] y_pred = automl.predict(X_test) ################################################################### elif framework == 'tpot': if task == "bt" or task == "bre": tpot = TPOTClassifier(max_time_mins=int(timeforjob / 60), max_eval_time_mins=float(timeforjob / 60), n_jobs=ncore, verbosity=2) tpot.fit(X_train, y_train) y_pred_prob = tpot.predict_proba(X_test) elif task == "it": tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2) y_pred_prob = [] automl = tpot y_pred = tpot.predict(X_test) print(tpot.score(X_test, y_test)) end = time.time() timespend = float(end - start) ################################################################### ################################################################### save_prob(timeforjob, dataset, resultsfile, foldn, y_pred, y_pred_prob, outputdir) metrics = metric(task, y_test, y_pred, y_pred_prob) print(dataset) get_run_info( metalearning, automl, dataset, shape, timeforjob, ncore, foldn, framework, resultsfile, fitmetrics, metrics, timespend, prepb, outputdir, target, )
scoring="roc_auc", cv=5, n_jobs=-1, verbosity=2, random_state=0) tpot_cls.fit(x_train, y_train) # Print best pipeline steps print('\nBest pipeline steps:', end='\n') for idx, (name, transform) in enumerate(tpot_cls.fitted_pipeline_.steps, start=1): # Print idx and transform print(f'{idx}. {transform}') # y_pred_prob is a 2-D array of probability of being labeled as 0 (first column of array) vs 1 (2nd column in array) y_pred_prob = tpot_cls.predict_proba(x_test) # AUC score for tpot model tpot_roc_auc_score = roc_auc_score(y_test, y_pred_prob[:, 1]) print(f'\nAUC score for TPOT Best Model: {tpot_roc_auc_score:.4f}') #Exporting the model tpot_cls.export("tpot_best_model.py") # X_train's variance, rounding the output to 3 decimal places print('\n', x_train.var().round(3)) #Lowering the variance of the feature with high variance using log transformation x_train_normed, x_test_normed = x_train.copy(), x_test.copy() # Log normalization
def automl(market): print "evaluating market ", market dt = datetime.now() #rng = sri.KISS(123958, 34987243, 3495825239, 2398172431) rng = sri.KISS(dt.microsecond) csv = pd.read_csv('data/' + market + '.csv') print csv.shape #print csv.columns # Dropping rows with label 0 # Doing binary logistic regression here # in F_AD there are only 181 0 labels # but there could be more. csv = csv[(csv.y != 0)] y = csv['y'] dates = csv['date'] # lookback 255. 254 to 0. Exclude 0. X = csv.loc[:, '254':'1'] # Accuracy: 0.5172 #seed = 342 #seed = 3165278097 seed = next(rng) print "seed:", seed cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=seed) # time: 6252.53945184 tpot_config = { 'xgboost.sklearn.XGBClassifier': { #'gamma': [0,0.5,1.0], #'subsample': [0.4,0.6,0.8,1.0], #'colsample_bylevel': [0.5,0.75,1.0], 'max_depth': [1], # [1,2,3] 'learning_rate': [1], # [1,0.1,0.01] #'silent': [1.0], 'nthread': [-1], #'n_estimators': [50,75,100,125,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]} 'n_estimators': [100] } } # default: gen=5, pop=20 # target: gen=10, pop=100 pipeline_optimizer = TPOTClassifier( generations=2, population_size=20, cv=cv, n_jobs=-1, random_state=seed, verbosity=3, periodic_checkpoint_folder='checkpoints', config_dict=tpot_config) start_time = timeit.default_timer() pipeline_optimizer.fit(X, y) elapsed = timeit.default_timer() - start_time print "time:", elapsed # pseudo test X_test = csv.loc[:9, '254':'1'] # Accuracy: 0.5172 print X_test.shape # Performance on test set. Might (probably will) differ from best pipeline score. #print X_test joblib.dump(X_test, "xtest.txt") print(pipeline_optimizer.predict(X_test)) print(pipeline_optimizer.predict_proba(X_test)) # Write out best pipeline as python code t = time.localtime() timestamp = time.strftime('%m%d%Y_%H%M%S', t) pipeline_optimizer.export('export/tpot_exported_pipeline_' + market + '_' + timestamp + '_' + str(seed) + '.py') # Serialize Pipe as JSON. Because dump string does not ASCII encode bytearrays. clfname = 'clfs/pipe_' + market + '_' + timestamp + '_' + str( seed) + '.json' frozen = jsonpickle.encode(pipeline_optimizer.fitted_pipeline_) f = open(clfname, 'w') f.write(frozen) f.close() f = open(clfname, 'r') frozen = f.read() f.close() thawed = jsonpickle.decode(frozen) #thawed = jsonpickle.decode(jsonstring) print "*** debug" print thawed.predict(X_test) print thawed.predict_proba(X_test)
with open("./classification_tables.pkl", 'rb') as fp: datasets = pickle.load(fp) data = datasets[0]['data'] X = data.iloc[:, :-1].to_numpy(dtype=int) y = data.iloc[:, -1].to_numpy(dtype=int) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) tpot = TPOTClassifier(generations=20, population_size=20, n_jobs=8, verbosity=2, scoring='balanced_accuracy') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) output_fname = f"pipeline_{datasets[0]['assay']}.py" tpot.export(output_fname) print("## EXPORTED FILE: ##") with open(output_fname, 'r') as fp: print(fp.read()) print("## END EXPORTED FILE ##") print("## PFHXS PREDICTED PROBABILITY:") print(tpot.predict_proba(pfhxs)) print("## PFHXS PREDICTED CLASS:") print(tpot.predict(pfhxs))
print('y_valid', y_valid) t1 = time.time() print('TPOT...!') tpot = TPOTClassifier( max_time_mins=60 * 10, population_size=100, scoring='roc_auc', cv=3, verbosity=2, random_state=67, n_jobs= -1) tpot.fit(x_t, y_train) tpot.export('./tpot_pipeline.py') print('accuracy is {}'.format(tpot.score(x_v, y_valid))) probab = tpot.predict_proba(x_v) probab = probab[:,1] print('AUC Score is {}'.format(roc_auc_score(y_valid,probab))) t2 = time.time() print('Total time taken by TPOT:', int(t2-t1)) check_x = x_v.set_index(X_valid['AGREEMENTID']) check_x.set_index(X_valid['AGREEMENTID'],inplace = True) check_y = pd.DataFrame(y_valid).set_index(X_valid['AGREEMENTID']) check_pred = pd.DataFrame(tpot.predict(x_v)).set_index(X_valid['AGREEMENTID'])
# rf_y_pred = RF.predict(Xtest) '''Optimización RF''' tpot_classifier = TPOTClassifier(generations=5, population_size=24, offspring_size=12, verbosity=2, early_stop=12, config_dict={'sklearn.ensemble.RandomForestClassifier': parameters}, cv=4, scoring='balanced_accuracy') '''Ajuste del modelo''' tpot_classifier.fit(Xtrain, ytrain) '''Predicción''' rf_y_pred = tpot_classifier.predict(Xtest) rf_y_prob = [probs[1] for probs in tpot_classifier.predict_proba(Xtest)] '''Validación y metricas de desempeño''' print('RF') print(confusion_matrix(ytest, rf_y_pred)) print('kappa', cohen_kappa_score(ytest, rf_y_pred)) report = precision_recall_fscore_support(ytest, rf_y_pred, average='weighted') auc_test_RF[fold] = roc_auc_score(ytest, rf_y_pred, average='weighted') kappa_test_RF[fold] = cohen_kappa_score(ytest, rf_y_pred) f1_test_RF[fold] = report[2] acc_test_RF[fold] = report[0] # Compute area under the curve fpr, tpr, _ = roc_curve(ytest, rf_y_prob) roc_auc = auc(fpr, tpr)
tpotC.fit(X_train, y_train) fin = process_time() print("Elapsed time in seconds : ", fin - debut) X_test = pd.read_csv( '/home/bench/notebooks/data/TomWilliams/x_test_IoT_Botnet.csv') predictions = tpotC.predict(X_test) y_test = pd.read_csv( '/home/bench/notebooks/data/TomWilliams/y_test_IoT_Botnet.csv') y_pred = pd.DataFrame(data=predictions) y_pred.to_csv("y_pred_IoT_Botnet_Tpot.csv") probs = tpotC.predict_proba(X_test) probs = probs[:, 1] auc = roc_auc_score(y_test, probs) print('AUC: %.2f' % auc) fpr, tpr, _ = roc_curve(y_test, probs) roc_auc = auc(fpr, tpr) plt.figure() lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw,
from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split( iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) print(type(X_train)) print(type(X_test)) print(type(y_train)) print(type(y_test)) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.predict(X_test)) print(tpot.predict_proba(X_test)) print(tpot.score(X_test, y_test))
class BotClassifier: FEATURE_IMPORTANCE_CONFIG = { # Classifiers 'sklearn.tree.DecisionTreeClassifier': { 'criterion': ["gini", "entropy"], 'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21) }, 'sklearn.ensemble.ExtraTreesClassifier': { 'n_estimators': [100], 'criterion': ["gini", "entropy"], 'max_features': np.arange(0.05, 1.01, 0.05), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21), 'bootstrap': [True, False] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [100], 'criterion': ["gini", "entropy"], 'max_features': np.arange(0.05, 1.01, 0.05), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21), 'bootstrap': [True, False] }, 'sklearn.ensemble.GradientBoostingClassifier': { 'n_estimators': [100], 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.], 'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21), 'subsample': np.arange(0.05, 1.01, 0.05), 'max_features': np.arange(0.05, 1.01, 0.05) }, 'xgboost.XGBClassifier': { 'n_estimators': [100], 'max_depth': range(1, 11), 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.], 'subsample': np.arange(0.05, 1.01, 0.05), 'min_child_weight': range(1, 21), 'nthread': [1] }, } FEATURE_IMPORTANCE_TEMPLATE = "Classifier" def __init__( self, number_of_generations : int = 3, population_size : int = 10, scoring : str = "accuracy", # "accuracy", "f1", "precision", "recall", "roc_auc" cv : Union[int, List[Tuple[List[int], List[int]]]] = 5, verbosity : int = 0, # 0, 1, 2, 3 number_of_jobs : int = -1, # -1 = number of cores is_feature_importances : bool = False, ) -> None: config = self.FEATURE_IMPORTANCE_CONFIG if is_feature_importances else None template = self.FEATURE_IMPORTANCE_TEMPLATE if is_feature_importances else None self.classifier = TPOTClassifier( generations = number_of_generations, population_size = population_size, scoring = scoring, cv = cv, verbosity = verbosity, n_jobs = number_of_jobs, config_dict = config, template = template, ) self.is_feature_importances = is_feature_importances def fit(self, features : pd.DataFrame, classes : pd.DataFrame) -> None: self.feature_names = features.columns self.classifier.fit(features, classes) def predict(self, features : pd.DataFrame) -> np.ndarray: return self.classifier.predict(features) def predict_proba(self, features : pd.DataFrame) -> np.ndarray: return self.classifier.predict_proba(features) def score(self, testing_features : pd.DataFrame, testing_classes : pd.DataFrame) -> float: return self.classifier.score(testing_features, testing_classes) def export(self, output_file_name : str) -> None: self.classifier.export(output_file_name) def scores(self, testing_features : pd.DataFrame, testing_classes : pd.DataFrame) -> Dict[str, float]: # labels = testing_classes["label"].tolist() print(1) classifier_predictions = self.predict(testing_features) print(2) classifier_prob_predictions = self.predict_proba(testing_features)[:, 1] print(3) scores_dict = { "accuracy" : accuracy_score(testing_classes, classifier_predictions), "precision" : precision_score(testing_classes, classifier_predictions), "recall" : recall_score(testing_classes, classifier_predictions), "f1" : f1_score(testing_classes, classifier_predictions), "roc_auc" : roc_auc_score(testing_classes, classifier_prob_predictions), } print(4) return scores_dict def get_fitted_pipeline(self) -> Pipeline: fitted_pipeline = self.classifier.fitted_pipeline_ return fitted_pipeline def get_pareto_front_fitted_pipelines(self) -> Dict[str, Pipeline]: try: return self.classifier.pareto_front_fitted_pipelines_ except Exception as e: print(f"cannot get pareto_front_fitted_pipelines_\n{e}") return {} def get_evaluated_individuals(self) -> Dict[str, Dict[str, Union[int, float, Tuple[str, ...]]]]: try: return self.classifier.evaluated_individuals_ except Exception as e: print(f"cannot get evaluated_individuals_\n{e}") return {} def get_feature_importances(self) -> Dict[str, float]: if True or self.is_feature_importances: try: classifier_with_feature_importance = self.get_fitted_pipeline()[-1] feature_importances = classifier_with_feature_importance.feature_importances_ return dict(zip(self.feature_names, feature_importances)) except Exception as e: print(f"cannot get feature importances\n{e}") return else: print(f"is_feature_importances is passed as False, cannot retrieve feature importances") return
# with joblib.parallel_backend("dask"): tpot.fit(X_train, y_train) tpot.score(X_test, y_test) # Winning pipelines print(tpot.fitted_pipeline_) # copy file # tpot.export('tpot_mnist_pipeline.py') # Get predictions y_predict = tpot.predict(X_test) # Probability of malignant tissue produced by the model y_prob = [probs[1] for probs in tpot.predict_proba(X_test)] #Accuracy on test set print("Test accuracy: %s\n" % (accuracy_score(y_test, y_predict).round(2))) # Confusion matrix test set conf_mat = pd.DataFrame(confusion_matrix(y_test, y_predict), columns=['Predicted NO', 'Predicted YES'], index=['Actual NO', 'Actual YES']) print(conf_mat) # Compute area under the curve fpr, tpr, _ = roc_curve(y_test, y_prob) roc_auc = auc(fpr, tpr) # only use if you can visualise
# Auto_ml START_EXPERIMENT = time.time() automl = TPOTClassifier( max_time_mins=(TIME_LIMIT // 60), scoring='roc_auc', verbosity=1, random_state=RANDOM_SEED, ) automl.fit( X_train, y_train, ) try: predictions = automl.predict_proba(X_test) except RuntimeError: predictions = automl.predict(X_test) y_test_predict_proba = predictions[:, 1] y_test_predict = automl.predict(X_test) print('AUC: ', roc_auc_score(y_test, y_test_predict_proba)) END_EXPERIMENT = time.time() #preds = pd.DataFrame(predictions) #preds['Y'] = y_test.reset_index(drop=True) #preds.to_csv(f'./result/predicts/{DATASET_NAME}_{MODEL_NAME}_predict_proba_exp_{EXPERIMENT}.csv', index=False,) metrics.append({ 'AUC':
float_tab = float_tab[float_tab['date'] > '2009'] float_tab.to_csv(os.path.join(path_output, 'data.csv'), index=False, encoding='utf8') X_train, X_test, y_train, y_test = split_by_date(float_tab, 'date') pipeline_optimizer = TPOTClassifier() pipeline_optimizer = TPOTClassifier(generations=10, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline_batiment_detail.py') # on veut savoir si on claase bien les 150 premiers: proba = pipeline_optimizer.predict_proba(X_test) proba_etre_insalubre = [x[1] for x in proba] proba = pd.Series(proba_etre_insalubre) prediction = proba.rank(ascending=False, method='min') < 150 + 1 y_test.loc[prediction.values].value_counts(normalize=True) autre_approche = pipeline_optimizer.predict(X_test) == 1 y_test.loc[autre_approche].value_counts(normalize=True) autre_approche = pipeline_optimizer.predict(X_test) == 0 y_test.loc[autre_approche].value_counts(normalize=True)
X_train.info() # In[17]: tpot = TPOTClassifier(generations=10, population_size=20, verbosity=2, scoring='roc_auc', random_state=42, disable_update_check=True, config_dict='TPOT light') tpot.fit(X_train, y_train) # In[18]: tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1]) print(f'\nAUC score: {tpot_auc_score:.4f}') # In[19]: print('\nBest pipeline steps:', end='\n') for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1): # Print idx and transform print(f'{idx}. {transform}') # In[20]: tpot.fitted_pipeline_ # In[21]:
model_specific_preprocessing=True) X_test = lr.preprocess( X_test, is_train=False, vect_max_features=hyperparams['vectorizer_dict_size'], model_specific_preprocessing=True) t = Thread(target=sleep, args=(hours * 60 * 60, )) t.start() # optimize roc_auc metric clf = TPOTClassifier(scoring='roc_auc', random_state=0, verbosity=2, config_dict=classifier_config_dict, population_size=20) clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test)[:, 1] with open(f'tpot_pred/{data_name}.pickle', 'wb') as f: pickle.dump(y_pred, f) # get prediction auc = roc_auc_score(y_test.values, y_pred) res[data_name] = auc print(data_name, auc) print('#' * 100) print(res) with open(f"tpot_pred/result.pickle", 'wb') as f: pickle.dump(res, f)