def test_predict(): """Assert that the TPOT predict function raises a ValueError when no optimized pipeline exists""" tpot_obj = TPOTClassifier() try: tpot_obj.predict(testing_features) assert False # Should be unreachable except ValueError: pass
class OnlineTpotClassifer(base.Classifier): def __init__(self, n_training_samples, classes: list, max_time_mins: int = 15): self.n_training_samples = n_training_samples self.max_time_mins = max_time_mins self.training_samples_x = [] self.training_samples_y = [] self.estimator = None self.classes = classes def learn_one(self, x: dict, y: base.typing.ClfTarget, **kwargs) -> base.Classifier: if self.estimator is None: self.training_samples_x.append(x) self.training_samples_y.append(y) if len(self.training_samples_x) >= self.n_training_samples and self.estimator is None: x_train = np.stack([dict2numpy(i) for i in self.training_samples_x]) self.estimator = TPOTClassifier(max_time_mins=self.max_time_mins) self.estimator.fit(x_train, self.training_samples_y) self.training_samples_y = [] self.training_samples_x = [] return self def predict_proba_one(self, x: dict) -> typing.Dict[base.typing.ClfTarget, float]: if self.estimator is not None: y_pred = self.estimator.predict_proba([list(x.values())])[0] return {self.classes[i]: p for i, p in enumerate(y_pred)} else: return {c: 1 / len(self.classes) for c in self.classes} def predict_proba_many(self, X): return pd.Series(self.estimator.predict_proba(X), columns=self.classes) @property def _multiclass(self): return True def learn_many(self, X, y): self.estimator.partial_fit(X=X.values, y=y.values, classes=self.classes) return self def predict_one(self, x): if self.estimator is not None: y_pred = self.estimator.predict([list(x.values())])[0] return y_pred else: return self.classes[0] def predict_many(self, X): return pd.Series(self.estimator.predict(X))
def run_AutoTpot(self): # Running the AutoTpot pipeline automl = TPOTClassifier(generations=1, verbosity=2, config_dict='TPOT sparse') automl.fit(self.train, self.y_train) # TPOT produces ready-to-run, standalone Python code for the best-performing model, # in the form of a scikit-learn pipeline. # Exporting the best models automl.export(os.path.join(self.args.save_dir, 'tpot-sportswear.py')) print('The best pipeline discovered through auto-tpot is {}'.format(automl.fitted_pipeline_)) print('Saving the best model discovered through TPOT.') # Dumping ensemble of the models joblib.dump(automl, os.path.join(self.args.checkpoint_dir, 'auto-tpot.pickle')) # Calculating time per prediction # Start time ****************************************************************************** start = timeit.default_timer() # Predicting label, confidence probability on the test data set predictions = automl.predict(self.test) predictions_prob = automl.predict_proba(self.test) # Binary class values : rounding them to 0 or 1 predictions = [round(value) for value in predictions] end = timeit.default_timer() # End Time ****************************************************************************** print('Time per prediction : {}'.format((end - start) / self.test.shape[0])) self.visualize(predictions, automl)
class AutomlInstance: def __init__(self, openML_id, scoring_function, memory_path = None, max_time=None): self.y_class_dict = None self.X_train, self.X_test, self.y_train, self.y_test = self.get_dataset(openML_id) if memory_path != None: if Path(memory_path).is_file(): self.tpot = TPOTClassifier(memory=memory_path,warm_start=True,scoring=scoring_function,verbosity=3) else: self.tpot = TPOTClassifier(memory=memory_path,max_time_mins=max_time, scoring=scoring_function,verbosity=3) else: self.tpot = TPOTClassifier(max_time_mins=max_time, scoring=scoring_function,verbosity=3) self.tpot.fit(self.X_train,self.y_train) def predict(self, X): return self.tpot.predict(X) def get_segments(self)->List[Segment]: segments = [] for model in self.tpot.evaluated_individuals_: try: classifier = self.tpot._toolbox.compile(creator.Individual.from_string(model, self.tpot._pset)) classifier.fit(self.X_train,self.y_train) y_pred = classifier.predict(self.X_test) segments.append(Segment(y_ground=self.y_test,y_pred=y_pred)) except ValueError: print("One classifier could not be evaluated.") except RuntimeError: print("One classifier could not be evaluated.") return segments def get_dataset(self, openMl_id, test_size=0.2): X, y = openml.fetch_openml(data_id=openMl_id, return_X_y=True) self.dataset_categories = openml.fetch_openml(data_id=31).categories openml_data = openml.fetch_openml(data_id=openMl_id, return_X_y=False) self.feature_names_X = openml_data.feature_names imp = Imputer() self.target_categories = numpy.unique(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) x_imp = imp.fit(X_train) X_train = x_imp.transform(X_train) x_imp = imp.fit(X_test) X_test = x_imp.transform(X_test) y_train = self._y_string_2_int(y_train) y_test = self._y_string_2_int(y_test) return X_train, X_test, y_train, y_test def _y_string_2_int(self, y: numpy.ndarray): if self.y_class_dict == None: self._create_class_dict(y) transdict = {y:x for x,y in self.y_class_dict.items()} return numpy.array([transdict[val] for val in y]) def _create_class_dict(self, y:numpy.ndarray): res = {} unique_values = numpy.unique(y) counter = 0 for x in unique_values.tolist(): res[counter] = x counter = counter +1 self.y_class_dict = res
def run_tpot(zeros, ones): all_data, y = make_all_data(zeros, ones) X_train, X_test, y_train, y_test = train_test_split(all_data, y, test_size=.1) pca = PCA(n_components=15) X_train = pca.fit_transform(X_train) X_test = pca.fit_transform(X_test) # if not os.path.exists('tpot_checkpoint'): # os.mkdir('tpot_checkpoint') tpot = TPOTClassifier( n_jobs=-1, generations=50, verbosity=3, scoring='f1', # subsample=.5, # periodic_checkpoint_folder='tpot_checkpoint', max_eval_time_mins=30, memory='auto') tpot.fit(X_train, y_train) tpot.export('tpot_ecog_pipeline.py') results = tpot.predict(X_test) out_file = open('tpot_metrics.txt', 'w') out_file.write(sklearn.metrics.classification_report(y_test, results)) tpot.export('tpot_ecog_pipeline.py')
def run_tpot(X,y, target_ft,time_budget=30, include_preprocessors=None, n_jobs=1 ): print(n_jobs) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1) if include_preprocessors: pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None, use_dask=False, #template="Selector-Transformer-Classifier", n_jobs=n_jobs,) else: pipeline_optimizer = TPOTClassifier(max_time_mins = time_budget//60, generations=None, use_dask=False, template='Classifier', n_jobs=n_jobs,) pipeline_optimizer.fit(X_train, y_train) y_hat = pipeline_optimizer.predict(X_test) acc = sklearn.metrics.accuracy_score(y_test, y_hat) f1_s = sklearn.metrics.f1_score(y_test, y_hat, average='weighted') metrs = [] metrs.append("Accuracy score - " + str(acc)) metrs.append("F1 score - " + str(f1_s)) res = ["","","","",f1_s,acc,"",pipeline_optimizer.export()] return str(metrs),res
def build_classifier(data, name): X, y = data categories = pandas.unique(y) config = make_tpot_pmml_config(classifier_config_dict) del config["sklearn.neighbors.KNeighborsClassifier"] classifier = TPOTClassifier(generations=1, population_size=3, random_state=13, config_dict=config, verbosity=2) classifier.fit(X, y) pipeline = make_pmml_pipeline(classifier.fitted_pipeline_, active_fields=X.columns.values, target_fields=[y.name]) print(repr(pipeline)) store_pkl(pipeline, name + ".pkl") result = DataFrame(classifier.predict(X), columns=[y.name]) if (len(categories) > 0): probabilities = DataFrame(classifier.predict_proba(X), columns=[ "probability(" + str(category) + ")" for category in categories ]) result = pandas.concat([result, probabilities], axis=1) store_csv(result, name + ".csv")
def build_classifier(data, feature_pipeline, generations, population_size, name): X, y = data Xt = feature_pipeline.fit_transform(X) Xt = Xt.astype(float) categories = pandas.unique(y) config = make_tpot_pmml_config(classifier_config_dict) config = filter_config(config) del config[ "sklearn.naive_bayes.GaussianNB"] # Does not support nesting - see http://mantis.dmg.org/view.php?id=208 del config["sklearn.neighbors.KNeighborsClassifier"] del config[ "sklearn.svm.LinearSVC"] # Does not support classifier.predict_proba(Xt) del config["sklearn.tree.DecisionTreeClassifier"] classifier = TPOTClassifier(generations=generations, population_size=population_size, random_state=13, config_dict=config, verbosity=2) classifier.fit(Xt, y) pipeline = make_pmml_pipeline(Pipeline(steps=feature_pipeline.steps + classifier.fitted_pipeline_.steps), active_fields=X.columns.values, target_fields=[y.name]) print(repr(pipeline)) store_pkl(pipeline, name) result = DataFrame(classifier.predict(Xt), columns=[y.name]) if (len(categories) > 0): probabilities = DataFrame(classifier.predict_proba(Xt), columns=[ "probability(" + str(category) + ")" for category in categories ]) result = pandas.concat([result, probabilities], axis=1) store_csv(result, name)
def train(X, Y, test_size=0.2, auto_ml=False, use_best_classifier=False, classifier_name=None): trained_classifiers = [] x_train, x_test, y_train, y_test = model_selection.train_test_split( X, Y, test_size=test_size) #x_scaler = StandardScaler() #x_train = x_scaler.fit_transform(x_train) #x_test = x_scaler.transform(x_test) if auto_ml: classifier = TPOTClassifier(generations=6, verbosity=2) classifier.fit(x_train, y_train) elif use_best_classifier and classifier_name: cls = copy.deepcopy(classifiers) estimator = cls[classifier_name].pop("estimator") classifier = estimator(**cls[classifier_name]) classifier.fit(x_train, y_train) else: classifier = tree.DecisionTreeClassifier(max_depth=3, criterion="entropy") #classifier = LogisticRegression(C=15, dual=False) classifier.fit(x_train, y_train) predicted = classifier.predict(x_test) print("Classification report for classifier %s:\n%s\n" % (classifier.__class__, metrics.classification_report(y_test, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, predicted)) trained_classifiers.append(classifier) return trained_classifiers
class TpotEstimator(BaseEstimator): def __init__(self, task, **kwargs): super(TpotEstimator, self).__init__(task) if task == 'regression': self.tpot = TPOTRegressor(**kwargs) else: self.tpot = TPOTClassifier(**kwargs) self.name = 'tpot' self.label_encoder = None self.obj_cols = None def train(self, X, y, X_test): self.obj_cols = column_object_category_bool(X) self.label_encoder = SafeOrdinalEncoder() X[self.obj_cols] = self.label_encoder.fit_transform(X[self.obj_cols]) self.tpot.fit(X, y) def predict_proba(self, X): X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols]) proba = self.tpot.predict_proba(X) print(f'proba.shape:{proba.shape}') return proba def predict(self, X): X[self.obj_cols] = self.label_encoder.transform(X[self.obj_cols]) return self.tpot.predict(X)
def cli(erv_data): # import the ERV expression data as a Pandas dataframe df = pd.read_csv(erv_data) class_codes = dict(enumerate( df['class'].astype("category").cat.categories)) df["class"] = df["class"].astype("category").cat.codes # create the test and training data X_train, X_test, y_train, y_test = train_test_split(df.values[:, 2:], df.values[:, 1], train_size=0.75, test_size=0.25) # convert them all to floats X_train, X_test, y_train, y_test = X_train.astype(float), X_test.astype( float), y_train.astype(float), y_test.astype(float) # create a pipeline pipeline_optimizer = TPOTClassifier(cv=2, verbosity=2, n_jobs=-1) pipeline_optimizer.fit(X_train, y_train) pipeline_optimizer.export('tpot_exported_pipeline.py') print(f"Validation Accuracy: {pipeline_optimizer.score(X_test, y_test)}") cm = ConfusionMatrix([class_codes[y] for y in y_test], [ class_codes[y] for y in [pipeline_optimizer.predict(x.reshape(1, -1))[0] for x in X_test] ]) cm.save_html("report")
def TPOT(df, task, timelife): df_new = copy.copy(df) pd.options.mode.chained_assignment = None #if isinstance(df_new, pd.DataFrame): df_new = fill_and_to_category(df_new) X, y, _ = return_X_y(df_new) #if not isinstance(df_new, pd.DataFrame): #X = fill_and_to_category(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) le = LabelEncoder() if task == 'classification': model = TPOTClassifier(generations=timelife, cv=5, max_time_mins=1, random_state=1, verbosity=2, n_jobs=1) model.fit(X_train, y_train) y_test = le.fit_transform(y_test) y_pred = le.fit_transform(model.predict(X_test)) pipelines = get_stat(model) if len(np.unique(y)) > 2: return accuracy_score(y_test, y_pred), f1_score( y_test, y_pred, average='weighted'), pipelines else: return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), pipelines else: model = TPOTRegressor(generations=timelife, cv=5, max_time_mins=1, random_state=1, verbosity=2, n_jobs=1) model.fit(X_train, y_train) y_pred = model.predict(X_test) pipelines = get_stat(model) return np.sqrt(mean_squared_error(y_test, y_pred)), r2_score(y_test, y_pred), pipelines
def clfWithTpot(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1) my_tpot = TPOTClassifier(generations=10, verbosity=2) my_tpot.fit(np.array(X_train), np.array(y_train)) print(my_tpot.score(np.array(X_test), np.array(y_test))) my_tpot.export('exported_pipeline.py') predictions = my_tpot.predict(np.array(X_test)) print(confusion_matrix(y_test, predictions))
def test_predict_2(): """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)""" tpot_obj = TPOTClassifier() tpot_obj._optimized_pipeline = creator.Individual.\ from_string('DecisionTreeClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict(testing_features) assert result.shape == (testing_features.shape[0],)
def run_tpot(X, y, target_ft, time_budget=30, include_preprocessors=None): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, random_state=1) pipeline_optimizer = TPOTClassifier(max_time_mins=time_budget / 60, generations=None) pipeline_optimizer.fit(X_train, y_train) y_hat = pipeline_optimizer.predict(X_test) metrs = [] metrs.append("Accuracy score - " + str(sklearn.metrics.accuracy_score(y_test, y_hat))) metrs.append("F1 score - " + str(sklearn.metrics.f1_score(y_test, y_hat, average='macro'))) return str(metrs)
def test_predict_2(): """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)""" tpot_obj = TPOTClassifier() tpot_obj._optimized_pipeline = creator.Individual.\ from_string('DecisionTreeClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile( expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict(testing_features) assert result.shape == (testing_features.shape[0], )
def TPOT_Classifier(): tpot = TPOTClassifier( verbosity=2, max_time_mins=390, population_size=40, ) tpot.fit(x_train, y_train) tpot.export('tpot_assignment_pipeline.py') TPOT_predict = tpot.predict(x_test) score = tpot.score(x_test, y_test) print(score) print(y_test) print(TPOT_predict) return score
def test_predict_2(): """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)""" tpot_obj = TPOTClassifier() pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5)') tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict(testing_features) assert result.shape == (testing_features.shape[0],)
class TPOTClassifierModel(Model): def __init__( self, name: str, model_params: Dict[str, Any], ) -> None: super().__init__(name, model_params) self._model = TPOTClassifier(**model_params) def _force_fit(self, X: np.ndarray, y: np.ndarray) -> None: self._model.fit(X, y) # TODO: This is required by DESlib which is kind of annoying. # They call check_if_fitted(model, 'classes_'), meaning these have # to act more like general sklearn models # # If using more classifiers, the creation of a ClassifierModel # base class is probably required to ensure consistency self.classes_ = self._model.fitted_pipeline_.classes_ def fit(self, X: np.ndarray, y: np.ndarray) -> None: if not isinstance(self._model, TPOTClassifier): raise RuntimeError( 'Due to TPOT being unpickelable, saving this' + ' means only the actual sklearn.Pipeline' + ' was saved. Calling fit will fit this pipeline' + ' rather than the TPOT algorithm. If this is' + ' desired behaviour, please use `_force_fit`' + ' instead') self._force_fit(X, y) def save(self, path: str) -> None: # See comment above class if isinstance(self._model, TPOTClassifier): self._model = self._model.fitted_pipeline_ with open(path, 'wb') as file: pickle.dump(self, file) @classmethod def load(cls, path: str): with open(path, 'rb') as file: model = pickle.load(file) return cast(TPOTClassifierModel, model) def predict(self, X: np.ndarray) -> np.ndarray: return self._model.predict(X) def predict_proba(self, X: np.ndarray) -> np.ndarray: return self._model.predict_proba(X)
def Classifier(x, y): x_train = x y_train = y tpot = TPOTClassifier( verbosity=2, max_time_mins=10, population_size=50, ) tpot.fit(x_train, y_train) tpot.export('tpot_pipeline.py') TPOT_predict = tpot.predict(x_test) score = tpot.score(x_test, y_test) #print(score) #print(y_test) #print(TPOT_predict) return score
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset): tp = TPOTClassifier(verbosity=3) start_time = timer(None) tp.fit(X_train, y_train) tp.export('tpot_pipeline_dont_overfit.py') time = timer(start_time) preds = tp.predict(X_test) time_out = open(name_dataset + '_' + 'tpot', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({"id": id_test, "target": preds}) submission.to_csv(name_dataset + '_' + 'tpot' + '_submission.csv', index=False)
class TPOTBaselineModel(Model): def __init__( self, name: str, model_params: Dict[str, Any], ) -> None: super().__init__(name, model_params) self._model = TPOTClassifier(**model_params) def _force_fit(self, X: np.ndarray, y: np.ndarray) -> None: self._model.fit(X, y) def fit(self, X: np.ndarray, y: np.ndarray) -> None: if not isinstance(self._model, TPOTClassifier): raise RuntimeError( 'Due to TPOT being unpickelable, saving this' + ' means only the actual sklearn.Pipeline' + ' was saved. Calling fit will fit this pipeline' + ' rather than the TPOT algorithm. If this is' + ' desired behaviour, please use `_force_fit`' + ' instead') self._force_fit(X, y) def save(self, path: str) -> None: # See comment above class if isinstance(self._model, TPOTClassifier): self._model = self._model.fitted_pipeline_ with open(path, 'wb') as file: pickle.dump(self, file) @classmethod def load(cls, path: str): with open(path, 'rb') as file: model = pickle.load(file) return cast(TPOTBaselineModel, model) def predict(self, X: np.ndarray) -> np.ndarray: return self._model.predict(X) def predict_proba(self, X: np.ndarray) -> np.ndarray: # TODO: May cause issues if SVG or SVM model is best return self._model.predict_proba(X)
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset): tp = TPOTClassifier(generations=5, population_size=20, random_state=42, verbosity=2) start_time = timer(None) tp.fit(X_train, y_train) tp.export('tpot_pipeline_' + name_dataset + '.py') time = timer(start_time) preds = tp.predict(X_test) time_out = open("time_files/" + name_dataset + '_' + 'tpot', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({"id": id_test, "target": preds}) submission.to_csv("submit_files/" + name_dataset + '_' + 'tpot_submission' + '.csv', index=False)
def process_tpot(X_train, X_test, y_train, df_types, m_type, seed, *args): """Function that trains and tests data using tpot""" from tpot import TPOTClassifier from tpot import TPOTRegressor from ..config import classifier_config_dict # Register Timer def handler(signum, frame): raise SystemExit('Time limit exceeded, sending system exit...') signal.signal(signal.SIGALRM, handler) # default cv is 5 if m_type == 'classification': automl = TPOTClassifier(generations=100, population_size=100, config_dict=classifier_config_dict, verbosity=3, max_time_mins=int(10800/60), scoring='f1_weighted', n_jobs=N_CORES, random_state=seed) else: automl = TPOTRegressor(generations=100, population_size=100, verbosity=3, max_time_mins=int(10800/60), scoring='neg_mean_squared_error', n_jobs=N_CORES, random_state=seed) # Set timer # for long running processes TPOT sometimes does not end even with generations signal.alarm(TIME_PER_TASK+GRACE_PERIOD) automl.fit(X_train.values, y_train.values) signal.alarm(0) return (automl.predict_proba(X_test.values) if m_type == 'classification' else automl.predict(X_test.values))
def main(): # set up the path to the data sets and the data were are going to experiment # with base_path = '/scratch/ditzler/Git/ClassificationDatasets/csv/' data_setz = [#'bank', 'blood', 'breast-cancer-wisc-diag', 'breast-cancer-wisc-prog', 'breast-cancer-wisc', 'breast-cancer', 'congressional-voting', 'conn-bench-sonar-mines-rocks', 'credit-approval', 'cylinder-bands', 'echocardiogram', #'fertility', 'haberman-survival', 'heart-hungarian', 'hepatitis', 'ionosphere', 'mammographic', 'molec-biol-promoter', 'musk-1', 'oocytes_merluccius_nucleus_4d', 'oocytes_trisopterus_nucleus_2f', 'ozone', 'parkinsons', 'pima', #'pittsburg-bridges-T-OR-D'; 'planning', 'ringnorm', #'spambase', 'spectf_train', 'statlog-australian-credit', 'statlog-german-credit', 'statlog-heart', 'titanic', #'twonorm', 'vertebral-column-2clases'] # nsplits is like the number of cv (its bootstraps here) then set up some variales # to save the results to. n_splitz = 10 errors = np.zeros((len(data_setz),)) fms = np.zeros((len(data_setz),)) times = np.zeros((len(data_setz),)) m = 0 for n in range(n_splitz): print 'Spilt ' + str(n) + ' of ' + str(n_splitz) for i in range(len(data_setz)): print ' ' + data_setz[i] df = pd.read_csv(base_path + data_setz[i] + '.csv', sep=',') data = df.as_matrix() X = data[:, :-1] y = data[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=m) m += 1 ts = time.time() tpot = TPOTClassifier(generations=10, population_size=25, verbosity=1) tpot.fit(X_train, y_train) times[i] += (time.time() - ts) errors[i] += (1-tpot.score(X_test, y_test)) yhat = tpot.predict(X_test) fms[i] += f1_score(y_test, yhat, average='macro') errors /= n_splitz fms /= n_splitz times /= n_splitz df = pd.DataFrame({'errors': errors, 'fms': fms, 'times': times}) df.to_csv(path_or_buf='tpot-results2.csv', sep=',') return None
probab = tpot.predict_proba(x_v) probab = probab[:,1] print('AUC Score is {}'.format(roc_auc_score(y_valid,probab))) t2 = time.time() print('Total time taken by TPOT:', int(t2-t1)) check_x = x_v.set_index(X_valid['AGREEMENTID']) check_x.set_index(X_valid['AGREEMENTID'],inplace = True) check_y = pd.DataFrame(y_valid).set_index(X_valid['AGREEMENTID']) check_pred = pd.DataFrame(tpot.predict(x_v)).set_index(X_valid['AGREEMENTID']) check_probab = pd.DataFrame(tpot.predict_proba(x_v)).set_index(X_valid['AGREEMENTID']) # new_y = check_y.reset_index().groupby(['AGREEMENTID'])['FORECLOSURE'].agg({'y':np.mean}) new_y = check_y.reset_index().groupby(['AGREEMENTID'])['FORECLOSURE'].agg(lambda x: stats.mode(x)[0][0]) # new_pred = check_pred.reset_index().groupby(['AGREEMENTID'])[0].agg({'y':stats.mode(axis = None)}) new_pred = check_pred.reset_index().groupby(['AGREEMENTID'])[0].agg(lambda x: stats.mode(x)[0][0]) new_probab = check_probab.reset_index().groupby(['AGREEMENTID'])[1].agg({'probab':np.mean}) print('new_accuracy is {}'.format(np.mean(new_pred==new_y))) print('new roc auc is {}'.format(roc_auc_score(new_y,new_probab)))
from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from tpot import TPOTClassifier X_train = read_csv('input/aps_failure_training_set.csv',na_values='na') X_test = read_csv('input/aps_failure_test_set.csv',na_values='na') # deal with missing values and constant features and normalize X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test) print(f'Data loaded: {len(X_train)} training observations, {len(X_test)} testing observations') X_train, y_train = balance_data(X_train, y_train, n_samples = 2500) print(f'Balanced training data ({2500/1000}/1): {len(X_train)} training observations, {len(X_test)} testing observations') # A custom scorer function is created in order to reflect on the different cost of misclassification (fn > fp) def scania_scorer(y_true,y_pred): tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() total_cost = 10*fp + 500*fn return total_cost custom_scania_scorer = make_scorer(scania_scorer, greater_is_better=False) tpot = TPOTClassifier(generations=100, population_size=100, verbosity=3, random_state=42, use_dask=True, n_jobs=-1, memory='auto', early_stop=10, scoring=custom_scania_scorer) tpot.fit(X_train, y_train) y_pred = tpot.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print("Total cost: " + str(scania_scorer(y_test, y_pred))) print(tpot.score(X_test, y_test)) tpot.export('tpot_scania_pipeline.py')
# #不用onehot # train_data['Sex']=train_data['Sex'].replace(['male','female'],[0,1]) # test_data['Sex']=test_data['Sex'].replace(['male','female'],[0,1]) # train_data['Embarked']=train_data['Embarked'].replace(['S','Q','C'],[0,1,2]) # test_data['Embarked']=test_data['Embarked'].replace(['S','Q','C'],[0,1,2]) #定义特征值 features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] #定义训练和测试数据 train_features = train_data[features] train_y = train_data['Survived'] test_features = test_data[features] #将xonehot化 dv = DictVectorizer(sparse=False) train_dv = dv.fit_transform(train_features.to_dict(orient='record')) test_dv = dv.transform(test_features.to_dict(orient='record')) # #训练模型 CART TP = TPOTClassifier() TP.fit(train_dv, train_y) pred = TP.predict(test_dv) # for n,p in zip(test_data['Name'],pred): # print('{}是{}'.format(n,p)) # 得到决策树准确率(基于训练集) acc_decision_tree = round(TP.score(train_dv, train_y), 6) print(u'score准确率为 %.4lf' % acc_decision_tree)
START_EXPERIMENT = time.time() automl = TPOTClassifier( max_time_mins=(TIME_LIMIT // 60), scoring='roc_auc', verbosity=1, random_state=RANDOM_SEED, ) automl.fit( X_train, y_train, ) try: predictions = automl.predict_proba(X_test) except RuntimeError: predictions = automl.predict(X_test) y_test_predict_proba = predictions[:, 1] y_test_predict = automl.predict(X_test) print('AUC: ', roc_auc_score(y_test, y_test_predict_proba)) END_EXPERIMENT = time.time() #preds = pd.DataFrame(predictions) #preds['Y'] = y_test.reset_index(drop=True) #preds.to_csv(f'./result/predicts/{DATASET_NAME}_{MODEL_NAME}_predict_proba_exp_{EXPERIMENT}.csv', index=False,) metrics.append({ 'AUC': round(roc_auc_score(y_test, y_test_predict_proba), 4), 'log_loss':
def main(): # Import Data train = pd.read_csv( "../input/train.csv", dtype={"Age": np.float64}, ) test = pd.read_csv( "../input/test.csv", dtype={"Age": np.float64}, ) # Check to see if there are any null training set values print '********Checking NaNs for Test Data********' print 'Fare NaNs:', len(train[pd.isnull(train['Fare'])]) print 'Class NaNs:', len(train[pd.isnull(train['Pclass'])]) print 'Age NaNs:', len(train[pd.isnull(train['Age'])]) print 'Sibling NaNs:', len(train[pd.isnull(train['SibSp'])]) print 'Parent/Child NaNs:', len(train[pd.isnull(train['Parch'])]) print 'Sex NaNs:', len(train[pd.isnull(train['Sex'])]) print 'Embarked NaNs:', len(train[pd.isnull(train['Embarked'])]) print '*******************************************' pt.plot_distribution(train, var='Age', target='Survived', row='Sex') pt.plot_distribution(train, var='Fare', target='Survived', row='Sex') input() # Replacing on fine grouping print 'Replacing Age NaNs with categorical means for Class, Sex, Siblings, Parent/Child' train['Age'] = train.groupby( ['Pclass', 'Sex', 'SibSp', 'Parch'])['Age'].transform(lambda x: x.fillna(x.mean())) # Replacing on less granular gropuing print 'Checking Age NaNs after first replacement:', len(train[pd.isnull( train['Age'])]) train['Age'] = train.groupby( ['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.mean())) print 'Checking Age NaNs after replacement:', len(train[pd.isnull( train['Age'])]) train['Age'] = train['Age'].astype(int) train['Embarked'].fillna('S', inplace=True) print 'Checking Embarked NaNs after replacement:', len(train[pd.isnull( train['Embarked'])]) seaborn_hist(train, 'Age', 'Age Distributions Before Replacement') seaborn_hist(train, 'Age', 'Age Distributions After Replacement') """ Use train average information to replace NaNs in test set """ # Check to see if there are any null testing set values print '********Checking NaNs for Test Data********' print 'Fare NaNs:', len(test[pd.isnull(test['Fare'])]) print 'Class NaNs:', len(test[pd.isnull(test['Pclass'])]) print 'Age NaNs:', len(test[pd.isnull(test['Age'])]) print 'Sibling NaNs:', len(test[pd.isnull(test['SibSp'])]) print 'Parent/Child NaNs:', len(test[pd.isnull(test['Parch'])]) print 'Sex NaNs:', len(test[pd.isnull(test['Sex'])]) print 'Embarked NaNs:', len(test[pd.isnull(test['Embarked'])]) print '*******************************************' # Replacing on fine grouping print 'Replacing Age NaNs with categorical means for Class, Sex, Siblings, Parent/Child' test['Age'] = train.groupby( ['Pclass', 'Sex', 'SibSp', 'Parch'])['Age'].transform(lambda x: x.fillna(x.mean())) print 'Checking Age NaNs after first replacement:', len(test[pd.isnull( test['Age'])]) test['Age'] = test['Age'].astype(int) # Replacing on fine grouping print 'Replacing Fare NaNs with categorical means for Class, Sex, Siblings, Parent/Child' test['Fare'] = train.groupby( ['Pclass', 'Sex', 'SibSp', 'Parch'])['Fare'].transform(lambda x: x.fillna(x.mean())) print 'Checking Fare NaNs after first replacement:', len(test[pd.isnull( test['Fare'])]) # Create a Family Size column train['Family_Size'] = train['SibSp'] + train['Parch'] # Creating Titles column in DataFrame titles = sorted(set([x for x in train.Name.map(lambda x: get_title(x))])) print 'List of titles in data' print len(titles), ':', titles train['Title'] = train['Name'].map(lambda x: get_title(x)) train['Title'] = train.apply(replace_titles, axis=1) print '*******************************************' # Determine the number of cabins reserved per person # print train['Deck'].unique() # raw_input('press enter...') # train['Cabin_Length'] = train['Cabin'].str.split(' ').str.len() # train['Cabin_Length'].fillna(0,inplace=True) column_vals = [ 'Sex', 'Fare', 'Age', 'Pclass', 'Family_Size', 'Title', 'Embarked' ] mean_analysis(train, column_vals) print '*******************************************' """ Creating deck from cabin, age label bands, fare label bands, titles from names, and applying LabelEncoder to categorical variables """ # Convert Categorical Variables to Numerical le_age = LabelEncoder() le_fare = LabelEncoder() le_title = LabelEncoder() le_embarked = LabelEncoder() le_deck = LabelEncoder() le_sex = LabelEncoder() le_fam = LabelEncoder() train['Deck'] = train['Cabin'].str[0] train['Deck'].fillna('Z', inplace=True) age_labels = [ 'Band_1', 'Band_2', 'Band_3', 'Band_4', 'Band_5', 'Band_6', 'Band_7', 'Band_8', 'Band_9', 'Band_10' ] train['AgeBand'] = pd.cut(train['Age'], bins=10, labels=age_labels) train['AgeBand'] = le_age.fit_transform(train['AgeBand']) fare_labels = [ 'Band_1', 'Band_2', 'Band_3', 'Band_4', 'Band_5', 'Band_6', 'Band_7', 'Band_8', 'Band_9', 'Band_10' ] train['FareBand'] = pd.cut(train['Fare'], bins=10, labels=fare_labels) train['FareBand'] = le_fare.fit_transform(train['FareBand']) fam_size_labels = ['Band_1', 'Band_2', 'Band_3'] train['FamilySizeBand'] = pd.cut(train['Family_Size'], bins=3, labels=fam_size_labels) train['FamilySizeBand'] = le_fam.fit_transform(train['FamilySizeBand']) train['Title'] = le_title.fit_transform(train['Title']) train['Embarked'] = le_embarked.fit_transform(train['Embarked']) train['Deck'] = le_deck.fit_transform(train['Deck']) train['Sex'] = le_sex.fit_transform(train['Sex']) train.drop([ 'PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'FareBand', 'Cabin', 'Family_Size' ], inplace=True, axis=1) train.drop(['AgeBand', 'Deck', 'Title'], inplace=True, axis=1) # survived = train['Survived'] # train.drop(['Survived'],inplace=True,axis=1) # scaler = preprocessing.StandardScaler().fit(train) # train = pd.DataFrame(scaler.transform(train)) # train['Survived'] = survived # del survived print train.head() input() colormap = plt.cm.viridis plt.figure(figsize=(12, 12)) plt.title('Pearson Correlation of Features', y=1.05, size=15) sns.heatmap(train.corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True) # pt.plot_correlation_map(train) plt.show(block=False) print train.corr()['Survived'] input() # """ Cleaning Test Set """ test['Deck'] = test['Cabin'].str[0] test['Deck'].fillna('Z', inplace=True) test['Family_Size'] = test['SibSp'] + test['Parch'] test['AgeBand'] = pd.cut(test['Age'], bins=10, labels=age_labels) test['AgeBand'] = le_age.transform(test['AgeBand']) test['FareBand'] = pd.cut(test['Fare'], bins=10, labels=fare_labels) test['FareBand'] = le_fare.transform(test['FareBand']) test['FamilySizeBand'] = pd.cut(test['Family_Size'], bins=3, labels=fam_size_labels) test['FamilySizeBand'] = le_fam.transform(test['FamilySizeBand']) test['Title'] = test['Name'].map(lambda x: get_title(x)) test['Title'] = test.apply(replace_titles, axis=1) test['Title'] = le_title.transform(test['Title']) test['Embarked'] = le_embarked.transform(test['Embarked']) test['Deck'] = le_deck.transform(test['Deck']) test['Sex'] = le_sex.transform(test['Sex']) results = pd.DataFrame(columns=['PassengerId', 'Survived']) results['PassengerId'] = test['PassengerId'] test.drop([ 'PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'FareBand', 'Cabin', 'Family_Size' ], inplace=True, axis=1) test.drop(['AgeBand', 'Deck', 'Title'], inplace=True, axis=1) # test = pd.DataFrame(scaler.transform(test)) """ Model Training """ train.rename(columns={'Survived': 'class'}, inplace=True) X_train = train.drop(['class'], axis=1) y_train = train['class'] print y_train.head() input() # Cross Validation train_data, test_data, train_target, test_target = train_test_split( X_train, y_train, test_size=0.25, random_state=0) pipeline_optimizer = TPOTClassifier(generations=10, population_size=25, random_state=42, cv=5, verbosity=2, n_jobs=3, scoring='f1') pipeline_optimizer.fit(train_data, train_target) print pipeline_optimizer.score(test_data, test_target) pipeline_optimizer.export('Titanic_TPOT_Classifier.py') results['Survived'] = pipeline_optimizer.predict(test) # Trying a bunch of different classifiers # classifiers = [LinearSVC(), KNeighborsClassifier(), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1), GaussianNB(), SVC(gamma=2, C=1), AdaBoostClassifier(), XGBClassifier()] # print 'Classifier score:', clf.score(test_data,test_target.values.ravel()) # # """ Model Analysis """ # # # cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) # # # title = "Learning Curves (XGBoost)" # # # plot_learning_curve(clf, title, X_train, y_train.values, ylim=(0.7, 1.01), cv=cv, n_jobs=4) # # # y_pred = clf.predict(X_train) # # # print 'done in %0.3fs' % (time() - t0) # # # print confusion_matrix(y_train,y_pred) # # # print '*******************************************' # # # print classification_report(y_train,y_pred) # # # print 0.5 * (precision_recall_fscore_support(y_train,y_pred)[2][0] + precision_recall_fscore_support(y_train,y_pred)[2][1]) # # # print '*******************************************' # # # print max(evaluation(classifiers,train_data,train_target,test_data,test_target)[:][1]) # # _ , f_scores = evaluation(classifiers,train_data,train_target,test_data,test_target) # # print f_scores.index(max(f_scores)) # # model = classifiers[-1]#f_scores.index(max(f_scores))] # # print model # # rfecv = RFECV( estimator = model , step = 1 , cv = StratifiedKFold( 2 ) , scoring = 'accuracy' ) # # rfecv.fit( train_data , train_target ) # # print rfecv.score( train_data , train_target ) , rfecv.score( test_data , test_target ) # # print "Optimal number of features : %d" % rfecv.n_features_ # # input() # # # Plot number of features VS. cross-validation scores # # plt.figure() # # plt.xlabel( "Number of features selected" ) # # plt.ylabel( "Cross validation score (nb of correct classifications)" ) # # plt.plot( range( 1 , len( rfecv.grid_scores_ ) + 1 ) , rfecv.grid_scores_ ) # # plt.show(block=False) # # print X_train.head() # # X_train = pd.DataFrame(rfecv.transform(X_train)) # # print rfecv.ranking_ # # print X_train.head() # # input() # # # """ Model Predicting """ # # print 'Fitting model to full training set...' # # model.fit(X_train,y_train) # # X_test = test[['Pclass','Sex','Embarked','Title','Deck','AgeBand','FareBand','FamilySizeBand']] # # X_test = pd.DataFrame(rfecv.transform(pd.DataFrame(scaler.transform(X_test)))) # # results['Survived'] = model.predict(X_test) """ Ensemble Stacking """ # results['Survived'] = ensemble_stacking(train,test) results.to_csv('Titanic_Results.csv', sep=',', index=False) plt.show(block=False) raw_input('Press [enter] to close.')
X_train_index, X_test_index, y_train_string, y_test_string = train_test_split( list(range(nImg)), image_order[:, 0], test_size=0.2) X_train = data_concat[X_train_index, :] X_test = data_concat[X_test_index, :] y_train = Label(y_train_string) y_test = Label(y_test_string) # """ -------------- TPOT does is magic------------------------------------- #""" from tpot import TPOTClassifier clf = TPOTClassifier(verbosity=2, n_jobs=-1, config_dict='TPOT light') clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) predictions = clf.predict(X_test) print(confusion_matrix(y_test, predictions)) score1 = clf.score(X_test, y_test) confusion_matrix1 = confusion_matrix(y_test, predictions) print('Order of which element that was the most of=', Counter(y_test_string).keys()) print('How many that were of each unique element', Counter(y_test_string).values()) print( '-------------------------------------PERSON 2-------------------------------------------------------' ) """ -------------- Data is imported, concated and normalized-------------------- """
def evaluate_tpot(dataset, task_type, run_id, time_limit, seed=1, use_fe=True): n_job = args.n_job # Construct the ML model. if not use_fe: from mindware.utils.tpot_config import classifier_config_dict config = classifier_config_dict _task_type = MULTICLASS_CLS if task_type == 'cls' else REGRESSION if task_type == 'cls': if space_type == 'large': from tpot.config.classifier import classifier_config_dict elif space_type == 'small': from tpot.config.classifier_small import classifier_config_dict else: from tpot.config.classifier_extremely_small import classifier_config_dict config_dict = classifier_config_dict else: if space_type == 'large': from tpot.config.regressor import regressor_config_dict elif space_type == 'small': from tpot.config.regressor_small import regressor_config_dict else: from tpot.config.regressor_extremely_small import regressor_config_dict config_dict = regressor_config_dict if task_type == 'cls': automl = TPOTClassifier(config_dict=config_dict, generations=10000, population_size=20, verbosity=2, n_jobs=n_job, cv=0.2, scoring='balanced_accuracy', max_eval_time_mins=max_eval_time, max_time_mins=int(time_limit / 60), random_state=seed) raw_data, test_raw_data = load_train_test_data(dataset, task_type=_task_type) X_train, y_train = raw_data.data X_test, y_test = test_raw_data.data X_train, y_train = X_train.astype('float64'), y_train.astype('int') X_test, y_test = X_test.astype('float64'), y_test.astype('int') else: automl = TPOTRegressor(config_dict=config_dict, generations=10000, population_size=20, verbosity=2, n_jobs=n_job, cv=0.2, scoring='neg_mean_squared_error', max_eval_time_mins=max_eval_time, max_time_mins=int(time_limit / 60), random_state=seed) raw_data, test_raw_data = load_train_test_data(dataset, task_type=_task_type) X_train, y_train = raw_data.data X_test, y_test = test_raw_data.data X_train, y_train = X_train.astype('float64'), y_train.astype('float64') X_test, y_test = X_test.astype('float64'), y_test.astype('float64') start_time = time.time() automl.fit(X_train, y_train) y_hat = automl.predict(X_test) pareto_front = automl._pareto_front if task_type == 'cls': score_func = balanced_accuracy_score else: score_func = mean_squared_error valid_score = max([ pareto_front.keys[x].wvalues[1] for x in range(len(pareto_front.keys)) ]) test_score = score_func(y_test, y_hat) print('Run ID : %d' % run_id) print('Dataset : %s' % dataset) print('Val/Test score : %f - %f' % (valid_score, test_score)) scores = automl.scores times = automl.times _space_type = '%s_' % space_type if space_type != 'large' else '' save_path = save_dir + '%s%s_tpot_%s_false_%d_1_%d.pkl' % ( _space_type, task_type, dataset, time_limit, run_id) with open(save_path, 'wb') as f: pickle.dump( [dataset, valid_score, test_score, times, scores, start_time], f)