def test_score(): """Ensure that the TPOT score function raises a ValueError when no optimized pipeline exists""" tpot_obj = TPOT() try: tpot_obj.score(testing_features, testing_classes) assert False # Should be unreachable except ValueError: pass
def test_score_2(): """Ensure that the TPOT score function outputs a known score for a fixed pipeline""" tpot_obj = TPOT() tpot_obj._training_classes = training_classes tpot_obj._training_features = training_features tpot_obj.pbar = tqdm(total=1, disable=True) known_score = 0.981993770448 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score tpot_obj._optimized_pipeline = creator.Individual.\ from_string('_logistic_regression(input_df, 1.0, 0, True)', tpot_obj._pset) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
def test_score_2(): """Assert that the TPOT score function outputs a known score for a fixed pipeline""" tpot_obj = TPOT() tpot_obj.pbar = tqdm(total=1, disable=True) known_score = 0.986318199045 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score tpot_obj._optimized_pipeline = creator.Individual.\ from_string('RandomForestClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
def test_score_2(): """Ensure that the TPOT score function outputs a known score for a fixed pipeline""" tpot_obj = TPOT() tpot_obj._training_classes = training_classes tpot_obj._training_features = training_features tpot_obj.pbar = tqdm(total=1, disable=True) known_score = 0.981993770448 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score tpot_obj._optimized_pipeline = creator.Individual.\ from_string('_logistic_regression(input_df, 1.0, 0, True)', tpot_obj._pset) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
def test_score_2(): """Assert that the TPOT score function outputs a known score for a fixed pipeline""" tpot_obj = TPOT() tpot_obj.pbar = tqdm(total=1, disable=True) known_score = 0.986318199045 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score tpot_obj._optimized_pipeline = creator.Individual.\ from_string('RandomForestClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile( expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
phenotype = load_gametes['Class'] individuals = load_gametes.drop('Class', axis=1) X_train, X_test, y_train, y_test = train_test_split(individuals, phenotype, train_size=0.75, test_size=0.25) # Expert Knowledge Filter & MDR tpot = TPOT(generations=200, population_size=200, verbosity=2, expert_source=load_ekf) t1 = time.time() tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) t2 = time.time() print("Time lapsed: ", t2 - t1) # MDR Only #tpot = TPOT(generations=500, population_size=350, verbosity=2, expert_source=None) #t1 = time.time() #tpot.fit(X_train, y_train) #print(tpot.score(X_test, y_test)) #t2 = time.time() #print("Time lapsed: ", t2 - t1) # Random Forest #clf = RandomForestClassifier(max_depth=5, max_features=len(X_train.columns), # n_estimators=1000) #clf.fit(X_train, y_train)
from tpot import TPOT from sklearn.cross_validation import train_test_split import pandas as pd import numpy as numpy telescope = pd.read_csv("MAGIC Gamma Telescope Data.csv") #clean the data telescope_shuffle = telescope.iloc[np.random.permutation(len(telescope))] tele = telescope_shuffle.reset_index(drop=True) #Store classes tele['Class'] = tele['Class'].map({'g':0, 'h':1}) tele_class = tele['Class'].values #Split data training_indices, validation_indices = training_indices, testing_indices = train_test_split(tele.index, stratify= tele_class, train_size=0.75, test_size=0.25) #find best model tpot = TPOT(generations=5, verbosity=2) tpot.fit(tele.drop('Class', axis=1).loc[training_indices].values, tele.loc[training_indices, "Class"].values) #Score the accuracy tpot.score(tele.drop('Class', axis=1).loc[validation_indices].values, tele.loc[validation_indices, 'Class'].values) #Export generated code tpot.export('pipeline.py')
data_class = data['Class'].values # split training, testing, and validation data training_indices, validation_indices = training_indices, testing_indices = train_test_split( data.index, stratify=data_class, train_size=0.75, test_size=0.25) # Let Genetic Programming to find the best ML model and hyperparamters # Verbosity 2 shows a loading bar tpot = TPOT(generations=5, verbosity=2) tpot.fit( data.drop('Class', axis=1).loc[training_indices].values, data.loc[training_indices, 'Class'].values) # Score the accuracy tpot.score( data.drop('Class', axis=1).loc[validation_indices].values, data.loc[validation_indices, 'Class'].values) # export the generated code tpot.export('pipeline.py') ## compute sigmoid nonlinearity ## normalizes numbers given #def sigmoid(x): # output = 1/(1+np.exp(-x)) # return output # ## convert output of sigmoid function to its derivative ## used to calculate training later on #def sigmoid_output_to_derivative(output): # return output*(1-output)
from tpot import TPOT from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, train_size=0.75, test_size=0.25) tpot = TPOT(generations=5,verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_iris_pipeline.py')
import ch9util from tpot import TPOT X_train, X_test, y_train, y_test = ch9util.rain_split() tpot = TPOT(generations=7, population_size=110, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_train, y_train, X_test, y_test))
train = train.drop(drop_list,axis=1) train = train[0:3000000:300] train.info(memory_usage='deep') X = train.drop("hotel_cluster",axis=1).values y = train.loc[: , "hotel_cluster"].values del train import gc gc.collect() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,test_size=0.25) print("got here!") my_tpot = TPOT(generations=20,verbosity=2,population_size=5) # seems to have a problem with pop <5 # gen 1-> really means two generations! start = time.clock() print(start) my_tpot.fit(X_train, y_train) my_tpot.export('tpot_expedia_pipeline.py') end = time.clock() duration = end - start score = my_tpot.score(X_test, y_test) print(duration,score)