Esempio n. 1
0
def test_score():
    """Ensure that the TPOT score function raises a ValueError when no optimized pipeline exists"""

    tpot_obj = TPOT()

    try:
        tpot_obj.score(testing_features, testing_classes)
        assert False  # Should be unreachable
    except ValueError:
        pass
Esempio n. 2
0
def test_score():
    """Ensure that the TPOT score function raises a ValueError when no optimized pipeline exists"""

    tpot_obj = TPOT()

    try:
        tpot_obj.score(testing_features, testing_classes)
        assert False  # Should be unreachable
    except ValueError:
        pass
Esempio n. 3
0
def test_score_2():
    """Ensure that the TPOT score function outputs a known score for a fixed pipeline"""

    tpot_obj = TPOT()
    tpot_obj._training_classes = training_classes
    tpot_obj._training_features = training_features
    tpot_obj.pbar = tqdm(total=1, disable=True)
    known_score = 0.981993770448  # Assumes use of the TPOT balanced_accuracy function

    # Reify pipeline with known score
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('_logistic_regression(input_df, 1.0, 0, True)', tpot_obj._pset)

    # Get score from TPOT
    score = tpot_obj.score(testing_features, testing_classes)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
Esempio n. 4
0
def test_score_2():
    """Assert that the TPOT score function outputs a known score for a fixed pipeline"""

    tpot_obj = TPOT()
    tpot_obj.pbar = tqdm(total=1, disable=True)
    known_score = 0.986318199045  # Assumes use of the TPOT balanced_accuracy function

    # Reify pipeline with known score
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('RandomForestClassifier(input_matrix)', tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    # Get score from TPOT
    score = tpot_obj.score(testing_features, testing_classes)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
Esempio n. 5
0
def test_score_2():
    """Ensure that the TPOT score function outputs a known score for a fixed pipeline"""

    tpot_obj = TPOT()
    tpot_obj._training_classes = training_classes
    tpot_obj._training_features = training_features
    tpot_obj.pbar = tqdm(total=1, disable=True)
    known_score = 0.981993770448  # Assumes use of the TPOT balanced_accuracy function

    # Reify pipeline with known score
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('_logistic_regression(input_df, 1.0, 0, True)', tpot_obj._pset)

    # Get score from TPOT
    score = tpot_obj.score(testing_features, testing_classes)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
Esempio n. 6
0
def test_score_2():
    """Assert that the TPOT score function outputs a known score for a fixed pipeline"""

    tpot_obj = TPOT()
    tpot_obj.pbar = tqdm(total=1, disable=True)
    known_score = 0.986318199045  # Assumes use of the TPOT balanced_accuracy function

    # Reify pipeline with known score
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('RandomForestClassifier(input_matrix)', tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(
        expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features, training_classes)

    # Get score from TPOT
    score = tpot_obj.score(testing_features, testing_classes)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
phenotype = load_gametes['Class']
individuals = load_gametes.drop('Class', axis=1)

X_train, X_test, y_train, y_test = train_test_split(individuals,
                                                    phenotype,
                                                    train_size=0.75,
                                                    test_size=0.25)

# Expert Knowledge Filter & MDR
tpot = TPOT(generations=200,
            population_size=200,
            verbosity=2,
            expert_source=load_ekf)
t1 = time.time()
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
t2 = time.time()
print("Time lapsed: ", t2 - t1)

# MDR Only
#tpot = TPOT(generations=500, population_size=350, verbosity=2, expert_source=None)
#t1 = time.time()
#tpot.fit(X_train, y_train)
#print(tpot.score(X_test, y_test))
#t2 = time.time()
#print("Time lapsed: ", t2 - t1)

# Random Forest
#clf = RandomForestClassifier(max_depth=5, max_features=len(X_train.columns),
#                             n_estimators=1000)
#clf.fit(X_train, y_train)
from tpot import TPOT
from sklearn.cross_validation import train_test_split
import pandas as pd 
import numpy as numpy

telescope = pd.read_csv("MAGIC Gamma Telescope Data.csv")

#clean the data
telescope_shuffle = telescope.iloc[np.random.permutation(len(telescope))]
tele = telescope_shuffle.reset_index(drop=True)

#Store classes
tele['Class'] = tele['Class'].map({'g':0, 'h':1})
tele_class = tele['Class'].values

#Split data
training_indices, validation_indices = training_indices, testing_indices = train_test_split(tele.index,
		stratify= tele_class, train_size=0.75, test_size=0.25)

#find best model
tpot = TPOT(generations=5, verbosity=2)
tpot.fit(tele.drop('Class', axis=1).loc[training_indices].values,
	tele.loc[training_indices, "Class"].values)

#Score the accuracy
tpot.score(tele.drop('Class', axis=1).loc[validation_indices].values,
	tele.loc[validation_indices, 'Class'].values)

#Export generated code
tpot.export('pipeline.py')
data_class = data['Class'].values

# split training, testing, and validation data
training_indices, validation_indices = training_indices, testing_indices = train_test_split(
    data.index, stratify=data_class, train_size=0.75, test_size=0.25)

# Let Genetic Programming to find the best ML model and hyperparamters
# Verbosity 2 shows a loading bar
tpot = TPOT(generations=5, verbosity=2)
tpot.fit(
    data.drop('Class', axis=1).loc[training_indices].values,
    data.loc[training_indices, 'Class'].values)

# Score the accuracy
tpot.score(
    data.drop('Class', axis=1).loc[validation_indices].values,
    data.loc[validation_indices, 'Class'].values)

# export the generated code
tpot.export('pipeline.py')

## compute sigmoid nonlinearity
## normalizes numbers given
#def sigmoid(x):
#    output = 1/(1+np.exp(-x))
#    return output
#
## convert output of sigmoid function to its derivative
## used to calculate training later on
#def sigmoid_output_to_derivative(output):
#    return output*(1-output)
Esempio n. 10
0
from tpot import TPOT
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                    train_size=0.75, test_size=0.25)

tpot = TPOT(generations=5,verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

tpot.export('tpot_iris_pipeline.py')
Esempio n. 11
0
import ch9util
from tpot import TPOT

X_train, X_test, y_train, y_test = ch9util.rain_split()
tpot = TPOT(generations=7, population_size=110, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_train, y_train, X_test, y_test))
Esempio n. 12
0
train = train.drop(drop_list,axis=1)
train = train[0:3000000:300]
train.info(memory_usage='deep')



X = train.drop("hotel_cluster",axis=1).values
y = train.loc[: , "hotel_cluster"].values

del train
import gc
gc.collect()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,test_size=0.25)

print("got here!")

my_tpot = TPOT(generations=20,verbosity=2,population_size=5) # seems to have a problem with pop <5
# gen 1-> really means two generations!

start = time.clock()
print(start)
my_tpot.fit(X_train, y_train)
my_tpot.export('tpot_expedia_pipeline.py')
end = time.clock()
duration = end - start
score = my_tpot.score(X_test, y_test)
print(duration,score)