Example #1
0
File: tests.py Project: val922/tpot
def test_score_3():
    """Assert that the TPOTRegressor score function outputs a known score for a fix pipeline"""

    tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')
    known_score = 12.3727966005  # Assumes use of mse

    # Reify pipeline with known score

    pipeline_string = (
        "ExtraTreesRegressor("
        "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8,"
        "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber,"
        "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5,"
        "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5,"
        "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25),"
        "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5,"
        "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, "
        "ExtraTreesRegressor__n_estimators=100)")
    tpot_obj._optimized_pipeline = creator.Individual.from_string(
        pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(
        expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)
    # Get score from TPOT
    score = tpot_obj.score(testing_features_r, testing_classes_r)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
Example #2
0
def generate_optimal_pipeline(features, outcomes, output_dir, memory=None):
    """
    Generate a training set and test set from data
    Parameters:
    - features: The feature set to train on. In scikit-learn this is often refered to as X.
    - outcomes: The labeled outcomes we are trying to predict. In scikit-learn this often refered to as y.
    - output_dir: The directory that TPOT can write to. This will be where the cache and intermitent save are written.
    Output:
    The TPOT object returned by fitting TPOTRegressor https://epistasislab.github.io/tpot/api/
    """
    if (features.shape[0] * features.shape[1] > (50000 * 250)):
        config_dict = 'TPOT light'
    else:
        config_dict = None

    pipeline_optimizer = TPOTRegressor(
        generations=GENERATIONS,
        population_size=POPULATION_SIZE,
        verbosity=get_verbosity('tpot'),
        random_state=RANDOM_STATE,
        template='Selector-Transformer-Regressor',
        n_jobs=N_JOBS,
        warm_start=True,
        memory=memory,
        config_dict=config_dict,
        periodic_checkpoint_folder='{DIR}/tpot-intermediate-save/'.format(
            DIR=output_dir))

    pipeline_optimizer.fit(features, outcomes)
    return pipeline_optimizer
Example #3
0
def works():
    config_dict = {
        'sklearn.gaussian_process.GaussianProcessRegressor': {
            'alpha': [1e1, 1, 1e-1]
        }
    }
    model = TPOTRegressor(config_dict=config_dict,
                          crossover_rate=0.1,
                          cv=5,
                          disable_update_check=False,
                          early_stop=None,
                          generations=10,
                          max_eval_time_mins=5,
                          max_time_mins=None,
                          mutation_rate=0.9,
                          n_jobs=-1,
                          offspring_size=None,
                          population_size=100,
                          random_state=None,
                          scoring=None,
                          subsample=1.0,
                          use_dask=False,
                          verbosity=3,
                          warm_start=False)
    model.fit(X, y)
Example #4
0
def test_score_3():
    """Assert that the TPOTRegressor score function outputs a known score for a fix pipeline"""

    tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')
    known_score = 12.3727966005 # Assumes use of mse

    # Reify pipeline with known score

    pipeline_string = ("ExtraTreesRegressor("
        "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8,"
        "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber,"
        "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5,"
        "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5,"
        "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25),"
        "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5,"
        "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, "
        "ExtraTreesRegressor__n_estimators=100)")
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)
    # Get score from TPOT
    score = tpot_obj.score(testing_features_r, testing_classes_r)


    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
Example #5
0
    def _train(self):
        '''
        Calculate the residuals of the current training batch, then retrain on
        everything
        '''
        # Instantiate the preprocessor and TPOT if we haven't done so already
        if not hasattr(self, 'preprocessor'):
            self._train_preprocessor()
        if not hasattr(self, 'tpot'):
            self.tpot = TPOTRegressor(generations=2,
                                      population_size=32,
                                      offspring_size=32,
                                      verbosity=2,
                                      scoring='neg_median_absolute_error',
                                      n_jobs=16,
                                      warm_start=True)
            features = self.preprocessor.transform(self.training_batch)
            energies = [doc['energy'] for doc in self.training_batch]
            self.tpot.fit(features, energies)

        # Calculate and save the residuals of this next batch
        features = self.preprocessor.transform(self.training_batch)
        tpot_predictions = self.tpot.predict(features)
        dft_energies = np.array([doc['energy'] for doc in self.training_batch])
        residuals = tpot_predictions - dft_energies
        self.residuals.extend(list(residuals))

        # Retrain
        self.training_set.extend(self.training_batch)
        self.__train_tpot()
Example #6
0
def fails():
    config_dict = {
        'example_tpot.MyGP': {
            'alpha': [1e1, 1, 1e-1],
            # 'mu_x': np.logspace(-2, 4, 10),
            # 'mu_y': np.logspace(-2, 4, 10),
        }
    }
    model = TPOTRegressor(config_dict=config_dict,
                          crossover_rate=0.1,
                          cv=5,
                          disable_update_check=False,
                          early_stop=None,
                          generations=10,
                          max_eval_time_mins=5,
                          max_time_mins=None,
                          mutation_rate=0.9,
                          n_jobs=-1,
                          offspring_size=None,
                          population_size=100,
                          random_state=None,
                          scoring=None,
                          subsample=1.0,
                          use_dask=False,
                          verbosity=3,
                          warm_start=False)
    model.fit(X, y)
Example #7
0
    def discover(self, limit_time=None, random_state=42):
        """Perform the discovery of a pipeline.

        Args:
            limit_time (int): In minutes, the maximum time to wait for the
                generation of the pipeline. If None, ignored.
            random_state (int): The number to seed the random state with.
                Defaults to 42.

        Returns:
            sklearn.pipeline.Pipeline: The resulting pipeline.

        """
        # Define the arguments as a dictionary
        arguments = {
            "generations": 5,
            "population_size": 20,
            "cv": 5,
            "random_state": random_state,
            "verbosity": 2,
            "max_time_mins": limit_time,
        }

        # If the search space is defined, then use it
        if isinstance(self.search_space, dict):
            arguments['config_dict'] = self.search_space

        # Limit time if passed
        if limit_time is not None:
            arguments['max_time_mins'] = limit_time

        if self.evaluation_metric is not None:
            arguments['scoring'] = self.evaluation_metric

        # If the initially passed tpot params are not none dict, we extend args
        if self._passed_tpot_params is not None \
                and isinstance(self._passed_tpot_params, dict):
            arguments.update(self._passed_tpot_params)

        # Create classifier or regressor, depending on the associated problem
        if self.dataset.is_classification_problem():
            self._tpot_optimizer = TPOTClassifier(**arguments)

        if self.dataset.is_regression_problem():
            self._tpot_optimizer = TPOTRegressor(**arguments)

        # Create the train_test split, for now...

        x_train, x_val, y_train, y_val = self.dataset.train_test_split()

        # Fit TPOT so we discover the pipeline
        self._tpot_optimizer.fit(x_train, y_train)

        # Store the validation score obtained for our validation) set
        self.validation_score = self._tpot_optimizer.score(x_val, y_val)

        return self._tpot_optimizer.fitted_pipeline_
Example #8
0
 def fit_single_output(row):
     tpot = TPOTRegressor(generations=generations,
                          population_size=population_size,
                          verbosity=2,
                          n_jobs=1,
                          config_dict='TPOT light')
     fit_model = tpot.fit(X, row).fitted_pipeline_
     print(tpot.score(X, row))
     return fit_model
Example #9
0
def build_regressor(data, name):
	X, y = data
	config = make_tpot_pmml_config(regressor_config_dict)
	del config["sklearn.neighbors.KNeighborsRegressor"]
	regressor = TPOTRegressor(generations = 3, population_size = 3, random_state = 13, config_dict = config, verbosity = 2)
	regressor.fit(X, y)
	pipeline = make_pmml_pipeline(regressor.fitted_pipeline_, active_fields = X.columns.values, target_fields = [y.name])
	print(repr(pipeline))
	store_pkl(pipeline, name)
	result = DataFrame(regressor.predict(X), columns = [y.name])
	store_csv(result, name)
Example #10
0
    def __init__(self, **kwargs):

        self.task = kwargs['task']
        self.speed = kwargs['speed']
        self.max_eval_time = 3*self.speed/2
        self.test_size = kwargs['test_size']
        if self.task == 'Classification':
            self.tpot_model = TPOTClassifier(generations=self.speed, population_size=self.speed*5, 
                verbosity=2, n_jobs=-1, max_eval_time_mins=self.max_eval_time, early_stop=1)
        else:
            self.tpot_model = TPOTRegressor(generations=self.speed, population_size=self.speed*5, 
                verbosity=2, n_jobs=-1, max_eval_time_mins=self.max_eval_time, early_stop=1)
Example #11
0
def fit_model0_adsorption_energies(adsorbate):
    '''
    Create and save a modeling pipeline to predict adsortion energies.

    Arg:
        adsorbate   String indicating which adsorbate you want to fit the model
                    for
    Saves:
        pipeline    An `sklearn.pipeline.Pipeline` object that is fit to our
                    data and can be used to make predictions on adsorption
                    energies.  The pipeline is automatically saved to our GASdb
                    cache location, which is specified as 'gasdb_path' in the
                    `gaspyrc.json` file.
    '''
    model_name = 'model0'

    print('[%s] Making %s pipeline/regression for %s...' %
          (datetime.utcnow(), model_name, adsorbate))

    # Fit the transformers and models
    docs = get_adsorption_docs(adsorbate=adsorbate)
    energies_dft = np.array([doc['energy'] for doc in docs])
    inner_fingerprinter = fingerprinters.InnerShellFingerprinter()
    outer_fingerprinter = fingerprinters.OuterShellFingerprinter()
    fingerprinter = fingerprinters.StackedFingerprinter(
        inner_fingerprinter, outer_fingerprinter)
    scaler = StandardScaler()
    pca = PCA()
    preprocessing_pipeline = Pipeline([('fingerprinter', fingerprinter),
                                       ('scaler', scaler), ('pca', pca)])
    features = preprocessing_pipeline.fit_transform(docs)
    tpot = TPOTRegressor(generations=2,
                         population_size=32,
                         offspring_size=32,
                         verbosity=2,
                         scoring='neg_median_absolute_error',
                         n_jobs=16)
    tpot.fit(features, energies_dft)

    # Make the pipeline
    steps = [('fingerprinter', fingerprinter), ('scaler', scaler),
             ('pca', pca)]
    for step in tpot.fitted_pipeline_.steps:
        steps.append(step)
    pipeline = Pipeline(steps)

    # Save the pipeline
    file_name = GASDB_LOCATION + '/pipeline_%s_%s.pkl' % (adsorbate,
                                                          model_name)
    with open(file_name, 'wb') as file_handle:
        pickle.dump(pipeline, file_handle)
Example #12
0
def tpot(use_dask=True):
    # TODO: Add some documentation...
    # TODO: Investigate why tpot crashes when uing Dask (probably a RAM problem).
    if use_dask:
        client = Client("tcp://192.168.1.94:8786")
        print(client)
    tpot_reg = TPOTRegressor(generations=TPOT_GENERATIONS,
                             population_size=TPOT_POPULATION_SIZE,
                             random_state=SEED,
                             cv=CV,
                             use_dask=use_dask,
                             verbosity=2,
                             memory="auto")
    df = pd.read_csv("elo/data/augmented_train.csv")
    print(df.sample(5))
    # TODO: Find a better way to impute inf and missing values.
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(df.median())
    X = df.drop(FEATS_EXCLUDED, axis=1, errors='ignore').values
    y = df.loc[:, "target"].values

    if use_dask:
        with ProgressBar() as pbar, Profiler() as prof:
            tpot_reg.fit(X, y)
    else:
        tpot_reg.fit(X, y)
    export_path = str(
        Path('elo/data/tpot_few_generations_augmented_dataset.py').absolute())
    tpot_reg.export(export_path)
    return tpot_reg
Example #13
0
    def run_example(self):

        train = pd.read_csv("./data/churn-train.csv")
        #dummy_train = pd.get_dummies(train[categorical_cols])
        categorical_feature_mask = train.dtypes == object
        categorical_cols = train.columns[categorical_feature_mask].tolist()
        le = LabelEncoder()
        #le.fit(train[categorical_cols])
        #le.transform(train[categorical_cols])
        train[categorical_cols] = train[categorical_cols].apply(
            lambda col: le.fit_transform(col))
        # numpy
        X_train = train.drop(columns=['churn_probability']).to_numpy()
        y_train = train["churn_probability"].to_numpy()

        test = pd.read_csv("./data/churn-test.csv")
        #dummy_new = pd.get_dummies(test[categorical_cols])
        test[categorical_cols] = test[categorical_cols].apply(
            lambda col: le.fit_transform(col))
        X_test = test.drop(columns=['churn_probability']).to_numpy()
        y_test = test["churn_probability"].to_numpy()

        tpot = TPOTRegressor(generations=5,
                             population_size=50,
                             verbosity=2,
                             random_state=42,
                             scoring='neg_mean_absolute_error',
                             cv=5)
        tpot.fit(X_train, y_train)
        print(tpot.score(X_test, y_test))
        tpot.export('tpot_iris_pipeline.py')

        return tpot.score(X_test, y_test)
def rolling_forecasts(data, target):
    """
    Fits the rolling forecast model
    :param data: feature Dataframe
    :param window: lookback window
    :param horizon: forecast horizon
    :param target: variable to be forecasted
    :return:
    """
    model = TPOTRegressor(generations=5, population_size=50, verbosity=2)
    model.fit(data.values, target)
    # for i in range(0, ldf.shape[0] - window):
    #     model.fit(ldf.values[i:i + window, :], ldf['target'].values[i:i + window])

    return model
Example #15
0
    def __init__(self, dataset_path, json_path, n_jobs=1, config_dict=None, task="Classification"):
        self.scores = []
        self.datasets_path = dataset_path

        self.JSON = json_path

        if task == "Classification":
            self.tpot = TPOTClassifier(population_size=1, generations=0, verbosity=0,
                                       n_jobs=n_jobs, config_dict=config_dict, warm_start=True)
        elif task == "Regression":
            self.tpot = TPOTRegressor(population_size=1, generations=0, verbosity=0,
                                       n_jobs=n_jobs, config_dict=config_dict, warm_start=True)
        else:
            raise ValueError

        self.tpot._fit_init()  # Create _pset(PrimitiveSet)
Example #16
0
def test_export_pipeline_5():
    """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with SelectFromModel."""
    tpot_obj = TPOTRegressor()
    pipeline_string = (
        'DecisionTreeRegressor(SelectFromModel(input_matrix, '
        'SelectFromModel__ExtraTreesRegressor__max_features=0.05, SelectFromModel__ExtraTreesRegressor__n_estimators=100, '
        'SelectFromModel__threshold=0.05), DecisionTreeRegressor__max_depth=8,'
        'DecisionTreeRegressor__min_samples_leaf=5, DecisionTreeRegressor__min_samples_split=5)'
    )
    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    expected_code = """import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'].values, random_state=42)

exported_pipeline = make_pipeline(
    SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.05, n_estimators=100), threshold=0.05),
    DecisionTreeRegressor(max_depth=8, min_samples_leaf=5, min_samples_split=5)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert expected_code == export_pipeline(pipeline, tpot_obj.operators,
                                            tpot_obj._pset)
Example #17
0
def createRegressor(config):
    tpot = TPOTRegressor(
        generations=int(config["generations"]),
        population_size=int(config["population_size"]),
        offspring_size=None
        if config["offspring_size"] is None else int(config["offspring_size"]),
        mutation_rate=config["mutation_rate"],
        crossover_rate=config["crossover_rate"],
        scoring=config["scoring"],
        cv=int(config["cv"]),
        subsample=config["subsample"],
        n_jobs=int(config["n_jobs"]),
        max_time_mins=None
        if config["max_time_mins"] is None else int(config["max_time_mins"]),
        max_eval_time_mins=config["max_eval_time_mins"],
        random_state=None
        if config["random_state"] is None else int(config["random_state"]),
        config_dict=config["config_dict"],
        warm_start=config["warm_start"],
        memory=config["memory"],
        use_dask=config["use_dask"],
        periodic_checkpoint_folder=config["periodic_checkpoint_folder"],
        early_stop=None
        if config["early_stop"] is None else int(config["early_stop"]),
        verbosity=config["verbosity"],
        disable_update_check=config["disable_update_check"])
    return tpot
Example #18
0
def runTPOT(X, y, metric, algo):
    aml_config_dict = aml_config()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.75,
                                                        test_size=0.25)

    if algo == "Classifier":
        pipeline_optimizer = TPOTClassifier(generations=1,
                                            population_size=5,
                                            verbosity=2,
                                            warm_start=True)
        pipeline_optimizer.fit(X_train, y_train)
        print(pipeline_optimizer.score(X_test, y_test))
    elif algo == 'Regressor':

        def aml_reg_scorer(y_pred, y_test):
            rsme = sqrt(mean_squared_error(y_test, y_pred))
            return rsme

        aml_custom_scorer = make_scorer(aml_reg_scorer,
                                        greater_is_better=False)
        pipeline_optimizer = TPOTRegressor(generations=1,
                                           population_size=5,
                                           verbosity=2,
                                           warm_start=True,
                                           scoring=aml_custom_scorer)
        pipeline_optimizer.fit(X_train, y_train)
        print(pipeline_optimizer.score(X_test, y_test))
    else:
        raise Exception('Incorrect Problem Type')
    return pipeline_optimizer, pipeline_optimizer.score(X_test, y_test), len(
        pipeline_optimizer.evaluated_individuals_)
Example #19
0
def test_set_param_recursive_2():
    """Assert that set_param_recursive sets \"random_state\" to 42 in nested estimator in SelectFromModel."""
    pipeline_string = (
        'DecisionTreeRegressor(SelectFromModel(input_matrix, '
        'SelectFromModel__ExtraTreesRegressor__max_features=0.05, SelectFromModel__ExtraTreesRegressor__n_estimators=100, '
        'SelectFromModel__threshold=0.05), DecisionTreeRegressor__max_depth=8,'
        'DecisionTreeRegressor__min_samples_leaf=5, DecisionTreeRegressor__min_samples_split=5)'
    )
    tpot_obj = TPOTRegressor()
    tpot_obj._fit_init()
    deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline)
    set_param_recursive(sklearn_pipeline.steps, 'random_state', 42)

    assert getattr(getattr(sklearn_pipeline.steps[0][1], 'estimator'), 'random_state') == 42
    assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42
Example #20
0
 def build_pipeline(self):
     """
     Makes a pipeline based on data_config
     This is because autosklearn does not perform automatic data encoding
     """
     categorical_list = infer_categoricals(self.X)
     preprocessing_steps = []
     if self.data_config.get("text_columns"):
         print("Applying TFIDF to text columns: {data_config.get('text_columns')}")
         preprocessing_steps.append(make_pipeline(
             ColumnSelector(cols=data_config.get("text_columns"), drop_axis=True),
             TfidfVectorizer()
         ))
         categorical_list = [c for c in categorical_list if c not in data_config["text_columns"]]
     if categorical_list:
         print(f"Applying One Hot Encoding to categorical columns: {categorical_list}")
         preprocessing_steps.append(make_pipeline(
             ColumnSelector(cols=categorical_list),
             OneHotEncoder(handle_unknown="impute")
         ))
     if preprocessing_steps:
         preprocessing_steps = make_union(*preprocessing_steps)
         preprocessing_steps = make_pipeline(preprocessing_steps, SimpleImputer())
     else:
         preprocessing_steps = SimpleImputer()
     if self.problem_type == "classification":
         automl = TPOTClassifier(**self.automl_settings)
     else:
         automl = TPOTRegressor(**self.automl_settings)
     automl_pipeline = make_pipeline(
         preprocessing_steps,
         automl
     )
     return automl_pipeline
Example #21
0
    def run(self, train_path, test_path, target, task):
        train = pd.read_csv(train_path)
        X_train = train.drop(columns=[target]).to_numpy()
        y_train = train[target].to_numpy()

        test = pd.read_csv(test_path)
        X_test = test.drop(columns=[target]).to_numpy()
        y_test = test[target].to_numpy()

        if task == "class":
            tpot = TPOTClassifier(max_time_mins=60,
                                  verbosity=3,
                                  random_state=42,
                                  scoring='roc_auc',
                                  cv=5,
                                  n_jobs=-1,
                                  early_stop=3)
        elif task == "reg":
            tpot = TPOTRegressor(max_time_mins=60,
                                 verbosity=3,
                                 random_state=42,
                                 scoring='neg_mean_absolute_error',
                                 cv=5,
                                 n_jobs=-1,
                                 early_stop=3)

        tpot.fit(X_train, y_train)
        print(tpot.score(X_test, y_test))
        tpot.export('/home/lferreira/autoautoml/data/churn/tpot/plasma.py')

        return tpot.score(X_test, y_test)
Example #22
0
File: tests.py Project: val922/tpot
def test_config_dict_params():
    """Assert that TPOT uses TPOT's lite dictionary of operators when config_dict is \'TPOT light\' or \'TPOT MDR\'"""
    tpot_obj = TPOTClassifier(config_dict='TPOT light')
    assert tpot_obj.config_dict == classifier_config_dict_light

    tpot_obj = TPOTClassifier(config_dict='TPOT MDR')
    assert tpot_obj.config_dict == tpot_mdr_classifier_config_dict

    tpot_obj = TPOTRegressor(config_dict='TPOT light')
    assert tpot_obj.config_dict == regressor_config_dict_light

    try:
        tpot_obj = TPOTRegressor(config_dict='TPOT MDR')
        assert False
    except TypeError:
        assert True
Example #23
0
File: tests.py Project: val922/tpot
def test_init_default_scoring():
    """Assert that TPOT intitializes with the correct default scoring function"""

    tpot_obj = TPOTRegressor()
    assert tpot_obj.scoring_function == 'neg_mean_squared_error'

    tpot_obj = TPOTClassifier()
    assert tpot_obj.scoring_function == 'accuracy'
def regressor(verbosity, max_time_mins, max_eval_time_mins, config_dict,
              warm_start, scoring):
    if verbosity == True:
        model_def = TPOTRegressor(verbosity=2,
                                  max_time_mins=max_time_mins,
                                  max_eval_time_mins=max_eval_time_mins,
                                  config_dict=config_dict,
                                  warm_start=warm_start,
                                  scoring=scoring)
    else:
        model_def = TPOTRegressor(verbosity=0,
                                  max_time_mins=max_time_mins,
                                  max_eval_time_mins=max_eval_time_mins,
                                  config_dict=config_dict,
                                  warm_start=warm_start,
                                  scoring=scoring)
    return model_def
 def train(self, params):
     #print('Training the model')
     #self.clf = RandomForestRegressor() # 0.625
     #self.clf = RandomForestClassifier() # 0.5
     #self.clf = GradientBoostingRegressor() # 0.730
     #self.clf = BaggingRegressor(KNeighborsClassifier(),max_samples=0.5, max_features=0.5) # MemoryError
     #self.clf = AdaBoostRegressor() # 0.674
     #clf = KNeighborsRegressor() # MemoryError
     #self.clf = MLPRegressor(hidden_layer_sizes = (5,)) # MemoryError
     #self.clf = GradientBoostingRegressor(**params)
     #self.clf = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task= 75000, per_run_time_limit= 7500 )
     self.clf = TPOTRegressor(generations=5,
                              population_size=50,
                              verbosity=2,
                              n_jobs=3)
     self.clf.fit(self.x, self.y)
     self.clf.export('tpot_best_pipeline.py')
Example #26
0
def train_tpot(name, X, y, gen, cores):

    test_name = str('gen_' + str(gen) + name + '_' + time.strftime('%y%m%d'))

    print('Training with TPOT .... ', test_name)
    t1 = time.time()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.75,
                                                        test_size=0.25)
    tpot = TPOTRegressor(generations=gen,
                         population_size=50,
                         verbosity=2,
                         n_jobs=cores)
    tpot.fit(X_train, y_train.reshape(-1, ))

    print(tpot.score(X_test, y_test))
    t2 = time.time()
    delta_time = t2 - t1
    print('Time to train...:', delta_time)

    print('Saving the model ...')
    tpot.export('trained_models/' + test_name + '.py')
    joblib.dump(tpot.fitted_pipeline_, 'trained_models/' + test_name + '.pk1')
    print(test_name, ' saved ... ')
Example #27
0
    def fit(self, df, target, **fit_kwargs):
        """
        Train a TPOTRegressor or TPOTClassifier by fitting on a dataframe.

        Args:
            df (pandas.DataFrame): The df to be used for training.
            target (str): The key used to identify the machine learning target.
            **fit_kwargs: Keyword arguments to be passed to the TPOT backend.
                These arguments must be valid arguments to the TPOTBase class.

        Returns:
            TPOTAdaptor (self)

        """
        # Prevent goofy pandas casting by casting to native
        y = df[target].values
        X = df.drop(columns=target).values

        # Determine learning type based on whether classification or regression
        self.mode = regression_or_classification(df[target])

        mltype_str = "Classifier" if self.mode == AMM_CLF_NAME else "Regressor"
        self.tpot_kwargs["template"] = self.tpot_kwargs.get(
            "template", "Selector-Transformer-{}".format(mltype_str))

        if self.mode == AMM_CLF_NAME:
            self.tpot_kwargs["config_dict"] = self.tpot_kwargs.get(
                "config_dict", TPOT_CLASSIFIER_CONFIG)
            if "scoring" not in self.tpot_kwargs:
                self.tpot_kwargs["scoring"] = "balanced_accuracy"
            self._backend = TPOTClassifier(**self.tpot_kwargs)
        elif self.mode == AMM_REG_NAME:
            self.tpot_kwargs["config_dict"] = self.tpot_kwargs.get(
                "config_dict", TPOT_REGRESSOR_CONFIG)
            if "scoring" not in self.tpot_kwargs:
                self.tpot_kwargs["scoring"] = "neg_mean_absolute_error"
            self._backend = TPOTRegressor(**self.tpot_kwargs)
        else:
            raise ValueError("Learning type {} not recognized as a valid mode "
                             "for {}".format(self.mode,
                                             self.__class__.__name__))
        self._features = df.drop(columns=target).columns.tolist()
        self._fitted_target = target
        self._backend = self._backend.fit(X, y, **fit_kwargs)
        return self
Example #28
0
def tpot_test(conf):
    from tpot import TPOTRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import TimeSeriesSplit

    p.load_config(conf)
    ds = dl.load_price_data()
    ds = add_features(ds)

    X = ds[p.feature_list][:-1]
    y = ds['DR'].shift(-1)[:-1]

    # Split Train and Test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        test_size=0.2)

    tpot = TPOTRegressor(n_jobs=-1,
                         verbosity=2,
                         max_time_mins=60,
                         cv=TimeSeriesSplit(n_splits=3))

    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('./tpot_out.py')
Example #29
0
def train_and_pickle_best_model(target, X, y, val_X, val_y):
    print('AutoML Search for good model for {}'.format(target))
    pipeline_optimizer = TPOTRegressor(
        generations=10,
        population_size=150,
        cv=3,
        random_state=0xDEADBEEF,
        verbosity=3,
        scoring='r2',
        n_jobs=-1,
        early_stop=5,
        periodic_checkpoint_folder='tpot_checkpoint')
    pipeline_optimizer.fit(X, y)
    new_preds = pipeline_optimizer.predict(val_X)
    mae = mean_absolute_error(val_y, new_preds)
    rmse = sqrt(mean_squared_error(val_y, new_preds))
    r2 = r2_score(val_y, new_preds)
    print("TPOT mae:", mae)
    print("TPOT rmse:", rmse)
    print("TPOT R^2 score:", r2)
    pipeline_optimizer.export(
        'models/tpot_exported_pipeline_{}.py'.format(target))
    dump(pipeline_optimizer.fitted_pipeline_,
         'models/{}-best-model-automl.joblib'.format(target))
    return r2, mae, rmse
Example #30
0
    def fit(self):
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, train_size=self.train_size, random_state=0)

        tpot = TPOTRegressor(generations=self.generation,
                             population_size=self.generation,
                             verbosity=3,
                             warm_start=True,
                             config_dict=self.config_dict)
        startTime = datetime.datetime.now()
        tpot.fit(X_train, y_train)

        endTime = datetime.datetime.now()

        predict_score = tpot.score(X_test, y_test)
        cost_time = endTime - startTime

        return predict_score, cost_time
def functionRegression(sparkDF, listOfFeatures, label):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = sparkDF.toPandas()
    df.columns.intersection(listOfFeatures)
    X = df.drop(label, axis=1).values
    y = df[label].values
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=1,
                                                        test_size=0.2)
    tpotModel = TPOTRegressor(verbosity=3,
                              generations=10,
                              max_time_mins=15,
                              n_jobs=-1,
                              random_state=25,
                              population_size=15)
    tpotModel.fit(X_train, y_train)
    print(tpotModel.score(X_test, y_test))
Example #32
0
def test_score_3():
    """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline"""

    tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')
    tpot_obj._pbar = tqdm(total=1, disable=True)
    known_score = 8.9673743407873712  # Assumes use of mse
    # Reify pipeline with known score
    tpot_obj._optimized_pipeline = creator.Individual.\
        from_string('ExtraTreesRegressor(GradientBoostingRegressor(input_matrix, 100.0, 0.11), 0.17999999999999999)', tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)

    # Get score from TPOT
    score = tpot_obj.score(testing_features_r, testing_classes_r)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

    assert isclose(known_score, score)
Example #33
0
def test_sample_weight_func():
    """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline with sample weights"""

    tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')

    # Reify pipeline with known scor

    pipeline_string = ("ExtraTreesRegressor("
        "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8,"
        "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber,"
        "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5,"
        "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5,"
        "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25),"
        "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5,"
        "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, "
        "ExtraTreesRegressor__n_estimators=100)")
    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)

    tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)

    # make up a sample weight
    training_classes_r_weight = np.array(range(1, len(training_classes_r)+1))
    training_classes_r_weight_dict = set_sample_weight(tpot_obj._fitted_pipeline.steps, training_classes_r_weight)

    np.random.seed(42)
    cv_score1 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error')

    np.random.seed(42)
    cv_score2 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error')

    np.random.seed(42)
    cv_score_weight = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error', fit_params=training_classes_r_weight_dict)

    np.random.seed(42)
    tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r, **training_classes_r_weight_dict)
    # Get score from TPOT
    known_score = 12.643383517 # Assumes use of mse
    score = tpot_obj.score(testing_features_r, testing_classes_r)

    # http://stackoverflow.com/questions/5595425/
    def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
    assert np.allclose(cv_score1, cv_score2)
    assert not np.allclose(cv_score1, cv_score_weight)
    assert isclose(known_score, score)
train = combi[:train.shape[0]]
test = combi[train.shape[0]:]
test.drop('Item_Outlet_Sales',axis=1,inplace=True)

## removing id variables 
tpot_train = train.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1)
tpot_test = test.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train.drop('Item_Outlet_Sales',axis=1,inplace=True)

# finally building model using tpot library
from tpot import TPOTRegressor

X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

tpot.export(data+'tpot_boston_pipeline.py')

## predicting using tpot optimised pipeline
tpot_pred = tpot.predict(tpot_test)
sub1 = pd.DataFrame(data=tpot_pred)

#sub1.index = np.arange(0, len(test)+1)
sub1 = sub1.rename(columns = {'0':'Item_Outlet_Sales'})
sub1['Item_Identifier'] = test['Item_Identifier']
sub1['Outlet_Identifier'] = test['Outlet_Identifier']
sub1.columns = ['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier']
sub1 = sub1[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]
def model_dev(train_set,matchups,spreads):
	""" Create the testing set for the algo creation """
	# Create a sample set to pass into the machine learning algorithm
	X = train_set[['rush_attempt_diff', 'turn_diff', 'yards_diff', 'third_diff', 'sack_diff', 'sack_ydiff', 'poss_diff', 'p_attempt_diff']].copy()
	# X = df[['poss_diff', 'third_diff', 'turn_diff', 'pass_diff', 'rush_diff']].copy()

	# Create results vector (a home win = 1, a home loss or tie = 0)
	train_set.rename(columns={'result_spread':'class'},inplace=True)
	y = train_set['class']#np.array(np.where(df['home_score'] > df['away_score'], 1, 0))

	""" Train, test, and predict the algorithm """
	# Scale the sample data
	scaler = preprocessing.StandardScaler().fit(X)
	X = scaler.transform(X)

	# Delete the dataframe to clear memory
	del train_set

	# Split out training and testing data sets
	X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.25,random_state=0)

	# alphas = [0.1, 0.3, 0.9, 1.0, 1.3, 1.9, 2.0, 2.3, 2.9]
	# for alpha in alphas:
	# 	reg = linear_model.Ridge(alpha = alpha)
	# 	reg.fit(X_train,y_train)
	# 	print 'alpha = ',alpha,', score = ',reg.score(X_test,y_test)
	# input()
	pipeline_optimizer = TPOTRegressor(generations = 5, population_size = 10, random_state = 42, cv = 5, verbosity = 2, n_jobs = 3)#, scoring = 'f1')
	pipeline_optimizer.fit(X_train,y_train)
	print pipeline_optimizer.score(X_test,y_test)
	pipeline_optimizer.export('NFL_ML_TPOT_Regressor.py')

	# Remove the 'week' 'home_team' and 'away_team' columns from matchups as they are not used in the algorithm
	matchups.drop(['week', 'home_team', 'away_team'], axis=1, inplace=True)


	"""
	for feat in range(1,len(matchups.columns)):
		for c in C_vec:
			# Create the classifier and check the score
			# clf = LogisticRegression()
			clf = linear_model.LogisticRegression(C=c,random_state=42)
			selector = RFE(clf)
			selector = selector.fit(X_train,y_train)

			# Calculate probabilities using the predict_proba method for logistic regression
			probabilities = selector.predict_proba(scaler.transform(matchups))

			# Vectorize the spread_conversion function and apply the function to the probabilities result vector
			vfunc = np.vectorize(spread_conversion)
			predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0])

			# If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team
			bet_vector = np.array(np.where(predicted_spreads > spreads,0,1))

			# Create the actual result vector where a tie counts as a loss for the home team
			game_result = np.array(np.where(home_score.ix[:,0] + predicted_spreads[:] > away_score.ix[:,0], 1, 0))

			# Check to see where the bet_vector equals the actual game result with the spread included
			result = np.array(np.where(bet_vector == game_result,1,0))

			prob_result = float(np.sum(result)) / len(result)

			# print 'Number of features =', feat, 'C =',c,'  Percent correct =',prob_result

			if prob_result > prob_val:
				prob_val = prob_result
				C_val = c
				feat_val = feat

	print 'Score =',selector.score(X_test,y_test)
	# print prob_val, C_val, feat

	clf = linear_model.LogisticRegression(C=C_val,random_state=42)
	clf = clf.fit(X_train,y_train)
	probabilities = clf.predict_proba(scaler.transform(matchups))
	vfunc = np.vectorize(spread_conversion)
	predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0])
	"""

	predicted_spreads = pd.DataFrame(pipeline_optimizer.predict(scaler.transform(matchups)),columns = ['results'])
	bet_vector = np.array(np.where(predicted_spreads > spreads,0,1))
	print spreads
	print predicted_spreads
	print bet_vector
Example #36
0
    classifier_config_dict[test_operator_key_1]
)

TPOTSelectFromModel, TPOTSelectFromModel_args = TPOTOperatorClassFactory(
    test_operator_key_2,
    classifier_config_dict[test_operator_key_2]
)

mnist_data = load_digits()
training_features, testing_features, training_target, testing_target = \
    train_test_split(mnist_data.data.astype(np.float64), mnist_data.target.astype(np.float64), random_state=42)

tpot_obj = TPOTClassifier()
tpot_obj._fit_init()

tpot_obj_reg = TPOTRegressor()
tpot_obj_reg._fit_init()

def test_export_random_ind():
    """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39."""
    tpot_obj = TPOTClassifier(random_state=39, config_dict="TPOT light")
    tpot_obj._fit_init()
    tpot_obj._pbar = tqdm(total=1, disable=True)
    pipeline = tpot_obj._toolbox.individual()
    expected_code = """import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)