Example #1
0
def train_and_pickle_best_model(target, X, y, val_X, val_y):
    print('AutoML Search for good model for {}'.format(target))
    pipeline_optimizer = TPOTRegressor(
        generations=10,
        population_size=150,
        cv=3,
        random_state=0xDEADBEEF,
        verbosity=3,
        scoring='r2',
        n_jobs=-1,
        early_stop=5,
        periodic_checkpoint_folder='tpot_checkpoint')
    pipeline_optimizer.fit(X, y)
    new_preds = pipeline_optimizer.predict(val_X)
    mae = mean_absolute_error(val_y, new_preds)
    rmse = sqrt(mean_squared_error(val_y, new_preds))
    r2 = r2_score(val_y, new_preds)
    print("TPOT mae:", mae)
    print("TPOT rmse:", rmse)
    print("TPOT R^2 score:", r2)
    pipeline_optimizer.export(
        'models/tpot_exported_pipeline_{}.py'.format(target))
    dump(pipeline_optimizer.fitted_pipeline_,
         'models/{}-best-model-automl.joblib'.format(target))
    return r2, mae, rmse
Example #2
0
def tpot_regression(x_calib,
                    y_calib,
                    x_prod,
                    y_prod,
                    results_direct,
                    cv_folds,
                    error_metric,
                    num_jobs,
                    gens,
                    pop,
                    mins,
                    mins_per_pipeline,
                    verbose,
                    early_stop_generations,
                    tpot_config_dict,
                    model_name='tpot_best'):

    checkpoint_folder = results_direct + 'checkpoint_folder/'
    if not Path(checkpoint_folder).is_dir():
        os.mkdir(checkpoint_folder)

    ml_model = TPOTRegressor(generations=gens,
                             population_size=pop,
                             scoring=error_metric,
                             max_time_mins=mins,
                             cv=cv_folds,
                             verbosity=verbose,
                             n_jobs=num_jobs,
                             early_stop=early_stop_generations,
                             max_eval_time_mins=mins_per_pipeline,
                             config_dict=tpot_config_dict,
                             periodic_checkpoint_folder=checkpoint_folder)

    ml_model.fit(x_calib, y_calib)

    # save entire pipeline
    ml_model.export(results_direct + model_name + '.py')
    joblib.dump(ml_model.fitted_pipeline_,
                results_direct + model_name + '.sav')

    # for cross valdation errors see the exported model py file

    # production - results and errors
    y_prod_predict = ml_model.predict(x_prod)
    np.save(results_direct + model_name + '_prod_predicted.npy',
            y_prod_predict)

    df_prod_errors = pd.DataFrame(index=[
        'Mean Squared Error', 'Median Absolute Error',
        'Correlation Coefficient', 'R2'
    ])
    df_prod_errors['TPOT Best'] = [
        mean_squared_error(y_prod, y_prod_predict),
        median_absolute_error(y_prod, y_prod_predict),
        np.corrcoef(y_prod, y_prod_predict)[0][-1],
        r2_score(y_prod, y_prod_predict)
    ]
    df_prod_errors.to_csv(results_direct + model_name + '_prod_errors.csv')
Example #3
0
def build_regressor(data, name):
	X, y = data
	config = make_tpot_pmml_config(regressor_config_dict)
	del config["sklearn.neighbors.KNeighborsRegressor"]
	regressor = TPOTRegressor(generations = 3, population_size = 3, random_state = 13, config_dict = config, verbosity = 2)
	regressor.fit(X, y)
	pipeline = make_pmml_pipeline(regressor.fitted_pipeline_, active_fields = X.columns.values, target_fields = [y.name])
	print(repr(pipeline))
	store_pkl(pipeline, name)
	result = DataFrame(regressor.predict(X), columns = [y.name])
	store_csv(result, name)
Example #4
0
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name,
                  target_name):
    tp = TPOTRegressor(verbosity=2)
    start_time = timer(None)
    tp.fit(X_train, y_train)
    tp.export('tpot_pipeline_dont_overfit.py')
    time = timer(start_time)
    preds = tp.predict(X_test)

    time_out = open(name_dataset + '_' + 'tpot', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({id_name: id_test, target_name: preds})

    submission.to_csv('submission_' + name_dataset + '_' + 'tpot.csv',
                      index=False)
Example #5
0
def model_selection_and_HPO(dataframe, target="job_performance", test_size=0.25, r_seed=123):
    """ Pass in the dataframe that has gone through feature selection
    Uses the TPOT regressor module from TPOT to perform MS and HPO. As this modeling uses some element
    of stochasticity, it may provide different results every time. The longer you run this,
    the more similar the final models will look like in the end.
    
    Finally outputs a .py file with the selected model and its hyperparameters, for which we can import.
    """
    import TPOT 
    from sklearn.model_selection import train_test_split
    import timeit
    from tpot import TPOTRegressor
    from sklearn.metrics import (
        confusion_matrix,
        roc_auc_score,
        precision_recall_fscore_support,
        accuracy_score,
    )

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(
        dataframe.loc[:, dataframe.columns != target].values,
        dataframe[target].values.ravel(),
        test_size=test_size,
        random_state=r_seed)
    
    y_train = y_train.ravel()
    y_test = y_test.ravel()

    # model selection and hyperparameter optimization with TPOT Regressor
    tpot_regressor = TPOTRegressor(generations=20, 
                                   population_size=50, 
                                   cv=10,
                                   random_state=r_seed, 
                                   verbosity=2, 
                                   memory='auto')
    
    start_time = timeit.default_timer()
    tpot_regressor.fit(X_train, y_train)
    y_pred = tpot_regressor.predict(X_test)
    end_time = timeit.default_timer()

    print(f"Total runtime for the Employee dataset: {end_time-start_time}s")
    print("TPOT Score: {}".format(tpot_regressor.score(X_test, y_test)))

    tpot_regressor.export('tpot_exported_pipeline.py')
Example #6
0
def build_regressor(data, feature_pipeline, generations, population_size,
                    name):
    X, y = data
    Xt = feature_pipeline.fit_transform(X)
    Xt = Xt.astype(float)
    config = make_tpot_pmml_config(regressor_config_dict)
    config = filter_config(config)
    del config["sklearn.neighbors.KNeighborsRegressor"]
    regressor = TPOTRegressor(generations=generations,
                              population_size=population_size,
                              random_state=13,
                              config_dict=config,
                              verbosity=2)
    regressor.fit(Xt, y)
    pipeline = Pipeline(steps=feature_pipeline.steps +
                        regressor.fitted_pipeline_.steps)
    pipeline = make_pmml_pipeline(pipeline,
                                  active_fields=X.columns.values,
                                  target_fields=[y.name])
    print(repr(pipeline))
    store_pkl(pipeline, name)
    result = DataFrame(regressor.predict(Xt), columns=[y.name])
    store_csv(result, name)
Example #7
0
    def callback(self, channel, method, properties, body):
        with self.lock:
            (symbol, X_train, X_test, y_train, y_test,
             folds_index) = decode_data(body)
            channel.basic_ack(delivery_tag=method.delivery_tag)
        logger.info("data received %s %d", symbol, folds_index)
        tpot = TPOTRegressor(memory='auto',
                             generations=100,
                             population_size=100,
                             n_jobs=-1,
                             max_time_mins=20,
                             max_eval_time_mins=20,
                             config_dict='TPOT light')
        try:
            tpot.fit(X_train, y_train)
        except Exception as e:
            logger.error(e)
            data = (None, None, None, None)
            with self.lock:
                channel.basic_publish(exchange='',
                                      routing_key='tpot_pipelines',
                                      body=encode_data(data))
            return

        test_prediction = tpot.predict(X_test)
        test_prediction_error = abs((y_test - test_prediction) * 100 / y_test)
        score = tpot.score(X_test, y_test)
        logger.info("sending result of %s %s", symbol, folds_index)
        try:
            data = (tpot.fitted_pipeline_, score, folds_index, symbol)
            with self.lock:
                channel.basic_publish(exchange='',
                                      routing_key='tpot_pipelines',
                                      body=encode_data(data))
        except Exception:
            import pdb
            pdb.set_trace()
Example #8
0
                         max_time_mins=5,
                         scoring='r2',
                         verbosity=3,
                         n_jobs=4)

tpotReg2 = TPOTRegressor(generations=50,
                         population_size=50,
                         max_time_mins=5,
                         scoring='r2',
                         verbosity=3,
                         n_jobs=4)

tpotReg1.fit(X_train, y_train1)
tpotReg2.fit(X_train, y_train2)

y_pred1 = tpotReg1.predict(X_test)
y_pred2 = tpotReg2.predict(X_test)
y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))

printMetrics(y_true=y_test, y_pred=y_pred)
val_metrics = getMetrics(y_true=y_test, y_pred=y_pred)

y_pred1 = tpotReg1.predict(X_train)
y_pred2 = tpotReg1.predict(X_train)
y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))

printMetrics(y_true=y_train, y_pred=y_pred)
metrics = getMetrics(y_true=y_train, y_pred=y_pred)

tpotReg1.export('tpot_pipeline1.py')
tpotReg2.export('tpot_pipeline2.py')
y_train = y_train.ravel()
y_test = y_test.ravel()

## TPOT Model Performance

tpot_regressor_pipeline_selector = TPOTRegressor(
    generations=20,
    population_size=50,
    offspring_size=None,
    cv=10,
    random_state=42,
    verbosity=2,
    memory="auto",
    warm_start=True,
    use_dask=False,
    periodic_checkpoint_folder=PERIODIC_CHECKPOINT_FOLDER,
)

tpot_regressor_pipeline_selector.fit(X_train, y_train)
y_pred = tpot_regressor_pipeline_selector.predict(X_test)


def save_best_pipeline(selected_pipeline, filename):
    selected_pipeline.export(
        os.path.join(PERIODIC_CHECKPOINT_FOLDER, f"{filename}.py"))


# tpot_regressor_pipeline_selector.export(os.path.join(PERIODIC_CHECKPOINT_FOLDER,'tpot_exported_pipeline.py'))

save_best_pipeline(tpot_regressor_pipeline_selector, "tpot_exported_pipeline")
Example #10
0
    def scoring(y_real, y_predicted):
        return sum(y_predicted)[-1] / (len(y_predicted) - 1)

    for i in range(10):
        print('#' * 80)
        print(f'# GENERATION {i + 1}')
        print('#' * 80)
        x = np.array(walker.state_history[:-1])
        y = np.array([
            list(a) + [r]
            for a, r in zip(walker.action_history, walker.reward_history)
        ])

        walker.save_history(f'sillywalker{i+1}')
        model = TPOTRegressor(generations=5,
                              population_size=20,
                              scoring=scoring,
                              verbosity=2,
                              config_dict=regressor_config_dict_light)
        model.fit(x, y)
        for _ in range(10):
            while not walker.done:
                s = walker.state
                prediction = model.predict(np.array([s]))[0]
                print(prediction)

                action = Action(*prediction[:-1])
                walker.step(action)

            walker.reset()
Example #11
0
print(TDmodel_gmd_F.score(gmd_X_td_F, gmd_y_td_F))

# PS R2
gmd_df_ps_F = PS_data_F
gmd_df_ps_F = gmd_df_ps_F.dropna()
gmd_X_ps_F = gmd_df_ps_F.drop(['ageAtScan1', 'goassessDxpmr7', 'sex'], axis=1)
for column in gmd_X_ps_F:
    pd.to_numeric(gmd_X_ps_F[column], errors='coerce')
gmd_y_ps_F = gmd_df_ps_F.ageAtScan1
pd.to_numeric(gmd_y_ps_F, errors='coerce')
print(TDmodel_gmd_F.score(gmd_X_ps_F, gmd_y_ps_F))

# Create new columns in dataframe
# --- TD
# 1) real and predicted
gmd_df_td_F['pred_age'] = TDmodel_gmd_F.predict(gmd_X_td_F)
real_age_td_F = gmd_df_td_F.ageAtScan1
pred_age_td_F = TDmodel_gmd_F.predict(gmd_X_td_F)
gmd_df_td_F['diff_real_pred_age'] = real_age_td_F - pred_age_td_F
gmd_df_td_F['real_over18'] = gmd_df_td_F.ageAtScan1 >= 216
gmd_df_td_F['pred_over18'] = gmd_df_td_F.pred_age >= 216
# 2) age group indicators
gmd_df_td_F['8_9'] = ((gmd_df_td_F.ageAtScan1 >= 96) &
                      (gmd_df_td_F.ageAtScan1 < 120))
gmd_df_td_F['10_11'] = ((gmd_df_td_F.ageAtScan1 >= 120) &
                        (gmd_df_td_F.ageAtScan1 < 144))
gmd_df_td_F['12_13'] = ((gmd_df_td_F.ageAtScan1 >= 144) &
                        (gmd_df_td_F.ageAtScan1 < 168))
gmd_df_td_F['14_15'] = ((gmd_df_td_F.ageAtScan1 >= 168) &
                        (gmd_df_td_F.ageAtScan1 < 192))
gmd_df_td_F['16_17'] = ((gmd_df_td_F.ageAtScan1 >= 192) &
Example #12
0
def model_dev(train_set,matchups,spreads):
	""" Create the testing set for the algo creation """
	# Create a sample set to pass into the machine learning algorithm
	X = train_set[['rush_attempt_diff', 'turn_diff', 'yards_diff', 'third_diff', 'sack_diff', 'sack_ydiff', 'poss_diff', 'p_attempt_diff']].copy()
	# X = df[['poss_diff', 'third_diff', 'turn_diff', 'pass_diff', 'rush_diff']].copy()

	# Create results vector (a home win = 1, a home loss or tie = 0)
	train_set.rename(columns={'result_spread':'class'},inplace=True)
	y = train_set['class']#np.array(np.where(df['home_score'] > df['away_score'], 1, 0))

	""" Train, test, and predict the algorithm """
	# Scale the sample data
	scaler = preprocessing.StandardScaler().fit(X)
	X = scaler.transform(X)

	# Delete the dataframe to clear memory
	del train_set

	# Split out training and testing data sets
	X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.25,random_state=0)

	# alphas = [0.1, 0.3, 0.9, 1.0, 1.3, 1.9, 2.0, 2.3, 2.9]
	# for alpha in alphas:
	# 	reg = linear_model.Ridge(alpha = alpha)
	# 	reg.fit(X_train,y_train)
	# 	print 'alpha = ',alpha,', score = ',reg.score(X_test,y_test)
	# input()
	pipeline_optimizer = TPOTRegressor(generations = 5, population_size = 10, random_state = 42, cv = 5, verbosity = 2, n_jobs = 3)#, scoring = 'f1')
	pipeline_optimizer.fit(X_train,y_train)
	print pipeline_optimizer.score(X_test,y_test)
	pipeline_optimizer.export('NFL_ML_TPOT_Regressor.py')

	# Remove the 'week' 'home_team' and 'away_team' columns from matchups as they are not used in the algorithm
	matchups.drop(['week', 'home_team', 'away_team'], axis=1, inplace=True)


	"""
	for feat in range(1,len(matchups.columns)):
		for c in C_vec:
			# Create the classifier and check the score
			# clf = LogisticRegression()
			clf = linear_model.LogisticRegression(C=c,random_state=42)
			selector = RFE(clf)
			selector = selector.fit(X_train,y_train)

			# Calculate probabilities using the predict_proba method for logistic regression
			probabilities = selector.predict_proba(scaler.transform(matchups))

			# Vectorize the spread_conversion function and apply the function to the probabilities result vector
			vfunc = np.vectorize(spread_conversion)
			predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0])

			# If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team
			bet_vector = np.array(np.where(predicted_spreads > spreads,0,1))

			# Create the actual result vector where a tie counts as a loss for the home team
			game_result = np.array(np.where(home_score.ix[:,0] + predicted_spreads[:] > away_score.ix[:,0], 1, 0))

			# Check to see where the bet_vector equals the actual game result with the spread included
			result = np.array(np.where(bet_vector == game_result,1,0))

			prob_result = float(np.sum(result)) / len(result)

			# print 'Number of features =', feat, 'C =',c,'  Percent correct =',prob_result

			if prob_result > prob_val:
				prob_val = prob_result
				C_val = c
				feat_val = feat

	print 'Score =',selector.score(X_test,y_test)
	# print prob_val, C_val, feat

	clf = linear_model.LogisticRegression(C=C_val,random_state=42)
	clf = clf.fit(X_train,y_train)
	probabilities = clf.predict_proba(scaler.transform(matchups))
	vfunc = np.vectorize(spread_conversion)
	predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0])
	"""

	predicted_spreads = pd.DataFrame(pipeline_optimizer.predict(scaler.transform(matchups)),columns = ['results'])
	bet_vector = np.array(np.where(predicted_spreads > spreads,0,1))
	print spreads
	print predicted_spreads
	print bet_vector
y = df.pop('progression')
X = df

#y.head()

#split training and test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#specify model
regr = linear_model.LinearRegression()
regr = TPOTRegressor(generations=5, population_size=50, verbosity=2, n_jobs=-1)
#regr = linear_model.Ridge()
#regr = linear_model.Lasso()

#train the model using all data
#regr.fit(X, y)

# Train the model using the training sets
regr.fit(X_train, y_train)

#Explained variance score: 1 is perfect prediction
regr.score(X, y)
regr.score(X_train, y_train)
regr.score(X_test, y_test)

#Generate predictions, then append to df, then write to Excel
results = X_test
y_pred = regr.predict(X_test)
results['progression'] = y_test
results['pred_progression'] = y_pred
results.to_excel(r'diabetes.xls', header=True, index=True)
Example #14
0
class TPOTGaussianAdsorptionDiscoverer(AdsorptionDiscovererBase):
    '''
    This discoverer uses a Gaussian selection method with a TPOT model to select
    new sampling points.

    ...sorry for the awful code. This is a hack-job and I know it.
    '''
    # The width of the Gaussian selection curve
    stdev = 0.1

    def _train(self):
        '''
        Calculate the residuals of the current training batch, then retrain on
        everything
        '''
        # Instantiate the preprocessor and TPOT if we haven't done so already
        if not hasattr(self, 'preprocessor'):
            self._train_preprocessor()
        if not hasattr(self, 'tpot'):
            self.tpot = TPOTRegressor(generations=2,
                                      population_size=32,
                                      offspring_size=32,
                                      verbosity=2,
                                      scoring='neg_median_absolute_error',
                                      n_jobs=16,
                                      warm_start=True)
            features = self.preprocessor.transform(self.training_batch)
            energies = [doc['energy'] for doc in self.training_batch]
            self.tpot.fit(features, energies)

        # Calculate and save the residuals of this next batch
        features = self.preprocessor.transform(self.training_batch)
        tpot_predictions = self.tpot.predict(features)
        dft_energies = np.array([doc['energy'] for doc in self.training_batch])
        residuals = tpot_predictions - dft_energies
        self.residuals.extend(list(residuals))

        # Retrain
        self.training_set.extend(self.training_batch)
        self.__train_tpot()

    def _train_preprocessor(self):
        '''
        Trains the preprocessing pipeline and assigns it to the `preprocessor`
        attribute.
        '''
        # Open the cached preprocessor
        try:
            cache_name = 'caches/preprocessor.pkl'
            with open(cache_name, 'rb') as file_handle:
                self.preprocessor = pickle.load(file_handle)

        # If there is no cache, then remake it
        except FileNotFoundError:
            inner_fingerprinter = fingerprinters.InnerShellFingerprinter()
            outer_fingerprinter = fingerprinters.OuterShellFingerprinter()
            fingerprinter = fingerprinters.StackedFingerprinter(
                inner_fingerprinter, outer_fingerprinter)
            scaler = StandardScaler()
            pca = PCA()
            preprocessing_pipeline = Pipeline([
                ('fingerprinter', fingerprinter), ('scaler', scaler),
                ('pca', pca)
            ])
            preprocessing_pipeline.fit(self.training_batch)
            self.preprocessor = preprocessing_pipeline

            # Cache it for next time
            with open(cache_name, 'wb') as file_handle:
                pickle.dump(preprocessing_pipeline, file_handle)

    def __train_tpot(self):
        '''
        Train TPOT using the `training_set` attached to the class
        '''
        # Cache the current point for (manual) warm-starts, because there's a
        # solid chance that TPOT might cause a segmentation fault.
        cache_name = 'caches/%.3i_discovery_cache.pkl' % self.next_batch_number
        with open(cache_name, 'wb') as file_handle:
            cache = {
                'training_set': self.training_set,
                'sampling_space': self.sampling_space,
                'residuals': self.residuals,
                'regret_history': self.regret_history,
                'next_batch_number': self.next_batch_number,
                'training_batch': self.training_batch
            }
            pickle.dump(cache, file_handle)

        # Instantiate the preprocessor and TPOT if we haven't done so already
        if not hasattr(self, 'preprocessor'):
            self._train_preprocessor()
        if not hasattr(self, 'tpot'):
            self.tpot = TPOTRegressor(generations=2,
                                      population_size=32,
                                      offspring_size=32,
                                      verbosity=2,
                                      scoring='neg_median_absolute_error',
                                      n_jobs=16,
                                      warm_start=True)

        # [Re-]train
        features = self.preprocessor.transform(self.training_set)
        energies = [doc['energy'] for doc in self.training_set]
        self.tpot.fit(features, energies)
        self.next_batch_number += 1

        # Try to address some memory issues by collecting garbage
        _ = gc.collect()  # noqa: F841

    def _choose_next_batch(self):
        '''
        Choose the next batch "randomly", where the probability of selecting
        sites are weighted using a combination of a Gaussian distribution and
        TPOT's prediction of their distance from the optimal energy. Snippets
        were stolen from the GASpy_feedback module.
        '''
        # Use the energies to calculate probabilities of selecting each site
        features = self.preprocessor.transform(self.sampling_space)
        energies = self.tpot.predict(features)
        gaussian_distribution = norm(loc=self.optimal_value, scale=self.stdev)
        probability_densities = [
            gaussian_distribution.pdf(energy) for energy in energies
        ]

        # Perform a weighted shuffling of the sampling space such that sites
        # with better energies are more likely to be early in the list
        self.sampling_space = self.weighted_shuffle(self.sampling_space,
                                                    probability_densities)

        self._pop_next_batch

    @staticmethod
    def weighted_shuffle(sequence, weights):
        '''
        This function will shuffle a sequence using weights to increase the chances
        of putting higher-weighted elements earlier in the list. Credit goes to
        Nicky Van Foreest, whose function I based this off of.

        Args:
            sequence    A sequence of elements that you want shuffled
            weights     A sequence that is the same length as the `sequence` that
                        contains the corresponding probability weights for
                        selecting/choosing each element in `sequence`
        Returns:
            shuffled_list   A list whose elements are identical to those in the
                            `sequence` argument, but randomly shuffled such that
                            the elements with higher weights are more likely to
                            be in the front/start of the list.
        '''
        shuffled_list = np.empty_like(sequence)

        # Pack the elements in the sequences and their respective weights
        pairings = list(zip(sequence, weights))
        for i in range(len(pairings)):

            # Randomly choose one of the elements, and get the corresponding index
            cumulative_weights = np.cumsum([weight for _, weight in pairings])
            rand = random.random() * cumulative_weights[-1]
            j = bisect_right(cumulative_weights, rand)

            # Pop the element out so we don't re-select
            try:
                shuffled_list[i], _ = pairings.pop(j)

            # Hack a quick fix to some errors I don't feel like solving
            except IndexError:
                try:
                    shuffled_list[i], _ = pairings.pop(-1)
                except IndexError:
                    break

        return shuffled_list.tolist()
# Data Extraction
df = data_extract_e('e_20190609_15.pkl')

# Data Transformation and Engineering
df = feature_eng(df)
df = extract_queues(df)
dept_encoder, queue_encoder = fit_labels(df)
df = feature_transform(df, queue_encoder, dept_encoder)

# Training/Test Split
x, y = data_filter(df)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2468)

# Using TPOT AutoML
tpot = TPOTRegressor(n_jobs=-1,
                     verbosity=1,
                     config_dict=xgb_config.xgb_config_dict)
tpot = tpot.fit(x_train, y_train)
y_pred = tpot.predict(x_train)
print('XGB TPOT training R2 score: ', r2_score(y_train, y_pred))
print('XGB TPOT training negative MSE: ', tpot.score(x_train, y_train))

y_pred = tpot.predict(x_test)
print('XGB TPOT test R2 score: ', r2_score(y_test, y_pred))
print('XGB TPOT test negative MSE: ', tpot.score(x_test, y_test))

tpot.export('xgb_tpot.py')
Example #16
0
auto_classifier.fit(X_train, y_train)


# In[ ]:


#print("The cross-validation MSE")
#print(auto_classifier.score(X_valid, y_valid))


# In[ ]:


# Now do the prediction
test_result = auto_classifier.predict(test[feature_names].values)
sub = pd.DataFrame()
sub['id'] = test['id']
sub['trip_duration'] = np.exp(test_result)
sub.to_csv('NYCTaxi_TpotModels.csv', index=False)
sub.head()


# In[ ]:


# Export the model
auto_classifier.export('NYCTaxi_pipeline.py')


# That is it for now. You can run locally with more number of generations, population, etc. to get a better result. Because of Kaggle time limitations I could not choose parameters that take longer to run.
test = combi[train.shape[0]:]
test.drop('Item_Outlet_Sales',axis=1,inplace=True)

## removing id variables 
tpot_train = train.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1)
tpot_test = test.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train.drop('Item_Outlet_Sales',axis=1,inplace=True)

# finally building model using tpot library
from tpot import TPOTRegressor

X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

tpot.export(data+'tpot_boston_pipeline.py')

## predicting using tpot optimised pipeline
tpot_pred = tpot.predict(tpot_test)
sub1 = pd.DataFrame(data=tpot_pred)

#sub1.index = np.arange(0, len(test)+1)
sub1 = sub1.rename(columns = {'0':'Item_Outlet_Sales'})
sub1['Item_Identifier'] = test['Item_Identifier']
sub1['Outlet_Identifier'] = test['Outlet_Identifier']
sub1.columns = ['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier']
sub1 = sub1[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]
sub1.to_csv('tpot.csv',index=False)
Example #18
0
plt.show()

# %% tpot

testSL = to_supervised(test, n_input, n_outputs)
trainSL = to_supervised(train, n_input, n_outputs)
testSL[0].shape = (testSL[0].shape[0], testSL[0].shape[1] * testSL[0].shape[2])
trainSL[0].shape = (
    trainSL[0].shape[0],
    trainSL[0].shape[1] * trainSL[0].shape[2],
)
(X_train, y_train) = trainSL
(X_test, y_test) = testSL
tpot = TPOTRegressor(generations=20,
                     population_size=100,
                     verbosity=2,
                     random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export("tpot_boston_pipeline.py")

# %% Plot
predictions = tpot.predict(X_test)
plt.plot(np.squeeze(predictions), label="Predictions")
plt.plot(np.array(test)[-1 * predictions.shape[0]:][:, 0], label="dlGDP_csa")
plt.title("dlGDP forecasts")
plt.ylabel("dlGDP_csa")
plt.xlabel("Quarter")
plt.legend(loc="upper left")
plt.show()
Example #19
0
    # y_test_pred = reg.predict(X_test).squeeze()
    # print(reg.evaluate(X, y))
    # print(reg.evaluate(X_test, y_test))

    # TPOT
    from tpot import TPOTRegressor
    tpot = TPOTRegressor(
        # scoring=None, use_dask=True,
        generations=5,
        population_size=50,
        n_jobs=4,
        verbosity=2,
        random_state=42,
    )
    tpot.fit(X, y)
    y_pred = tpot.predict(X)
    y_test_pred = tpot.predict(X_test)
    print(tpot.score(X, y))
    print(tpot.score(X_test, y_test))

    y_pred_ss = pd.Series(y_pred, index=train_data.index[idx_good])
    strategy_train = train_data[selected_factor_names +
                                ['hmo_s1']].assign(y_pred=y_pred_ss)
    y_test_pred_ss = pd.Series(y_test_pred,
                               index=test_data.index[idx_test_good])
    strategy_test = test_data[selected_factor_names +
                              ['hmo_s1']].assign(y_pred=y_test_pred_ss)
    test_date0 = test_date[0]

    strategy = pd.concat([strategy_train, strategy_test])
    # strategy = strategy_train.copy()
y_train_all = train['y']
del train['ID']
del train['y']
id_test = test['ID']
del test['ID']

print 'train:', train.shape, ', test:', test.shape
random_state = 42
X_train, X_val, y_train, y_val = train_test_split(train,
                                                  y_train_all,
                                                  test_size=0.2,
                                                  random_state=random_state)

pipeline_optimizer = TPOTRegressor(generations=5,
                                   population_size=100,
                                   offspring_size=None,
                                   scoring='r2',
                                   cv=5,
                                   subsample=0.95,
                                   n_jobs=1,
                                   random_state=random_state,
                                   verbosity=2)

pipeline_optimizer.fit(X_train.values, y_train.values)
print(pipeline_optimizer.score(X_val.values, y_val.values))
pipeline_optimizer.export('./tpot_exported_models/tpot_exported_pipeline.py')
predict_y = pipeline_optimizer.predict(test.values)
df_sub = pd.DataFrame({'ID': id_test, 'y': predict_y})
df_sub.to_csv('tpot_pipeline_result.csv', index=False)
Example #21
0
class TPOT(BaseModel):
    '''
    This is our wrapper for fingerprinting sites and then using TPOT to predict
    adsorption energies from those fingerprints.
    '''
    def __init__(self):
        '''
        Instantiate the preprocessing pipeline and the TPOT model
        '''
        # Instantiate the fingerprinter
        inner_fingerprinter = fingerprinters.InnerShellFingerprinter()
        outer_fingerprinter = fingerprinters.OuterShellFingerprinter()
        fingerprinter = fingerprinters.StackedFingerprinter(inner_fingerprinter,
                                                            outer_fingerprinter)
        scaler = StandardScaler()
        pca = PCA()
        preprocessing_pipeline = Pipeline([('fingerprinter', fingerprinter),
                                           ('scaler', scaler),
                                           ('pca', pca)])
        self.preprocessor = preprocessing_pipeline

        # Instantiate TPOT
        self.tpot = TPOTRegressor(generations=2,
                                  population_size=32,
                                  offspring_size=32,
                                  verbosity=2,
                                  scoring='neg_median_absolute_error',
                                  n_jobs=16,
                                  warm_start=True)

    def train(self, docs, energies):
        '''
        Trains both the preprocessor and TPOT in series

        Args:
            docs        List of dictionaries from
                        `gaspy.gasdb.get_adsorption_docs`
            energies    List of floats containing the adsorption energies of
                        `docs`
        '''
        features = self.preprocessor.fit_transform(docs)
        self.tpot.fit(features, energies)

        # Try to address some memory issues by collecting garbage
        _ = gc.collect()  # noqa: F841

    def predict(self, docs):
        '''
        Use the whole fingerprinting and TPOT pipeline to make adsorption
        energy predictions

        Args:
            docs        List of dictionaries from
                        `gaspy.gasdb.get_adsorption_docs`
        Returns:
            predictions     `np.array` of TPOT's predictions of each doc
            uncertainties   `np.array` that contains the "uncertainty
                            prediction" for each site. In this case, it'll
                            just be TPOT's RMSE
        '''
        # Point predictions
        features = self.preprocessor.transform(docs)
        try:
            predictions = np.array(self.tpot.predict(features))
        # In case we need to make a prediction from a loaded state
        except AttributeError:
            predictions = np.array(self.tpot.fitted_pipeline_.predict(features))

        # "Uncertainties" will just be the RMSE
        residuals = np.array([prediction - doc['energy']
                              for prediction, doc in zip(predictions, docs)])
        rmse = np.sqrt((residuals**2).mean())
        uncertainties = np.array([rmse for _ in predictions])

        return predictions, uncertainties

    def save(self):
        '''
        Saves the state of the model into some pickles
        '''
        with open(self._fingerprinter_cache, 'wb') as file_handle:
            pickle.dump(self.preprocessor, file_handle)
        with open(self._pipeline_cache, 'wb') as file_handle:
            pickle.dump(self.tpot.fitted_pipeline_, file_handle)

    def load(self):
        '''
        Loads a previous state of the model from some pickles
        '''
        with open(self._fingerprinter_cache, 'rb') as file_handle:
            self.preprocessor = pickle.load(file_handle)
        with open(self._pipeline_cache, 'rb') as file_handle:
            self.tpot.fitted_pipeline_ = pickle.load(file_handle)

    @property
    def _fingerprinter_cache(self):
        return 'fingerprinter.pkl'

    @property
    def _pipeline_cache(self):
        return 'tpot_pipeline.pkl'
class simpleEstimator:
    def __init__(self):
        print('Initializing')
        self.application_file = 'application_train.csv'
        self.application_test = 'application_test.csv'
        self.additional_data = [
            'bureau_preprocessed.csv'
        ]  # 0.617 nachdem ich den score aufgenommen habe
        self.outfile = 'submission.csv'

    def submit(self):
        ofile = open(self.outfile, 'w')
        ofile.write('SK_ID_CURR,TARGET\n')
        print('Preparing submission')
        df = pd.read_csv(self.application_test, quotechar='"')
        for additional in self.additional_data:
            dfadd = pd.read_csv(additional)
            df = pd.merge(df, dfadd, on='SK_ID_CURR', how='left')
        tmat = df.values
        index = tmat[:, 0]
        x = tmat[:, 1:]
        for i in range(0, x.shape[0]):
            for j in range(0, x.shape[1]):
                x[i, j] = self.strtonum(x[i, j])
        y = self.predict(x)
        for i in range(0, len(y)):
            oline = str(index[i]) + ',' + str(max(min(y[i], 1), 0)) + '\n'
            ofile.write(oline)
        ofile.close()

    def prepare(self):
        #print(df.shape)
        df = pd.read_csv(self.application_file, quotechar='"')
        for additional in self.additional_data:
            dfadd = pd.read_csv(additional)
            df = pd.merge(df, dfadd, on='SK_ID_CURR', how='left')
        #print(df.shape)
        tmat = df.values
        self.x = tmat[:, 2:]
        self.y = np.array(tmat[:, 1], dtype=np.float64)
        #self.y = np.array(y.astype('float'), dtype = np.float64)

        index = tmat[:, 0]
        for i in range(0, self.x.shape[0]):
            for j in range(0, self.x.shape[1]):
                self.x[i, j] = self.strtonum(self.x[i, j])
        #for i in range(len(self.y)):
        # self.y[i] = self.strtonum(self.y[i])

    def gridsearch(self, parameters):
        svc = GradientBoostingRegressor()
        self.clf = GridSearchCV(svc, parameters, verbose=1, n_jobs=4)
        self.clf.fit(self.x, self.y)

    def train(self, params):
        #print('Training the model')
        #self.clf = RandomForestRegressor() # 0.625
        #self.clf = RandomForestClassifier() # 0.5
        #self.clf = GradientBoostingRegressor() # 0.730
        #self.clf = BaggingRegressor(KNeighborsClassifier(),max_samples=0.5, max_features=0.5) # MemoryError
        #self.clf = AdaBoostRegressor() # 0.674
        #clf = KNeighborsRegressor() # MemoryError
        #self.clf = MLPRegressor(hidden_layer_sizes = (5,)) # MemoryError
        #self.clf = GradientBoostingRegressor(**params)
        #self.clf = autosklearn.regression.AutoSklearnRegressor(time_left_for_this_task= 75000, per_run_time_limit= 7500 )
        self.clf = TPOTRegressor(generations=5,
                                 population_size=50,
                                 verbosity=2,
                                 n_jobs=3)
        self.clf.fit(self.x, self.y)
        self.clf.export('tpot_best_pipeline.py')

    def predict(self, x):
        return self.clf.predict(x)

    def test(self):
        #print('Testing the model')
        ypred = self.predict(self.x)
        rmse = self.get_rmse(ypred, self.y)
        print('OOS RMSE: ' + str(rmse))
        return rmse

    def strtonum(self, st):
        try:
            f = float(st)
            if math.isnan(f):
                return -10000
            return f
        except:
            sta = [ord(x) for x in st]
            cs = 0
            for i in sta:
                cs += i
            return cs

    def get_rmse(self, a, b):
        mse = 0
        for i in range(0, len(a)):
            mse += (a[i] - b[i])**2
        mse = math.sqrt(mse / len(a))
        return mse
finaltrainset = train_df[usable_columns].values
finaltestset = test_df[usable_columns].values

from tpot import TPOTRegressor
auto_classifier = TPOTRegressor(generations=3, population_size=8, verbosity=2)
from sklearn.model_selection import train_test_split

# Split training data to train and validate
X_train, X_valid, y_train, y_valid = train_test_split(finaltrainset,
                                                      y_train,
                                                      train_size=0.75,
                                                      test_size=0.25)

auto_classifier.fit(X_train, y_train)

cv_score = auto_classifier.score(X_valid, y_valid)

print("The cross-validation accuracy")
print(cv_score)

# we need access to the pipeline to get the probabilities
test_result = auto_classifier.predict(finaltestset)
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = test_result

sub.to_csv(base_output_path + 'tpot_analysis_{}.csv'.format(cv_score),
           index=False)

auto_classifier.export(base_output_path + 'tpot_pipeline.py')
Example #24
0
                          mutation_rate=0.9,
                          crossover_rate=0.1,
                          scoring="neg_mean_squared_error",
                          cv=5,
                          n_jobs=1,
                          max_time_mins=5,
                          verbosity=2,
                          config_dict=tpot_config)

auto_tpot.fit(features=X_train, target=y_train)

auto_tpot.fitted_pipeline_
auto_tpot.pareto_front_fitted_pipelines_
auto_tpot.evaluated_individuals_

y_hat = auto_tpot.predict(features=X_test)

# H2O AUTOML

import h2o
from h2o.automl import H2OAutoML

# Shart h2o cluster
h2o.init(max_mem_size="8G")

# Upload to h2o
df_train_h2o = h2o.H2OFrame(
    pd.concat([X_train, pd.DataFrame({"target": y_train})], axis=1))
df_test_h2o = h2o.H2OFrame(X_test)

features = X_train.columns.values.tolist()
x_train, x_test, y_train, y_test = train_test_split(train_new,
                                                    train_class,
                                                    train_size=0.75,
                                                    test_size=0.25)

#Instantiate tpot instance
tpot = TPOTRegressor(verbosity=3, generations=10, population_size=50)

#call fit function
tpot.fit(x_train, y_train)

#call the score function on cv data
print('TPOT score: {}'.format(tpot.score(x_test, y_test)))

#Predict temps for each month for next 5 years
submission = tpot.predict(test)

#create dataframe of results for each month/years
final = pd.DataFrame({
    'year': test[:, 0],
    'month': test[:, 1],
    'Pred': submission
})

#export pipeline
export_filename = 'BTC Pipeline.py'
tpot.export(export_filename)

#export predicted values
final_filename = 'btc_pred.csv'
final.to_csv(final_filename, index=False)
Example #26
0
from tpot import TPOTRegressor
X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,
 train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_boston_pipeline.py')





## predicting using tpot optimised pipeline

tpot_pred = tpot.predict(tpot_test)
sub1 = pd.DataFrame(data=tpot_pred)
#sub1.index = np.arange(0, len(test)+1)
sub1 = sub1.rename(columns = {'0':'Item_Outlet_Sales'})
sub1['Item_Identifier'] = test['Item_Identifier']
sub1['Outlet_Identifier'] = test['Outlet_Identifier']
sub1.columns = ['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier']
sub1 = sub1[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']]
sub1.to_csv('tpot.csv',index=False)






Example #27
0
numeric_df = pd.DataFrame(X)
numeric_df.index = all_df.index
combined_df = process_categorical(numeric_df, all_df, categorical_features)

X = combined_df.as_matrix()

from sklearn.decomposition import PCA

test_n = df.shape[0]

pca = PCA()
pca.fit(X[:test_n, :], price)
X = pca.transform(X)

X_train = X[:test_n, :]
X_train, X_val, y_train, y_val = ms.train_test_split(X_train,
                                                     price,
                                                     test_size=0.3,
                                                     random_state=0)
X_test = X[test_n:, :]

# housing = load_boston()
# X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target,
#                                                     train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2)
tpot.fit(X_train, y_train)
y_predicted = tpot.predict(X_test)
sdf['SalePrice'] = y_predicted
sdf.to_csv('submission.csv')
# tpot.export('tpot_kaggle_housing_pipeline.py')
Example #28
0
class TPOTAdaptor(DFMLAdaptor, LoggableMixin):
    """
    A dataframe adaptor for the TPOT classifiers and regressors.

    Args:
        tpot_kwargs: All kwargs accepted by a TPOTRegressor/TPOTClassifier
            or TPOTBase object.

            Note that for example, you can limit the models that TPOT explores
            by setting config_dict directly. For example, if you want to only
            use random forest:
        config_dict = {
            'sklearn.ensemble.RandomForestRegressor': {
                'n_estimators': [100],
                'max_features': np.arange(0.05, 1.01, 0.05),
                'min_samples_split': range(2, 21),
                'min_samples_leaf': range(1, 21),
                'bootstrap': [True, False]
                },
            }

        logger (Logger, bool): A custom logger object to use for logging.
            Alternatively, if set to True, the default automatminer logger will
            be used. If set to False, then no logging will occur.

    Attributes:
        The following attributes are set during fitting.

        mode (str): Either AMM_REG_NAME (regression) or AMM_CLF_NAME
            (classification)
        features (list): The features labels used to develop the ml model.
        ml_data (dict): The raw ml data used for training.
        best_pipeline (sklearn.Pipeline): The best fitted pipeline found.
        best_models (OrderedDict): The best model names and their scores.
        backend (TPOTBase): The TPOT object interface used for ML training.
        is_fit (bool): If True, the adaptor and backend are fit to a dataset.
        models (OrderedDict): The raw sklearn-style models output by TPOT.
        fitted_target (str): The target name in the df used for training.
    """
    def __init__(self, logger=True, **tpot_kwargs):
        tpot_kwargs['cv'] = tpot_kwargs.get('cv', 5)
        tpot_kwargs['n_jobs'] = tpot_kwargs.get('n_jobs', -1)
        tpot_kwargs['verbosity'] = tpot_kwargs.get('verbosity', 2)

        self.mode = None
        self._backend = None
        self.tpot_kwargs = tpot_kwargs
        self.fitted_target = None
        self._features = None
        self.models = None
        self._logger = self.get_logger(logger)
        self.is_fit = False
        self.random_state = tpot_kwargs.get('random_state', None)
        self._ml_data = None
        self.greater_score_is_better = None

    @log_progress(AMM_LOG_FIT_STR)
    @set_fitted
    def fit(self, df, target, **fit_kwargs):
        """
        Train a TPOTRegressor or TPOTClassifier by fitting on a dataframe.

        Args:
            df (pandas.DataFrame): The df to be used for training.
            target (str): The key used to identify the machine learning target.
            **fit_kwargs: Keyword arguments to be passed to the TPOT backend.
                These arguments must be valid arguments to the TPOTBase class.

        Returns:
            TPOTAdaptor (self)

        """
        # Prevent goofy pandas casting by casting to native
        y = df[target].values.tolist()
        X = df.drop(columns=target).values.tolist()

        # Determine learning type based on whether classification or regression
        self.mode = regression_or_classification(df[target])

        if self.mode == AMM_CLF_NAME:
            self.tpot_kwargs['config_dict'] = self.tpot_kwargs.get(
                'config_dict', TPOT_CLASSIFIER_CONFIG)
            if "scoring" not in self.tpot_kwargs:
                self.tpot_kwargs["scoring"] = "balanced_accuracy"
            self._backend = TPOTClassifier(**self.tpot_kwargs)
        elif self.mode == AMM_REG_NAME:
            self.tpot_kwargs['config_dict'] = self.tpot_kwargs.get(
                'config_dict', TPOT_REGRESSOR_CONFIG)
            if "scoring" not in self.tpot_kwargs:
                self.tpot_kwargs["scoring"] = "neg_mean_absolute_error"
            self._backend = TPOTRegressor(**self.tpot_kwargs)
        else:
            raise ValueError("Learning type {} not recognized as a valid mode "
                             "for {}".format(self.mode,
                                             self.__class__.__name__))
        self._features = df.drop(columns=target).columns.tolist()
        self._ml_data = {"X": X, "y": y}
        self.fitted_target = target
        self._backend = self._backend.fit(X, y, **fit_kwargs)
        return self

    @property
    @check_fitted
    def best_models(self):
        """
        The best models found by TPOT, in order of descending performance.

        If you want a pipeline you can use to make predtions, use the
        best_pipeline.

        Performance is evaluated based on the TPOT scoring. This can be changed
        by passing a "scoring" kwarg into the __init__ method.

        Returns:
            best_models_and_scores (dict): Keys are names of models. Values
                are the best internal cv scores of that model with the
                best hyperparameter combination found.

        """
        self.greater_score_is_better = is_greater_better(
            self.backend.scoring_function)

        # Get list of evaluated model names, cast to set and back
        # to get unique model names, instantiate ordered model dictionary
        evaluated_models = [
            key.split('(')[0]
            for key in self.backend.evaluated_individuals_.keys()
        ]
        model_names = list(set(evaluated_models))
        models = OrderedDict({model: [] for model in model_names})

        # This makes a dict of model names mapped to all runs of that model
        for key, val in self.backend.evaluated_individuals_.items():
            models[key.split('(')[0]].append(val)

        # For each base model type sort the runs by best score
        for model_name in model_names:
            models[model_name].sort(key=lambda x: x['internal_cv_score'],
                                    reverse=self.greater_score_is_better)

        # Gets a simplified dict of the model to only its best run
        # Sort the best individual models by type to best models overall
        best_models = OrderedDict(
            sorted({model: models[model][0]
                    for model in models}.items(),
                   key=lambda x: x[1]['internal_cv_score'],
                   reverse=self.greater_score_is_better))

        # Mapping of top models to just their score
        scores = {
            model: best_models[model]['internal_cv_score']
            for model in best_models
        }
        # Sorted dict of top models just mapped to their top scores
        best_models_and_scores = OrderedDict(
            sorted(scores.items(),
                   key=lambda x: x[1],
                   reverse=self.greater_score_is_better))
        self.models = models
        return best_models_and_scores

    @log_progress(AMM_LOG_PREDICT_STR)
    @check_fitted
    def predict(self, df, target):
        """
        Predict the target property of materials given a df of features.

        The predictions are appended to the dataframe in a column called:
            "{target} predicted"

        Args:
            df (pandas.DataFrame): Contains all features needed for ML (i.e.,
                all features contained in the training dataframe.
            target (str): The property to be predicted. Should match the target
                used for fitting. May or may not be present in the argument
                dataframe.

        Returns:
            (pandas.DataFrame): The argument dataframe plus a column containing
                the predictions of the target.

        """
        if target != self.fitted_target:
            raise AutomatminerError(
                "Argument dataframe target {} is different "
                "from the fitted dataframe target! {}"
                "".format(target, self.fitted_target))
        elif not all([f in df.columns for f in self._features]):
            not_in_model = [f for f in self._features if f not in df.columns]
            not_in_df = [f for f in df.columns if f not in self._features]
            raise AutomatminerError(
                "Features used to build model are different"
                " from df columns! Features located in "
                "model not located in df: \n{} \n Features "
                "located in df not in model: \n{}"
                "".format(not_in_df, not_in_model))
        else:
            X = df[self._features].values  # rectify feature order
            y_pred = self._backend.predict(X)
            df[target + " predicted"] = y_pred
            return df

    @property
    @check_fitted
    def best_pipeline(self):
        return self._backend.fitted_pipeline_

    @property
    @check_fitted
    def features(self):
        return self._features

    @property
    @check_fitted
    def ml_data(self):
        return self._ml_data

    @property
    @check_fitted
    def backend(self):
        return self._backend
Example #29
0
                                                        test_size=0.25,
                                                        random_state=seed)
    tpot = TPOTRegressor(generations=gen,
                         population_size=50,
                         verbosity=2,
                         n_jobs=cores)
    tpot.fit(X_train, y_train.reshape(-1, ))

    features_readable = list()
    for t in range(len(test_features[test_no])):
        features_readable.append(d[test_features[test_no][t]])

    x = linspace(n1 + 1, n1 + sample_n, sample_n)
    fig, ax = plt.subplots()
    fig.set_size_inches(22, 13)
    plt.plot(x, tpot.predict(X_test)[n1:n1 + sample_n])
    plt.plot(x, y_test[n1:n1 + sample_n])

    ax.set(xlabel='sample no',
           ylabel='FO flow m3/h',
           title='Training number:' + str(test_no) + '\nFeatures: \n ' +
           str(features_readable))
    ax.grid()

    fig.savefig("results/test_no_" + str(test_no) + ".png")

# In[218]:

# Train linear  models
#
Example #30
0
    population_size=100,
    n_jobs=4,
    verbosity=2,
    cv=3,
    early_stop=3
)
model.fit(X_train, y_train.values)


# In[5]:


def rmsle_metric(y_test, y_pred) : 
    assert len(y_test) == len(y_pred)
    y_test = np.exp(y_test)-1
    y_pred = np.exp(y_pred)-1
    rmsle = np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))
    return rmsle

y_pred = model.predict(X_test)
print(rmsle_metric(y_test, y_pred))


# In[6]:


from sklearn.externals import joblib

joblib.dump(model.fitted_pipeline_, 'PCA_y_log_TPOT_1_475.pkl')

# the optimization process if there is no improvement
# verbosity - integer, default=0,
# how much information TPOT communicates while it's running, verbosity=2 means
# TPOT will print more information and provide a progress bar
# verbosity=3 means TPOT will print everything and provide a progress bar
# max_time_mins - integer or None, default=None,
# it defines how many minutes TPOT has to optimize the pipeline

# Start a timer
import time
start = time.time()

tpot.fit(X_train, y_train)
tpot.export('TPOT_RF_Pers_E_Shopping.py')

results = tpot.predict(X_test)
y_pred_GP = results

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_GP))
#print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_GP))
print('Root Mean Squared Error:',
      np.sqrt(metrics.mean_squared_error(y_test, y_pred_GP)))

# New score
errors = abs(y_test - y_pred_GP)
mape = 100 * (errors / y_test)
# Calculate and display accuracy_tpot
accuracy_tpot = 100 - np.mean(mape)
print('Accuracy_tpot:', round(accuracy_tpot, 2), '%.')

print('Improvement of Accuracy with TPOT_Regression of: {:0.2f}%.'.format(