Esempio n. 1
0
def Model(Encoding, Scores, Run_name, step_size, loop_dict, var_dict,
          round_data, ElasticNet_dict, l1_ratios, All_data):
    Pearson_correlations = []
    Data = Encoding.copy()  #copy, so it does not change#
    Data_sets = CV_split(Data, 5)  # The Big 5#
    for cv_round in range(len(Data_sets)):
        score_dict = Scores.copy()  #Randomized scores at the start each time#
        Test_set = Data_sets[cv_round]
        Train_set = exclude(Data_sets,
                            cv_round)  #Keeps everything but the train set#
        Train_set = pd.concat(Train_set)  #All train sets into on dataframe#
        X = Train_set.iloc[:, :Train_set.shape[1] - 1]  #features#
        X['Intercept'] = 1  #add intercept#
        y = pd.DataFrame(Train_set['pMeas'])  #targets#
        AM_EndOfLoopError = []
        AM_EndOfLoopError.append(Get_Error(
            X, y, score_dict))  # The Error Before AM Tuning #
        """AM Tuning Looping Starts Here and Adds a value to End of Loop Error"""
        Loop_num = 1  #
        AM_EndOfLoopError.append(
            Amplitude_Tuning(X, y, step_size, score_dict, Loop_num, Run_name,
                             cv_round, loop_dict, var_dict))
        round_data[cv_round] = loop_dict
        while ((AM_EndOfLoopError[-1] - AM_EndOfLoopError[-2]) /
               (AM_EndOfLoopError[-2])) < -0.001:
            Loop_num += 1
            AM_EndOfLoopError.append(
                Amplitude_Tuning(X, y, step_size, score_dict, Loop_num,
                                 Run_name, cv_round, loop_dict, var_dict))
            round_data[cv_round] = loop_dict
        loop_dict['AM Time Series Data'] = AM_EndOfLoopError
        loop_dict['Final Scores'] = score_dict
        """  AM Tuning is now Finished for the CV_split, Elastic Net is Next """
        EN = ElasticNetCV(l1_ratio=l1_ratios,
                          cv=5,
                          copy_X=True,
                          normalize=True,
                          random_state=23)
        X_train = X.copy()
        X_train.replace(score_dict, inplace=True)
        y_train = y.copy()
        X_test = Test_set.iloc[:, :Test_set.shape[1] - 1]
        X_test.replace(score_dict, inplace=True)
        X_test['Intercept'] = 1
        y_test = pd.DataFrame(Test_set['pMeas'])
        EN.fit(X_train, y_train)
        y_pred = pd.DataFrame(EN.predict(X_test))
        Pearson_correlations.append(np.corrcoef(y_test.T, y_pred.T)[0][1])
        """Save Everything """
        ElasticNet_dict["y_pred"] = y_pred
        ElasticNet_dict['y_test'] = y_test
        ElasticNet_dict['Alpha'] = EN.alpha_
        ElasticNet_dict['l1_ratio'] = EN.l1_ratio_
        ElasticNet_dict['Parameters'] = EN.get_params()
        ElasticNet_dict["AlphaSpace"] = EN.alphas_
        loop_dict['ElasticNet'] = ElasticNet_dict
        round_data[cv_round] = loop_dict
    All_data[Run_name] = round_data
    np.save("All Data.npy", All_data)
    return np.mean(Pearson_correlations)
Esempio n. 2
0
    def train(self, cvs, init_params=[], FS=False, inner_jobs=1):
        print('training with deap...')

        X = np.vstack((cvs[0][0], cvs[0][2], cvs[0][4]))

        if len(cvs[0][1].shape) == 1 and len(cvs[0][5].shape) == 1:
            y = np.hstack((cvs[0][1], cvs[0][3], cvs[0][5]))
        else:
            y = np.vstack((cvs[0][1], cvs[0][3], cvs[0][5])).ravel()
        self.D, self.N = X.shape

        if 'elasticnet' in str.lower(self.model_type):
            X_train = cvs[0][0]
            y_train = cvs[0][1].reshape(-1, 1)
            X_val = cvs[0][2]
            y_val = cvs[0][3].reshape(-1, 1)
            X_test = cvs[0][4]
            y_test = cvs[0][5].reshape(-1, 1)
            X_train = np.vstack((X_train, X_val, X_test))
            y_train = np.vstack((y_train, y_val, y_test))
            model = ElasticNetCV(cv=5, max_iter=4000)
            model.fit(X_train, y_train.ravel())

            self.best_params = model.get_params()
            ypred = model.predict(X_test).ravel()
            if self.rated is None:
                self.accuracy = np.mean(np.abs(ypred - y_test.ravel()) / y_test.ravel())
            else:
                self.accuracy = np.mean(np.abs(ypred - y_test.ravel()))
            self.acc_test = self.accuracy
            self.model = model

            self.logger.info('Best params')
            self.logger.info(self.best_params)
            self.logger.info('Final mae %s', str(self.acc_test))
            self.logger.info('Final rms %s', str(self.accuracy))
            self.logger.info('finish train for model %s', self.model_type)
            self.istrained = True
            self.save(self.model_dir)

            return self.to_dict()

        else:
            if 'xgb' in str.lower(self.model_type):
                params = {'learning_rate': np.logspace(-5, -1, num=6, base=10),
                          'max_depth': np.unique(np.linspace(1, 150, num=50).astype('int')),
                          'colsample_bytree': np.linspace(0.4, 1.0, num=60),
                          'colsample_bynode': np.linspace(0.4, 1.0, num=60),
                          'subsample': np.linspace(0.2, 1.0, num=6),
                          'gamma': np.linspace(0.001, 2, num=20),
                          'reg_alpha': np.linspace(0, 1.0, num=12)}
                model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
                ngen = self.static_data['sklearn']['gen']
                npop = self.static_data['sklearn']['pop']
            elif 'rf' in str.lower(self.model_type):
                if FS:
                    params = {
                        'max_depth': [1, 2, 3, 5, 10, 16, 24, 36, 52, 76, 96, 128, 150],
                    }
                    model = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42,
                                                  max_features=2 / 3)
                    ngen = 2
                    npop = 4
                else:
                    params = {
                        'max_depth': np.unique(np.linspace(1, 130, num=50).astype('int')),
                        'max_features': ['auto', 'sqrt', 'log2', None, 0.8, 0.6, 0.4],
                        'min_samples_leaf': np.unique(np.linspace(1, 20, num=20).astype('int')),
                        'min_samples_split': np.unique(np.linspace(2, 100, num=20).astype('int')),
                    }
                    model = RandomForestRegressor(n_estimators=500, random_state=42)
                    ngen = self.static_data['sklearn']['gen']
                    npop = self.static_data['sklearn']['pop']
            elif str.lower(self.model_type) == 'svm':
                params = {'C': np.logspace(-2, 3, num=100, base=10),
                          'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                          'gamma': list(np.linspace(0.001, 2, num=100)) + ['scale', 'auto']}
                model = SVR(max_iter=1000000)
                ngen = self.static_data['sklearn']['gen']
                npop = self.static_data['sklearn']['pop']
            elif str.lower(self.model_type) == 'nusvm':
                params = {'nu': np.linspace(0.01, 0.99, num=10),
                          'C': np.logspace(-1, 5, num=100, base=10),
                          'gamma': np.linspace(0.01, 10, num=100)}
                model = NuSVR(max_iter=1000000)
                ngen = self.static_data['sklearn']['gen']
                npop = self.static_data['sklearn']['pop']
            elif 'mlp' in str.lower(self.model_type):
                if not self.is_combine:
                    params = {'hidden_layer_sizes': np.linspace(4, 800, num=50).astype('int'),
                              'alpha': np.linspace(1e-5, 1e-1, num=4),
                              }
                else:
                    params = {'hidden_layer_sizes': np.linspace(4, 250, num=50).astype('int'),
                              'activation': ['identity', 'tanh', 'relu'],
                              'alpha': np.linspace(1e-5, 1e-1, num=4),
                              }

                model = MLPRegressor(max_iter=1000, early_stopping=True)
                ngen = 5
                npop = self.static_data['sklearn']['pop']

        if not self.path_group is None:
            ncpus = joblib.load(os.path.join(self.path_group, 'total_cpus.pickle'))
            gpu_status = joblib.load(os.path.join(self.path_group, 'gpu_status.pickle'))

            njobs = int(ncpus - gpu_status)
            cpu_status = njobs
            joblib.dump(cpu_status, os.path.join(self.path_group, 'cpu_status.pickle'))
        else:
            njobs = self.njobs

        cv = EvolutionaryAlgorithmSearchCV(estimator=model,
                                           params=params,
                                           scoring='neg_root_mean_squared_error',
                                           cv=3,
                                           rated=self.rated,
                                           verbose=1,
                                           population_size=npop,
                                           gene_mutation_prob=0.8,
                                           gene_crossover_prob=0.8,
                                           tournament_size=3,
                                           generations_number=ngen,
                                           refit=False,
                                           init_params=init_params,
                                           n_jobs=njobs,
                                           path_group=self.path_group)

        cv.fit(cvs)

        self.best_params = cv.best_params_

        self.accuracy, self.acc_test = self.fit_model1(model, self.best_params, cvs)

        self.model = model
        self.model.set_params(**self.best_params)
        self.model.fit(X, y.ravel())

        self.logger.info('Best params')
        self.logger.info(self.best_params)
        self.logger.info('Final mae %s', str(self.acc_test))
        self.logger.info('Final rms %s', str(self.accuracy))
        self.logger.info('finish train for model %s', self.model_type)
        self.istrained = True
        self.save(self.model_dir)

        return self.to_dict()
Esempio n. 3
0
class ElasticNet:
    """Wrapper for SciKitLearn linear_model.ElasticNetCV to help with model optimization"""

    def __init__(self, x='numpy_array', y='predictor', sample_labels=None,
                 test_split=.2, sk_elastic_net_kwargs=None, regression_site_labels=None, test_samples=None,
                 output_name=None, output_directory=None):
        assert isinstance(sk_elastic_net_kwargs, dict)
        assert isinstance(sample_labels, list)
        assert isinstance(regression_site_labels, list)
        self.x = x
        self.y = y
        self.test_split = test_split
        self.en_kwargs = sk_elastic_net_kwargs
        # test_container/ train_container [array, outcomes, labels]
        self.test_container = [[], [], []]
        self.train_container = [[], [], []]
        self.en_model = None
        self.sample_labels = sample_labels
        self.regression_site_labels = regression_site_labels
        self.model_stats = None
        self.input_test_samples = test_samples
        self.output_name = output_name
        self.output_directory = output_directory
        self.run()

    def run(self):
        self.set_test_samples()
        if self.input_test_samples:
            self.test_split = 0.1
        if self.test_split == 0:
            self.test_container = self.train_container
        self.fit_model()
        self.get_model_stats()
        self.model_output()

    def set_test_samples(self):
        if self.input_test_samples:
            test_samples = self.input_test_samples
        else:
            test_size = int(round(len(self.y) * self.test_split, 0))
            test_samples = random.sample(self.sample_labels, test_size)

        # test_container/ validation_container [test_array, test_outcomes, labels]

        for count, info in enumerate(zip(self.x, self.y, self.sample_labels)):
            if info[2] in test_samples:
                self.test_container[0].append(info[0])
                self.test_container[1].append(info[1])
                self.test_container[2].append(info[2])
            else:
                self.train_container[0].append(info[0])
                self.train_container[1].append(info[1])
                self.train_container[2].append(info[2])
        self.test_container[0] = np.asarray(self.test_container[0])
        self.test_container[1] = np.asarray(self.test_container[1])
        self.train_container[0] = np.asarray(self.train_container[0])
        self.train_container[1] = np.asarray(self.train_container[1])

    def fit_model(self):
        self.en_model = ElasticNetCV(**self.en_kwargs).fit(self.train_container[0], self.train_container[1])

    def get_model_stats(self):
        regression_sites = []
        for site in zip(self.regression_site_labels, list(self.en_model.coef_)):
            if not math.isclose(site[1], 0):
                regression_sites.append(site[0])
        model_score = self.en_model.score(self.test_container[0], self.test_container[1])
        predited_values = self.en_model.predict(self.test_container[0])
        self.model_stats = (regression_sites, model_score, predited_values)

    def model_output(self):
        """
        """
        kwarg_pair = ['Test Split:%s\n' % str(self.test_split)]
        for key, value in self.en_model.get_params().items():
            kwarg_pair.append('%s:%s' % (key, str(value)))

        output_path = '%s%s' % (self.output_directory, self.output_name)

        joblib.dump(self.en_model, output_path + '.model')

        out = open(output_path + '.model_info.txt', 'w')
        out.write('%s\n' % self.output_name)
        out.write('Model Score (R^2) = %s\n' % str(self.model_stats[1]))
        out.write('%s\n' % '\t'.join(kwarg_pair))
        out.write('Test Samples \t%s\n' % '\t'.join(stc(self.test_container[2])))
        out.write('Test Samples Predicted Values \t%s\n' % '\t'.join(stc(self.model_stats[2])))
        out.write('Test Samples Actual Values \t%s\n' % '\t'.join(stc(self.test_container[1])))
        out.write('Training Samples \t%s\n' % '\t'.join(stc(self.train_container[2])))
        out.write('Training Samples Actual Values \t%s\n' % '\t'.join(stc(self.train_container[1])))
        out.write('Regression Sites \t%s\n' % '\t'.join(stc(self.model_stats[0])))
        out.close()
y = pd.read_csv(y_train_path, index_col=False)
y = y.iloc[:, 0]

#%%

# feature selection affected linear model greatly so lets automate this process
# by using instead of the normal LinearRegression library lets use ElasticNet
# that combines L1 and L2 regularization. This works as a kind of automatic feature
# selection

# lets start by optimizing the parameters in crossvalidation. There is a separate
# function for this package that does this more efficiently

glmnet = ElasticNetCV(cv=70, random_state=seed)
glmnet.fit(X, y)
glmnet_best_params = glmnet.get_params()

#%%

# Defining the method for crossvalidation. We crossvalidate each individual row
crossvalidation = KFold(n_splits=70, shuffle=True, random_state=seed)

# Defining list of scoring methods
scoring = ["neg_mean_squared_error", "neg_mean_absolute_error"]

#%%

glmnet_model = ElasticNet()
glmnet_best_params_matching = {
    key: glmnet_best_params[key]
    for key in glmnet_model.get_params().keys() if key in glmnet_best_params