def _test_ridge_cv(filter_): ridge_cv = RidgeCV() ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert len(ridge_cv.coef_.shape) == 1 assert type(ridge_cv.intercept_) == np.float64 cv = KFold(5) ridge_cv.set_params(cv=cv) ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert len(ridge_cv.coef_.shape) == 1 assert type(ridge_cv.intercept_) == np.float64
def linear_model(self, train_set, test_set, type = 'ridge_regression'): print ' <<<<<<<<<<<<<<<<<<<<<< linear_model >>>>>>>>>>>>>>>>>>>>>>>> ' xTrain, xTest, yTrain, yTest, yPrevTest, yPrevTrain, yPrevIndex = self.prepare_data(train_set,test_set) if type == 'ridge_regression': print ' <<< ridge-regression >>> ' cvParams = {'ridgecv': [{'alphas': np.array([1, .1, .01, .001, .0001, 10, 100, 1000, 10000, 100000, 100000, 1000000])}]} model = RidgeCV() model.set_params(**dict((k, v[0] if isinstance(v, list) else v) for k,v in cvParams['ridgecv'][0].iteritems())) else: print ' <<< linear_regression >>>' model = linear_model.LinearRegression() model.fit(xTrain, yTrain) pred_test = model.predict(xTest) pred_train = model.predict(xTrain) print 'test MSE: ', mean_squared_error(yTest, pred_test) print 'train MSE: ', mean_squared_error(yTrain, pred_train) print 'test MAE: ', mean_absolute_error(yTest, pred_test) print 'train MAE: ', mean_absolute_error(yTrain, pred_train) print 'test accuracy: ' , sum(1 for x,y in zip(np.sign(pred_test - yPrevTest),np.sign(yTest - yPrevTest)) if x == y) / float(len(yTest)) print 'train accuracy: ' , sum(1 for x,y in zip(np.sign(pred_train - yPrevTrain),np.sign(yTrain - yPrevTrain)) if x == y) / float(len(yTrain)) coef = model.coef_ print 'coef: ' print coef if type == 'ridge_regression': print 'best alpha: ' print model.alpha_ return pred_train, pred_test
def __init__(self, num_dists=2, sigma=0.1, base_learner=None, **kwargs): self.num_dists = num_dists self.sigma = sigma if base_learner is None: base_learner = RidgeCV(fit_intercept=False, \ alphas=[0.001, 0.01, 0.1, 100, 1000], cv=None, store_cv_values=True) if 'fit_intercept' not in kwargs: kwargs['fit_intercept'] = False self.base_learner = base_learner.set_params(**kwargs) self.R = None self.model = None
class Model: params = Params() testsize = float(params.get_data_params()['testsize']) random_state = int(params.get_data_params()['randomstate']) def __init__(self, X, y, **kwargs): self.X, self.x_val, self.y, self.y_val = train_test_split( X, y, test_size=self.testsize, random_state=self.random_state) self.model = None self.FeatureSelectionType = kwargs['featureselection'] self.features = self.X.columns self.model_coefficients = None self.EvalMetrics = kwargs['evalmetric'] self.FeatSelCvFolds = kwargs['featureselectioncvfolds'] self.CvFolds = kwargs['gridsearchcvfolds'] self.gridSearch = kwargs['gridsearchcv'] self.set_model() self.feature_selection() self.fit_model() def __repr__(self): return "Model(" + str(self.model) + ")" def set_model(self): model_name = self.params.get_model() if model_name == 'lr': self.model = LinearRegression() params = self.params.get_linear_reg() del params['regularization'] self.model = self.model.set_params(**params) elif model_name == 'svr': self.model = SVR() self.model = self.model.set_params(**self.params.get_svr()) else: pass def lasso(self): estimator = LassoCV(cv=5, max_iter=10000) selector = SelectFromModel(estimator) selector = selector.fit(self.X, self.y) bool_mask = selector.get_support() self.features = list(compress(self.X.columns, bool_mask)) self.X = self.X.loc[:, bool_mask] self.x_val = self.x_val.loc[:, bool_mask] def feature_selection(self): if self.FeatureSelectionType.lower() == "lasso": self.lasso() else: self.features = self.X.columns def fit_model(self): print("Fitting model..") model_type = self.params.get_model() if self.gridSearch and model_type == 'svr': params_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1], 'kernel': ['linear', 'rbf'], } reg = GridSearchCV(self.model, params_grid, cv=5) reg.fit(self.X, self.y) self.model = self.model.set_params(**reg.best_params_) self.model.fit(self.X, self.y) elif model_type =='lr': regularization = self.params.get_linear_reg()['regularization'] if regularization.lower() == 'ridge': params = self.model.get_params() del params['copy_X'] del params['n_jobs'] self.model = RidgeCV(cv=5) self.model = self.model.set_params(**params) self.model.fit(self.X, self.y) else: self.model.fit(self.X, self.y) else: self.model.fit(self.X, self.y) def score_model(self): cv_fold = int(self.params.get_test_params()['testcvfold']) train_score_result = [] val_score_result = [] score_result = pd.DataFrame() y_pred = self.model.predict(self.x_val) scoring_list = ['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'] for scoring in scoring_list: train_score_result.append( np.abs(np.mean(cross_val_score(self.model, self.X, self.y, scoring=scoring, cv=cv_fold)))) train_score_result.append(np.sqrt(train_score_result[2])) val_score_result.append(r2_score(self.y_val, y_pred)) val_score_result.append(mean_absolute_error(self.y_val, y_pred)) val_score_result.append(mean_squared_error(self.y_val, y_pred)) val_score_result.append(np.sqrt(val_score_result[2])) score_result['mean {}-fold cv'.format(self.FeatSelCvFolds)] = train_score_result score_result['validation score'] = val_score_result score_result.index = ['r2', 'MAE', 'MSE', 'RMSE'] print(""" ######################################################### ######## Model and final parameters ####### #########################################################\n {} """.format(self.model) ) if self.params.get_model() == 'lr': coef_feature = pd.DataFrame() coef_feature['Coefficients'] = self.model.coef_ coef_feature['Feature'] = self.X.columns else: print("Predictors used:") coef_feature = self.X.columns.values print("{}".format(coef_feature) ) print(""" ######################################################### ######### Score ######### #########################################################\n {} """.format(score_result))
def pre_linear_model( self, train_set, test_set , old_pred_train , old_pred_test , type = 'ridge_regression' ): print ' <<<<<<<<<<<<<<<<<<<<<< pre_linear_model >>>>>>>>>>>>>>>>>>>>>>>> ' xTrain, xTest, yTrain, yTest, yPrevTest, yPrevTrain, yPrevIndex = self.prepare_data(train_set,test_set, remove_prev_label= True) yTest = yTest - old_pred_test yTrain = yTrain - old_pred_train if type == 'ridge_regression': print ' <<< ridge-regression >>> ' cvParams = {'ridgecv': [{'alphas': np.array([1, .1, .01, .001, .0001, 10, 100, 1000, 10000, 100000, 100000, 1000000, 10000000, 100000000, 1000000000 ])}]} model = RidgeCV() model.set_params(**dict((k, v[0] if isinstance(v, list) else v) for k,v in cvParams['ridgecv'][0].iteritems())) else: print ' <<< linear_regression >>>' model = linear_model.LinearRegression() model.fit(xTrain, yTrain) pred_test = model.predict(xTest) pred_train = model.predict(xTrain) yTest = yTest + old_pred_test pred_test = pred_test + old_pred_test yTrain = yTrain +old_pred_train pred_train = pred_train + old_pred_train print 'test MSE: ', mean_squared_error(yTest, pred_test) print 'train MSE: ', mean_squared_error(yTrain, pred_train) print 'test MAE: ', mean_absolute_error(yTest, pred_test) print 'train MAE: ', mean_absolute_error(yTrain, pred_train) print 'test accuracy: ' , sum(1 for x,y in zip(np.sign(pred_test - yPrevTest),np.sign(yTest - yPrevTest)) if x == y) / float(len(yTest)) print 'train accuracy: ' , sum(1 for x,y in zip(np.sign(pred_train - yPrevTrain),np.sign(yTrain - yPrevTrain)) if x == y) / float(len(yTrain)) coef = model.coef_ print 'coef: ' print coef if type == 'ridge_regression': print 'best alpha: ' print model.alpha_ a = [] a.append(yPrevTest.tolist()) a.append(yTest) a.append(pred_test) a = np.transpose(a) print a[:25,:] print a[25:50,:] return pred_train, pred_test