def background_null_distance_resid_correlation(x, y, z, data, model=linear_model.LassoLarsCV(), Reps=100): backgrounds = [] X, Y, Z = prepare_data_for_regression_stat(x, y, z, data) n = X.shape[0] UCMD_A = DistCor.U_centered_matrix(DistCor.dist_matrix(X)) triu_indices = np.triu_indices_from(UCMD_A, k=1) UCMD_B = DistCor.U_centered_matrix(DistCor.dist_matrix(Y)) U_vector_B = UCMD_B[triu_indices].reshape(-1, 1) indices = np.arange(n) if Z is not None: UCMD_Cs = list(map(DistCor.U_centered_matrix, map(DistCor.dist_matrix, Z.T))) U_matrix_C = np.vstack([UCMD_Cs[i][triu_indices] for i in range(len(UCMD_Cs))]).T for i in range(Reps): np.random.shuffle(indices) shuffled_UCMD_A = UCMD_A[:, indices][indices] shuffled_U_vector_A = shuffled_UCMD_A[triu_indices].reshape(-1, 1) background_statistic = reg_correlation(shuffled_U_vector_A, U_vector_B, U_matrix_C, model) backgrounds.append(background_statistic) else: for i in range(Reps): np.random.shuffle(indices) shuffled_UCMD_A = UCMD_A[:, indices][indices] shuffled_U_vector_A = shuffled_UCMD_A[triu_indices].reshape(-1, 1) background_statistic = np.corrcoef(shuffled_U_vector_A.flatten(), U_vector_B.flatten())[0][1] backgrounds.append(background_statistic) return backgrounds
def train_lassolars_model(train_x, train_y, predict_x): print_title("LassoLars Regressor") reg = linear_model.LassoLarsCV(cv=10, n_jobs=3, max_iter=2000, normalize=False) reg.fit(train_x, train_y) print("alphas and cv_alphas: {0} and {1}".format(reg.alphas_.shape, reg.cv_alphas_.shape)) print("alphas[%d]: %s" % (len(reg.cv_alphas_), reg.cv_alphas_)) print("mse shape: {0}".format(reg.cv_mse_path_.shape)) # print("mse: %s" % np.mean(_mse, axis=0)) # print("mse: %s" % np.mean(_mse, axis=1)) # index = np.where(reg.alphas_ == reg.alpha_) # print("itemindex: %s" % index) index = np.where(reg.cv_alphas_ == reg.alpha_) _mse_v = np.mean(reg.cv_mse_path_[index, :]) print("mse value: %f" % _mse_v) print("best alpha: %f" % reg.alpha_) best_alpha = reg.alpha_ reg = linear_model.LassoLars(alpha=best_alpha) reg.fit(train_x, train_y) n_nonzeros = (reg.coef_ != 0).sum() print("Non-zeros coef: %d" % n_nonzeros) predict_y = reg.predict(predict_x) return {'y': predict_y, "coef": reg.coef_}
def choose_optimizer(self, LassoType='Lasso', RegCoef=0.00001, cv=5, criterion='aic', maxiter=10000, tolerance=0.0001, normalize=True): if LassoType == 'Lasso': lin = linear_model.Lasso(alpha=RegCoef, max_iter=maxiter, normalize=normalize, tol=tolerance) elif LassoType == 'LassoCV': lin = linear_model.LassoCV(cv=cv, normalize=normalize, max_iter=maxiter) elif LassoType == 'LassoLarsCV': lin = linear_model.LassoLarsCV(cv=cv, normalize=normalize, max_iter=maxiter) elif LassoType == 'LarsCV': lin = linear_model.LarsCV(cv=cv, normalize=normalize, max_iter=maxiter) elif LassoType == 'LassoLarsIC': lin = linear_model.LassoLarsIC(criterion=criterion, normalize=normalize, max_iter=maxiter) else: raise Exception("wrong option") return lin
def scale2fitting(scale, x_train, y_train, x_test, y_test): global test_a, test_b # 选取gamma,其中scale为尺度上界一般取(2^n),具体scale值需要crossValiation确定,gamma服从U[0,scale] parameters = {} # parameters里第一列放的是系数coef_,第二列放的是x_train的位置,第三列放的是gamma parameters['x_train'] = x_train singleScale = np.random.uniform(0, scale, size=1) gamma = np.ones([x_train.shape[0]])*singleScale parameters['gamma'] = gamma trainMap = featureMap(x_train, gamma, x_train) try: F1 = lm.LassoLarsCV(cv=5, normalize=False) F1.fit(trainMap, y_train) parameters['coef'] = F1.coef_ y_train_fit = F1.predict(trainMap) mseTrain = (float(1) / len(y_train)) * np.linalg.norm((y_train_fit - y_train), ord=2) ** 2 rmseTrain = mseTrain ** 0.5 testMap = featureMap(x_test, gamma, x_train) y_test_fit = F1.predict(testMap) mseTest = (float(1) / len(y_test)) * np.linalg.norm((y_test_fit - y_test), ord=2) ** 2 rmseTest = mseTest ** 0.5 return {'mseTrain': mseTrain, 'mseTest': mseTest, 'rmseTrain': rmseTrain, 'rmseTest': rmseTest, 'parameters': parameters, 'scale': scale, 'x_train': x_train, 'gamma': gamma, 'model': F1} except: print 'lasso/lars error' return {'mseTest': 999999999}
def test_lasso(): alphaNum = 6 print '*' * 80 inputData = pd.read_hdf( './rise_DM_fraud/dev1/preprocessing/preprocessing_result.h5') target = 'fpd' Y = inputData[target] X = inputData.drop(target, axis=1) X.fillna(-999, inplace=True) lars_cv = linear_model.LassoLarsCV(cv=6).fit(X, Y) skf = cv.StratifiedKFold(y=Y, n_folds=5) for i, (_, test_index) in enumerate(skf): print 'Fold', i test_X = X.iloc[test_index, :] test_Y = Y[test_index] alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], alphaNum) clf = linear_model.RandomizedLasso(alphas, random_state=33, n_jobs=1).fit(test_X, test_Y) featureImportance = pd.DataFrame(sorted(zip( map(lambda x: round(x, 4), clf.scores_), X.columns), reverse=True), columns=['importance', 'name']) featureImportance.to_csv( './rise_DM_fraud/dev1/feature_ranking/feature_importance_lasso_fold_%d.csv' % (i + 1), index=False)
def test_sk_LassoLarsCV(): print("Testing sklearn, LassoLarsCV...") mod = linear_model.LassoLarsCV() X, y = iris_data mod.fit(X, y) docs = {'name': "LassoLarsCV test"} fv = X[0, :] upload(mod, fv, docs)
def reg_correlation_with_residuals(X, Y, Z, model=linear_model.LassoLarsCV()): model.fit(Z, X.ravel()) X_res = X.ravel() - model.predict(Z) model.fit(Z, Y.ravel()) Y_res = Y.ravel() - model.predict(Z) if np.isclose(np.linalg.norm(X_res), 0) or np.isclose(np.linalg.norm(Y_res), 0): return 0 return np.corrcoef(X_res.flatten(), Y_res.flatten())[0][1], X_res, Y_res
def test_lars_cv_max_iter(): with warnings.catch_warnings(record=True) as w: rng = np.random.RandomState(42) x = rng.randn(len(y)) X = diabetes.data X = np.c_[X, x, x] # add correlated features lars_cv = linear_model.LassoLarsCV(max_iter=5) lars_cv.fit(X, y) assert len(w) == 0
def _train(self): x = self._train_set.features y = self._train_set.outputs self._transform = preprocessing.PolynomialFeatures(1) clf = linear_model.LassoLarsCV(fit_intercept=True) clf.fit(self._transform.fit_transform(x, y), y) self._model = clf.predict
def test_lars_cv_max_iter(): with warnings.catch_warnings(record=True) as w: X = diabetes.data y = diabetes.target rng = np.random.RandomState(42) x = rng.randn(len(y)) X = np.c_[X, x, x] # add correlated features lars_cv = linear_model.LassoLarsCV(max_iter=5) lars_cv.fit(X, y) # Expected single FutureWarning for deprecation of n_splits=3 assert_true(len(w) != 0)
def test_model_lasso_lars_cv(self): model, X = fit_regression_model(linear_model.LassoLarsCV()) model_onnx = convert_sklearn( model, "lasso lars cv", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, basename="SklearnLassoLarsCV-Dec4")
def _noise_filtering(X, target, good_cols=[], problem_type="regression"): """ Trains a prediction model with additional noise features and selects only those of the original features that have a higher coefficient than any of the noise features. Inputs: - X: n x d numpy array with d features - target: n dimensional array with targets corresponding to the data points in X - good_cols: list of column names for the features in X - problem_type: str, either "regression" or "classification" (default: "regression") Returns: - good_cols: list of noise filtered column names """ n_feat = X.shape[1] assert len( good_cols) == n_feat, "fewer column names provided than features in X." if not good_cols: good_cols = list(range(n_feat)) # perform noise filtering on these features if problem_type == "regression": model = lm.LassoLarsCV(cv=5, eps=1e-8) elif problem_type == "classification": model = lm.LogisticRegressionCV(cv=5, penalty="l1", solver="saga", class_weight="balanced") else: print( "[featsel] WARNING: Unknown problem_type %r - not performing noise filtering." % problem_type) model = None if model is not None: X = _add_noise_features(X) with warnings.catch_warnings(): warnings.simplefilter("ignore") # TODO: remove if sklearn least_angle issue is fixed try: model.fit(X, target) except ValueError: rand_idx = np.random.permutation(X.shape[0]) model.fit(X[rand_idx], target[rand_idx]) # model.fit(X, target) if problem_type == "regression": coefs = np.abs(model.coef_) else: # model.coefs_ is n_classes x n_features, but we need n_features coefs = np.max(np.abs(model.coef_), axis=0) weights = dict(zip(good_cols, coefs[:len(good_cols)])) # only include features that are more important than our known noise features noise_w_thr = np.max(coefs[n_feat:]) good_cols = [c for c in good_cols if weights[c] > noise_w_thr] return good_cols
def test_lars_cv(): # Test the LassoLarsCV object by checking that the optimal alpha # increases as the number of samples increases. # This property is not actually garantied in general and is just a # property of the given dataset, with the given steps chosen. old_alpha = 0 lars_cv = linear_model.LassoLarsCV() for length in (400, 200, 100): X = diabetes.data[:length] y = diabetes.target[:length] lars_cv.fit(X, y) np.testing.assert_array_less(old_alpha, lars_cv.alpha_) old_alpha = lars_cv.alpha_
def cross_validated_estimators_tests(): models = [ linear_model.ElasticNetCV(), linear_model.LarsCV(), linear_model.LassoCV(), linear_model.LassoLarsCV(), linear_model.LogisticRegressionCV(), linear_model.OrthogonalMatchingPursuitCV(), linear_model.RidgeClassifierCV(), linear_model.RidgeCV() ] for model in models: cross_validated_estimators(model)
def test_model_lasso_lars_cv(self): model, X = _fit_model(linear_model.LassoLarsCV()) model_onnx = convert_sklearn(model, "lasso lars cv", [("input", FloatTensorType(X.shape))]) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(numpy.float32), model, model_onnx, basename="SklearnLassoLarsCV-Dec4", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.2.1')", )
def predict_profit(feature_pred=None): df = clean.doit() df = df[df['title_year'] >= 1990] df.keys() m = preprocessing.LabelEncoder() u = m.fit_transform(df['content_rating']) y = pd.Series(u, index=df.index) ya = pd.DataFrame({"Rating": y}) df = df.join(ya) s = df['genres'].str.split('|').apply(pd.Series, 1) s = s.fillna('') le = defaultdict(preprocessing.LabelEncoder) genres_num = s.apply(lambda x: le[x.name].fit_transform(x)) df = df.join(genres_num) feature = df.ix[:, [ 'bud', 'director_avg_profit', 'director_movie_count', 'actor_1_avg_profit', 'actor_1_movie_count', 'actor_2_avg_profit', 'actor_2_movie_count', 'actor_3_avg_profit', 'actor_3_movie_count' ]] #,'title_year',0,1,2,3,4,'Rating']] label = df['profit'] feat_train, feat_test, lab_train, lab_test = train_test_split( feature, label, random_state=1) regress = linear_model.LassoLarsCV(cv=10, precompute=False) regress.fit(feat_train, lab_train) sco = cross_val_score(regress, feat_test, lab_test, cv=10) cross_score = sco.mean() print "cross validated score:", cross_score print "coefficients:", regress.coef_ print "intercept:", regress.intercept_ plt.clf() plt.scatter(feat_train['actor_1_avg_profit'], lab_train, color='blue', label='training data') plt.scatter(feat_test['actor_1_avg_profit'], lab_test, color='red', label='testing data') plt.plot(feat_test['actor_1_avg_profit'], regress.predict(feat_test), color='black', linewidth='2') plt.xlabel('director_profit') plt.ylabel('profit_of_movie') plt.show() with open("prediction.pickle", "wb") as f: pickle.dump(regress, f)
def test_model_lasso_lars_cv(self): model, X = fit_regression_model(linear_model.LassoLarsCV()) model_onnx = convert_sklearn( model, "lasso lars cv", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnLassoLarsCV-Dec4", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.2.1')", )
def RunLassoLARS( args, verbose=True ): ''' Run a LassoLARS model. trainX, trainY, testX: you know what those are f_psearch: the fraction of the test sample to use to choose hyperparameters (0 < f_psearch < 1) ''' trainX, trainY, testX = args if verbose: print '\nChoosing best alpha and fitting the model' model = linear_model.LassoLarsCV( cv=5, verbose=int(verbose), normalize=False ) model.fit( trainX, trainY ) if verbose: print '\nUsing alpha =',model.alpha_,'\nProducing estimates' predictions = model.predict( testX ) if verbose: print '\nComplete.' return predictions
def test_lars_cv_max_iter(recwarn): warnings.simplefilter('always') with np.errstate(divide='raise', invalid='raise'): X = diabetes.data y = diabetes.target rng = np.random.RandomState(42) x = rng.randn(len(y)) X = diabetes.data X = np.c_[X, x, x] # add correlated features lars_cv = linear_model.LassoLarsCV(max_iter=5, cv=5) lars_cv.fit(X, y) # Check that there is no warning in general and no ConvergenceWarning # in particular. # Materialize the string representation of the warning to get a more # informative error message in case of AssertionError. recorded_warnings = [str(w) for w in recwarn] assert recorded_warnings == []
def sklearn_liner_model_regressions(xTrain, xTest, yTrain, yTest): modelForConsideration: DataFrame = pd.DataFrame() LinerModels = \ [ linear_model.ARDRegression(), linear_model.BayesianRidge(), linear_model.ElasticNet(), linear_model.ElasticNetCV(), linear_model.HuberRegressor(), linear_model.Lars(), linear_model.LarsCV(), linear_model.Lasso(), linear_model.LassoCV(), linear_model.LassoLars(), linear_model.LassoLarsCV(), linear_model.LassoLarsIC(), linear_model.LinearRegression(), linear_model.MultiTaskLasso(), linear_model.MultiTaskElasticNet(), linear_model.MultiTaskLassoCV(), linear_model.MultiTaskElasticNetCV(), linear_model.OrthogonalMatchingPursuit(), linear_model.OrthogonalMatchingPursuitCV(), linear_model.PassiveAggressiveClassifier(), linear_model.PassiveAggressiveRegressor(), linear_model.Perceptron(), linear_model.RANSACRegressor(), linear_model.Ridge(), linear_model.RidgeClassifier(), linear_model.RidgeClassifierCV(), linear_model.RidgeCV(), linear_model.SGDClassifier(), linear_model.SGDRegressor(), linear_model.TheilSenRegressor(), linear_model.enet_path(xTrain, yTrain), linear_model.lars_path(xTrain, yTrain), linear_model.lasso_path(xTrain, yTrain), # linear_model.LogisticRegression() # ,linear_model.LogisticRegressionCV(),linear_model.logistic_regression_path(xTrain, yTrain), linear_model.orthogonal_mp(xTrain, yTrain), linear_model.orthogonal_mp_gram(), linear_model.ridge_regression() ] for model in LinerModels: modelName: str = model.__class__.__name__ try: # print(f"Preparing Model {modelName}") if modelName == "LogisticRegression": model = linear_model.LogisticRegression(random_state=0) model.fit(xTrain, yTrain) yTrainPredict = model.predict(xTrain) yTestPredict = model.predict(xTest) errorList = calculate_prediction_error(modelName, yTestPredict, yTest, yTrainPredict, yTrain) if errorList["Test Average Error"][0] < 30 and errorList[ "Train Average Error"][0] < 30: try: modelForConsideration = modelForConsideration.append( errorList) except (Exception) as e: print(e) except (Exception, ArithmeticError) as e: print(f"Error occurred while preparing Model {modelName}") return modelForConsideration
def _l1_graph_setup(X, positive, alpha): n, d = X.shape # Choose an efficient Lasso solver if alpha is not None: if positive or d < n: clf = linear_model.Lasso(positive=positive, alpha=alpha) else: clf = linear_model.LassoLars(alpha=alpha) else: cv = min(d, 3) if positive or d < n: clf = linear_model.LassoCV(positive=positive, cv=cv) else: clf = linear_model.LassoLarsCV(cv=cv) # Normalize all samples X = X / np.linalg.norm(X, ord=2, axis=1)[:,None] return clf, X
def make_prediction(train, test): y_train = train.SalePrice.values.tolist() train = train.drop('SalePrice', 1) # test = train.drop('Id', 1) # train = train.drop('Id', 1) x_train = train.values.tolist() x_test = test.values.tolist() model = linear_model.LassoLarsCV(normalize = False) model = model.fit(x_train, y_train) answer = model.predict(test.values.tolist()) df = return_csv_from_arr(answer) df.to_csv(predictionsFolder + 'submission.csv' , index=False)
def build(self, **kwargs): """ builds and returns estimator Args: hyperparameters (dictionary): Dictionary of hyperparameters to be used for tuning the estimator. **kwargs (key-value arguments): Ignored in this implementation. Added for compatibility with :func:`mlaut.estimators.nn_estimators.Deep_NN_Classifier`. Returns: `sklearn pipeline` object: pipeline for transforming the features and training the estimator """ estimator = linear_model.LassoLarsCV( max_n_alphas=self._hyperparameters['max_n_alphas'], cv=self._num_cv_folds, n_jobs=self._n_jobs) return self._create_pipeline(estimator=estimator)
def test_lars_cv_max_iter(recwarn): warnings.simplefilter("always") with np.errstate(divide="raise", invalid="raise"): X = diabetes.data y = diabetes.target rng = np.random.RandomState(42) x = rng.randn(len(y)) X = diabetes.data X = np.c_[X, x, x] # add correlated features lars_cv = linear_model.LassoLarsCV(max_iter=5, cv=5) lars_cv.fit(X, y) # Check that there is no warning in general and no ConvergenceWarning # in particular. # Materialize the string representation of the warning to get a more # informative error message in case of AssertionError. recorded_warnings = [str(w) for w in recwarn] # FIXME: when 'normalize' is removed set exchange below for: # assert len(recorded_warnings) == [] assert len(recorded_warnings) == 1 assert "normalize' will be set to False in version 1.2" in recorded_warnings[0]
def _estimate_model(self): """Estimates lasso regression object. Returns ------- model : sklearn lasso regression or lasso cv object Fitted lasso model. """ ###Lars Algorithm if self.solver == "Lars": self.underlying = linear_model.LassoLars( fit_intercept=self.intercept, normalize=False) if self.cv_folds is 'IC': #For AIC/BIC. criterion kwarg should be provided. model = linear_model.LassoLarsIC(fit_intercept=self.intercept, normalize=False, **self.kwargs) elif self.cv_folds is not None: model = linear_model.LassoLarsCV(fit_intercept=self.intercept, cv=self.cv_folds, normalize=False, **self.kwargs) else: model = linear_model.Lasso(fit_intercept=self.intercept, **self.kwargs) ###Coordinate Descent Algorithm elif self.solver == "Coordinate Descent": self.underlying = linear_model.Lasso(fit_intercept=self.intercept) if self.cv_folds is not None: model = linear_model.LassoCV(fit_intercept=self.intercept, cv=self.cv_folds, **self.kwargs) else: model = linear_model.Lasso(fit_intercept=self.intercept, **self.kwargs) else: raise NotImplementedError( 'Solver not implemented. Choices are Lars or Coordinate Descent.' ) #self.model.fit(np.asanyarray(self.x_train.values,order='F'), self.y_train) model.fit(self.x_train, self.y_train) return model
def cross_validate_model(X_train, Y_train): """ Here we perform cross validation of models to choose the best one. """ # Divide the training and testing data train, test, y_actual, y_predict = train_test_split(X_train, Y_train, test_size=0.5, random_state=42) # List the regression methods to use. clf_random_forest = ensemble.RandomForestRegressor(n_estimators=50) clf_adaboost_reg = ensemble.AdaBoostRegressor(n_estimators=50) clf_lasso_larscv = sklinear.LassoLarsCV(cv=9) clf_ridge = sklinear.RidgeCV() clf_elastic_net = sklinear.ElasticNet() clf_extra_tree = ensemble.ExtraTreesRegressor(n_estimators=50) clf_mlpr = neural_network.MLPRegressor(solver='adam') # Add the above methods in an array # More ameable for looping methods = [ clf_random_forest, clf_adaboost_reg, clf_lasso_larscv, clf_elastic_net, clf_extra_tree, clf_mlpr ] methods_label = [ 'clf_random_forest', 'clf_adaboost_reg', 'clf_lasso_larscv', 'clf_elastic_net', 'clf_extra_tree', 'clf_mlpr' ] method_mse = np.zeros((len(methods), 1)) # Fit and predict for each method for i in range(len(methods)): methods[i].fit(train, y_actual) method_predict = methods[i].predict(test) method_mse[i] = metrics.mean_squared_error(y_predict, method_predict) print('MSE for %s while cross validation : %f' % (methods_label[i], method_mse[i])) # We return the method which has the minimum mse return np.argmin(method_mse)
def __init__(self, params=None): self.clf = xgb.sklearn.XGBRegressor( max_depth=3, learning_rate=0.1, n_estimators=300, silent=True, objective='reg:linear', nthread=1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=.25, #.5 reg_alpha=0, #1 reg_lambda=.5, #.2 scale_pos_weight=1, base_score=0.5, seed=0, missing=None) self.clf2 = linear_model.LassoLarsCV(fit_intercept=True)
def _model_fitting_cv(cls, x, y, num_cv, plotting=False): # Compute paths # print("Computing regularization path using the Lars lasso...") model = linear_model.LassoLarsCV(cv=num_cv).fit(x, y) # Display results if plotting: import matplotlib.pyplot as plt m_log_alphas = -np.log10(model.cv_alphas_) plt.figure(figsize=(20, 10)) plt.plot(m_log_alphas, model.cv_mse_path_, ':') plt.plot(m_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.axis('tight') plt.savefig('cross_validation', dpi=None, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format=None, transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None) plt.plot() return model
def __init__( self, method, yrange, params, i=0 ): #TODO: yrange doesn't currently do anything. Remove or do something with it! self.algorithm_list = [ 'PLS', 'GP', 'OLS', 'OMP', 'Lasso', 'Elastic Net', 'Ridge', 'Bayesian Ridge', 'ARD', 'LARS', 'LASSO LARS', 'SVR', 'KRR', ] self.method = method self.outliers = None self.ransac = False print(params) if self.method[i] == 'PLS': self.model = PLSRegression(**params[i]) if self.method[i] == 'OLS': self.model = linear.LinearRegression(**params[i]) if self.method[i] == 'OMP': # check whether to do CV or not self.do_cv = params[i]['CV'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove CV parameter params_temp.pop('CV') if self.do_cv is False: self.model = linear.OrthogonalMatchingPursuit(**params_temp) else: params_temp.pop('precompute') self.model = linear.OrthogonalMatchingPursuitCV(**params_temp) if self.method[i] == 'LASSO': # create a temporary set of parameters params_temp = copy.copy(params[i]) # check whether to do CV or not try: self.do_cv = params[i]['CV'] # Remove CV parameter params_temp.pop('CV') except: self.do_cv = False if self.do_cv is False: self.model = linear.Lasso(**params_temp) else: params_temp.pop('alpha') self.model = linear.LassoCV(**params_temp) if self.method[i] == 'Elastic Net': params_temp = copy.copy(params[i]) try: self.do_cv = params[i]['CV'] params_temp.pop('CV') except: self.do_cv = False if self.do_cv is False: self.model = linear.ElasticNet(**params_temp) else: params_temp['l1_ratio'] = [.1, .5, .7, .9, .95, .99, 1] self.model = linear.ElasticNetCV(**params_temp) if self.method[i] == 'Ridge': # create a temporary set of parameters params_temp = copy.copy(params[i]) try: # check whether to do CV or not self.do_cv = params[i]['CV'] # Remove CV parameter params_temp.pop('CV') except: self.do_cv = False if self.do_cv: self.model = linear.RidgeCV(**params_temp) else: self.model = linear.Ridge(**params_temp) if self.method[i] == 'BRR': self.model = linear.BayesianRidge(**params[i]) if self.method[i] == 'ARD': self.model = linear.ARDRegression(**params[i]) if self.method[i] == 'LARS': # create a temporary set of parameters params_temp = copy.copy(params[i]) try: # check whether to do CV or not self.do_cv = params[i]['CV'] # Remove CV parameter params_temp.pop('CV') except: self.do_cv = False if self.do_cv is False: self.model = linear.Lars(**params_temp) else: self.model = linear.LarsCV(**params_temp) if self.method[i] == 'LASSO LARS': model = params[i]['model'] params_temp = copy.copy(params[i]) params_temp.pop('model') if model == 0: self.model = linear.LassoLars(**params_temp) elif model == 1: self.model = linear.LassoLarsCV(**params_temp) elif model == 2: self.model = linear.LassoLarsIC(**params_temp) else: print("Something went wrong, \'model\' should be 0, 1, or 2") if self.method[i] == 'SVR': self.model = svm.SVR(**params[i]) if self.method[i] == 'KRR': self.model = kernel_ridge.KernelRidge(**params[i]) if self.method[i] == 'GP': # get the method for dimensionality reduction and the number of components self.reduce_dim = params[i]['reduce_dim'] self.n_components = params[i]['n_components'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove parameters not accepted by Gaussian Process params_temp.pop('reduce_dim') params_temp.pop('n_components') self.model = GaussianProcess(**params_temp)
def fit_transform(self, X, y): """ Fits the regression model and returns a new dataframe with the additional features. Inputs: - X: pandas dataframe or numpy array with original features (n_datapoints x n_features) - y: pandas dataframe or numpy array with targets for all n_datapoints Returns: - new_df: new pandas dataframe with all the original features (except categorical features transformed into multiple 0/1 columns) and the most promising engineered features. This df can then be used to train your final model. Please ensure that X only contains valid feature columns (including possible categorical variables). Note: we strongly encourage you to name your features X1 ... Xn or something simple like this before passing a DataFrame to this model. This can help avoid potential problems with sympy later on. The data should only contain finite values (no NaNs etc.) """ # store column names as they'll be lost in the other check cols = [str(c) for c in X.columns] if isinstance(X, pd.DataFrame) else [] # check input variables X, target = check_X_y(X, y, y_numeric=self.problem_type == "regression", dtype=None) if not cols: # the additional zeros in the name are because of the variable check in _generate_features, # where we check if the column name occurs in the the expression. this would lead to many # false positives if we have features x1 and x10...x19 instead of x001...x019. cols = ["x%03i" % i for i in range(X.shape[1])] self.original_columns_ = cols # transform X into a dataframe (again) df = pd.DataFrame(X, columns=cols) # possibly convert categorical columns df = self._transform_categorical_cols(df) # if we're not given specific feateng_cols, then just take all columns except categorical if self.feateng_cols: fcols = [] for c in self.feateng_cols: if c not in self.original_columns_: raise ValueError("[AutoFeat] feateng_col %r not in df.columns" % c) if c in self.categorical_cols_map_: fcols.extend(self.categorical_cols_map_[c]) else: fcols.append(c) self.feateng_cols_ = fcols else: self.feateng_cols_ = list(df.columns) # convert units to proper pint units if self.units: # need units for only and all feateng columns self.units = {c: self.units[c] if c in self.units else "" for c in self.feateng_cols_} # apply pi-theorem -- additional columns are not used for regular feature engineering (for now)! df = self._apply_pi_theorem(df) # subsample data points and targets in case we'll generate too many features # (n_rows * n_cols * 32/8)/1000000000 <= max_gb n_cols = n_cols_generated(len(self.feateng_cols_), self.feateng_steps, len(self.transformations)) n_gb = (len(df) * n_cols) / 250000000 if self.verbose: print("[AutoFeat] The %i step feature engineering process could generate up to %i features." % (self.feateng_steps, n_cols)) print("[AutoFeat] With %i data points this new feature matrix would use about %.2f gb of space." % (len(df), n_gb)) if self.max_gb and n_gb > self.max_gb: n_rows = int(self.max_gb * 250000000 / n_cols) if self.verbose: print("[AutoFeat] As you specified a limit of %.1d gb, the number of data points is subsampled to %i" % (self.max_gb, n_rows)) subsample_idx = np.random.permutation(list(df.index))[:n_rows] df_subs = df.iloc[subsample_idx] df_subs.reset_index(drop=True, inplace=True) target_sub = target[subsample_idx] else: df_subs = df.copy() target_sub = target.copy() # generate features df_subs, self.feature_formulas_ = engineer_features(df_subs, self.feateng_cols_, _parse_units(self.units, verbose=self.verbose), self.feateng_steps, self.transformations, self.verbose) # select predictive features if self.featsel_runs <= 0: if self.verbose: print("[AutoFeat] WARNING: Not performing feature selection.") good_cols = df_subs.columns else: if self.problem_type in ("regression", "classification"): good_cols = select_features(df_subs, target_sub, self.featsel_runs, None, self.problem_type, self.n_jobs, self.verbose) # if no features were selected, take the original features if not good_cols: good_cols = list(df.columns) else: print("[AutoFeat] WARNING: Unknown problem_type %r - not performing feature selection." % self.problem_type) good_cols = df_subs.columns # filter out those columns that were original features or generated otherwise self.new_feat_cols_ = [c for c in good_cols if c not in list(df.columns)] self.good_cols_ = good_cols # re-generate all good feature again; for all data points this time self.feature_functions_ = {} df = self._generate_features(df, self.new_feat_cols_) # filter out unnecessary junk from self.feature_formulas_ self.feature_formulas_ = {f: self.feature_formulas_[f] for f in self.new_feat_cols_ + self.feateng_cols_} self.feature_functions_ = {f: self.feature_functions_[f] for f in self.new_feat_cols_} self.all_columns_ = list(df.columns) # train final prediction model on all selected features if self.verbose: # final dataframe contains original columns and good additional columns print("[AutoFeat] Final dataframe with %i feature columns (%i new)." % (len(df.columns), len(df.columns) - len(self.original_columns_))) # train final prediction model if self.problem_type == "regression": model = lm.LassoLarsCV(cv=5) elif self.problem_type == "classification": model = lm.LogisticRegressionCV(cv=5, class_weight="balanced") else: print("[AutoFeat] WARNING: Unknown problem_type %r - not fitting a prediction model." % self.problem_type) model = None if model is not None: if self.verbose: print("[AutoFeat] Training final %s model." % self.problem_type) X = df[self.good_cols_].to_numpy() with warnings.catch_warnings(): warnings.simplefilter("ignore") model.fit(X, target) self.prediction_model_ = model # sklearn requires a "classes_" attribute if self.problem_type == "classification": self.classes_ = model.classes_ if self.verbose: if self.problem_type == "regression": coefs = model.coef_ else: # model.coefs_ is n_classes x n_features, but we need n_features coefs = np.max(np.abs(model.coef_), axis=0) weights = dict(zip(self.good_cols_, coefs)) print("[AutoFeat] Trained model: largest coefficients:") print(model.intercept_) for c in sorted(weights, key=lambda x: abs(weights[x]), reverse=True): if abs(weights[c]) < 1e-5: break print("%.6f * %s" % (weights[c], c)) print("[AutoFeat] Final score: %.4f" % model.score(X, target)) if self.always_return_numpy: return df.to_numpy() return df