def apply_algorithm(algorithm, reductions, x_features, y_labels, model_tasks, components_count, graphs, params): model = None if algorithm == 'random_forest': model = RandomForestClassifier( ) # has methods: decision_path(X) & apply elif algorithm == 'xgb': model = XGBClassifier() elif algorithm == 'knn': model = KNeighborsClassifier() elif algorithm == 'kmeans': model = KMeans() elif algorithm == 'linear_regression': model = LinearRegression() elif algorithm == 'logreg': model = LogisticRegression() elif algorithm == 'ridge': model = Ridge() elif algorithm == 'lasso': model = Lasso() elif algorithm == 'elastic_net': model = ElasticNet() elif algorithm == 'dirichlet': model = LatentDirichletAllocation() elif algorithm == 'lda': model = LinearDiscriminantAnalysis() elif algorithm == 'mlp': model = MLPClassifier() else: print('unhandled algorithm', algorithm) ''' algorithm will also be null if we're just applying reductions ''' if model is not None or 'reduce' in model_tasks: results = {} if algorithm and model: for k, v in params[algorithm].items(): model = model.set_params(**{k: v}) ''' train the original model ''' result = model_train(algorithm, model, x_features, y_labels, model_tasks, components_count, graphs) if result: results['original'] = result if 'reduce' in model_tasks: ''' reduce features and train a new model on each reduced feature set ''' iterative_reduced_features = x_features if 'iterative_reduce' in model_tasks else None for reduction in reductions: features_to_reduce = iterative_reduced_features if 'iterative_reduce' in model_tasks else x_features reduced_feature_result = reduce_features( features_to_reduce, y_labels, components_count, algorithm) if reduced_feature_result: print('starting features to reduce', features_to_reduce, 'reduced features', reduced_feature_result['features']) ''' update iteratively reduced feature set ''' iterative_reduced_features = reduced_feature_result[ 'features'] if 'iterative_reduce' in model_tasks else iterative_reduced_features ''' train new model on reduced features ''' if 'score' in model_tasks or 'train' in model_tasks: reduced_feature_model = model_train( algorithm, model, reduced_feature_result['features'], y_labels, model_tasks, components_count, graphs) if reduced_feature_model: iteration_name = ''.join(['iteration', reduction]) results[iteration_name] = reduced_feature_model if results: if 'apply_most_reduced_features' in model_tasks: ''' find most reduced feature set across all reduced feature sets & train new model on that most reduced set''' reduced_features = filter_reduced_features(results) if reduced_features: reduced_feature_model = model_train( algorithm, model, reduced_features, y_labels, model_tasks, components_count, graphs) if reduced_feature_model: results[ 'iteration_most_reduced'] = reduced_feature_model return results return False
def run_fit(self): # Display ConvergenceWarning only once and not for every item it occurs warnings.simplefilter("once", category=ConvergenceWarning) # initialize the ElasticNet model self.model = ElasticNet(alpha=1e-4, l1_ratio=self.l1_ratio, positive=self.positive_only, fit_intercept=False, copy_X=False, precompute=True, selection='random', max_iter=100, tol=1e-4) URM_train = check_matrix(self.URM, 'csc', dtype=np.float32) n_items = URM_train.shape[1] # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time # fit each item's factors sequentially (not in parallel) for currentItem in range(n_items): # get the target column y = URM_train[:, currentItem].toarray() if y.sum() == 0.0: continue # set the j-th column of X to zero start_pos = URM_train.indptr[currentItem] end_pos = URM_train.indptr[currentItem + 1] current_item_data_backup = URM_train.data[start_pos:end_pos].copy() URM_train.data[start_pos:end_pos] = 0.0 # fit one ElasticNet model per column self.model.fit(URM_train, y) nonzero_model_coef_index = self.model.sparse_coef_.indices nonzero_model_coef_value = self.model.sparse_coef_.data local_topK = min(len(nonzero_model_coef_value) - 1, self.topK) relevant_items_partition = ( -nonzero_model_coef_value ).argpartition(local_topK)[0:local_topK] relevant_items_partition_sorting = np.argsort( -nonzero_model_coef_value[relevant_items_partition]) ranking = relevant_items_partition[ relevant_items_partition_sorting] for index in range(len(ranking)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = nonzero_model_coef_index[ranking[index]] cols[numCells] = currentItem values[numCells] = nonzero_model_coef_value[ranking[index]] numCells += 1 # finally, replace the original values of the j-th column URM_train.data[start_pos:end_pos] = current_item_data_backup elapsed_time = time.time() - start_time new_time_value, new_time_unit = seconds_to_biggest_unit( elapsed_time) if time.time( ) - start_time_printBatch > 300 or currentItem == n_items - 1: print( "Processed {} ( {:.2f}% ) in {:.2f} {}. Items per second: {:.2f}" .format(currentItem + 1, 100.0 * float(currentItem + 1) / n_items, new_time_value, new_time_unit, float(currentItem) / elapsed_time)) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # generate the sparse weight matrix self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(n_items, n_items), dtype=np.float32)
def test_coefficients_graph_regression(): model = ElasticNet() model.fit(X_train, y_train) mr.coefficients_graph(X_train, X_test, model, "regression", "regression_test")
# 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = lassoLarsIC.predict(test_X) print "测试集得分:", lassoLarsIC.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, lassoLarsIC.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试ElasticNet类**********" # 在初始化ElasticNet类时, 指定超参数α和ρ, 默认值分别是1.0和0.5. elasticNet = ElasticNet(alpha=1.0, l1_ratio=0.5) # 拟合训练集 elasticNet.fit(train_X, train_Y) # 打印模型的系数 print "系数:", elasticNet.coef_ print "截距:", elasticNet.intercept_ print '训练集R2: ', r2_score(train_Y, elasticNet.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = elasticNet.predict(test_X) print "测试集得分:", elasticNet.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred)
def set_linear_regressors(self): self.estimators = [ # LinearRegression(), Ridge(), RidgeCV(), Lasso(), # MultiTaskLasso(), ElasticNet(), ElasticNetCV(), # MultiTaskElasticNet(), Lars(), LassoLars(), OrthogonalMatchingPursuit(), BayesianRidge(), # ARDRegression(), SGDRegressor(), PassiveAggressiveRegressor(), HuberRegressor(), RandomForestRegressor(), GradientBoostingRegressor() ] self.estimator_params = {} self.estimator_params['LinearRegression'] = { 'fit_intercept': [False, True], 'normalize': [False, True], 'n_jobs': [1, 2, 3, 4] } self.estimator_params['Ridge'] = { 'alpha': [1, 3, 6, 10], 'fit_intercept': [False, True], 'normalize': [False, True] } self.estimator_params['Lasso'] = { 'alpha': [1, 3, 6, 10], 'fit_intercept': [False, True], 'normalize': [False, True], 'precompute': [False, True] } self.estimator_params['Lars'] = { 'fit_intercept': [False, True], 'verbose': [1, 3, 6, 10], 'normalize': [False, True], 'precompute': [False, True] } self.estimator_params['LassoLars'] = { 'alpha': [1, 3, 6, 10], 'fit_intercept': [False, True], 'verbose': [1, 3, 6, 10], 'normalize': [False, True], 'precompute': [False, True] } self.estimator_params['OrthogonalMatchingPursuit'] = { 'n_nonzero_coefs': [1, 3, 6, 10], 'fit_intercept': [False, True], 'normalize': [False, True], 'precompute': [False, True] } self.estimator_params['BayesianRidge'] = { 'alpha': [0.0000001, 0.00001, 0.001, 0.1], 'fit_intercept': [False, True], 'normalize': [False, True], 'precompute': [False, True] } self.estimator_params['SGDRegressor'] = { 'alpha': [0.0000001, 0.00001, 0.001, 0.1], 'penalty': ['none', 'l2', 'l1', 'elasticnet'], 'fit_intercept': [False, True] } self.estimator_params['HuberRegressor'] = { 'alpha': [0.0000001, 0.00001, 0.001, 0.1], 'epsilon': [1, 1.35, 2, 5], 'fit_intercept': [False, True] } self.estimator_params['HuberRegressor'] = { 'alpha': [0.0000001, 0.00001, 0.001, 0.1], 'epsilon': [1, 1.35, 2, 5], 'fit_intercept': [False, True] } self.estimator_params['RandomForestRegressor'] = { 'n_estimators': [60, 100, 150], 'max_features': ['log2', 'sqrt', 'auto'], 'criterion': ['mse', 'mae'], # 'max_depth': [None, 8, 32, 64], # 'min_samples_split': [0.1, 0.2, 0.5, 0.7, 1.0], # 'min_samples_leaf': [1,2,5] } self.estimator_params['GradientBoostingRegressor'] = { 'n_estimators': [150], # 'loss': ['ls', 'lad','huber','quantile'], # 'criterion': ['mse', 'mae'], # 'max_depth': [None, 3, 5, 8], # 'min_samples_split': [0.2, 0.5, 0.9, 1.0], # 'min_samples_leaf': [1, 2, 3, 5], # 'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5], # 'alpha': [0.5, 0.7, 0.9, 1.0, 1.5] }
from sklearn.kernel_ridge import KernelRidge from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone from sklearn.model_selection import KFold, cross_val_score, train_test_split from sklearn.metrics import mean_squared_error import xgboost as xgb import lightgbm as lgb import sklearn import tensorflow as tf from tensorflow import keras ## regressors set - up KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1)) ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =5) XGBoost = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=2200, reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213, silent=1, random_state =7, nthread = -1) LightgBoost = lgb.LGBMRegressor(objective='regression',num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin = 55, bagging_fraction = 0.8, bagging_freq = 5, feature_fraction = 0.2319, feature_fraction_seed=9, bagging_seed=9,
def objective_function(args): n_components = args['n_components'] quantiles = args['quantiles'] if args['preprocessing'] == 'NoTransform': X, Y, scaler = transform(dataset) elif args['preprocessing'] == 'MinMaxScaler': X, Y, scaler = transform(dataset) elif args['preprocessing'] == 'StandardScaler': X, Y, scaler = standard_scaler(dataset) elif args['preprocessing'] == 'RobustScaler': X, Y, scaler = robust_scaler(dataset) elif args['preprocessing'] == 'QuantileTransformer': X, Y, scaler = quantile_transformer(dataset, quantiles) elif args['preprocessing'] == 'PowerTransformer': X, Y, scaler = power_transformer(dataset) elif args['preprocessing'] == 'PCA': X, Y, scaler = pca_transform(dataset, n_components) if args['preprocessing'] != 'PCA': k_features = args['k_features'] else: k_features = X.shape[1] if args['model'] == RandomForestRegressor: n_estimators = args['params']['n_estimators'] max_depth = args['params']['max_depth'] min_samples_split = args['params']['min_samples_split'] min_samples_leaf = args['params']['min_samples_leaf'] min_weight_fraction_leaf = args['params']['min_weight_fraction_leaf'] max_features = args['params']['max_features'] max_leaf_nodes = args['params']['max_leaf_nodes'] estimator = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, max_leaf_nodes = max_leaf_nodes, min_weight_fraction_leaf = min_weight_fraction_leaf, max_features = max_features, n_jobs = -1) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == AdaBoostRegressor: learning_rate = args['params']['learning_rate'] n_estimators = args['params']['n_estimators'] loss = args['params']['loss'] max_depth = args['params']['base_estimator']['max_depth'] min_samples_split = args['params']['base_estimator']['min_samples_split'] min_samples_leaf = args['params']['base_estimator']['min_samples_leaf'] min_weight_fraction_leaf = args['params']['base_estimator']['min_weight_fraction_leaf'] max_features = args['params']['base_estimator']['max_features'] estimator = AdaBoostRegressor(base_estimator = DecisionTreeRegressor(max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, min_weight_fraction_leaf = min_weight_fraction_leaf, max_features = max_features), learning_rate = learning_rate, n_estimators = n_estimators, loss = loss) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == ExtraTreesRegressor: n_estimators = args['params']['n_estimators'] max_depth = args['params']['max_depth'] min_samples_split = args['params']['min_samples_split'] max_features = args['params']['max_features'] min_samples_leaf = args['params']['min_samples_leaf'] min_weight_fraction_leaf = args['params']['min_weight_fraction_leaf'] max_leaf_nodes = args['params']['max_leaf_nodes'] estimator = ExtraTreesRegressor(n_estimators = n_estimators, max_depth = max_depth, min_samples_split = min_samples_split, max_features = max_features, max_leaf_nodes = max_leaf_nodes, min_weight_fraction_leaf = min_weight_fraction_leaf, min_samples_leaf = min_samples_leaf, n_jobs = -1) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == GradientBoostingRegressor: loss = args['params']['loss'] learning_rate = args['params']['learning_rate'] n_estimators = args['params']['n_estimators'] subsample = args['params']['subsample'] min_samples_split = args['params']['min_samples_split'] max_depth = args['params']['max_depth'] tol = args['params']['tol'] estimator = GradientBoostingRegressor(loss = loss, n_estimators = n_estimators, subsample = subsample, min_samples_split = min_samples_split, learning_rate = learning_rate, max_depth = max_depth, tol = tol) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == SGDRegressor: loss = args['params']['loss'] penalty = args['params']['penalty'] alpha = args['params']['alpha'] l1_ratio = args['params']['l1_ratio'] tol = args['params']['tol'] learning_rate = args['params']['learning_rate'] power_t = args['params']['power_t'] estimator = SGDRegressor(loss = loss, penalty = penalty, alpha = alpha, max_iter = 13000, l1_ratio = l1_ratio, tol = tol, learning_rate = learning_rate, power_t = power_t) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == ElasticNet: alpha = args['params']['alpha'] l1_ratio = args['params']['l1_ratio'] tol = args['params']['tol'] estimator = ElasticNet(alpha = alpha, l1_ratio = l1_ratio, tol = tol) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == Ridge: alpha = args['params']['alpha'] tol = args['params']['tol'] solver = args['params']['solver'] estimator = Ridge(alpha = alpha, tol = tol, solver = solver) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == KNeighborsRegressor: n_neighbors = args['params']['n_neighbors'] weights = args['params']['weights'] algorithm = args['params']['algorithm'] leaf_size = args['params']['leaf_size'] p = args['params']['p'] estimator = KNeighborsRegressor(n_neighbors = n_neighbors, weights = weights, algorithm = algorithm, leaf_size = leaf_size, p = p, n_jobs = -1) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == GaussianProcessRegressor: alpha = args['params']['alpha'] estimator = GaussianProcessRegressor(alpha = alpha) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == SVR: kernel = args['params']['kernel'] if kernel == 'poly': degree = args['params']['degree'] else: degree = 3 if kernel == 'rbf' or 'poly' or 'sigmoid': gamma = args['params']['gamma'] else: gamma = 'auto' tol = args['params']['tol'] C = args['params']['C'] shrinking = args['params']['shrinking'] estimator = SVR(kernel = kernel, degree = degree, gamma = gamma, tol = tol, C = C, shrinking = shrinking) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False) elif args['model'] == xgb: booster = args['params']['booster'] eta = args['params']['eta'] gamma = args['params']['gamma'] max_depth = args['params']['max_depth'] n_estimators = args['params']['n_estimators'] min_child_weight = args['params']['min_child_weight'] subsample = args['params']['subsample'] alpha = args['params']['alpha'] random_state = args['params']['random_state'] colsample_bytree = args['params']['colsample_bytree'] colsample_bylevel = args['params']['colsample_bylevel'] colsample_bynode = args['params']['colsample_bynode'] reg_lambda = args['params']['reg_lambda'] grow_policy = args['params']['grow_policy'] if booster == 'dart': sample_type = args['params']['sample_type'] normalize_type = args['params']['normalize_type'] rate_drop = args['params']['rate_drop'] skip_drop = args['params']['skip_drop'] if args['preprocessing'] != 'PCA': k_features = args['k_features'] else: k_features = sample(scope.int(hp.quniform('k_features', 1, X.shape[1], 1))) if booster == 'gbtree': estimator = xgb.XGBRegressor(booster = booster, eta = eta, gamma = gamma, max_depth = max_depth, n_estimators = n_estimators, min_child_weight = min_child_weight, subsample = subsample, alpha = alpha, random_state = random_state, colsample_bytree = colsample_bytree, colsample_bylevel = colsample_bylevel, grow_policy = grow_policy, colsample_bynode = colsample_bynode, reg_lambda = reg_lambda, n_jobs = -1) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False, scoring = metrics_names[eval_metric]) elif booster == 'dart': num_round = 50 estimator = xgb.XGBRegressor(booster = booster, eta = eta, gamma = gamma, max_depth = max_depth, n_estimators = n_estimators, min_child_weight = min_child_weight, subsample = subsample, alpha = alpha, random_state = random_state, colsample_bytree = colsample_bytree, sample_type = sample_type, normalize_type = normalize_type, rate_drop = rate_drop, skip_drop = skip_drop, colsample_bylevel = colsample_bylevel, grow_policy = grow_policy, colsample_bynode = colsample_bynode, reg_lambda = reg_lambda, n_jobs = -1) reg = SFS(estimator, cv = 4, k_features = k_features, forward = True, floating = False, scoring = metrics_names[eval_metric]) if eval_metric == 'mse': x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 1 - percent_train, random_state = 1, shuffle = False) sfsl = reg.fit(X, Y) x_sfs = sfsl.transform(X) x_train_sfs = x_sfs[:length_train] x_test_sfs = x_sfs[length_train:] estimator.fit(x_train_sfs, y_train) if args['model'] == xgb: if booster == "gbtree": y_pred = estimator.predict(x_test_sfs) elif booster == "dart": y_pred = estimator.predict(x_test_sfs, ntree_limit = num_round) else: y_pred = estimator.predict(x_test_sfs) if args['preprocessing'] != 'NoTransform': predictions = y_pred.reshape(-1, 1) for i in range(predictions.shape[1]): if args['preprocessing'] != 'PCA': tmp = np.zeros((predictions.shape[0], n_features)) else: tmp = np.zeros((predictions.shape[0], X.shape[1])) tmp[:, 0] = predictions[:, i] predictions[:, i] = scaler.inverse_transform(tmp)[:, 0] mse = mean_squared_error(dataset[target][length_train:], predictions) print('mse value: {}, model: {}'.format(mse, args['model'])) return mse else: mse = mean_squared_error(dataset[target][length_train:], y_pred) print('mse value: {}, model: {}'.format(mse, args['model'])) return mse else: reg.fit(X, Y) print('Model: {}, r2 value: {}, Selected variables {}'.format(args['model'], reg.k_score_, reg.k_feature_names_)) loss_function = 1 - reg.k_score_ return loss_function
def training_model(self, param): model = ElasticNet(**param) return model
def simple_experiment(file_path): # Read data dta = pd.read_csv(file_path) dta_clean = dta # remove the null values, that is fill NaN with there - FIXME: Rihards, naive implementation dta_clean = dta_clean.fillna(value=0, axis=1) dta_clean = dta_clean.dropna() dta_clean = dta_clean.drop('Unnamed: 0', axis=1) ######################### ####### Models ########## ######################### models_class = [ AdaBoostClassifier(), BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), RandomForestClassifier(), PassiveAggressiveClassifier(), LogisticRegression(), RidgeClassifier(), SGDClassifier(), GaussianNB(), MultinomialNB(), KNeighborsClassifier(), RadiusNeighborsClassifier(), NearestCentroid(), MLPClassifier(), SVC(), LinearSVC(), NuSVC(), DecisionTreeClassifier(), ExtraTreeClassifier() ] models_reg = [ AdaBoostRegressor(), BaggingRegressor(), ExtraTreesRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), ElasticNet(), HuberRegressor(), Lasso(), LassoLars(), LinearRegression(), PassiveAggressiveRegressor(), Ridge(), SGDRegressor(), OrthogonalMatchingPursuit(), RANSACRegressor(), KNeighborsRegressor(), RadiusNeighborsRegressor(), MLPRegressor(), SVR(), LinearSVR(), NuSVR(), DecisionTreeRegressor(), ExtraTreeRegressor() ] models_cfg = {} models_cfg[AdaBoostClassifier.__name__] = {} models_cfg[BaggingClassifier.__name__] = {} models_cfg[ExtraTreesClassifier.__name__] = {} models_cfg[GradientBoostingClassifier.__name__] = {} models_cfg[RandomForestClassifier.__name__] = {} models_cfg[PassiveAggressiveClassifier.__name__] = {} models_cfg[LogisticRegression.__name__] = {} models_cfg[RidgeClassifier.__name__] = {} models_cfg[SGDClassifier.__name__] = {} models_cfg[GaussianNB.__name__] = {} models_cfg[MultinomialNB.__name__] = {} models_cfg[KNeighborsClassifier.__name__] = {} models_cfg[RadiusNeighborsClassifier.__name__] = {} models_cfg[NearestCentroid.__name__] = {} models_cfg[MLPClassifier.__name__] = {} models_cfg[SVC.__name__] = {} models_cfg[LinearSVC.__name__] = {} models_cfg[NuSVC.__name__] = {} models_cfg[DecisionTreeClassifier.__name__] = {} models_cfg[ExtraTreeClassifier.__name__] = {} models_cfg[AdaBoostRegressor.__name__] = {} models_cfg[BaggingRegressor.__name__] = {} models_cfg[ExtraTreesRegressor.__name__] = {} models_cfg[GradientBoostingRegressor.__name__] = {} models_cfg[RandomForestRegressor.__name__] = {} models_cfg[BayesianRidge.__name__] = {} models_cfg[ElasticNet.__name__] = {} models_cfg[HuberRegressor.__name__] = {} models_cfg[Lars.__name__] = {} models_cfg[Lasso.__name__] = {} models_cfg[LassoLars.__name__] = {} models_cfg[LinearRegression.__name__] = {} models_cfg[PassiveAggressiveRegressor.__name__] = {} models_cfg[Ridge.__name__] = {} models_cfg[SGDRegressor.__name__] = {} models_cfg[OrthogonalMatchingPursuit.__name__] = {} models_cfg[RANSACRegressor.__name__] = {} models_cfg[TheilSenRegressor.__name__] = {} models_cfg[KNeighborsRegressor.__name__] = {} models_cfg[RadiusNeighborsRegressor.__name__] = {} models_cfg[MLPRegressor.__name__] = {} models_cfg[SVR.__name__] = {} models_cfg[LinearSVR.__name__] = {} models_cfg[NuSVR.__name__] = {} models_cfg[DecisionTreeRegressor.__name__] = {} models_cfg[ExtraTreeRegressor.__name__] = {} ##to run for multiple classes of data, add the tuples of x and y to the tuples array of data and decsription for the purposes logging. For now it is set to run for all the samples there are. For instance tuples_of_data = [(X,y, "all samples"), (X_1,y_1, "samples class1") , (X_2,y_2", "samples class2")] # for each tupple extracted from the array a new log file is going to be generated, so that each run is in a different log file. X_all = dta_clean.drop('worldwide_gross', axis=1) y_all = dta_clean['worldwide_gross'] desc = "quickReg" + file_path.replace('.', '').replace('/', '').replace( 'dataset', '').replace('csv', '') tuples_of_data = [(X_all, y_all, desc)] ######################### ### Start Regress######## ######################### orig_stdout = sys.stdout # save orig datetime and save orign stdout time = datetime.datetime.now().strftime("%Y_%m_%d_%H%M%S") for ind, tupl in enumerate(tuples_of_data): with warnings.catch_warnings(): warnings.simplefilter("ignore") # restart the current itterator for each run global itter_current itter_current = 0 x_crr, y_crr, dsc = tupl trg = "regressRes_" + dsc + "_" + time + ".log" new_file = open(trg, "w") sys.stdout = new_file # set the itterator run to start from global itter_start itter_start = 0 run_for_many(x_crr, y_crr, dsc, models_reg, models_cfg) new_file.close() desc = "quickClass" + file_path.replace('.', '').replace('/', '').replace( 'dataset', '').replace('csv', '') labels = [label_gross_3, label_gross_2] #save orig datetime and save orign stdout orig_stdout = sys.stdout time = datetime.datetime.now().strftime("%Y_%m_%d_%H%M%S") for ind, cb in enumerate(labels): with warnings.catch_warnings(): warnings.simplefilter("ignore") #restart the current itterator for each run global itter_current itter_current = 0 trg = "classifyRes_" + desc + "_" + cb.__name__ + "_" + time + ".log" new_file = open(trg, "w") sys.stdout = new_file #set the itterator run to start from global itter_start itter_start = 0 x_crr = dta_clean.drop('worldwide_gross', axis=1) y_crr = dta_clean.worldwide_gross.apply(lambda gross: cb(gross)) dsc = desc + "_" + cb.__name__ run_for_many(x_crr, y_crr, dsc, models_class, models_cfg) new_file.close() # reassign the org stdout for some reason sys.stdout = orig_stdout
#from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet from sklearn.model_selection import GridSearchCV, cross_val_score #from sklearn.neighbors import KNeighborsRegressor #from sklearn.svm import SVR #from sklearn.tree import DecisionTreeRegressor #from xgboost import XGBRegressor # In[85]: models = [('LR', LinearRegression()), ("Ridge", Ridge()), ("Lasso", Lasso()), ("ElasticNet", ElasticNet())] # In[86]: for name, regressor in models: rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=5, scoring="neg_mean_squared_error"))) print(f"RMSE: {round(rmse, 4)} ({name}) ") # In[88]: model = Ridge(alpha=1.0) model.fit(X, y)
test_rmse = np.sqrt(test_mse) return test_rmse def get_test_r2(model, test_x, test_y): """Return the r-squared of the trained model based on the test dataset.""" test_y_hat = model.predict(test_x) test_r2 = r2_score(test_y, test_y_hat) return test_r2 # %% dict_models_explore = { 'ridge': Ridge(), 'lasso': Lasso(), 'elastic_net': ElasticNet(), 'svr': SVR(), 'decision_tree': DecisionTreeRegressor(), 'random_forest': RandomForestRegressor(), 'extra_trees': ExtraTreesRegressor(), 'gradient_boosting': GradientBoostingRegressor() } dict_models = { 'lasso': Lasso(), 'elastic_net': ElasticNet(), 'random_forest': RandomForestRegressor(), 'gradient_boosting': GradientBoostingRegressor() } # %% train_set, test_set = train_test_split(df_Austin_all_data_adj,
def train_linear_model(X, y, random_state=1, test_size=0.2, regularization_type='elasticnet', k_fold=5, max_iter=1000000, tol=0.0001, l1_ratio=None): """ Function to train linear model with regularization and cross-validation. Args: X (pandas.DataFrame): dataframe of descriptors. y (pandas.DataFrame): dataframe of cycle lifetimes. random_state (int): seed for train/test split. test_size (float): proportion of the dataset reserved for model evaluation. regularization_type (str): lasso or ridge or elastic-net (with cv). k_fold (int): k in k-fold cross-validation. max_iter (int): maximum number of iterations for model fitting. tol (float): tolerance for optimization. l1_ratio ([float]): list of lasso to ridge ratios for elasticnet. Returns: sklearn.linear_model.LinearModel: fitted model. mu (float): Mean value of descriptors used in training. s (float): Std dev of descriptors used in training. """ if l1_ratio is None: l1_ratio = [.1, .5, .7, .9, .95, 1] X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, random_state=random_state) # Standardize (training) data after train/test split mu = np.mean(X_train, axis=0) s = np.std(X_train, axis=0) X_scaled = (X_train - mu) / s hyperparameters = { 'random_state': random_state, 'test_size': test_size, 'k_fold': k_fold, 'tol': tol, 'max_iter': max_iter } if regularization_type == 'lasso' and y.shape[1] == 1: lassocv = LassoCV(fit_intercept=True, alphas=None, tol=tol, cv=k_fold, max_iter=max_iter) lassocv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and refit model alpha_opt = lassocv.alpha_ linear_model = Lasso(fit_intercept=True, alpha=alpha_opt, max_iter=max_iter) linear_model.fit(X_scaled, y_train.values) hyperparameters['l1_ratio'] = 1 elif regularization_type == 'ridge' and y.shape[1] == 1: ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold) ridgecv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and refit model alpha_opt = ridgecv.alpha_ linear_model = Ridge(fit_intercept=True, alpha=alpha_opt) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = 0 elif regularization_type == 'elasticnet' and y.shape[1] == 1: elasticnetcv = ElasticNetCV(fit_intercept=True, normalize=False, alphas=None, cv=k_fold, l1_ratio=l1_ratio, max_iter=max_iter) elasticnetcv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and l1_ratio. Refit model alpha_opt = elasticnetcv.alpha_ l1_ratio_opt = elasticnetcv.l1_ratio_ linear_model = ElasticNet(fit_intercept=True, normalize=False, l1_ratio=l1_ratio_opt, alpha=alpha_opt, max_iter=max_iter) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = l1_ratio_opt # If more than 1 outcome present, perform multitask regression elif regularization_type == 'elasticnet' and y.shape[1] > 1: multi_elasticnet_CV = MultiTaskElasticNetCV(fit_intercept=True, cv=k_fold, normalize=False, l1_ratio=l1_ratio, max_iter=max_iter) multi_elasticnet_CV.fit(X_scaled, y_train) # Set optimal alpha and l1_ratio. Refit model alpha_opt = multi_elasticnet_CV.alpha_ l1_ratio_opt = multi_elasticnet_CV.l1_ratio_ linear_model = MultiTaskElasticNet(fit_intercept=True, normalize=False, max_iter=max_iter) linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = l1_ratio_opt else: raise NotImplementedError y_pred = linear_model.predict((X_test - mu) / s) Rsq = linear_model.score((X_test - mu) / s, y_test) # Compute 95% confidence interval # Multioutput = 'raw_values' provides prediction error per output pred_actual_ratio = [x / y for x, y in zip(y_pred, np.array(y_test))] relative_prediction_error = 1.96 * np.sqrt( mean_squared_error(np.ones(y_pred.shape), pred_actual_ratio, multioutput='raw_values') / y_pred.shape[0]) hyperparameters['alpha'] = alpha_opt return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
print(score) print(score.mean()) # In[ ]: X_train[f_selected].shape # In[ ]: from sklearn.linear_model import ElasticNet elasticnet_reg = ElasticNet(l1_ratio=0.8, random_state=0) score = cross_val_score(elasticnet_reg, X_train[f_selected], y_train, cv=10) print(score) print(score.mean()) # In[ ]: from sklearn.ensemble import RandomForestRegressor randomfor_reg = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42) score = cross_val_score(randomfor_reg, X_train[f_selected], y_train, cv=10) print(score) print(score.mean())
def chooseAlgorithm(problemType, features, targets): if 'Classification' in problemType: models = { 'RFC': RandomForestClassifier(), 'ETC': ExtraTreesClassifier(), 'GNB': GaussianNB(), 'MNB': MultinomialNB(), 'KNC': KNeighborsClassifier(n_neighbors=round(sqrt(len(features.index)))), 'SVC': SVC(), 'LSVC': LinearSVC(), 'LGR': LogisticRegression(), 'LDA': LinearDiscriminantAnalysis(), 'SDGC': SGDClassifier() } elif 'Regression' in problemType: models = { 'RFR': RandomForestRegressor(), 'ETR': ExtraTreesRegressor(), 'LNR': LinearRegression(), 'SDGR': SGDRegressor(), 'KNR': KNeighborsRegressor(n_neighbors=round(sqrt(len(features.index)))), 'SVR': SVR(), 'LSVR': LinearSVR(), 'Lasso': Lasso(), 'ENET': ElasticNet(), 'Ridge': Ridge() } else: raise TypeError([ 'expected either \'classification\' or \'regression\' as problem type' ]) results = {} X_train, X_test, y_train, y_test = train_test_split( features, targets.values.ravel()) for name, model in models.items(): model.fit(X_train, y_train) score = model.score(X_test, y_test) results[name] = score bestModelScore = sorted(results.items(), key=lambda x: x[1], reverse=True)[0] model = models[bestModelScore[0]].fit(features, targets) return model
# print("The ALL data size before: {} ".format(all_data.shape)) all_data = process_data(all_data) # print("The ALL data size after: {} \n".format(all_data.shape)) df_train = all_data[:n_train] X_train = df_train.values df_test = all_data[n_train:] regressor = RandomForestRegressor(n_estimators=300, random_state=0) score = rmsle_cv(regressor, X_train, y_train) print("\nRandomForestRegressor score: {:.4f} ({:.4f})\n".format( score.mean(), score.std())) lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) KRR = KernelRidge(alpha=0.6, kernel="polynomial", degree=2, coef0=2.5) GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features="sqrt", min_samples_leaf=15, min_samples_split=10, loss="huber", random_state=5) stacked_averaged_models = StackingAveragedModels(base_models=(ENet, GBoost, KRR), meta_model=lasso) score = rmsle_cv(lasso, X_train, y_train) print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
lasso.fit(x_train,y_train) RLasso=['RL-Lasso', "%.4f" % lasso.score(x_train,y_train), "%.4f" % lasso.score(x_test,y_test)] ### Elastic Net Regression ### #ENet = GridSearchCV(ElasticNet(random_state=3),param_grid={'alpha':np.logspace(-6,-2,10), # 'l1_ratio':np.linspace(0.5,0.9,5)}, cv=5) ENet = ElasticNet(alpha=0.00046,l1_ratio=0.9,random_state=3) ENet.fit(x_train,y_train) ElasticN=['RL-ENet', "%.4f" % ENet.score(x_train,y_train), "%.4f" % ENet.score(x_test,y_test)] ### Kernel Ridge Regression ### #KRidge = GridSearchCV(KernelRidge(kernel='polynomial',degree=2,coef0=2.5), # param_grid={'alpha':np.linspace(0.7,0.9,10), # 'gamma': np.logspace(-5,-3,10)}, cv=5) KRidge = KernelRidge(kernel='polynomial', degree=2, coef0=2.5, alpha=0.9 ,gamma=1e-04)
data = pd.read_csv("wine-quality.csv") # Split the data into training and test sets. (0.75, 0.25) split. train, test = train_test_split(data) # The predicted column is "quality" which is a scalar from [3, 9] train_x = train.drop(["quality"], axis=1) test_x = test.drop(["quality"], axis=1) train_y = train[["quality"]] test_y = test[["quality"]] alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5 l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5 with mlflow.start_run(): lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) lr.fit(train_x, train_y) predicted_qualities = lr.predict(test_x) (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) print(" RMSE: %s" % rmse) print(" MAE: %s" % mae) print(" R2: %s" % r2) mlflow.log_param("alpha", alpha) mlflow.log_param("l1_ratio", l1_ratio) mlflow.log_metric("rmse", rmse) mlflow.log_metric("r2", r2)
reg_features_2['intercept'] = 1.0 ''' ''' # ============================================================================= # from sklearn import linear_model # reg = linear_model.Lasso(alpha=0.1, fit_intercept=True, normalize=True, precompute=True, copy_X=True, max_iter=1000, tol=0.0001, warm_start=True, positive=True, random_state=42, selection='cyclic') # ============================================================================= from sklearn.linear_model import ElasticNet reg_1 = ElasticNet(alpha=0.8, l1_ratio=0.8, fit_intercept=True, normalize=False, precompute=True, copy_X=True, max_iter=1000, tol=0.001, warm_start=True, positive=True, random_state=42, selection='cyclic') reg_2 = ElasticNet(alpha=0.7, l1_ratio=1, fit_intercept=True, normalize=False, precompute=True, copy_X=True, max_iter=1000, tol=0.001, warm_start=True, positive=True,
#!/usr/bin/python # -*- coding: UTF-8 -*- import numpy as np from sklearn.linear_model import ElasticNet from sklearn.linear_model import SGDRegressor __author__ = 'lebaishi' X = 2 * np.random.rand(100, 1) y = 4 + 3 * X + np.random.randn(100, 1) elastic_net = ElasticNet(alpha=0.0001, l1_ratio=0.15) elastic_net.fit(X, y) print(elastic_net.predict(1.5)) sgd_reg = SGDRegressor(penalty='elasticnet', max_iter=1000) sgd_reg.fit(X, y.ravel()) print(sgd_reg.predict(1.5))
def fit(self, l1_ratio=0.1, alpha=1.0, positive_only=True, topK=100): assert l1_ratio >= 0 and l1_ratio <= 1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format( self.RECOMMENDER_NAME, l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK # Display ConvergenceWarning only once and not for every item it occurs warnings.simplefilter("once", category=ConvergenceWarning) # initialize the ElasticNet model self.model = ElasticNet(alpha=alpha, l1_ratio=self.l1_ratio, positive=self.positive_only, fit_intercept=False, copy_X=False, precompute=True, selection='random', max_iter=100, tol=1e-4) URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = URM_train.shape[1] # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time # fit each item's factors sequentially (not in parallel) for currentItem in range(n_items): # get the target column y = URM_train[:, currentItem].toarray() # set the j-th column of X to zero start_pos = URM_train.indptr[currentItem] end_pos = URM_train.indptr[currentItem + 1] current_item_data_backup = URM_train.data[start_pos:end_pos].copy() URM_train.data[start_pos:end_pos] = 0.0 # fit one ElasticNet model per column self.model.fit(URM_train, y) # self.model.coef_ contains the coefficient of the ElasticNet model # let's keep only the non-zero values # Select topK values # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index nonzero_model_coef_index = self.model.sparse_coef_.indices nonzero_model_coef_value = self.model.sparse_coef_.data local_topK = min(len(nonzero_model_coef_value) - 1, self.topK) relevant_items_partition = ( -nonzero_model_coef_value ).argpartition(local_topK)[0:local_topK] relevant_items_partition_sorting = np.argsort( -nonzero_model_coef_value[relevant_items_partition]) ranking = relevant_items_partition[ relevant_items_partition_sorting] for index in range(len(ranking)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = nonzero_model_coef_index[ranking[index]] cols[numCells] = currentItem values[numCells] = nonzero_model_coef_value[ranking[index]] numCells += 1 # finally, replace the original values of the j-th column URM_train.data[start_pos:end_pos] = current_item_data_backup elapsed_time = time.time() - start_time new_time_value, new_time_unit = seconds_to_biggest_unit( elapsed_time) if time.time( ) - start_time_printBatch > 300 or currentItem == n_items - 1: self._print( "Processed {} ( {:.2f}% ) in {:.2f} {}. Items per second: {:.2f}" .format(currentItem + 1, 100.0 * float(currentItem + 1) / n_items, new_time_value, new_time_unit, float(currentItem) / elapsed_time)) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # generate the sparse weight matrix self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(n_items, n_items), dtype=np.float32)
from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.linear_model import ElasticNet from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.svm import SVR from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.preprocessing import PolynomialFeatures np.random.seed(1335) # for reproducibility g_models = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), KNeighborsRegressor(), DecisionTreeRegressor(), SVR(),RandomForestRegressor(), AdaBoostRegressor(), GradientBoostingRegressor()] #g_models = [LinearRegression(), LinearRegression(),LinearRegression(),LinearRegression()] g_idx = 0 NUM_LONG_POSITIONS=20 NUM_SHORT_POSITIONS=20 #start = '2016-8-10' # 必须在国内交易日 #end = '2017-8-11' # 必须在国内交易日 c,_ = get_sector_class() ONEHOTCLASS = tuple(c) hs300 = ts.get_hs300s()['code']
# score = numpy.sqrt(-numpy.mean(cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv))) score = numpy.sum(-cross_val_score( model, X, y, scoring='neg_mean_squared_error', cv=cv)) # v. 14 scores.append(score) results.append({ "model": "ridge", "fold": fold, "alpha": alpha, "rmse": score, "coefs": model.coef_ }) # print("ridge alpha:", alpha, "fold:", fold, "score:", score) # print("coefs:", model.coef_, "\n") model = ElasticNet(alpha=alpha).fit(X, y) # score = numpy.sqrt(-numpy.mean(cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv))) score = numpy.sum(-cross_val_score( model, X, y, scoring='neg_mean_squared_error', cv=cv)) # v. 14 scores.append(score) results.append({ "model": "elastic", "fold": fold, "alpha": alpha, "rmse": score, "coefs": model.coef_ }) # print("elastic alpha:", alpha, "fold:", fold, "score:", score) # print("coefs:", model.coef_, "\n")
# Lasso from sklearn.linear_model import Lasso alpha = 0.1 lasso = Lasso(alpha=alpha) y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test) r2_score_lasso = r2_score(y_test, y_pred_lasso) print(lasso) print("r^2 on test data : %f" % r2_score_lasso) # ############################################################################# # ElasticNet from sklearn.linear_model import ElasticNet enet = ElasticNet(alpha=alpha, l1_ratio=0.7) y_pred_enet = enet.fit(X_train, y_train).predict(X_test) r2_score_enet = r2_score(y_test, y_pred_enet) print(enet) print("r^2 on test data : %f" % r2_score_enet) m, s, _ = plt.stem(np.where(enet.coef_)[0], enet.coef_[enet.coef_ != 0], markerfmt='x', label='Elastic net coefficients') plt.setp([m, s], color="#2ca02c") m, s, _ = plt.stem(np.where(lasso.coef_)[0], lasso.coef_[lasso.coef_ != 0], markerfmt='x', label='Lasso coefficients')
ax = fig.add_subplot(111) cax = ax.matshow(dataframe.corr(), vmin=-1, vmax=1) fig.colorbar(cax) ticks = numpy.arange(0,31,1) ax.set_xticks(ticks) ax.set_yticks(ticks) ax.set_xticklabels(dataframe.columns) ax.set_yticklabels(dataframe.columns) num_instances = len(X) models = [] models.append(('LiR', LinearRegression())) models.append(('Ridge', Ridge())) models.append(('Lasso', Lasso())) models.append(('ElasticNet', ElasticNet())) models.append(('Bag_Re', BaggingRegressor())) models.append(('RandomForest', RandomForestRegressor())) models.append(('ExtraTreesRegressor', ExtraTreesRegressor())) models.append(('KNN', KNeighborsRegressor())) models.append(('CART', DecisionTreeRegressor())) models.append(('SVM', SVR())) # Evaluations results = [] names = [] scoring = [] for name, model in models: # Fit the model model.fit(X, Y)
def get_model_from_name(model_name, training_params=None, is_hp_search=False): global keras_imported # For Keras epochs = 1000 # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': # print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy') # epochs = 100 all_model_params = { 'LogisticRegression': {}, 'RandomForestClassifier': {'n_jobs': -2, 'n_estimators': 30}, 'ExtraTreesClassifier': {'n_jobs': -1}, 'AdaBoostClassifier': {}, 'SGDClassifier': {'n_jobs': -1}, 'Perceptron': {'n_jobs': -1}, 'LinearSVC': {'dual': False}, 'LinearRegression': {'n_jobs': -2}, 'RandomForestRegressor': {'n_jobs': -2, 'n_estimators': 30}, 'LinearSVR': {'dual': False, 'loss': 'squared_epsilon_insensitive'}, 'ExtraTreesRegressor': {'n_jobs': -1}, 'MiniBatchKMeans': {'n_clusters': 8}, 'GradientBoostingRegressor': {'presort': False, 'learning_rate': 0.1, 'warm_start': True}, 'GradientBoostingClassifier': {'presort': False, 'learning_rate': 0.1, 'warm_start': True}, 'SGDRegressor': {'shuffle': False}, 'PassiveAggressiveRegressor': {'shuffle': False}, 'AdaBoostRegressor': {}, 'LGBMRegressor': {'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384}, 'LGBMClassifier': {'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384}, 'DeepLearningRegressor': {'epochs': epochs, 'batch_size': 50, 'verbose': 2}, 'DeepLearningClassifier': {'epochs': epochs, 'batch_size': 50, 'verbose': 2}, 'CatBoostRegressor': {}, 'CatBoostClassifier': {} } # if os.environ.get('is_test_suite', 0) == 'True': # all_model_params model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if is_hp_search == True: if model_name[:12] == 'DeepLearning': model_params['epochs'] = 50 if model_name[:4] == 'LGBM': model_params['n_estimators'] = 500 if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it) model_params.update(training_params) print('After overwriting our defaults with your values, here are the final params that will be used to initialize the model:') print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans(), } try: model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001) model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001) model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(max_iter=1000, tol=0.001) model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001) model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor(max_iter=1000, tol=0.001) except TypeError: model_map['SGDClassifier'] = SGDClassifier() model_map['Perceptron'] = Perceptron() model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier() model_map['SGDRegressor'] = SGDRegressor() model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor() if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if catboost_installed: model_map['CatBoostRegressor'] = CatBoostRegressor(calc_feature_importance=True) model_map['CatBoostClassifier'] = CatBoostClassifier(calc_feature_importance=True) if model_name[:12] == 'DeepLearning': if keras_imported == False: # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead) try: os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from tensorflow import logging logging.set_verbosity(logging.INFO) except: pass global maxnorm global Dense, Dropout global LeakyReLU, PReLU, ThresholdedReLU, ELU global Sequential global keras_load_model global regularizers, optimizers global Activation global KerasRegressor, KerasClassifier from keras.constraints import maxnorm from keras.layers import Activation, Dense, Dropout from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU from keras.models import Sequential from keras.models import load_model as keras_load_model from keras import regularizers, optimizers from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier keras_imported = True model_map['DeepLearningClassifier'] = KerasClassifier(build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor(build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print('It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize') raise(e) if os.environ.get('is_test_suite', False) == 'True': if 'n_jobs' in model_params: model_params['n_jobs'] = 1 model_with_params = model_without_params.set_params(**model_params) return model_with_params
def enet_solve(c,b): regr = ElasticNet(random_state=0,max_iter=10000) regr.fit(c,b) return regr.coef_
ll = sum(act * sp.log(pred) + sp.subtract(1, act) * sp.log(sp.subtract(1, pred))) ll = ll * -1.0 / len(act) return ll # add two columns for hour and weekday def dayhour(timestr): d = datetime.strptime(str(x), "%y%m%d%H") return [float(d.weekday()), float(d.hour)] fh = FeatureHasher(n_features=2**20, input_type="string") # Train classifier clf = ElasticNet() train = pd.read_csv("train/subtrain.csv", chunksize=100000, iterator=True) all_classes = np.array([0, 1]) for chunk in train: y_train = chunk["click"] chunk = chunk[cols] chunk = chunk.join( pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"])) chunk.drop(["hour"], axis=1, inplace=True) Xcat = fh.transform(np.asarray(chunk.astype(str))) clf.fit(Xcat, y_train) # Create a submission file usecols = cols + ["id"] X_test = pd.read_csv("test/mtest.csv", usecols=usecols) X_test = X_test.join(
#Lasso Regression Models from sklearn import linear_model reg = linear_model.Lasso(alpha=0.1) reg.fit(X_train, y_train) lasso_y_pred = reg.predict(X_test) reg.coef_ reg.intercept_ print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, lasso_y_pred)) print('Variance score: %.2f' % r2_score(diabetes_y_test, lasso_y_pred)) #Elastic Nets from sklearn.linear_model import ElasticNet from sklearn.datasets import make_regression X, y = make_regression(n_features=2, random_state=0) regr = ElasticNet(random_state=0) regr.fit(X_train, y_train) print(regr.coef_) print(regr.intercept_) y_pred_elas = regr.predict(X_test) print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, y_pred_elas)) print('Variance score: %.2f' % r2_score(diabetes_y_test, y_pred_elas)) #Ridge Regression from sklearn import linear_model reg = linear_model.Ridge(alpha=5) reg.fit(X_train, y_train) reg.coef_ reg.intercept_
def __init__(self, mod_name, mod_params={}, param_grid={}, outdir='./'): self.mod_name = mod_name self.metric = None self.mod_params = mod_params self.outdir = outdir self.param_grid = param_grid # regression if self.mod_name == 'lm': self.model = LinearRegression() # linear regression model self.metric = 'R2' elif self.mod_name == 'elasticNet': alpha = mod_params['alpha'] if 'alpha' in mod_params else 0.03 l1_ratio = mod_params[ 'l1_ratio'] if 'l1_ratio' in mod_params else 0.5 self.model = ElasticNet( alpha=alpha, l1_ratio=l1_ratio) #l1=lasso; l2=ridge; l1_ratio=1 means l1 self.metric = 'R2' elif self.mod_name == 'rf': n_estimators = mod_params[ 'n_estimators'] if 'n_estimators' in mod_params else 100 max_depth = mod_params[ 'max_depth'] if 'max_depth' in mod_params else 10 min_samples_leaf = mod_params[ 'min_samples_leaf'] if 'min_samples_leaf' in mod_params else 5 max_features = mod_params[ 'max_features'] if 'max_features' in mod_params else 'sqrt' self.model = RandomForestRegressor( n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_features=max_features, oob_score=True, random_state=42) #random forest self.metric = 'R2' elif self.mod_name == 'svr': self.model = SVR(kernel=mod_params['kernel'], C=mod_params['C']) #SVR linear kernel self.metric = 'R2' elif self.mod_name == 'dummy_reg': self.model = DummyRegressor(quantile=0.5) self.metric = 'R2' elif self.mod_name == 'mlp': # TODO note neural-net based models were tested separately, some of the code are integrated here, but \ # MLPs are not fully functional yet with the libraries written here K.clear_session() input_data = Input(shape=(mod_params['feat_size'], )) for n in range(mod_params['num_layers']): if (n == 0): x = input_data #dense_name = 'Dense_%d' % n x = Dense(64 * (2**(2 * (mod_params['num_layers'] - n - 1))), kernel_regularizer=regularizers.l1(0.01))(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dropout(0.5)(x) encoded = Dense(1)(x) model = Model(input_data, encoded) model.compile(optimizer='adam', loss='mean_squared_error') self.model = model self.metric = 'R2' # classification elif self.mod_name == 'mlp_classify': K.clear_session() input_data = Input(shape=(mod_params['feat_size'], )) for n in range(mod_params['num_layers']): if (n == 0): x = input_data # dense_name = 'Dense_%d' % n x = Dense(64 * (2**(2 * (mod_params['num_layers'] - n - 1))), kernel_regularizer=regularizers.l1(0.01))(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dropout(0.5)(x) encoded = Dense(1, activation='sigmoid')(x) model = Model(input_data, encoded) model.compile(optimizer='adam', loss='binary_crossentropy') self.model = model self.metric = 'AUC' elif self.mod_name == 'logit': self.model = LogisticRegression(penalty='l2', class_weight='balanced') self.metric = 'AUC' elif self.mod_name == 'rfc': n_estimators = mod_params[ 'n_estimators'] if 'n_estimators' in mod_params else 100 max_depth = mod_params[ 'max_depth'] if 'max_depth' in mod_params else 10 min_samples_leaf = mod_params[ 'min_samples_leaf'] if 'min_samples_leaf' in mod_params else 5 max_features = mod_params[ 'max_features'] if 'max_features' in mod_params else 'sqrt' self.model = RandomForestClassifier( n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_features=max_features, oob_score=True, class_weight='balanced', random_state=42) self.metric = 'AUC' elif self.mod_name == 'xgboost': # Not in use - XGBClassifier has additional requirements that need to be configured # n_estimators = mod_params['n_estimators'] if 'n_estimators' in mod_params else 100 # self.model = XGBClassifier(n_estimators=n_estimators) # self.metric = 'AUC' self.model = None self.metric = 'AUC'
regressors = [ LinearRegression(fit_intercept=SET_FIT_INTERCEPT), Ridge(alpha=1, solver='cholesky', fit_intercept=SET_FIT_INTERCEPT, normalize=False, random_state=RANDOM_SEED), Lasso(alpha=0.1, max_iter=10000, tol=0.01, fit_intercept=SET_FIT_INTERCEPT, random_state=RANDOM_SEED), ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=10000, tol=0.01, fit_intercept=SET_FIT_INTERCEPT, normalize=False, random_state=RANDOM_SEED) ] from sklearn.model_selection import KFold N_FOLDS = 506 # leave one out cross validation N = number of observations cv_results = np.zeros( (N_FOLDS, len(names) )) # array with as many rows as folds and as many columns for names dv_results = np.zeros((N_FOLDS, len(names)))