def test_multitask_enet_and_lasso_cv(): X, y, _, _ = build_dataset(n_features=50, n_targets=3) clf = MultiTaskElasticNetCV(cv=3).fit(X, y) assert_almost_equal(clf.alpha_, 0.00556, 3) clf = MultiTaskLassoCV(cv=3).fit(X, y) assert_almost_equal(clf.alpha_, 0.00278, 3) X, y, _, _ = build_dataset(n_targets=3) clf = MultiTaskElasticNetCV(n_alphas=10, eps=1e-3, max_iter=100, l1_ratio=[0.3, 0.5], tol=1e-3, cv=3) clf.fit(X, y) assert 0.5 == clf.l1_ratio_ assert (3, X.shape[1]) == clf.coef_.shape assert (3, ) == clf.intercept_.shape assert (2, 10, 3) == clf.mse_path_.shape assert (2, 10) == clf.alphas_.shape X, y, _, _ = build_dataset(n_targets=3) clf = MultiTaskLassoCV(n_alphas=10, eps=1e-3, max_iter=100, tol=1e-3, cv=3) clf.fit(X, y) assert (3, X.shape[1]) == clf.coef_.shape assert (3, ) == clf.intercept_.shape assert (10, 3) == clf.mse_path_.shape assert 10 == len(clf.alphas_)
def test_enet_l1_ratio(): # Test that an error message is raised if an estimator that # uses _alpha_grid is called with l1_ratio=0 msg = ("Automatic alpha grid generation is not supported for l1_ratio=0. " "Please supply a grid by providing your estimator with the " "appropriate `alphas=` argument.") X = np.array([[1, 2, 4, 5, 8], [3, 5, 7, 7, 8]]).T y = np.array([12, 10, 11, 21, 5]) assert_raise_message(ValueError, msg, ElasticNetCV(l1_ratio=0, random_state=42).fit, X, y) assert_raise_message( ValueError, msg, MultiTaskElasticNetCV(l1_ratio=0, random_state=42).fit, X, y[:, None]) # Test that l1_ratio=0 is allowed if we supply a grid manually alphas = [0.1, 10] estkwds = {'alphas': alphas, 'random_state': 42} est_desired = ElasticNetCV(l1_ratio=0.00001, **estkwds) est = ElasticNetCV(l1_ratio=0, **estkwds) with ignore_warnings(): est_desired.fit(X, y) est.fit(X, y) assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5) est_desired = MultiTaskElasticNetCV(l1_ratio=0.00001, **estkwds) est = MultiTaskElasticNetCV(l1_ratio=0, **estkwds) with ignore_warnings(): est.fit(X, y[:, None]) est_desired.fit(X, y[:, None]) assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5)
def test_enet_path(): # We use a large number of samples and of informative features so that # the l1_ratio selected is more toward ridge than lasso X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100, n_informative_features=100) max_iter = 150 # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have selected an elastic-net # that is closer to ridge than to lasso assert clf.l1_ratio_ == min(clf.l1_ratio) clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter, precompute=True) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have selected an elastic-net # that is closer to ridge than to lasso assert clf.l1_ratio_ == min(clf.l1_ratio) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert clf.score(X_test, y_test) > 0.99 # Multi-output/target case X, y, X_test, y_test = build_dataset(n_features=10, n_targets=3) clf = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert clf.score(X_test, y_test) > 0.99 assert clf.coef_.shape == (3, 10) # Mono-output should have same cross-validated alpha_ and l1_ratio_ # in both cases. X, y, _, _ = build_dataset(n_features=10) clf1 = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf1.fit(X, y) clf2 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf2.fit(X, y[:, np.newaxis]) assert_almost_equal(clf1.l1_ratio_, clf2.l1_ratio_) assert_almost_equal(clf1.alpha_, clf2.alpha_)
def test_1d_multioutput_enet_and_multitask_enet_cv(): X, y, _, _ = build_dataset(n_features=10) y = y[:, np.newaxis] clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf.fit(X, y[:, 0]) clf1 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf1.fit(X, y) assert_almost_equal(clf.l1_ratio_, clf1.l1_ratio_) assert_almost_equal(clf.alpha_, clf1.alpha_) assert_almost_equal(clf.coef_, clf1.coef_[0]) assert_almost_equal(clf.intercept_, clf1.intercept_[0])
def elastic_net(X,Y): print(X.shape) clf = MultiTaskElasticNetCV(l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, max_iter=1000, tol=0.0001, cv=None, copy_X=True, verbose=0, n_jobs=1, random_state=None, selection='cyclic') fit=clf.fit(X,Y) sfm = SelectFromModel(fit,prefit=True) values= SelectFromModel.get_support(sfm,indices=True) new_features = sfm.transform(X) return new_features,values
class _MultiTaskElasticNetCVImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def test_uniform_targets(): enet = ElasticNetCV(n_alphas=3) m_enet = MultiTaskElasticNetCV(n_alphas=3) lasso = LassoCV(n_alphas=3) m_lasso = MultiTaskLassoCV(n_alphas=3) models_single_task = (enet, lasso) models_multi_task = (m_enet, m_lasso) rng = np.random.RandomState(0) X_train = rng.random_sample(size=(10, 3)) X_test = rng.random_sample(size=(10, 3)) y1 = np.empty(10) y2 = np.empty((10, 2)) for model in models_single_task: for y_values in (0, 5): y1.fill(y_values) assert_array_equal(model.fit(X_train, y1).predict(X_test), y1) assert_array_equal(model.alphas_, [np.finfo(float).resolution] * 3) for model in models_multi_task: for y_values in (0, 5): y2[:, 0].fill(y_values) y2[:, 1].fill(2 * y_values) assert_array_equal(model.fit(X_train, y2).predict(X_test), y2) assert_array_equal(model.alphas_, [np.finfo(float).resolution] * 3)
def train_glm_model( xtrain: Union[np.ndarray, pd.DataFrame], ytrain: Union[np.ndarray, pd.DataFrame], verbose: int = 0, ) -> BaseEstimator: """Train a basic Generalized Linear Model (GLM) Parameters ---------- xtrain : np.ndarray, pd.DataFrame (n_samples x d_features) input training data ytrain : np.ndarray, pd.DataFrame (n_samples x p_outputs) labeled training data verbose : int, default=0 option to print out training messages Returns ------- gl_model : BaseEstimator the trained model """ # Initialize GLM gl_model = MultiTaskElasticNetCV( alphas=None, cv=3, random_state=123, n_jobs=-1, normalize=False, selection="random", verbose=verbose, ) # train GLM t0 = time.time() gl_model.fit(xtrain, ytrain) t1 = time.time() - t0 if verbose > 0: print(f"Training time: {t1:.3f} secs.") return gl_model
def test_model_multi_task_elasticnet_cv(self): model, X = fit_regression_model(MultiTaskElasticNetCV(), n_targets=2) model_onnx = convert_sklearn( model, "multi-task elasticnet cv", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, verbose=False, basename="SklearnMultiTaskElasticNetCV-Dec4")
def fit_enet(X, flavors): # derive the flavor profiles by fitting the elastic net flavors[flavors == 0] = 0.01 # logit(0) and logit(1) are not finite flavors[flavors == 1] = 0.99 y = logit(flavors) idx = np.all(np.isfinite(y), axis=1) print 'Performing multi-task elastic net...' enet = MultiTaskElasticNetCV(cv=7, n_jobs=7, fit_intercept=False, verbose=1).fit(X[idx], y[idx]) weights = inv_logit(enet.coef_.T) # transform to 0 to 1 scale return weights
def select_mtelastic(self, X, y): # MultiTaskElasticCV from sklearn used to determine best alpha for Multi-task Elastic-Net Regression. mtlasso_alphas = MultiTaskElasticNetCV(alphas=[ 0.00001, .0001, .001, .002, .003, .004, .005, .006, .007, .008, .009, .099, .01, .011, .012, .013, .014, .015, .016, .017, .018, .019, .02, .025, .026, .027, .028, .029, .03, .031, .032, .033, .034, .035, .036, .037, .038, .039, .04, .041, .042, .043, .044, .045, .05, .06, .07, .071, .072, .073, .074, .075, .076, .077, .078, .079, .08, .1, .2, .225, .23, .24, .245, .246, .247, .248, .249, .25, .251, .252, .253, .254, .255, .26, .27, .275, .3, .35, .4, .45, .46, .47, .48, .481, .482, .483, .484, .485, .486, .487, .488, .489, .49, .491, .492, .493, .494, .495, .496, .497, .498, .499, .5, .51, .511, .512, .513, .514, .515, .516, .517, .518, .519, .52, .525, .53, .54, .55, .6, .75, .752, .7527, .7528, .7529, .753, .7531, .754, .7545, .755, .756, .76, .765, .77, .78, .79, .8, .9, 1.0, 1.2, 1.25, 1.5, 1.75, 2.0 ]) sel_alpha = mtlasso_alphas.fit(X, y) sel_alpha.alpha_ print(sel_alpha.alpha_)
def regtsls(data, opts): T_test, Z, T, Y = data trans = PolynomialFeatures(degree=_get(opts, 'lin_degree', 1), include_bias=False) polyT = trans.fit_transform(T) first = Pipeline([('poly', PolynomialFeatures(degree=_get(opts, 'lin_degree', 1))), ('elasticnet', MultiTaskElasticNetCV(cv=3))]) first.fit(Z, polyT) second = ElasticNetCV(cv=3) second.fit(first.predict(Z), Y.ravel()) polyT_test = trans.fit_transform(T_test) return second.predict(polyT_test).reshape(T_test.shape[:1] + Y.shape[1:])
def _fit(self): """ Fit regression model with training dataset, update self._regressor and self._param. """ warnings.filterwarnings("ignore", category=ConvergenceWarning) # Model for Elastic Net regression cv = MultiTaskElasticNetCV( alphas=[0, 0.001, 0.01, 0.1, 1, 10, 100, 1000], l1_ratio=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], cv=5, n_jobs=-1) # Fit with pipeline steps = [ ("scaler", MinMaxScaler()), ("regressor", cv), ] pipeline = Pipeline(steps=steps) pipeline.fit(self._X_train, self._y_train) reg_output = pipeline.named_steps.regressor # Update regressor self._regressor = pipeline # Intercept/coef intercept_df = pd.DataFrame(reg_output.coef_, index=self._y_train.columns, columns=self._X_train.columns) intercept_df.insert(0, "Intercept", None) intercept_df["Intercept"] = reg_output.intercept_ # Update param param_dict = { **{k: type(v) for (k, v) in steps}, "alpha": reg_output.alpha_, "l1_ratio": reg_output.l1_ratio_, "intercept": intercept_df, "coef": intercept_df, } self._param.update(param_dict)
def train_multi_elasticnet(train_features, train_labels, num_alphas, skip_cross_validation, alpha, l1_ratio, num_jobs): """ Performs the cross validation of multi elastic net model, and returns the trained model with best params. Assume features are scaled/normalized. Assumes train_labels has more than one column. """ best_alpha = alpha best_l1_ratio = l1_ratio max_iter = 10000 tol = 0.0005 if not skip_cross_validation: # use 5 fold cross validation model = MultiTaskElasticNetCV(l1_ratio=[ 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.925, 0.95, 0.975, 0.99, 0.999, 0.9999 ], max_iter=max_iter, cv=5, n_alphas=num_alphas, n_jobs=num_jobs, normalize=False, tol=tol) model.fit(train_features, train_labels) best_alpha = model.alpha_ best_l1_ratio = model.l1_ratio_ #print("number of iterations were {}".format(model.n_iter_)) model = MultiTaskElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio, normalize=False, max_iter=max_iter, tol=tol) model.fit(train_features, train_labels) return (model, {'alpha': best_alpha, 'l1_ratio': best_l1_ratio})
def train_linear_model(X, y, random_state=1, test_size=0.2, regularization_type='elasticnet', k_fold=5, max_iter=1000000, tol=0.0001, l1_ratio=None): """ Function to train linear model with regularization and cross-validation. Args: X (pandas.DataFrame): dataframe of descriptors. y (pandas.DataFrame): dataframe of cycle lifetimes. random_state (int): seed for train/test split. test_size (float): proportion of the dataset reserved for model evaluation. regularization_type (str): lasso or ridge or elastic-net (with cv). k_fold (int): k in k-fold cross-validation. max_iter (int): maximum number of iterations for model fitting. tol (float): tolerance for optimization. l1_ratio ([float]): list of lasso to ridge ratios for elasticnet. Returns: sklearn.linear_model.LinearModel: fitted model. mu (float): Mean value of descriptors used in training. s (float): Std dev of descriptors used in training. """ if l1_ratio is None: l1_ratio = [.1, .5, .7, .9, .95, 1] X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, random_state=random_state) # Standardize (training) data after train/test split mu = np.mean(X_train, axis=0) s = np.std(X_train, axis=0) X_scaled = (X_train - mu) / s hyperparameters = {'random_state': random_state, 'test_size': test_size, 'k_fold': k_fold, 'tol': tol, 'max_iter': max_iter } if regularization_type == 'lasso' and y.shape[1] == 1: lassocv = LassoCV(fit_intercept=True, alphas=None, tol=tol, cv=k_fold, max_iter=max_iter) lassocv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and refit model alpha_opt = lassocv.alpha_ linear_model = Lasso(fit_intercept=True, alpha=alpha_opt, max_iter=max_iter) linear_model.fit(X_scaled, y_train.values) hyperparameters['l1_ratio'] = 1 elif regularization_type == 'ridge' and y.shape[1] == 1: ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold) ridgecv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and refit model alpha_opt = ridgecv.alpha_ linear_model = Ridge(fit_intercept=True, alpha=alpha_opt) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = 0 elif regularization_type == 'elasticnet' and y.shape[1] == 1: elasticnetcv = ElasticNetCV(fit_intercept=True, normalize=False, alphas=None, cv=k_fold, l1_ratio=l1_ratio, max_iter=max_iter) elasticnetcv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and l1_ratio. Refit model alpha_opt = elasticnetcv.alpha_ l1_ratio_opt = elasticnetcv.l1_ratio_ linear_model = ElasticNet(fit_intercept=True, normalize=False, l1_ratio=l1_ratio_opt, alpha=alpha_opt, max_iter=max_iter) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = l1_ratio_opt # If more than 1 outcome present, perform multitask regression elif regularization_type == 'elasticnet' and y.shape[1] > 1: multi_elasticnet_CV = MultiTaskElasticNetCV(fit_intercept=True, cv=k_fold, normalize=False, l1_ratio=l1_ratio, max_iter=max_iter) multi_elasticnet_CV.fit(X_scaled, y_train) # Set optimal alpha and l1_ratio. Refit model alpha_opt = multi_elasticnet_CV.alpha_ l1_ratio_opt = multi_elasticnet_CV.l1_ratio_ linear_model = MultiTaskElasticNet(fit_intercept=True, normalize=False, max_iter=max_iter) linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = l1_ratio_opt else: raise NotImplementedError y_pred = linear_model.predict((X_test-mu)/s) Rsq = linear_model.score((X_test - mu) / s, y_test) # Compute 95% confidence interval # Multioutput = 'raw_values' provides prediction error per output pred_actual_ratio = [x/y for x, y in zip(y_pred, np.array(y_test))] relative_prediction_error = 1.96*np.sqrt(mean_squared_error(np.ones(y_pred.shape), pred_actual_ratio, multioutput='raw_values')/y_pred.shape[0]) hyperparameters['alpha'] = alpha_opt return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
def main(X, Y, Params, print_info=False, is_regression=True, Y_other=None): parameters = Params['Algorithm'][1] is_cv_run = False starttime = time.time() if print_info: print('Fitting model \'%s\' for %s' % (Params['Algorithm'][0], 'regression' if is_regression else 'classification')) if Params['Algorithm'][0] == 'BayesianRidge': if not is_regression: model = BayesianRidge(n_iter=300, tol=0.001, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False, **parameters) #parameters = {'alpha_1': [1e-6,1e-5,1e-4],'alpha_2': [1e-6,1e-5,1e-4], 'lambda_1': [1e-6,1e-5,1e-4], 'lambda_2': [1e-6,1e-5,1e-4]} else: model = BayesianRidge(n_iter=300, tol=0.001, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False, **parameters) elif Params['Algorithm'][0] == 'StringKernel': if not is_regression: raise (Exception('not implemented')) else: # we create an instance of SVM and fit out data. # # model = KernelRidge(alpha=parameters['alpha'], kernel='precomputed') model = SVR(kernel='precomputed', gamma='auto', coef0=0.0, shrinking=True, tol=0.001, cache_size=400, verbose=False, max_iter=-1) param_grid = { 'C': np.logspace(np.log10(0.0001), np.log10(500), 25) } model = NuSVR( kernel='precomputed' ) #cache_size=400, coef0=0.0, gamma='auto', max_iter=-1, shrinking=True, tol=0.001, verbose=False,**parameters) param_grid = {'nu': (0.50, )} model = GridSearchCV(model, param_grid, n_jobs=1, iid=True, refit=True, cv=7, verbose=0, scoring=neg_mean_squared_error_scorer) is_cv_run = True elif Params['Algorithm'][0] == 'XGBoost': # max_depth = 3, learning_rate = 0.1, n_estimators = 100, silent = True, objective = 'reg:linear', # booster = 'gbtree', n_jobs = 1, nthread = None, gamma = 0, min_child_weight = 1, # max_delta_step = 0, subsample = 1, colsample_bytree = 1, colsample_bylevel = 1, reg_alpha = 0, # reg_lambda = 1, scale_pos_weight = 1, base_score = 0.5, random_state = 0, seed = None, # missing = None if not is_regression: model = xgboost.XGBClassifier( missing=None, silent=True, learning_rate=0.10, objective='rank:pairwise', booster='gbtree', n_jobs=1, max_delta_step=0, colsample_bylevel=1, scale_pos_weight=1, base_score=0.5, random_state=666, colsample_bytree=0.75, # default 1 subsample=0.75, gamma=0, reg_alpha=0.01, # default 0 min_child_weight=6, **parameters) else: # model=xgboost.XGBRegressor(missing=None, silent=True, # learning_rate=0.10, # objective='reg:linear',#'rank:pairwise' booster='gbtree' # n_jobs=1, # booster='gbtree', # max_delta_step=0, # colsample_bylevel=1, # scale_pos_weight=1, # base_score=0.5, # random_state=666, # colsample_bytree=0.75, # default 1 # subsample=0.75, # gamma=0, # reg_alpha=0.01, # default 0 # reg_lambda=1.0, # min_child_weight=6, # **parameters) model = xgboost.XGBRegressor( missing=None, silent=True, learning_rate=0.10, objective='reg:linear', #'rank:pairwise' booster='gbtree' n_jobs=1, booster='gbtree', random_state=666, **parameters) param_grid = { 'colsample_bytree': (0.75, 1.0), 'subsample': (0.75, 1.0), 'min_child_weight': (3, 6, 9), 'reg_lambda': (0.80, 1.0, 1.20), 'reg_alpha': (0.001, 0.01) } model = GridSearchCV(model, param_grid, n_jobs=1, iid=True, refit=True, cv=7, verbose=0, scoring=neg_mean_squared_error_scorer) is_cv_run = True elif Params['Algorithm'][0] == "Keras_ElasticNet": #use_keras_CPU() if not is_regression: raise (Exception('ElasticNet is only for regression!')) else: param_grid = { 'l1_ratio': (Params['Algorithm'][1]['l1_ratio'], ), 'alpha': np.logspace(-3, 1, 15) } model = GridSearchCV(KerasENet(), param_grid, n_jobs=1, iid=True, refit=True, cv=5, verbose=0, scoring=neg_mean_squared_error_scorer) # first_output = Dense(1,activation='sigmoid')(first_output) is_cv_run = True elif Params['Algorithm'][0] == "Ridge": if not is_regression: raise (Exception('Ridge is only for regression!')) else: model = RidgeCV(alphas=np.logspace(-1, np.log10(700), parameters['n_alphas']), fit_intercept=True, normalize=False, scoring=None, cv=8, gcv_mode=None, store_cv_values=False) elif Params['Algorithm'][0] == "ElasticNet": tol = 0.0001 selection = 'cyclic' n_alphas = 90 max_iter = 1300 if X.shape[1] > 4000: tol = 0.001 selection = 'random' n_alphas = 60 max_iter = 1000 if not is_regression: raise (Exception('ElasticNet is only for regression!')) else: if Params['is_multitarget']: model = MultiTaskElasticNetCV(eps=0.001, alphas=None, fit_intercept=True, normalize=False, max_iter=max_iter, tol=tol, cv=7, copy_X=True, verbose=0, n_alphas=n_alphas, n_jobs=1, random_state=666, selection=selection, **parameters) else: model = ElasticNetCV(eps=0.001, alphas=None, fit_intercept=True, normalize=False, max_iter=max_iter, tol=tol, cv=7, copy_X=True, verbose=0, n_alphas=n_alphas, n_jobs=1, random_state=666, selection=selection, **parameters) elif Params['Algorithm'][0] == "RandomForest": if not is_regression: raise (Exception('not set up (lazy)')) else: model = RandomForestRegressor(criterion='mse', min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, **parameters) param_grid = { 'max_features': ('auto', 'sqrt'), 'min_samples_split': ( 2, 4, ), } model = GridSearchCV(model, param_grid, n_jobs=1, iid=True, refit=True, cv=7, verbose=0, scoring=neg_mean_squared_error_scorer) is_cv_run = True elif Params['Algorithm'][0] == 'SVM': # 0.001, 0.005, 0.01, 0.05, 0.1, 0.5,1.0,1.5,2.0,3.0,4.0,5.0,10.0 if not is_regression: model = SVC(cache_size=400, coef0=0.0, gamma='auto', max_iter=-1, shrinking=True, tol=0.001, verbose=False, **parameters) #parameters = {'reg__C':[0.5],'reg__epsilon':[0.1]} else: model = SVR(cache_size=400, coef0=0.0, gamma='auto', max_iter=-1, shrinking=True, tol=0.001, verbose=False, **parameters) param_grid = {'C': np.logspace(np.log10(0.0005), np.log10(10), 30)} #param_grid = {'nu':(0.1,0.3,0.5,0.7,0.9)} model = GridSearchCV(model, param_grid, n_jobs=1, iid=True, refit=True, cv=8, verbose=0, scoring=neg_mean_squared_error_scorer) is_cv_run = True elif Params['Algorithm'][0] == 'GradientBoosting': if not is_regression: model = GradientBoostingClassifier(random_state=1, **parameters) #parameters = {'reg__n_estimators': [140], 'reg__max_depth': [6],'learning_rate':[0.01,0.03,0.1],'min_samples_leaf':[2,3,4]} else: model = GradientBoostingRegressor(random_state=1, **parameters) #parameters = {'reg__n_estimators': [140], 'reg__max_depth': [6]} elif Params['Algorithm'][0] == 'MLP': #parameters['hidden_layer_sizes']=[parameters['hidden_layer_sizes']] #model = MLPRegressorCV(hidden_layer_sizes=parameters['hidden_layer_sizes']) model = MLPRegressor( activation="relu", solver="lbfgs", learning_rate="constant", learning_rate_init=0.0011, max_iter=450, random_state=None, tol=0.00013, epsilon=1e-08, hidden_layer_sizes=parameters['hidden_layer_sizes']) param_grid = {'alpha': np.logspace(0, np.log10(350), 20)} model = GridSearchCV(model, param_grid, n_jobs=1, iid=True, refit=True, cv=7, verbose=0, scoring=neg_mean_squared_error_scorer) is_cv_run = True #model = MLPRegressor(activation="relu", solver ="lbfgs",learning_rate ="constant", # learning_rate_init = 0.001, power_t = 0.5, max_iter = 500, shuffle = True, random_state = None, # tol = 0.0001, verbose = False, warm_start = False, momentum = 0.9, epsilon = 1e-08,**parameters) elif Params['Algorithm'][0] == 'MLP_KERAS': from keras.models import Sequential from keras import regularizers from keras.layers import Dense, Dropout from keras.callbacks import EarlyStopping from sklearn.preprocessing import LabelEncoder from keras.utils import np_utils import tensorflow as tf config = tf.ConfigProto() config.gpu_options.allow_growth = True session = tf.Session(config=config) early_stopping = EarlyStopping(monitor='val_loss', patience=5) model = Sequential() model.add( Dense( parameters['layers_and_nodes'][0], activation='tanh', input_shape=(X.shape[1], ), kernel_initializer='glorot_uniform', kernel_regularizer=regularizers.l2( parameters['l2_regularization']), )) model.add(Dropout(parameters['dropout'], noise_shape=None, seed=1)) for layer in range(1, len(parameters['layers_and_nodes'])): model.add( Dense(parameters['layers_and_nodes'][layer], activation='relu', input_shape=(parameters['layers_and_nodes'][layer - 1], ), kernel_initializer='glorot_normal', kernel_regularizer=regularizers.l2( parameters['l2_regularization']))) model.add(Dropout(parameters['dropout'], noise_shape=None, seed=1)) if not is_regression: model.add( Dense(1, activation='softmax', input_shape=(parameters['nodes'][-1], ))) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['f1']) encoder = LabelEncoder() encoder.fit(Y) encoded_Y = encoder.transform(Y) # convert integers to dummy variables (i.e. one hot encoded) Y = np_utils.to_categorical(encoded_Y) else: model.add( Dense(1, activation='linear', input_shape=(parameters['layers_and_nodes'][-1], ))) model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse']) model.fit(X, Y, batch_size=X.shape[0], epochs=100, validation_split=0, verbose=0) #,callbacks=[early_stopping]) return model else: raise (Exception('unknown model')) #decomposer = LatentDirichletAllocation(n_topics=10, max_iter=10,learning_method='online',learning_offset=50.,random_state=1) #decomposer = TruncatedSVD(n_components=100,random_state=666) """ X = data.iloc[:]['text'].values y = data.iloc[:]['mylabel'].values.astype(str) dat = vect.fit_transform(X) dat = tfidf.fit_transform(dat) dat = decomposer.fit_transform(dat) for a in numpy.unique(y): plt.scatter(dat[y==a,0],dat[y==a,1]) """ """ START LOOP """ #t0 = time() # if get_set_count(parameters)>1: # grid_search = GridSearchCV(model, parameters, n_jobs=6,verbose=1,cv=10,refit=True) # grid_search.fit(X=X,y=Y) # best_parameters = grid_search.best_estimator_.get_params() # print('--> best parameters: %s' % best_parameters) # return grid_search # else: if 1: start_time = time.time() print('... training model (X.shape=%s)' % str(X.shape), end='') warnings.filterwarnings("ignore") if Y_other is not None and Params['is_multitarget']: Y = np.expand_dims(Y, axis=1) model.fit(X=X, y=np.concatenate((Y, Y_other), axis=1)) else: Y = Y.flatten() model.fit(X=X, y=Y) if is_cv_run: print(' [best gridsearch params: %s] ' % model.best_params_, end='') if 1: end_time = time.time() print(' ... done (%1.1f min)' % ((end_time - start_time) / 60.0)) #elapsedtime = (time.time() - starttime) / 60.0 #print('fit done (took %f minutes)' % elapsedtime) return model
evolved_freq.append(f2) #get the average metabolite usage from the evolved population used_mets=[] for mm in g2: reacs=[react_dict[z] for z in mm] m=Model(reacs) used_mets.append(m.ex_reactants) used_mets = list(chain.from_iterable(used_mets)) mf=[] for mm in dm: mf.append(used_mets.count(mm)/len(g2)) true_used_env.append(mf) from sklearn.linear_model import MultiTaskElasticNetCV as EN enet = EN(cv=50, max_iter=100000) x = full_freq_m.T[m_diff_freq_m>.005].T y = used_environment.T[m_diff_used_env>0.005].T mod=enet.fit(x, y) p = mod.predict(f2[m_diff_freq_m>.005].reshape(1,-1)) p=p.flatten() p = p+abs(min(p)) p=p/max(p) c = [sts.pearsonr(mf,used_environment[ee][m_diff_used_env>0.005])[0] for ee in range(len(used_environment))] predicted.append(sts.pearsonr(p, mf)[0])
def train(self, vectors_path, bound_morphemes_path=None, word_segmentations_path=None, graphemes_to_phonemes_path=None, n_jobs=1, l1_ratio=0.5): train_config = locals() train_config.pop("self") train_config.pop("__class__", None) self.config["train_config"] = train_config logger.info("Train config: ") pprint.pprint(train_config) # Load vectors, where the keys can be words represented as # sequences of characters (normal word vectors) or words represented # as sequences of phonemes (phonemicized vectors). logger.info("Reading vectors from {}".format(vectors_path)) self.vectors = OrderedDict() with open(vectors_path) as vectors_file: for line in tqdm(vectors_file, total=get_line_number(vectors_path)): split_line = line.rstrip("\n").split() word = split_line[0] # If we have phonemicized vectors, the keys to the dict are # tuples of comma-separated phonemes representing a word. if graphemes_to_phonemes_path is not None: word = tuple(word.split(",")) embedding = np.array([float(val) for val in split_line[1:]]) self.vectors[word] = embedding # Randomly shuffle the OrderedDict random_seed = 0 logger.info( "Shuffling vectors with random seed {}".format(random_seed)) random.seed(random_seed) vector_items = list(self.vectors.items()) # random.shuffle is in-place random.shuffle(vector_items) self.vectors = OrderedDict(vector_items) vocabulary = list(self.vectors.keys()) targets = np.asarray(list(self.vectors.values())) # Load phonemes to graphemes if we were given g2p data if graphemes_to_phonemes_path: logger.info("Reading graphemes to phonemes data " "from {}".format(graphemes_to_phonemes_path)) self.phonemes_to_graphemes = {} # Load the graphemes to phonemes data with open( graphemes_to_phonemes_path) as graphemes_to_phonemes_file: for line in tqdm( graphemes_to_phonemes_file, total=get_line_number(graphemes_to_phonemes_path)): split_line = line.rstrip("\n").split("\t") word = split_line[0] phonemes = tuple(split_line[1].split(" ")) self.phonemes_to_graphemes[phonemes] = word if bound_morphemes_path is not None: # Load morpheme data if we were given bound morphemes word_segmentations, bound_morphemes = self._load_morpheme_data( word_segmentations_path, bound_morphemes_path) # Update targets with predictions of the morpheme model. This is equivalent # to using the model residuals as the new targets. targets = self._get_morpheme_residuals(vocabulary, targets, bound_morphemes, graphemes_to_phonemes_path, word_segmentations, n_jobs=n_jobs) # Get the ngram features for the vocabulary. self.X_ngram, self.ngram_to_idx = build_ngram_features( vocabulary=vocabulary, one_hot=self.one_hot, ngram_range=self.ngrams, mode=self.mode, freq_thres=self.min_count) logger.info("Shape of ElasticNet input (number of words, " "number of candidate phonesthemes): {}".format( self.X_ngram.shape)) logger.info("Shape of ElasticNet targets (number of words, " "vector dimension): {}".format(targets.shape)) # Fit a MultiTaskElasticNetCV model to extract phonesthemes. logger.info("Fitting MultiTaskElasticNetCV") self.phonesthemes_reg = MultiTaskElasticNetCV(l1_ratio=l1_ratio, n_jobs=n_jobs, random_state=0, cv=5) self.phonesthemes_reg.fit(self.X_ngram, targets) logger.info("Done fitting MultiTaskElasticNetCV") self.is_trained = True
def scorer(pipe, X, y): pred = pipe.predict(X) return metrics.f1_score(y, pred) accum = np.zeros((X.shape[1],)) for y in np.transpose(Y): selector = SelectKBest(f_classif, selectedFeaureNum) selector = selector.fit(X, y) accum += selector.pvalues_ selectedIndices = accum.argsort()[:selectedFeaureNum] def transform(X): return X[:, selectedIndices] X_filtered, X_test_filtered = transform(X), transform(X_test) clf = MultiTaskElasticNetCV(normalize=True) #clf = MultiTaskLasso(normalize=True) clf.fit(X_filtered, Y) predTrain = np.array(clf.predict(X_filtered)) splits = [] for col in range(predTrain.shape[1]): bestSplit, bestF1 = labanUtil.getSplitThreshold(predTrain[:, col], Y[:, col]) splits.append(bestSplit) pred = np.array(clf.predict(X_test_filtered)) for col in range(pred.shape[1]): pred[:, col] = [1 if e>=splits[col] else 0 for e in pred[:, col]] predTrain[:, col] = [1 if e>=splits[col] else 0 for e in predTrain[:, col]] ps.append(metrics.precision_score(Y_test, pred)) rs.append(metrics.recall_score(Y_test, pred)) teF = metrics.f1_score(Y_test, pred) teFs.append(teF)
logging.info("Starting outer CV, N = {}".format(N_outer)) #Outer loop over N splits split_index = 0 for train_idx, test_idx in cvsplitter_outer.split(X, Y, groups): groups_train = groups[train_idx] X_train = X[train_idx] Y_train = Y[train_idx] groups_test = groups[test_idx] X_test = X[test_idx] Y_test = Y[test_idx] regressor=MultiTaskElasticNetCV(l1_ratio = [.1, .5, .7, .9, .95, .99, 1], n_jobs = args.cores, cv = list(cvsplitter_inner.split(X_train, Y_train, groups_train))) estimator = make_pipeline(imputer, regressor) logging.info("Training...") estimator.fit(X_train, Y_train) logging.info('Training: {:1.3} Testing: {:1.3}'.format(estimator.score(X_train, Y_train), estimator.score(X_test, Y_test))) out_dict={"score_train" : estimator.score(X_train, Y_train), "score_test" : estimator.score(X_test, Y_test), "intercept" : estimator.named_steps['multitaskelasticnetcv'].intercept_ , "coef" : estimator.named_steps['multitaskelasticnetcv'].coef_ , "alpha" : estimator.named_steps['multitaskelasticnetcv'].alpha_ , "alphas" : estimator.named_steps['multitaskelasticnetcv'].alphas_ , "mse_path" : estimator.named_steps['multitaskelasticnetcv'].mse_path_ , "l1_ratio" : estimator.named_steps['multitaskelasticnetcv'].l1_ratio_ ,
from sklearn.linear_model import MultiTaskElasticNet, MultiTaskElasticNetCV #cross-validating to find best hyperparams cv_model = MultiTaskElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], verbose=1) cv_model.fit(X_train, y_train) #fitting model with hyperparameters from above model = MultiTaskElasticNet(alpha=cv_model.alpha_, l1_ratio=cv_model.l1_ratio_, random_state=0) model.fit(X_train, y_train) #predicting preds = model.predict(X_test) test_df[[ 'age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2' ]] = preds test_df.drop(columns=["is_train"], inplace=True) test_df.head() #predictions housekeeping sub_df = cudf.melt(test_df[[ "Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2" ]], id_vars=["Id"], value_name="Predicted") sub_df["Id"] = sub_df["Id"].astype("str") + "_" + sub_df["variable"].astype( "str") sub_df = sub_df.drop("variable", axis=1).sort_values("Id") assert sub_df.shape[0] == test_df.shape[0] * 5
train_labels = np.vstack((import_test_labels["Ytest"], import_train["Ytrain"])) # labels of the original train data ## Standardization scaler = preprocessing.StandardScaler().fit(X_train_raw) X_train_scaled = scaler.transform(X_train_raw) X_test_scaled = scaler.transform(X_test_raw) ## PCA and Feature Selection pca = PCA(n_components=800) selection = SelectKBest(k=850) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) combined_features.fit(X_train_scaled, train_labels.ravel()) # print(pca.explained_variance_ratio_) X_train_reduced = combined_features.transform(X_train_scaled) X_test_reduced = combined_features.transform(X_test_scaled) ## Lasso CV for parameter optimization t1 = time.time() alps = np.linspace(0.1, 0.625, 15) model = MultiTaskElasticNetCV(cv=3, n_jobs=-1, max_iter=25).fit(X_train_reduced, Y_train_raw) t_lasso_cv = time.time() - t1 print "time to train", t_lasso_cv print "alpha", model.alpha_ print "i1 ration", model.i1_ratio_ Y_predicted = model.predict(X_test_reduced) ## Save results to csv np.savetxt("prediction.csv", Y_predicted, fmt="%.5f", delimiter=",")
# -*- coding: utf-8 -*- """ Created on Thu Apr 21 23:51:12 2016 @author: patanjali """ from sklearn.linear_model import MultiTaskElasticNetCV from utils2 import load_dataset import pandas train, validate, test = load_dataset() no_classes = train[:,0].max()+1 train_y = pandas.get_dummies(train[:,0]) print no_classes, train.shape train = train[:201] validate = validate[:201] test = test[:201] for l1_ratio in [.1, .5, .7, .9, .95, .99, 1]: model = MultiTaskElasticNetCV(l1_ratio=l1_ratio, normalize=True, verbose=True, n_jobs=3) model.fit(train[:,1:], train_y) predicted_classes = (model.predict(validate[:,1:])).argmax(1) correct = sum(predicted_classes==validate[:,0]) print l1_ratio, correct, correct*1.0/validate.shape[0]
########Elstic Net################################### ####Fit the model#### ElNet = ElasticNet(alpha=0.5, random_state=0).fit(x, y) ElNet.score(x, y) #-1.1142739679728243e-16 #Try with cross validation prediction y_pred_ElNet = cross_val_predict(ElNet, x, y, cv=3) r2_score(y, y_pred_ElNet) #-0.0002686650433182912 #the best value is 0.0 mean_squared_error(y, y_pred_ElNet) #7.85883e-05 #the best value is 0.0 mean_absolute_error(y, y_pred_ElNet) #0.0005987262 #Multi Task Elstic Net with CV ElNetCV = MultiTaskElasticNetCV(random_state=0, verbose=1).fit(x, y) ##UserWarning: Objective did not #converge. You might want to increase the number of iterations #Plot start = 10000 plt.figure() plt.pcolormesh(np.log(x[start:start + 1000, :])) plt.ylabel('time') plt.xlabel('freq') plt.figure() plt.pcolormesh(np.log(y[start:start + 1000, :])) plt.ylabel('time') plt.xlabel('freq') ##### S_clean #################################################################
SVC(kernel='poly', probability=True, degree=3), SVC(kernel='poly', probability=True, degree=4), SVC(kernel='poly', probability=True, degree=5), DecisionTreeClassifier(), KNeighborsClassifier(), GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), QuadraticDiscriminantAnalysis(), LinearDiscriminantAnalysis(), ElasticNetCV(max_iter=10000), LarsCV(), LassoCV(max_iter=10000), LassoLarsCV(), LogisticRegressionCV(scoring=multi_class_log_loss), MultiTaskElasticNetCV(), MultiTaskLassoCV(), OrthogonalMatchingPursuitCV(), RidgeClassifierCV() ] algorithm = 17 if len(sys.argv) > 1: algorithm = int(sys.argv[1]) name = names[algorithm] clf = classifiers[algorithm] output_file_name = output_file_names[algorithm] + file_identifier t = time.time() random_state = np.random.RandomState(0) print "Fitting classifier " + name
def GetAllModelsForComparison(X_train, Y_train): models = { 'ARDRegression': ARDRegression(), 'BayesianRidge': BayesianRidge(), 'ElasticNet': ElasticNet(), 'ElasticNetCV': ElasticNetCV(), 'Hinge': Hinge(), #'Huber': Huber(), 'HuberRegressor': HuberRegressor(), 'Lars': Lars(), 'LarsCV': LarsCV(), 'Lasso': Lasso(), 'LassoCV': LassoCV(), 'LassoLars': LassoLars(), 'LassoLarsCV': LassoLarsCV(), 'LinearRegression': LinearRegression(), 'Log': Log(), 'LogisticRegression': LogisticRegression(), 'LogisticRegressionCV': LogisticRegressionCV(), 'ModifiedHuber': ModifiedHuber(), 'MultiTaskElasticNet': MultiTaskElasticNet(), 'MultiTaskElasticNetCV': MultiTaskElasticNetCV(), 'MultiTaskLasso': MultiTaskLasso(), 'MultiTaskLassoCV': MultiTaskLassoCV(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), 'Perceptron': Perceptron(), 'RANSACRegressor': RANSACRegressor(), #'RandomizedLasso': RandomizedLasso(), #'RandomizedLogisticRegression': RandomizedLogisticRegression(), 'Ridge': Ridge(), 'RidgeCV': RidgeCV(), 'RidgeClassifier': RidgeClassifier(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), 'SquaredLoss': SquaredLoss(), 'TheilSenRegressor': TheilSenRegressor(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LinearClassifierMixin': LinearClassifierMixin(), 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(), 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(), 'StandardScaler': StandardScaler(), 'TransformerMixin': TransformerMixin(), 'BaseEstimator': BaseEstimator(), 'KernelRidge': KernelRidge(), 'RegressorMixin': RegressorMixin(), 'LinearSVC': LinearSVC(), 'LinearSVR': LinearSVR(), 'NuSVC': NuSVC(), 'NuSVR': NuSVR(), 'OneClassSVM': OneClassSVM(), 'SVC': SVC(), 'SVR': SVR(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), #'BallTree': BallTree(), #'DistanceMetric': DistanceMetric(), #'KDTree': KDTree(), 'KNeighborsClassifier': KNeighborsClassifier(), 'KNeighborsRegressor': KNeighborsRegressor(), 'KernelDensity': KernelDensity(), #'LSHForest': LSHForest(), 'LocalOutlierFactor': LocalOutlierFactor(), 'NearestCentroid': NearestCentroid(), 'NearestNeighbors': NearestNeighbors(), 'RadiusNeighborsClassifier': RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor': RadiusNeighborsRegressor(), #'GaussianProcess': GaussianProcess(), 'GaussianProcessRegressor': GaussianProcessRegressor(), 'GaussianProcessClassifier': GaussianProcessClassifier(), 'CCA': CCA(), 'PLSCanonical': PLSCanonical(), 'PLSRegression': PLSRegression(), 'PLSSVD': PLSSVD(), #'ABCMeta': ABCMeta(), #'BaseDiscreteNB': BaseDiscreteNB(), 'BaseEstimator': BaseEstimator(), #'BaseNB': BaseNB(), 'BernoulliNB': BernoulliNB(), 'ClassifierMixin': ClassifierMixin(), 'GaussianNB': GaussianNB(), 'LabelBinarizer': LabelBinarizer(), 'MultinomialNB': MultinomialNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'DecisionTreeRegressor': DecisionTreeRegressor(), 'ExtraTreeClassifier': ExtraTreeClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'AdaBoostRegressor': AdaBoostRegressor(), 'BaggingClassifier': BaggingClassifier(), 'BaggingRegressor': BaggingRegressor(), #'BaseEnsemble': BaseEnsemble(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'IsolationForest': IsolationForest(), 'RandomForestClassifier': RandomForestClassifier(), 'RandomForestRegressor': RandomForestRegressor(), 'RandomTreesEmbedding': RandomTreesEmbedding(), #'VotingClassifier': VotingClassifier(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LabelBinarizer': LabelBinarizer(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'OneVsOneClassifier': OneVsOneClassifier(), #'OneVsRestClassifier': OneVsRestClassifier(), #'OutputCodeClassifier': OutputCodeClassifier(), 'Parallel': Parallel(), #'ABCMeta': ABCMeta(), 'BaseEstimator': BaseEstimator(), #'ClassifierChain': ClassifierChain(), 'ClassifierMixin': ClassifierMixin(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'MultiOutputClassifier': MultiOutputClassifier(), #'MultiOutputEstimator': MultiOutputEstimator(), #'MultiOutputRegressor': MultiOutputRegressor(), 'Parallel': Parallel(), 'RegressorMixin': RegressorMixin(), 'LabelPropagation': LabelPropagation(), 'LabelSpreading': LabelSpreading(), 'BaseEstimator': BaseEstimator(), 'IsotonicRegression': IsotonicRegression(), 'RegressorMixin': RegressorMixin(), 'TransformerMixin': TransformerMixin(), 'BernoulliRBM': BernoulliRBM(), 'MLPClassifier': MLPClassifier(), 'MLPRegressor': MLPRegressor() } return models
Apply sparse linear regression (ElasticNet) for easier analysis Force the coefficients to be non-negative as none drug should increase the presence of the bacterias ''' X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=1) folds = 5 alphas = np.logspace(1, 5, 3) l1_ratios = np.linspace(0, 1, 2, endpoint=True) models = MultiTaskElasticNetCV(l1_ratio=l1_ratios, alphas=alphas, verbose=1, cv=folds, n_jobs=-1) models.fit(X_train, Y_train) models.score(X_test, Y_test) print "Alpha: ", models.alpha_ print "L1 ratio: ", models.l1_ratio_ print "Score of Elastic-net on test data: ", models.score(X_test, Y_test) model_EN = ElasticNet(l1_ratio=models.l1_ratio_, alpha=models.alpha_) model_EN.fit(np.concatenate((X_train, X_test)), np.concatenate((Y_train, Y_test))) test = np.rint(models.predict(X_test)).astype('int16') coeff = model_EN.coef_.T
class PhonesthemesModel(object): """ Attributes ---------- self.config: Dict A dictionary of the arguments passed into the object. self.ngrams: List[int] A list of integers that refer to the ngram sizes to use. self.mode: List[str] List of str indicating the positions in the word to use as candidate phonesthemes. Possible elements are "start", "end", and "all". self.min_count: int Minimum number of ngram occurrences in order to be included as a features. self.one_hot: bool Whether or not to use one-hot features instead of counts for the phonestheme ngram features. self.vectors Dictionary of word to vector, where word is either a string or a tuple of strings (phoneme representation). self.phonesthemes_reg The MultiTaskElasticNetCV model fit on the phonestheme feature vectors to predict the phonestheme targets. self.X_ngram The input feature vectors used to fit the Elastic Net. self.ngram_to_idx A mapping from ngram to feature index of X_ngram. self.is_trained A boolean describing whether this model has been trained or not. """ def __init__(self, ngrams, mode, min_count, one_hot): self.config = locals() self.config.pop("self") self.config.pop("__class__", None) logger.info("Config: ") pprint.pprint(self.config) self.ngrams = ngrams self.mode = mode self.min_count = min_count self.one_hot = one_hot # Placeholder values, these get set when we call train self.vectors = None self.phonesthemes_reg = None self.X_ngram = None self.ngram_to_idx = None self.phonemes_to_graphemes = None self.is_trained = False def get_phonesthemes(self): return get_phonesthemes_from_model(self) def train(self, vectors_path, bound_morphemes_path=None, word_segmentations_path=None, graphemes_to_phonemes_path=None, n_jobs=1, l1_ratio=0.5): train_config = locals() train_config.pop("self") train_config.pop("__class__", None) self.config["train_config"] = train_config logger.info("Train config: ") pprint.pprint(train_config) # Load vectors, where the keys can be words represented as # sequences of characters (normal word vectors) or words represented # as sequences of phonemes (phonemicized vectors). logger.info("Reading vectors from {}".format(vectors_path)) self.vectors = OrderedDict() with open(vectors_path) as vectors_file: for line in tqdm(vectors_file, total=get_line_number(vectors_path)): split_line = line.rstrip("\n").split() word = split_line[0] # If we have phonemicized vectors, the keys to the dict are # tuples of comma-separated phonemes representing a word. if graphemes_to_phonemes_path is not None: word = tuple(word.split(",")) embedding = np.array([float(val) for val in split_line[1:]]) self.vectors[word] = embedding # Randomly shuffle the OrderedDict random_seed = 0 logger.info( "Shuffling vectors with random seed {}".format(random_seed)) random.seed(random_seed) vector_items = list(self.vectors.items()) # random.shuffle is in-place random.shuffle(vector_items) self.vectors = OrderedDict(vector_items) vocabulary = list(self.vectors.keys()) targets = np.asarray(list(self.vectors.values())) # Load phonemes to graphemes if we were given g2p data if graphemes_to_phonemes_path: logger.info("Reading graphemes to phonemes data " "from {}".format(graphemes_to_phonemes_path)) self.phonemes_to_graphemes = {} # Load the graphemes to phonemes data with open( graphemes_to_phonemes_path) as graphemes_to_phonemes_file: for line in tqdm( graphemes_to_phonemes_file, total=get_line_number(graphemes_to_phonemes_path)): split_line = line.rstrip("\n").split("\t") word = split_line[0] phonemes = tuple(split_line[1].split(" ")) self.phonemes_to_graphemes[phonemes] = word if bound_morphemes_path is not None: # Load morpheme data if we were given bound morphemes word_segmentations, bound_morphemes = self._load_morpheme_data( word_segmentations_path, bound_morphemes_path) # Update targets with predictions of the morpheme model. This is equivalent # to using the model residuals as the new targets. targets = self._get_morpheme_residuals(vocabulary, targets, bound_morphemes, graphemes_to_phonemes_path, word_segmentations, n_jobs=n_jobs) # Get the ngram features for the vocabulary. self.X_ngram, self.ngram_to_idx = build_ngram_features( vocabulary=vocabulary, one_hot=self.one_hot, ngram_range=self.ngrams, mode=self.mode, freq_thres=self.min_count) logger.info("Shape of ElasticNet input (number of words, " "number of candidate phonesthemes): {}".format( self.X_ngram.shape)) logger.info("Shape of ElasticNet targets (number of words, " "vector dimension): {}".format(targets.shape)) # Fit a MultiTaskElasticNetCV model to extract phonesthemes. logger.info("Fitting MultiTaskElasticNetCV") self.phonesthemes_reg = MultiTaskElasticNetCV(l1_ratio=l1_ratio, n_jobs=n_jobs, random_state=0, cv=5) self.phonesthemes_reg.fit(self.X_ngram, targets) logger.info("Done fitting MultiTaskElasticNetCV") self.is_trained = True def _load_morpheme_data(self, word_segmentations_path, bound_morphemes_path): # Load word segmentations word_segmentations = {} if word_segmentations_path: logger.info("Loading word segmentations from {}".format( word_segmentations_path)) with open(word_segmentations_path) as word_segmentations_file: for line in tqdm( word_segmentations_file, total=get_line_number(word_segmentations_path)): split_line = line.rstrip("\n").split("\t") assert len(split_line) == 2 word = split_line[0] morphemes = split_line[1].split(" ") word_segmentations[word] = morphemes logger.info("Loaded {} word segmentations".format( len(word_segmentations))) # Load the list of bound morphemes logger.info( "Loading bound morphemes from {}".format(bound_morphemes_path)) bound_morphemes = [] with open(bound_morphemes_path) as bound_morphemes_file: for line in tqdm(bound_morphemes_file, total=get_line_number(bound_morphemes_path)): bound_morphemes.append(line.rstrip("\n")) logger.info("Loaded {} bound morphemes".format(len(bound_morphemes))) return (word_segmentations, bound_morphemes) def _get_morpheme_residuals(self, vocabulary, targets, bound_morphemes, graphemes_to_phonemes_path, word_segmentations=None, n_jobs=1): # Get the vectors vocabulary, and convert to string if we are using # phonemicized vectors. if graphemes_to_phonemes_path is None: string_vectors_vocab = vocabulary else: # The vocab of the phonemicized vectors converted to graphemes. string_vectors_vocab = [ self.phonemes_to_graphemes[phonemes] for phonemes in vocabulary ] # Build the morpheme feature vectors. morpheme_features = build_morpheme_features(string_vectors_vocab, bound_morphemes, word_segmentations) logger.info("Input shape for morpheme pretraining linear regression " "(number of words, number of morphemes): {}".format( morpheme_features.shape)) logger.info("Target shape for morpheme pretraining linear regression " "(number of words, vector dimension): {}".format( targets.shape)) morph_reg = LinearRegression(n_jobs=n_jobs) logger.info("Pretraining on morpheme features.") morph_reg = morph_reg.fit(morpheme_features, targets) logger.info("Calculating residuals of of linear regression done " "on morpheme features and using that as the train " "vectors for the ngram feature model.") # Get the residuals of the model for use in the second model. morph_reg_pred_y = morph_reg.predict(morpheme_features) morph_reg_residuals = np.subtract(targets, morph_reg_pred_y) return morph_reg_residuals def __eq__(self, other): # Two PhonesthemesModel objects are the same if their members are # the same. # Compare their ngrams if self.ngrams != other.ngrams: return False # Compare their mode if self.mode != other.mode: return False # Compare their min count if self.min_count != other.min_count: return False # Compare whether they use one-hot or frequency features if self.one_hot != other.one_hot: return False # Compare that they have the same set of vectors in the same order if len(self.vectors) != len(other.vectors): return False for this_word, other_word in zip(self.vectors, other.vectors): if this_word != other_word: return False if not np.allclose(self.vectors[this_word], other.vectors[this_word]): return False # Check that they were trained on the same features if not np.allclose(self.X_ngram, other.X_ngram): return False # Check that they have the same mapping of ngram to feature idx if self.ngram_to_idx != other.ngram_to_idx: return False return True if six.PY2: def __ne__(self, other): equal = self.__eq__(other) return equal if equal is NotImplemented else not equal
#把离散特征和连续特征拼接起来 x_vec = np.concatenate((x_vec_con, x_vec_dis), axis=1) #对于目标进行预测 y_registered = bike_rel['registered'].values.astype(float) y_casual = bike_rel['casual'].values.astype(float) y = np.stack((y_registered, y_casual), axis=1) #建立模型进行预测 from sklearn.linear_model import MultiTaskLassoCV from sklearn.model_selection import train_test_split from sklearn.linear_model import MultiTaskElasticNetCV x1, x2, y1, y2 = train_test_split(x_vec, y, test_size=0.2, random_state=20) ############ Lasso mtl = MultiTaskLassoCV(alphas=np.logspace(-3, -1, 3), cv=8, verbose=3) mtl.fit(x1, y1) mtl.score(x1, y1) mtl.score(x2, y2) ############ ElasticNetCV mte = MultiTaskElasticNetCV(l1_ratio=np.logspace(-3, -1, 3), alphas=np.logspace(-3, -1, 3), cv=8, verbose=3) mte.fit(x1, y1) mtl.score(x1, y1) mtl.score(x2, y2)
p(mean_squared_error(lasso_predict, Y_test)) # ## Ridge # # In[25]: ridge_model = Ridge(alpha=0.01) ridge_model = ridge_model.fit(X=X_train, y=Y_train) ridge_predict = ridge_model.predict(X_test) p(mean_absolute_error(ridge_predict, Y_test)) p(mean_squared_error(ridge_predict, Y_test)) # ## Elastic Net # In[27]: enet_params = { 'alpha': [1e-7], } enet_model = MultiTaskElasticNetCV(alphas=enet_params['alpha']) enet_model = enet_model.fit(X=X_train, y=Y_train) enet_predict = enet_model.predict(X_test) p(mean_absolute_error(enet_predict, Y_test)) p(mean_squared_error(enet_predict, Y_test))
from sklearn.feature_selection import RFE from sklearn.feature_selection import RFECV # Linear Models from sklearn.linear_model import LassoCV from sklearn.linear_model import MultiTaskLassoCV from sklearn.linear_model import LogisticRegression from sklearn.linear_model import MultiTaskElasticNetCV from sklearn.linear_model import RidgeCV # SVM from sklearn.svm import SVR from sklearn.svm import SVC from sklearn.svm import NuSVC clf = MultiTaskElasticNetCV() # Set a minimum threshold of 0.25 sfm = SelectFromModel(clf, threshold=0.23) sfm.fit(trX, trY) new_trX = sfm.transform(trX) n_features = new_trX.shape[1] print sfm.get_support(indices=True) pd.tools.plotting.scatter_matrix(ttl_X, diagonal="kde") plt.tight_layout() plt.show()
def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams)
netTrainFs = [] lastX = np.zeros((X_raw.shape[0], hiddenSize)) for i in range(epochs/quanta): print 'Epoch: ', i*quanta an.trainSupervised(quanta, trndata, initialLearningrate=learningrate, decay=1,#0.999, myWeightdecay=weightDecay, momentum=momentum) netTrainFs.append(an.scoreOnDS(trndata)) X, X_test = an.transform(X_raw), an.transform(X_test_raw) if (lastX == X).all(): raise 'problem' lastX = copy.deepcopy(X) clf = MultiTaskElasticNetCV() clf.fit(X, Y) predTrain = np.array(clf.predict(X)) splits = [] for col in range(predTrain.shape[1]): bestSplit, bestF1 = labanUtil.getSplitThreshold(predTrain[:, col], Y[:, col]) splits.append(bestSplit) pred = np.array(clf.predict(X_test)) for col in range(pred.shape[1]): pred[:, col] = [1 if e>=splits[col] else 0 for e in pred[:, col]] predTrain[:, col] = [1 if e>=splits[col] else 0 for e in predTrain[:, col]] testFs.append(metrics.f1_score(Y_test, pred)) trainFs.append(metrics.f1_score(Y, predTrain)) #des+='\n EN test f1: '+ str(testF) #des+=' , EN train f1: '+ str(trainF)