def ridge_optimized_fit(m_structure_list, a_min, a_max, step): alpha = a_min old_CV2_min = 10000000 alpha_opt = 0 while alpha < a_max: energies = [] for i in range(len(m_structure_list)): mat = m_structure_list[i] if mat.mag_phase != "pera" and mat.phase_name != "pm": energies.append(m_structure_list[i].enrg) x_rows = len(energies) x_colm = m_structure_list[0].num_beg_rules + m_structure_list[ 0].num_cluster_rules + m_structure_list[0].num_j_rules x_matrix = np.zeros((x_rows, x_colm)) x_matrix = np.matrix(x_matrix) i = 0 for itter in range(len(m_structure_list)): mat = m_structure_list[itter] if mat.mag_phase != "pera" and mat.phase_name != "pm": for j in range(m_structure_list[0].num_beg_rules): x_matrix[i, j] = m_structure_list[i].BEG_sums[j] for k in range(m_structure_list[0].num_cluster_rules): x_matrix[i, j + k] = m_structure_list[i].Cluster_sums[k] for l in range(m_structure_list[0].num_j_rules): x_matrix[i, j + k + l] = m_structure_list[i].J_sums[l] i += 1 x_cross = np.zeros((x_rows - 1, x_colm)) CV2 = 0 for i in range(len(energies)): inc = 0 eng_cross = [] for j in range(len(energies)): if i != j: eng_cross.append(energies[j]) x_cross[inc, :] = x_matrix[j, :] inc += 1 b = np.transpose(np.matrix(eng_cross)) a = np.matrix(x_cross) r_fit = Ridge(alpha=alpha) r_fit.fit(a, b) Js = r_fit.get_params() Ei = 0 for j in range(len(Js)): Ei += x_matrix[i, j] * Js[j] CV2 += (energies[i] - Ei)**2 CV2 *= 1 / len(m_structure_list) if CV2 < old_CV2_min: old_CV2_min = CV2 alpha_opt = alpha r_fit = Ridge(alpha=alpha_opt) r_fit.fit(a, b) Js = r_fit.get_params() print(alpha_opt) print(CV2) return Js
def ridge_model(xy): # ridge模型 x = xy[:, 0].reshape(-1, 1) y = xy[:, 1] model = Ridge() alpha_can = np.linspace(-1, 10, 30) model = GridSearchCV(model, param_grid={'alpha': alpha_can}, cv=5) model.fit(x, y) print(model.best_params_) pred_y = model.predict(x) params = model.get_params() print(params) ridge_r2 = sm.r2_score(y, pred_y) ridge_absolute = sm.mean_absolute_error(y, pred_y) ridge_squared = sm.mean_squared_error(y, pred_y) ridge_median = sm.median_absolute_error(y, pred_y) drawing_ridge(xy, x, pred_y, model.best_params_) return { 'ridge_score': { 'ridge_r2': round(ridge_r2, 5), 'ridge_absolute': round(ridge_absolute, 5), 'ridge_squared': round(ridge_squared, 5), 'ridge_median': round(ridge_median, 5) } }
def test_regressor_modifications(self): regressor = Ridge(alpha=1e-8) pcovr = self.model(mixing=0.5, regressor=regressor) # PCovR regressor matches the original self.assertTrue(regressor.get_params() == pcovr.regressor.get_params()) # PCovR regressor updates its parameters # to match the original regressor regressor.set_params(alpha=1e-6) self.assertTrue(regressor.get_params() == pcovr.regressor.get_params()) # Fitting regressor outside PCovR fits the PCovR regressor regressor.fit(self.X, self.Y) self.assertTrue(hasattr(pcovr.regressor, "coef_")) # PCovR regressor doesn't change after fitting pcovr.fit(self.X, self.Y) regressor.set_params(alpha=1e-4) self.assertTrue(hasattr(pcovr.regressor_, "coef_")) self.assertTrue( regressor.get_params() != pcovr.regressor_.get_params())
with open('valid_sets.json', 'w') as fp: dump(valid_sets, fp) # # Загрузка модели # In[ ]: dir = abspath('') enc_categ = load(join(dir, 'enc_categ.pkl')) enc_text = load(join(dir, 'enc_text.pkl')) clf = load(join(dir, 'model.pkl')) clf # # Предсказание на случайном примере # In[ ]: X_rand = random( m=1, # количество объектов n=24627, random_state=clf.get_params()['random_state'] # необязательно ) X_rand # In[ ]: pred = clf.predict(X=X_rand) pred
ridge_score = cvs(ridge_reg, X_train, Y_train, scoring="neg_mean_squared_error", cv=10) ridge_score = np.sqrt(-ridge_score) display_scores(reg_score) display_scores(ridge_score) #%% # Grid search to find good hyperparameters from sklearn.model_selection import GridSearchCV # get parameters for hyperparameter tuning #print(regressor.get_params().keys()) print(ridge_reg.get_params().keys()) #param_grid_reg = [{'fit_intercept': [True, False], 'normalize': [True, False]}] param_grid_ridge = [{ 'alpha': [1e-3, 1e-2, 1e-1, 1], 'fit_intercept': [False], 'normalize': [True, False], 'solver': ['cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'] }, { 'alpha': [1e-3, 1e-2, 1e-1, 1], 'fit_intercept': [True], 'normalize': [True, False], 'solver': ['sparse_cg', 'sag'] }]
X = clf.transform(X) print "pca X: ", X, len(X), len(X[0]) X = PolynomialFeatures(degree=1).fit_transform(X) print "poly features: ", X, len(X), len(X[0]) clf = Ridge(alpha=0.001, max_iter=100000, normalize="True", fit_intercept=True) # params = {'alpha':[0.0001,0.001,0.01,0.1,1,10,50,100]} # clf = GridSearchCV(Ridge(normalize=True,max_iter=100000), params, cv = 5,verbose=True) X = X[:, 1:len(X[0])] clf.fit(X, y) # y_pred = clf.predict(X) # print y_pred, y # print (np.sum((y_pred - y)** 2)/len(X)) # print clf.score(X,y) # print clf.best_estimator_ # print clf.best_score_ # print clf.best_params_ # print clf.cv_results_ print "coef:", clf.coef_ print clf.n_iter_ y_pred = clf.predict(X) print "y:", y print "y_pred:", y_pred print(np.sum((y_pred - y)**2) / len(X)) print clf.score(X, y) print clf.get_params() joblib.dump(clf, "poly_Ridge_20k.pkl") # clf = joblib.load('filename.pkl')
def main(): #load data dnn = VGG16 #dnn = ResNet50 #dnn = VGG19 X_train, y_train, X_test, y_test = load_data(hdf5_path) # if features are not obtained yet #get_features(hdf5_path, dnn, 'block2_pool'); print("Loading feature from HDF5 files...") hdf5_manip = MyHDF5() hdf5_manip.hfile = os.path.join(hdf5_path, dnn.__name__, "train_features.hdf5") train_features = hdf5_manip.load() hdf5_manip.hfile = os.path.join(hdf5_path, dnn.__name__, "test_features.hdf5") test_features = hdf5_manip.load() hdf5_manip.hfile = os.path.join(hdf5_path, "y_train.hdf5") y_train = hdf5_manip.load() hdf5_manip.hfile = os.path.join(hdf5_path, "y_test.hdf5") y_test = hdf5_manip.load() train_model = False alpha = 15.0 if train_model: print("Training linear regression model...") clf = Ridge(alpha=alpha) tic = time.time() clf.fit(train_features, y_train) toc = time.time() print("Time elapsed: {0} seconds".format(toc - tic)) print("Predicting...") predictions = clf.predict(test_features) mse = metrics.mean_squared_error(y_test, predictions) print("MSE: {0}".format(mse)) pickle_manip = MyPickle() pickle_manip.pfile = os.path.join(pickle_path, dnn.__name__, "clf_all_" + str(alpha) + ".sav") if train_model: # Store the model in a pickle file pickle_manip.dump(clf) clf = pickle_manip.load() print("\nRidge model: ") print(clf.get_params()) print(clf.score(train_features, y_train)) print("\nModel on Train data:") pred = clf.predict(train_features) print(pred) print(metrics.mean_squared_error(y_train, pred)) print("\nModel on Test data:") predictions = clf.predict(test_features) print(predictions) mse = metrics.mean_squared_error(y_test, predictions) print("\nMSE: {0}".format(mse)) #### Test results extract_features = False if extract_features: # image parameters width = 50 height = 50 channels = 3 # grab all filenames extensions = [".jpg"] file_names = [ fn for fn in os.listdir(test_img_path) if any( fn.endswith(ext) for ext in extensions) ] # initialize image array which holds frames num_imgs = len(file_names) X = np.empty((num_imgs, height, width, channels), dtype='float32') print("\nConverting jpegs to numpy arrays...\n") img_size = (height, width) for idx in range(num_imgs): if idx % 1000 == 0: print("Converting image {0}".format(file_names[idx])) file_path = os.path.join(test_img_path, file_names[idx]) img_manip = MyImage(file_path, img_size) img = img_manip.conv_jpg2array() X[idx] = img print("\nWriting X to HDF5...") hdf5_manip = MyHDF5() hdf5_manip.hfile = os.path.join(test_out_path, "X.hdf5") hdf5_manip.write(X) base_model = dnn(weights='imagenet', include_top=False) model = Model(input=base_model.input, output=base_model.get_layer('block2_pool').output) print("Computing X features...") X_features = model.predict(X) print("X features shape, before reshape: {0}".format(X_features.shape)) X_features = np.reshape(X_features, (X_features.shape[0], -1)) print("X features shape, after reshape: {0}".format(X_features.shape)) print("Writing X feature to HDF5 files...") hdf5_manip = MyHDF5() hdf5_manip.hfile = os.path.join(test_out_path, dnn.__name__, "X_features.hdf5") hdf5_manip.write(X_features) else: print("Loading X feature from HDF5 files...") hdf5_manip = MyHDF5() hdf5_manip.hfile = os.path.join(test_out_path, dnn.__name__, "X_features.hdf5") X_features = hdf5_manip.load() print(X_features.shape) predictions = clf.predict(X_features) print(predictions)
import numpy as np import pandas as pd from sklearn.model_selection import LeaveOneOut from sklearn.linear_model import Ridge data = pd.read_csv("../resources/data.csv") r = Ridge() r.set_params(alpha=10) print(r.get_params()['alpha'])
class Regressor(object): """Wraps scikit regressors""" def __init__(self, modelname='Linear', num_bagged_est=None, random_state=None, **kwargs): """Construct a regressor Parameters ---------- modelname : str, model name to be used as regressor Available models: - "XGBoost", - "LightGBM", - "Keras", - "RandomForest", - "ExtraTrees", - "Tree", - "Bagging", - "AdaBoost" - "Linear" num_bagged_est: int or None Number of estimators to be averaged after bagged fitting. If None then bagged fitting is not performed. random_state: int, RandomState instance or None, optional, default=None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by models. **kwargs : default = None Parameters of the corresponding regressor. Examples : n_estimators, max_depth, ... """ if not _IS_SKLEARN_INSTALLED: raise ValueError('Scikit-learn is required for this module') self.__modelname = modelname if self.__modelname == "XGBoost" and not _IS_XGBOOST_INSTALLED: raise ValueError('Package XGBoost is not installed.') elif self.__modelname == "LightGBM" and not _IS_LIGHTGBM_INSTALLED: raise ValueError('Package LightGBM is not installed.') elif self.__modelname == "Keras" and not _IS_KERAS_INSTALLED: raise ValueError('Package Keras is not installed.') self.__regressor = None self.__set_regressor(self.__modelname) self.set_params(**kwargs) self.__num_bagged_est = num_bagged_est if type(self.__num_bagged_est) != int and self.__num_bagged_est is not None: raise ValueError("num_bagged_est must be either None or an integer.") self.__random_state = random_state if type(self.__random_state) != int and self.__random_state is not None: raise ValueError("random_state must be either None or an integer.") self.set_params(random_state=self.__random_state) self.__fitOK = False self.__bagged_est = None def get_params(self, deep=True): params = {} params.update({"modelname": self.__modelname, "num_bagged_est": self.__num_bagged_est, "random_state": self.__random_state}) params.update(self.__regressor.get_params()) return params def set_params(self, **params): self.__fitOK = False self.__bagged_est = None if 'modelname' in params.keys(): self.__set_regressor(params['modelname']) del params['modelname'] if self.__modelname == "XGBoost" and not _IS_XGBOOST_INSTALLED: raise ValueError('Package XGBoost is not installed.') elif self.__modelname == "LightGBM" and not _IS_LIGHTGBM_INSTALLED: raise ValueError('Package LightGBM is not installed.') elif self.__modelname == "Keras" and not _IS_KERAS_INSTALLED: raise ValueError('Package Keras is not installed.') if 'num_bagged_est' in params.keys(): self.__num_bagged_est = params['num_bagged_est'] del params['num_bagged_est'] if type(self.__num_bagged_est) != int and self.__num_bagged_est is not None: raise ValueError("num_bagged_est must be either None or an integer.") if 'random_state' in params.keys(): self.__random_state = params['random_state'] if 'random_state' not in self.__regressor.get_params().keys(): del params['random_state'] if type(self.__random_state) != int and self.__random_state is not None: raise ValueError("random_state must be either None or an integer.") if 'build_fn' in params.keys() and self.get_estimator_name == 'Keras': setattr(self.__regressor, 'build_fn', params['build_fn']) del params['build_fn'] self.__regressor.set_params(**params) def __set_regressor(self, modelname): self.__modelname = modelname if(modelname == 'XGBoost'): self.__regressor = XGBRegressor() elif(modelname == "LightGBM"): self.__regressor = LGBMRegressor() elif(modelname == "Keras"): self.__regressor = KerasRegressor(build_fn=Sequential()) elif(modelname == 'RandomForest'): self.__regressor = RandomForestRegressor() elif(modelname == 'ExtraTrees'): self.__regressor = ExtraTreesRegressor() elif(modelname == 'Tree'): self.__regressor = DecisionTreeRegressor() elif(modelname == "Bagging"): self.__regressor = BaggingRegressor() elif(modelname == "AdaBoost"): self.__regressor = AdaBoostRegressor() elif(modelname == "Linear"): self.__regressor = Ridge() else: raise ValueError( "Model name invalid. Please choose between LightGBM " + "(if installed), XGBoost(if installed), Keras(if installed)," + "RandomForest, ExtraTrees, Tree, Bagging, AdaBoost or Linear") def fit(self, X, y, **kwargs): """Fit model. In case num_bagged_est is not None then additionally performing a type of bagging ensamble - ensamble from the same models, but with different seed values/reshuffled data which aims to decrease variance of the predictions. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Input feature matrix used for training. y : array-like of shape = [n_samples, ] The numerical encoded target for regression tasks. **kwargs : default = None Additional fitting arguments accepted by model. Not tested. Returns ------- object self """ y = self.__process_target(y) if self.__num_bagged_est is None: self.__regressor.fit(X, y, **kwargs) else: if not hasattr(self.__regressor, 'random_state'): warnings.warn("The regressor " + str(self.__modelname) + " has no random_state attribute and only random " + " shuffling will be used.") self.__bagged_est = [] for i in range(0, self.__num_bagged_est): X_shuff, y_shuff = shuffle(X, y, random_state=self.__random_state+i) est = self.get_estimator() if hasattr(est, 'random_state'): est.set_params(random_state=self.__random_state+i) est.fit(X_shuff, y_shuff, **kwargs) self.__bagged_est.append(est) self.__fitOK = True return self def predict(self, X): """Predicts the target. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Returns ------- array of shape = [n_samples, ] The target to be predicted. """ try: if not callable(getattr(self.__regressor, "predict")): raise ValueError("predict attribute is not callable") except Exception as e: raise e if self.__fitOK: if self.__num_bagged_est is None: return self.__regressor.predict(X) else: bagged_pred = np.zeros(X.shape[0]) for c, est in enumerate(self.__bagged_est): bagged_pred += est.predict(X) / self.__num_bagged_est else: raise ValueError("You must call the fit function before !") return bagged_pred def transform(self, X): """Transforms X. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Returns ------- array-like or sparse matrix of shape = [n_samples, n_features] The transformed X. """ try: if not callable(getattr(self.__regressor, "transform")): raise ValueError("transform attribute is not callable") except Exception as e: raise e if self.__fitOK: return self.__regressor.transform(X) else: raise ValueError("You must call the fit function before !") def score(self, X, y, sample_weight=None): """Returns the coefficient of determination R^2 of the prediction. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Input feature matrix used for training and cv. y : array-like of shape = [n_samples, ] The numerical encoded target for regression tasks. Returns ------- float R^2 of self.predict(df) wrt. y. """ try: if not callable(getattr(self.__regressor, "score")): raise ValueError("score attribute is not callable") except Exception as e: raise e if self.__fitOK: return self.__regressor.score(X, y, sample_weight) else: raise ValueError("You must call the fit function before !") def cross_val_predict(self, X, y, cv=None, scoring=None, **kwargs): """Performing cross validation hold out predictions for stacking. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Input feature matrix used for training and cv. y : array-like of shape = [n_samples, ] The numerical encoded target for regression tasks. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a StratifiedKFold, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. scoring : callable, default: None A callable to evaluate the predictions on the cv set. None, accuracy score **kwargs : default = None Additional fitting arguments accepted by model. Not tested. Returns ------- array of shape = [n_samples, ] The hold out target """ y = self.__process_target(y) y_pred = np.zeros(X.shape[0]) cv = check_cv(cv, y, classifier=False) n_splits = cv.get_n_splits(X, y) if scoring is None: scoring = make_scorer(accuracy_score) i = 0 score_mean = 0.0 print("Starting hold out prediction with {} splits.".format(n_splits)) for train_index, cv_index in cv.split(X, y): X_train = X[train_index] y_train = y[train_index] X_cv = X[cv_index] y_cv = y[cv_index] est = self.get_estimator() est.fit(X_train, y_train, **kwargs) y_pred_cv = est.predict(X_cv) # score = scoring(y_cv, y_pred_proba_cv) # print("Train size: {} ::: cv size: {} score (fold {}/{}): {:.4f}".format(len(train_index), len(cv_index), i + 1, n_splits, score)) # score_mean += score / float(n_splits) y_pred[cv_index] = y_pred_cv i += 1 # print("Mean score: {:.4f}".format(score_mean)) return y_pred def cross_validate(self, X, y, cv=None, scoring=None, **kwargs): """Performing a cross validation method. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Input feature matrix used for training. y : array-like of shape = [n_samples, ] The numerical encoded target for regression tasks. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a StratifiedKFold, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. scoring : For scikit learn models: string, callable, list/tuple, dict or None, default: None A single string or a callable to evaluate the predictions on the test set. None, the estimator’s default scorer (if available) is used. For LightGBM: callable or None, optional (default=None) Customized evaluation function. Note: should return (eval_name, eval_result, is_higher_better) or list of such tuples. For XGBoost: callable or None, optional (default=None) Customized evaluation function. **kwargs : default = None Additional fitting arguments. Returns ------- object self """ y = self.__process_target(y) if self.get_estimator_name == 'LightGBM': params = self.__regressor.get_params() data = lgb.Dataset(X, label=y) cv = check_cv(cv, y, classifier=False) ret = lgb.cv(params, data, feval=scoring, folds=cv, **kwargs) elif self.get_estimator_name == 'XGBoost': params = self.__regressor.get_xgb_params() data = xgb.DMatrix(X, label=y) cv = check_cv(cv, y, classifier=False) ret = xgb.cv(params, data, feval=scoring, folds=cv, **kwargs) else: ret = cross_validate(self.__regressor, X, y, cv=cv, scoring=scoring) return ret def __process_target(self, y): y = np.array(y, dtype='float') return y def get_estimator(self): return self.__classifier def get_estimator_copy(self): return make_copy(self.__classifier) @property def feature_importances_(self): if self.__fitOK: if hasattr(self.__regressor, 'feature_importances_'): return self.__regressor.feature_importances_ else: raise ValueError('The regressor ' + self.get_estimator_name + ' does not have feature_importances_ attribute.') else: raise ValueError("You must call the fit function before !") @property def get_estimator_name(self): return self.__modelname
# In[20]: mean_absolute_error(y_valid, ridge_regr.predict(X_valid_sparse)) # #### Try RidgeCV # In[37]: kf = KFold(n_splits=3, shuffle=True, random_state=42) alphas = np.logspace(-2, 6, 10) ridge = Ridge(random_state=42) # In[15]: ridge.get_params() # In[41]: params = {'alpha': np.logspace(-2, 2, 30)} # In[42]: get_ipython().run_cell_magic( 'time', '', "grid = GridSearchCV(ridge, param_grid=params,\n scoring='neg_mean_absolute_error',\n cv=kf, n_jobs=-1, verbose=1)\ngrid.fit(X_train_part_sparse, y_train_part)" ) # In[44]: mean_absolute_error(y_valid, grid.predict(X_valid_sparse))
def ridge(X, Y, kfold=3, feature_set=None): arr = index_splitter(N=len(X), fold=kfold) ps = PredefinedSplit(arr) for train, test in ps.split(): train_index = train test_index = test train_X, train_y = X.values[train_index, :], Y.values[train_index] test_X, test_y = X.values[test_index, :], Y.values[test_index] arr = index_splitter(N=len(train_X), fold=kfold) ps2 = PredefinedSplit(arr) # Create the random grid alpha = np.linspace(1, 100, 100) random_grid = {'alpha': alpha} ridge = Ridge(random_state=42) # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(ridge.get_params()) # Use the random grid to search for best hyperparameters # First create the base model to tune # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores ridge_random = RandomizedSearchCV(estimator=ridge, param_distributions=random_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, random_state=42, n_jobs=-1) # Fit the random search model ridge_random.fit(train_X, train_y) pprint(ridge_random.best_params_) cv_result_rd = ridge_random.cv_results_ BestPara_random = ridge_random.best_params_ print(BestPara_random) ## Grid search of parameters, using 3 fold cross validation based on Random search from sklearn.model_selection import GridSearchCV # Number of trees in random forest alpha = np.linspace(BestPara_random["alpha"] - 1, BestPara_random["alpha"] + 1, 10) # Create the random grid grid_grid = {'alpha': alpha} ridge_grid = GridSearchCV(estimator=ridge, param_grid=grid_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, n_jobs=-1) # Fit the grid search model ridge_grid.fit(train_X, train_y) BestPara_grid = ridge_grid.best_params_ pprint(ridge_grid.best_params_) cv_results_grid = ridge_grid.cv_results_ # Fit the base line search model ridge.fit(train_X, train_y) #prediction predict_y = ridge_random.predict(test_X) predict_y_grid = ridge_grid.predict(test_X) predict_y_base = ridge.predict(test_X) # Performance metrics def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y): errors_Grid_CV = np.sqrt(mean_squared_log_error( predict_y_grid, test_y)) errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y)) errors_baseline = np.sqrt( mean_squared_log_error(predict_y_base, test_y)) return errors_Grid_CV, errors_Random_CV, errors_baseline errors_Grid_CV = (mean_squared_error(predict_y_grid, test_y)) #,squared = False)) errors_Random_CV = (mean_squared_error(predict_y, test_y)) #,squared = False)) errors_baseline = (mean_squared_error(predict_y_base, test_y)) #,squared = False)) results = [errors_Grid_CV, errors_Random_CV, errors_baseline] print('ridge results:', results) if True: fig = plt.figure(figsize=(15, 8)) x_axis = range(3) plt.bar(x_axis, results) plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline')) #plt.show() plt.savefig('ridge_error_compare.png') print('min index:', results.index(min(results))) if results.index(min(results)) is 0: model = ridge_grid pred_y = predict_y_grid else: if results.index(min(results)) is 1: model = ridge_random pred_y = predict_y_random else: mode = ridge pred_y = predict_y_base #feature importance #predictors = x_train.columns #coef = Series(lreg.coef_,predictors).sort_values() #coef.plot(kind='bar', title='Model Coefficients, kflod:'+str(kfold)) #plt.show() #plt.savefig('ridge_feature_importance.png') fig = plt.figure(figsize=(20, 8)) ax = fig.gca() x_label = range(0, len(pred_y)) plt.title("kfold=" + str(kfold)) ax.plot(x_label, pred_y, 'r--', label="predict") ax.plot(x_label, test_y, label="ground_truth") ax.set_ylim(0, 200) ax.legend() #plt.show() plt.savefig('ridge_prediction.png') return ridge_grid.predict, ridge_grid.best_estimator_
y = dataMat[:,0] #变量y # ========岭回归======== alphas = 10**np.linspace(- 3, 3, 100) model = Ridge(alpha=alphas) model = RidgeCV(alphas=alphas,store_cv_values=True) # 通过RidgeCV可以设置多个参数值,算法使用交叉验证获取最佳参数值 model.fit(X, y) # 线性回归建模 print(model.alpha_)#岭系数 print(model.cv_values_.shape)#loss值 plt.plot(alphas,model.cv_values_.mean(axis=0)) plt.plot(model.alpha_,min(model.cv_values_.mean(axis=0)),"bo") print(model.coef_) # 系数 print(model.intercept_) # 常量 print(model.score(X, y)) # R^2,拟合优度 print(model.get_params()) # 获取参数信息 print(model.set_params(fit_intercept=False)) # 重新设置参数 # print('交叉验证最佳alpha值',model.alpha_) # 只有在使用RidgeCV算法时才有效 # 使用模型预测 predicted = model.predict(X) # 绘制散点图 参数:x横轴 y纵轴 plt.scatter(X, y, marker='x') plt.plot(X, predicted,c='r') # 绘制x轴和y轴坐标 plt.xlabel("x") plt.ylabel("y") # 显示图形 plt.show()
def ridge(X, Y, kfold=3, feature_set=None): arr = index_splitter(N=len(X), fold=kfold) ps = PredefinedSplit(arr) for train, test in ps.split(): train_index = train test_index = test train_X, train_y = X.values[train_index, :], Y.values[train_index] test_X, test_y = X.values[test_index, :], Y.values[test_index] arr = index_splitter(N=len(train_X), fold=kfold) ps2 = PredefinedSplit(arr) # Create the random grid alpha = np.linspace(0, 1, 10) random_grid = {'alpha': alpha} ridge = Ridge(random_state=42) from pprint import pprint # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(ridge.get_params()) # Use the random grid to search for best hyperparameters # First create the base model to tune # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores ridge_random = RandomizedSearchCV(estimator=ridge, param_distributions=random_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, random_state=42, n_jobs=-1) # Fit the random search model ridge_random.fit(train_X, train_y) pprint(ridge_random.best_params_) cv_result_rd = ridge_random.cv_results_ BestPara_random = ridge_random.best_params_ ## Grid search of parameters, using 3 fold cross validation based on Random search from sklearn.model_selection import GridSearchCV # Number of trees in random forest alpha = [ int(x) for x in range(BestPara_random["alpha"] - 2, BestPara_random["alpha"] + 2, 100) ] # Create the random grid grid_grid = {'alpha': alpha} ridge_grid = GridSearchCV(estimator=ridge, param_grid=grid_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, n_jobs=-1) # Fit the grid search model ridge_grid.fit(train_X, train_y) BestPara_grid = ridge_grid.best_params_ pprint(ridge_grid.best_params_) cv_results_grid = ridge_grid.cv_results_ # Fit the base line search model ridge.fit(train_X, train_y) #prediction predict_y = ridge_random.predict(test_X) predict_y_grid = ridge_grid.predict(test_X) predict_y_base = ridge.predict(test_X) # Performance metrics from sklearn.metrics import mean_squared_log_error from sklearn.metrics import mean_squared_error def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y): errors_Grid_CV = np.sqrt(mean_squared_log_error( predict_y_grid, test_y)) errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y)) errors_baseline = np.sqrt( mean_squared_log_error(predict_y_base, test_y)) return errors_Grid_CV, errors_Random_CV, errors_baseline errors_Grid_CV = (mean_squared_error(predict_y_grid, test_y, squared=False)) errors_Random_CV = (mean_squared_error(predict_y, test_y, squared=False)) errors_baseline = (mean_squared_error(predict_y_base, test_y, squared=False)) x_axis = range(3) results = [errors_Grid_CV, errors_Random_CV, errors_baseline] plt.bar(x_axis, results) plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline')) plt.show() # ============================================================================= # #feature importance # num_feature = len(rf_random.best_estimator_.feature_importances_) # plt.figure(figsize=(12,6)) # plt.bar(range(0,num_feature*4,4),rf_random.best_estimator_.feature_importances_) # label_name = X.keys() # plt.xticks(range(0,num_feature*4,4), label_name) # plt.title("Feature Importances") # plt.show() # ============================================================================= #feature importance num_feature = len(ridge_grid.best_estimator_.feature_importances_) plt.figure(figsize=(24, 6)) plt.bar(range(0, num_feature * 4, 4), ridge_grid.best_estimator_.feature_importances_) label_name = X.keys() plt.xticks(range(0, num_feature * 4, 4), label_name) plt.title("Feature Importances" + ",kfold=" + str(kfold)) plt.show() fig = plt.figure(figsize=(20, 8)) ax = fig.gca() x_label = range(0, len(predict_y_grid)) plt.title("kfold=" + str(kfold)) ax.plot(x_label, predict_y_grid, 'r--', label="predict") ax.plot(x_label, test_y, label="ground_truth") ax.set_ylim(0, 200) ax.legend() plt.show() return ridge_grid.predict, ridge_grid.best_estimator_
def single_model(args): import h5py import pandas as pd import numpy as np import dill as pickle from utils import read_hdf5_dataset, prepare_output_file, read_hdf5_single from sklearn.preprocessing import StandardScaler from sklearn.metrics import r2_score, mean_squared_error from tqdm import tqdm logger.info('read phenotypes from file: ' + args.phenotype_file) #phenotypes = pd.read_table(args.phenotype_file) phenotypes = read_hdf5_dataset(args.phenotype_file) logger.info('read genotypes from file: ' + args.genotype_file) X = read_hdf5_dataset(args.genotype_file) if args.transpose_x: logger.info('transpose X') X = X.T y = phenotypes if args.feature_indices_file: logger.info('read feature indices from: ' + args.feature_indices_file) feature_indices = read_hdf5_dataset(args.feature_indices_file) X = np.take(X, feature_indices, axis=1) if args.normalize_x: logger.info('normalize X') X = StandardScaler().fit_transform(X) if args.sample_indices_file: logger.info('read sample indices from: ' + args.sample_indices_file) sample_indices = read_hdf5_dataset(args.sample_indices_file) else: sample_indices = np.nonzero(~np.isnan(phenotypes))[0] X_train = X[sample_indices] y_train = y[sample_indices] logger.info('read parent table from file: ' + args.parent_table_file) parent_table = read_hdf5_single(args.parent_table_file) logger.info('use model ' + args.model_name) logger.info('X.shape = %s, y.shape = %s' % (repr(X.shape), repr(y.shape))) if args.model_name == 'ridge': from sklearn.linear_model import Ridge model = Ridge(alpha=10000) model.fit(X_train, y_train) y_pred = np.ravel(model.predict(X)) y_pred_train = y_pred[sample_indices] elif args.model_name == 'ridge_cv': from sklearn.linear_model import Ridge alphas = 10.0**np.arange(1, 6) train_masks, test_masks = generate_cv_masks(sample_indices, parent_table, k_female=5, k_male=5) cv_metrics = {} cv_metrics['mse'] = np.zeros((len(alphas), train_masks.shape[0])) cv_metrics['r2'] = np.zeros((len(alphas), train_masks.shape[0])) pbar = tqdm(total=len(alphas) * train_masks.shape[0]) for i, alpha in enumerate(alphas): for j in range(train_masks.shape[0]): model = Ridge(alpha=alpha) model.fit(X[train_masks[j]], y[train_masks[j]]) y_pred = model.predict(X[test_masks[j]]) cv_metrics['mse'][i, j] = mean_squared_error( y[test_masks[j]], y_pred) cv_metrics['r2'][i, j] = r2_score(y[test_masks[j]], y_pred) pbar.update(1) pbar.close() best_alpha = alphas[cv_metrics['r2'].mean(axis=1).argmax()] logger.info('optmized alpha = %f' % best_alpha) model = Ridge(alpha=best_alpha) model.fit(X_train, y_train) y_pred = np.ravel(model.predict(X)) y_pred_train = y_pred[sample_indices] elif args.model_name == 'gpr': from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF kernel = RBF() + WhiteKernel() model = GaussianProcessRegressor(kernel=kernel) model.fit(X_train, y_train) logger.info('kernel params: %s' % repr(model.get_params())) y_pred_train = np.ravel(model.predict(X_train)) y_pred = np.ravel(model.predict(X)) elif args.model_name == 'gpy': from GPy.kern import Linear from GPy.models import GPRegression kernel = Linear(input_dim=2, name='linear') model = GPRegression(X_train, y_train, kernel=kernel) model.optimize() else: raise ValueError('unknown model name: ' + args.model_name) logger.info('r2 score = %f' % r2_score(y_train, y_pred_train)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_file = os.path.join(args.output_dir, 'model') logger.info('save model file: ' + model_file) with open(model_file, 'wb') as f: pickle.dump(model, f) pred_file = os.path.join(args.output_dir, 'predictions') logger.info('save predictions to file: ' + pred_file) with h5py.File(pred_file, 'w') as f: if args.output_residuals: f.create_dataset('residual', data=(y - y_pred)) f.create_dataset('y_true', data=y) f.create_dataset('y_pred', data=y_pred) f.create_dataset('y_pred_train', data=y_pred_train) f.create_dataset('indices_train', data=sample_indices) if args.model_name == 'ridge_cv': f.create_dataset('alpha', data=alphas) g = f.create_group('cv_metrics') for key in cv_metrics.keys(): g.create_dataset(key, data=cv_metrics[key])
scores_dict = {} mse_dict = {} mse_dict_raw = {} mae_dict = {} mae_dict_raw = {} mape_dict = {} scores_dict_f3 = {} mse_dict_f3 = {} mse_dict_f3_raw = {} mae_dict_f3 = {} mae_dict_f3_raw = {} mape_dict_f3 = {} test_start_time = time.clock() getparams_dict = aux_reg_regressor.get_params(deep=True) print("getparams_dict: ", getparams_dict) getparams_df = pd.DataFrame.from_dict(data=getparams_dict, orient='index') getparams_df.to_csv(analysis_path + model_id + str(model)[:-4] + "getparams.csv") model_as_pkl_filename = analysis_path + model_id + str( model)[:-4] + ".pkl" joblib.dump(aux_reg_regressor, filename=model_as_pkl_filename) #np.savetxt(analysis_path + "rf5getparams.txt",fmt='%s',X=str(aux_reg_regressor.get_params(deep=True))) #np.savetxt(analysis_path + "rf5estimatorparams.txt",fmt='%s',X=aux_reg_regressor.estimator_params) USELESS #np.savetxt(analysis_path + "rf5classes.txt",fmt='%s',X=aux_reg_regressor.classes_) #np.savetxt(analysis_path + "rf5baseestim.txt",fmt='%s',X=aux_reg_regressor.base_estimator_) #TODO: CHANGE THIS BACK IF CUT SHORT!! for files in combined_filenames:
def ridge(X, Y, kfold=3, feature_set=None): arr = index_splitter(N=len(X), fold=kfold) ps = PredefinedSplit(arr) for train, test in ps.split(): train_index = train test_index = test train_X, train_y = X.values[train_index, :], Y.values[train_index] test_X, test_y = X.values[test_index, :], Y.values[test_index] arr = index_splitter(N=len(train_X), fold=kfold) ps2 = PredefinedSplit(arr) # base ridge = Ridge(random_state=42) ridge.fit(train_X, train_y) print('Base Parameters in use:') print(ridge.get_params()) # grid search alpha_log = np.linspace(-8, 5, 14) alpha = [] for i in alpha_log: a = math.pow(10, i) alpha = alpha + [a] grid_grid = {'alpha': alpha} ridge_grid = GridSearchCV(estimator=ridge, param_grid=grid_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, n_jobs=-1) # Fit the grid search model ridge_grid.fit(train_X, train_y) BestPara_grid = ridge_grid.best_params_ print("grid search, best parameter:", ridge_grid.best_params_) cv_results_grid = ridge_grid.cv_results_ new_measure = int(ridge_grid.best_params_['alpha']) new_alpha = [x for x in range(100, new_measure * 10, 10)] print(new_alpha) grid_grid = {'alpha': new_alpha} ridge_grid2 = GridSearchCV(estimator=ridge, param_grid=grid_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, n_jobs=-1) # Fit the grid search model ridge_grid2.fit(train_X, train_y) BestPara_grid = ridge_grid2.best_params_ print("grid search, best parameter:", ridge_grid2.best_params_) cv_results_grid2 = ridge_grid2.cv_results_ #prediction predict_y_grid = ridge_grid.predict(test_X) predict_y_base = ridge.predict(test_X) predict_y_grid2 = ridge_grid2.predict(test_X) # Performance metrics errors_Grid_CV = np.sqrt(mean_squared_error(predict_y_grid, test_y)) errors_Grid2_CV = np.sqrt(mean_squared_error(predict_y_grid2, test_y)) errors_baseline = np.sqrt(mean_squared_error(predict_y_base, test_y)) results = [errors_Grid2_CV, errors_Grid_CV, errors_baseline] print('ridge results:', results) return ridge_grid2
data = data.dropna() #Train test split: data.dropna(axis=1, how='any', inplace=True) a = len(data.T) - 1 # The last column is the label X = data.iloc[:, range(0, a)] Y = data.iloc[:, a] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.34) #fit Random forest model: model = Ridge(alpha=1).fit(X_train, Y_train) pred = model.predict(X_test) #Get scores and analysis score_r = model.score(X_test, Y_test) print("Model = {}, R^2 = {}".format(model.get_params(), score_r)) print score_r # X_test1.to_csv('xtest.csv',na_rep='NA',index=False) # X_test1['dataset'].to_csv('ytest.csv',na_rep='NA',index=False) #print model.coef_ #print X_test.columns #print zip(model.coef_, X_test.columns) #print pred #print Y_test NRMSD = (pred - Y_test)**2 / (pred.mean() * Y_test.mean()) NRMSD = NRMSD.mean() print "NRMSD:" print NRMSD
ax = plt.gca() ax.plot(alphas, coefs) ax.set_xscale('log') ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis plt.xlabel('alpha') plt.ylabel('weights') plt.title('Ridge coefficients as a function of the regularization') plt.axis('tight') plt.show() clf = Ridge(alpha=2.0) clf.fit(X_train, Y_train) clf.score(X_cv, Y_cv) clf.get_params(deep=True) arreglo = clf.predict(X_train) readerWriter = ReaderAndWriter() readerWriter.write_file([arreglo], 'answer1.txt') def date_substraction(date1, date2): d1 = date1.split("-") d2 = date2.split("-") yearDiff = int(d1[0]) - int(d2[0]) monthDiff = int(d1[1]) - int(d2[1]) dayDiff = int(d1[2]) - int(d2[2]) return dayDiff + 30 * monthDiff + 365 * yearDiff def days_between(d1, d2):