def test_cross_val_generator_mask_indices_same(): # Test that the cross validation generators return the same results when # indices=True and when indices=False y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]) labels = np.array([1, 1, 2, 3, 3, 3, 4]) loo_mask = cval.LeaveOneOut(5, indices=False) loo_ind = cval.LeaveOneOut(5, indices=True) lpo_mask = cval.LeavePOut(10, 2, indices=False) lpo_ind = cval.LeavePOut(10, 2, indices=True) kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1) kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1) skf_mask = cval.StratifiedKFold(y, 3, indices=False) skf_ind = cval.StratifiedKFold(y, 3, indices=True) lolo_mask = cval.LeaveOneLabelOut(labels, indices=False) lolo_ind = cval.LeaveOneLabelOut(labels, indices=True) lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False) lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True) ps_mask = cval.PredefinedSplit([1, 1, 2, 2], indices=False) ps_ind = cval.PredefinedSplit([1, 1, 2, 2], indices=True) for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind), (kf_mask, kf_ind), (skf_mask, skf_ind), (lolo_mask, lolo_ind), (lopo_mask, lopo_ind), (ps_mask, ps_ind)]: for (train_mask, test_mask), (train_ind, test_ind) in \ zip(cv_mask, cv_ind): assert_array_equal(np.where(train_mask)[0], train_ind) assert_array_equal(np.where(test_mask)[0], test_ind)
def train_gbr(X_input_train, Y_input_train, X_input_test, Y_input_test, name): """ Function to train a Gradient Boosting regression model with the Scikit Learn API This functions trains and saves the network as a pickle file in ./silicat/saved/ INPUTS: X_input_train: Numpy array The training input values X_input_train: Numpy array The training input values X_input_train: Numpy array The training input values X_input_train: Numpy array The training input values name: String The name of the file for saving the network OUTPUTS: A saved network in ./silicat/saved/ with filename = name. """ # this is for the cross-validation stuff, we put the test data at the end and get their indexes X_dataset = np.concatenate([X_input_train, X_input_test]) Y_dataset = np.concatenate([Y_input_train, Y_input_test]) #ttt = test[len(x_i_g):len(x_i_g)+len(x_t_g),:] /// (ttt==x_t_g).all() => I found this technic like that, playing around test_idx = np.arange(len(X_input_train), len(X_input_train) + len(X_input_test)) # and we setup the cross-validation here ps = cross_validation.PredefinedSplit(test_fold=test_idx) # now we perform the gridsearch and fit the model, get the best nmodel and save it using joblib #mg = GridSearchCV(ensemble.GradientBoostingRegressor(), cv=ps,param_grid={"max_depth": [4,5,6],"learning_rate": [0.01,0.1,1.0,10.0]},n_jobs = 1) # Fit regression model params = { 'n_estimators': 1000, 'max_depth': 2, 'min_samples_split': 1, 'learning_rate': 0.1, 'loss': 'ls' } mg = ensemble.GradientBoostingRegressor(**params) mg.fit(X_input_train, Y_input_train.ravel()) joblib.dump( mg, os.path.dirname(os.path.abspath(__file__)) + '/saved/' + name + '.pkl')
def test_predefinedsplit_with_kfold_split(): # Check that PredefinedSplit can reproduce a split generated by Kfold. folds = -1 * np.ones(10) kf_train = [] kf_test = [] for i, (train_ind, test_ind) in enumerate(cval.KFold(10, 5, shuffle=True)): kf_train.append(train_ind) kf_test.append(test_ind) folds[test_ind] = i ps_train = [] ps_test = [] ps = cval.PredefinedSplit(folds) for train_ind, test_ind in ps: ps_train.append(train_ind) ps_test.append(test_ind) assert_array_equal(ps_train, kf_train) assert_array_equal(ps_test, kf_test)
def train_svm(X_input_train, Y_input_train, X_input_test, Y_input_test, name): """ Function to train a support vector machine model with the Scikit Learn API This functions trains and saves the network as a pickle file in ./silicat/saved/ INPUTS: X_input_train: Numpy array The training input values X_input_train: Numpy array The training input values X_input_train: Numpy array The training input values X_input_train: Numpy array The training input values name: String The name of the file for saving the network OUTPUTS: A saved network in ./silicat/saved/ with filename = name. """ # this is for the cross-validation stuff, we put the test data at the end and get their indexes X_dataset = np.concatenate([X_input_train, X_input_test]) Y_dataset = np.concatenate([Y_input_train, Y_input_test]) #ttt = test[len(x_i_g):len(x_i_g)+len(x_t_g),:] /// (ttt==x_t_g).all() => I found this technic like that, playing around test_idx = np.arange(len(X_input_train), len(X_input_train) + len(X_input_test)) # and we setup the cross-validation here ps = cross_validation.PredefinedSplit(test_fold=test_idx) # now we perform the gridsearch and fit the model, get the best nmodel and save it using joblib #mg = GridSearchCV(SVR(kernel='rbf',epsilon=0.1), cv=10,param_grid={"C":[0.01,0.1,1.0,10.0,100.0], "epsilon"=[0]},n_jobs = 1) mg = SVR(kernel='rbf', C=1.5, epsilon=0.01) mg.fit(X_input_train, Y_input_train.ravel()) joblib.dump( mg, os.path.dirname(os.path.abspath(__file__)) + '/saved/' + name + '.pkl')
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ss = cval.ShuffleSplit(2) ps = cval.PredefinedSplit([1, 1, 2, 2]) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
# try to keep n_estimators at 500 or less for performance reasons, over 2000 may exceed 8 GB ram if do_optimize: # cv_gs = cv.ShuffleSplit(len(train_y), n_iter=4, test_size=0.35) # 4 folds without shuffling gives better cross validation scores # cv_gs = cv.KFold(len(train_y), n_folds=4, shuffle=False) # custom split based on predicting the first/last 5 days from the remaining of each month split_arr = np.array( [0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1]) train_split_f = lambda x: split_arr[x.index.day - 1] train_split = train_set_df.apply(train_split_f)["atemp"].values cv_gs = cv.PredefinedSplit(train_split) regist_grid_search = sklearn.grid_search.GridSearchCV( ens.BaggingRegressor(base_estimator=ens.AdaBoostRegressor( base_estimator=tree.DecisionTreeRegressor())), search_space_regist, cv=cv_gs, scoring="mean_squared_error", refit=False, n_jobs=2, pre_dispatch="2*n_jobs", verbose=3) casual_grid_search = sklearn.grid_search.GridSearchCV( ens.BaggingRegressor(base_estimator=ens.AdaBoostRegressor( base_estimator=tree.DecisionTreeRegressor())),
"max_depth": [10], "max_leaf_nodes": [None], "n_estimators": [500] } # cv_gs = cv.ShuffleSplit(len(train_y), n_iter=4, test_size=0.35) # 4 folds without shuffling gives better cross validation scores # cv_gs = cv.KFold(len(train_y), n_folds=4, shuffle=False) # custom split based on predicting the first/last 5 days from the remaining of each month split_arr = np.array( [0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1]) split_f = lambda x: split_arr[x.index.day - 1] test_split = train_set_df.apply(split_f)["atemp"].values cv_gs = cv.PredefinedSplit(test_split) # split_f = lambda x: (x.index.day - 1)// 15 grid_search_gs = sklearn.grid_search.GridSearchCV( ens.RandomForestRegressor(), search_space, cv=cv_gs, scoring="mean_squared_error", refit=False, n_jobs=1, pre_dispatch="2*n_jobs", verbose=3) print("running grid search") grid_search_gs.fit(train_X, train_y)
# Convert 'Day' to 'datetime' and set as index hires.Day = pd.to_datetime(hires.Day, unit='D') hires.set_index('Day', inplace=True) # Extract first column (daily hires) and convert to 'float' hires = hires.iloc[:,0].astype('float').to_frame('Hires') # Apply logarithmic transformation # Create 7 new variables representing the lagged time series at lag = 1, ..., 7 # Create 2 new variables representing the smoothed time series # (rolling averages at 7 and 30 days) # Drop missing values # Define cross-validation split by leaving out 2016 as test set split = cv.PredefinedSplit(test_fold=(hires.index.year == 2016) - 1) # Create a pipeline that scales the data and trains a support vector regression # model # Fit the model # Compute MSE for split # Determine ‘optimal’ kernel and value of C by cross-validation # Plot original time series and prediction from January 2015 onwards