Esempio n. 1
0
def test_cross_val_generator_mask_indices_same():
    # Test that the cross validation generators return the same results when
    # indices=True and when indices=False
    y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])
    labels = np.array([1, 1, 2, 3, 3, 3, 4])

    loo_mask = cval.LeaveOneOut(5, indices=False)
    loo_ind = cval.LeaveOneOut(5, indices=True)
    lpo_mask = cval.LeavePOut(10, 2, indices=False)
    lpo_ind = cval.LeavePOut(10, 2, indices=True)
    kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1)
    kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1)
    skf_mask = cval.StratifiedKFold(y, 3, indices=False)
    skf_ind = cval.StratifiedKFold(y, 3, indices=True)
    lolo_mask = cval.LeaveOneLabelOut(labels, indices=False)
    lolo_ind = cval.LeaveOneLabelOut(labels, indices=True)
    lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False)
    lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True)
    ps_mask = cval.PredefinedSplit([1, 1, 2, 2], indices=False)
    ps_ind = cval.PredefinedSplit([1, 1, 2, 2], indices=True)
    for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind),
                            (kf_mask, kf_ind), (skf_mask, skf_ind),
                            (lolo_mask, lolo_ind), (lopo_mask, lopo_ind),
                            (ps_mask, ps_ind)]:
        for (train_mask, test_mask), (train_ind, test_ind) in \
                zip(cv_mask, cv_ind):
            assert_array_equal(np.where(train_mask)[0], train_ind)
            assert_array_equal(np.where(test_mask)[0], test_ind)
Esempio n. 2
0
def train_gbr(X_input_train, Y_input_train, X_input_test, Y_input_test, name):
    """
        Function to train a Gradient Boosting regression model with the Scikit Learn API
        
        This functions trains and saves the network as a pickle file in ./silicat/saved/
         
        INPUTS:
        
        X_input_train: Numpy array 
            The training input values
            
        X_input_train: Numpy array 
            The training input values

        X_input_train: Numpy array 
            The training input values
        
        X_input_train: Numpy array 
            The training input values

        name: String
            The name of the file for saving the network

        OUTPUTS:
        
        A saved network in ./silicat/saved/ with filename = name.
        
    """
    # this is for the cross-validation stuff, we put the test data at the end and get their indexes
    X_dataset = np.concatenate([X_input_train, X_input_test])
    Y_dataset = np.concatenate([Y_input_train, Y_input_test])

    #ttt = test[len(x_i_g):len(x_i_g)+len(x_t_g),:] /// (ttt==x_t_g).all() => I found this technic like that, playing around
    test_idx = np.arange(len(X_input_train),
                         len(X_input_train) + len(X_input_test))

    # and we setup the cross-validation here
    ps = cross_validation.PredefinedSplit(test_fold=test_idx)

    # now we perform the gridsearch and fit the model, get the best nmodel and save it using joblib
    #mg = GridSearchCV(ensemble.GradientBoostingRegressor(), cv=ps,param_grid={"max_depth": [4,5,6],"learning_rate": [0.01,0.1,1.0,10.0]},n_jobs = 1)
    # Fit regression model
    params = {
        'n_estimators': 1000,
        'max_depth': 2,
        'min_samples_split': 1,
        'learning_rate': 0.1,
        'loss': 'ls'
    }
    mg = ensemble.GradientBoostingRegressor(**params)
    mg.fit(X_input_train, Y_input_train.ravel())
    joblib.dump(
        mg,
        os.path.dirname(os.path.abspath(__file__)) + '/saved/' + name + '.pkl')
Esempio n. 3
0
def test_predefinedsplit_with_kfold_split():
    # Check that PredefinedSplit can reproduce a split generated by Kfold.
    folds = -1 * np.ones(10)
    kf_train = []
    kf_test = []
    for i, (train_ind, test_ind) in enumerate(cval.KFold(10, 5, shuffle=True)):
        kf_train.append(train_ind)
        kf_test.append(test_ind)
        folds[test_ind] = i
    ps_train = []
    ps_test = []
    ps = cval.PredefinedSplit(folds)
    for train_ind, test_ind in ps:
        ps_train.append(train_ind)
        ps_test.append(test_ind)
    assert_array_equal(ps_train, kf_train)
    assert_array_equal(ps_test, kf_test)
Esempio n. 4
0
def train_svm(X_input_train, Y_input_train, X_input_test, Y_input_test, name):
    """
        Function to train a support vector machine model with the Scikit Learn API
        
        This functions trains and saves the network as a pickle file in ./silicat/saved/
         
        INPUTS:
        
        X_input_train: Numpy array 
            The training input values
            
        X_input_train: Numpy array 
            The training input values

        X_input_train: Numpy array 
            The training input values
        
        X_input_train: Numpy array 
            The training input values

        name: String
            The name of the file for saving the network

        OUTPUTS:
        
        A saved network in ./silicat/saved/ with filename = name.
        
    """
    # this is for the cross-validation stuff, we put the test data at the end and get their indexes
    X_dataset = np.concatenate([X_input_train, X_input_test])
    Y_dataset = np.concatenate([Y_input_train, Y_input_test])

    #ttt = test[len(x_i_g):len(x_i_g)+len(x_t_g),:] /// (ttt==x_t_g).all() => I found this technic like that, playing around
    test_idx = np.arange(len(X_input_train),
                         len(X_input_train) + len(X_input_test))

    # and we setup the cross-validation here
    ps = cross_validation.PredefinedSplit(test_fold=test_idx)

    # now we perform the gridsearch and fit the model, get the best nmodel and save it using joblib
    #mg = GridSearchCV(SVR(kernel='rbf',epsilon=0.1), cv=10,param_grid={"C":[0.01,0.1,1.0,10.0,100.0], "epsilon"=[0]},n_jobs = 1)
    mg = SVR(kernel='rbf', C=1.5, epsilon=0.01)
    mg.fit(X_input_train, Y_input_train.ravel())
    joblib.dump(
        mg,
        os.path.dirname(os.path.abspath(__file__)) + '/saved/' + name + '.pkl')
Esempio n. 5
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]
    # try to keep n_estimators at 500 or less for performance reasons, over 2000 may exceed 8 GB ram

    if do_optimize:
        # cv_gs = cv.ShuffleSplit(len(train_y), n_iter=4, test_size=0.35)

        # 4 folds without shuffling gives better cross validation scores
        # cv_gs = cv.KFold(len(train_y), n_folds=4, shuffle=False)

        # custom split based on predicting the first/last 5 days from the remaining of each month
        split_arr = np.array(
            [0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1])
        train_split_f = lambda x: split_arr[x.index.day - 1]

        train_split = train_set_df.apply(train_split_f)["atemp"].values
        cv_gs = cv.PredefinedSplit(train_split)

        regist_grid_search = sklearn.grid_search.GridSearchCV(
            ens.BaggingRegressor(base_estimator=ens.AdaBoostRegressor(
                base_estimator=tree.DecisionTreeRegressor())),
            search_space_regist,
            cv=cv_gs,
            scoring="mean_squared_error",
            refit=False,
            n_jobs=2,
            pre_dispatch="2*n_jobs",
            verbose=3)

        casual_grid_search = sklearn.grid_search.GridSearchCV(
            ens.BaggingRegressor(base_estimator=ens.AdaBoostRegressor(
                base_estimator=tree.DecisionTreeRegressor())),
Esempio n. 7
0
            "max_depth": [10],
            "max_leaf_nodes": [None],
            "n_estimators": [500]
        }

        # cv_gs = cv.ShuffleSplit(len(train_y), n_iter=4, test_size=0.35)

        # 4 folds without shuffling gives better cross validation scores
        # cv_gs = cv.KFold(len(train_y), n_folds=4, shuffle=False)

        # custom split based on predicting the first/last 5 days from the remaining of each month
        split_arr = np.array(
            [0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1])
        split_f = lambda x: split_arr[x.index.day - 1]
        test_split = train_set_df.apply(split_f)["atemp"].values
        cv_gs = cv.PredefinedSplit(test_split)
        # split_f = lambda x: (x.index.day  - 1)// 15

        grid_search_gs = sklearn.grid_search.GridSearchCV(
            ens.RandomForestRegressor(),
            search_space,
            cv=cv_gs,
            scoring="mean_squared_error",
            refit=False,
            n_jobs=1,
            pre_dispatch="2*n_jobs",
            verbose=3)

        print("running grid search")
        grid_search_gs.fit(train_X, train_y)
Esempio n. 8
0
# Convert 'Day' to 'datetime' and set as index
hires.Day = pd.to_datetime(hires.Day, unit='D')
hires.set_index('Day', inplace=True)

# Extract first column (daily hires) and convert to 'float'
hires = hires.iloc[:,0].astype('float').to_frame('Hires')

# Apply logarithmic transformation

# Create 7 new variables representing the lagged time series at lag = 1, ..., 7

# Create 2 new variables representing the smoothed time series
# (rolling averages at 7 and 30 days)

# Drop missing values

# Define cross-validation split by leaving out 2016 as test set
split = cv.PredefinedSplit(test_fold=(hires.index.year == 2016) - 1)

# Create a pipeline that scales the data and trains a support vector regression
# model

# Fit the model

# Compute MSE for split

# Determine ‘optimal’ kernel and value of C by cross-validation

# Plot original time series and prediction from January 2015 onwards