def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])  
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models= [ 'pH_rdg_prds', 'pH_las_prds', 
              'pH_for_prds', 'pH_for_prds' ] 
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
Ejemplo n.º 2
0
def test_ridgecv_store_cv_values():
    rng = np.random.RandomState(42)

    n_samples = 8
    n_features = 5
    x = rng.randn(n_samples, n_features)
    alphas = [1e-1, 1e0, 1e1]
    n_alphas = len(alphas)

    r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True)

    # with len(y.shape) == 1
    y = rng.randn(n_samples)
    r.fit(x, y)
    assert r.cv_values_.shape == (n_samples, n_alphas)

    # with len(y.shape) == 2
    n_targets = 3
    y = rng.randn(n_samples, n_targets)
    r.fit(x, y)
    assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)

    r = RidgeCV(cv=3, store_cv_values=True)
    assert_raises_regex(ValueError, 'cv!=None and store_cv_values',
                        r.fit, x, y)
Ejemplo n.º 3
0
def test_ridge_gcv_vs_ridge_loo_cv(
        gcv_mode, X_constructor, X_shape, y_shape,
        fit_intercept, normalize, noise):
    n_samples, n_features = X_shape
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(
        n_samples=n_samples, n_features=n_features, n_targets=n_targets,
        random_state=0, shuffle=False, noise=noise, n_informative=5
    )
    y = y.reshape(y_shape)

    alphas = [1e-3, .1, 1., 10., 1e3]
    loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept,
                        alphas=alphas, scoring='neg_mean_squared_error',
                        normalize=normalize)
    gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept,
                        alphas=alphas, normalize=normalize)

    loo_ridge.fit(X, y)

    X_gcv = X_constructor(X)
    gcv_ridge.fit(X_gcv, y)

    assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_)
    assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
Ejemplo n.º 4
0
def test_check_gcv_mode_error(mode):
    X, y = make_regression(n_samples=5, n_features=2)
    gcv = RidgeCV(gcv_mode=mode)
    with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
        gcv.fit(X, y)
    with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
        _check_gcv_mode(X, mode)
Ejemplo n.º 5
0
def _test_ridge_cv_normalize(filter_):
    ridge_cv = RidgeCV(normalize=True, cv=3)
    ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes)

    gs = GridSearchCV(Ridge(normalize=True), cv=3,
                      param_grid={'alpha': ridge_cv.alphas})
    gs.fit(filter_(10. * X_diabetes), y_diabetes)
    assert_equal(gs.best_estimator_.alpha, ridge_cv.alpha_)
Ejemplo n.º 6
0
def test_ridgecv_int_alphas():
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
                  [1.0, 1.0], [1.0, 0.0]])
    y = [1, 1, 1, -1, -1]

    # Integers
    ridge = RidgeCV(alphas=(1, 10, 100))
    ridge.fit(X, y)
def pred_sand(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['Sand'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x]:
            lass_only.append(all_vars[x]) 
   
    # nearest nieghbors
    #neigh = KNeighborsRegressor(n_neighbors=2)
    #neigh.fit(train.ix[:, chosen], train['Sand'])
    #for dset in data:
      #  dset['sand_ngh_prds'] = neigh.predict(dset.ix[:, chosen])
        
    # SVM
    #svr = svm.SVR()
    #svr.fit(train.ix[:, lass_only], train['Sand'])
    #for dset in data:
        #dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only])
    # randomforest
    forst = RandomForestRegressor(n_estimators=200)
    forst.fit(train.ix[:, chosen], train['Sand'])
    for dset in data:
        dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # SVM
    svr = svm.SVR(C=23000)
    svr.fit(train.ix[:, all_vars], train['Sand'])
    for dset in data:
        dset['sand_svr_prds'] = svr.predict(dset.ix[:, all_vars])
        
    # lasso
    #lass = Lasso(alpha=.0000001, positive=True)
    #lass.fit(train[all_vars], train['Sand'])
    #for dset in data:
    #    dset['sand_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    sand_ridge = RidgeCV(np.array([1.135]), normalize=True)
    sand_ridge.fit(train[all_vars], train['Sand'])
    for dset in data:
        dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars])
    # combination
    models= [ 'sand_rdg_prds', 'sand_svr_prds',
             'sand_for_prds',  'sand_svr_prds'] 
    #print train.ix[0:20, models]
    name = 'sand_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Sand')
def pred_SOC(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    SOC_lassoed_vars = lass_varselect(train, all_vars, 'SOC', .000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 4500)
    univ_selector.fit(train[all_vars], train['SOC'])
    univ_selector2 = SelectKBest(score_func = f_regression, k = 200)
    univ_selector2.fit(train[all_vars], train['SOC'])
    pvals = univ_selector.get_support()
    pvals2 = univ_selector2.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    chosen2 =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x] | pvals2[x]:
            chosen2.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x]:
            lass_only.append(all_vars[x])    
    #randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['SOC'])
    for dset in data:
        dset['SOC_for_prds'] = forst.predict(dset.ix[:, chosen])
    gbr = GradientBoostingRegressor(n_estimators = 900,
            learning_rate = .0785, max_depth =1, random_state = 42, 
            verbose = 0, min_samples_leaf=4, subsample = .4)
    gbr.fit(train[chosen2], train['SOC'])
    for dset in data:
        dset['SOC_gbr_prds'] = gbr.predict(dset.ix[:, chosen2])    
    # lasso
    #lass = Lasso(alpha=.00000025, positive=True)
    #lass.fit(train[all_vars], train['SOC'])
    #for dset in data:
    #    dset['SOC_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    SOC_ridge = RidgeCV(np.array([.315]), normalize=True)
    SOC_ridge.fit(train[all_vars], train['SOC'])
    for dset in data:
        dset['SOC_rdg_prds'] = SOC_ridge.predict(dset[all_vars])
    # SVR
    svr = svm.SVR(C=9000, epsilon=.1)
    svr.fit(train.ix[:, chosen], train['SOC'])
    for dset in data:
        dset['SOC_svr_prds'] = svr.predict(dset.ix[:, chosen])
    # combination
    models= ['SOC_rdg_prds', 'SOC_svr_prds',
              'SOC_gbr_prds', 'SOC_for_prds',  'SOC_svr_prds' ]
    name = 'SOC_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'SOC')
Ejemplo n.º 9
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    fit_intercept = filter_ == DENSE_FILTER
    ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv2.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv3.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('neg_mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv4.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with sample weights
    if filter_ == DENSE_FILTER:
        ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
                      sample_weight=np.ones(n_samples))
        assert ridge_gcv.alpha_ == pytest.approx(alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_allclose(np.vstack((y_pred, y_pred)).T,
                    Y_pred, rtol=1e-5)

    return ret
def bag_of_words_ridge(variable):
    vectorizer = TfidfVectorizer(min_df=.1, max_df=.9) #use a vectorizer to count word usage instances and create sparse matrix
    bag_of_words_X = vectorizer.fit(train_and_validation[variable][pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-1')])
    # normalization of vectorizer is fit using train only
    bag_of_words_X = vectorizer.transform(train_and_validation[variable])
    test_bag_of_words= vectorizer.transform(test[variable])
    ridge= RidgeCV(array([18]), store_cv_values=True, normalize=True)
    # using data range to gaurantee recency and also run time 
    ridge.fit(bag_of_words_X[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')], train_and_validation.is_exciting[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')])
    var_nm = "b_of_wds_prds_" + variable
    # put predictions into samples for use later as base classifiers in ada boost    
    train_and_validation[var_nm]=ridge.predict(bag_of_words_X)
    test[var_nm]=ridge.predict(test_bag_of_words)
Ejemplo n.º 11
0
def test_ridge_gcv_sample_weights(
        gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
    alphas = [1e-3, .1, 1., 10., 1e3]
    rng = np.random.RandomState(0)
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(
        n_samples=11, n_features=n_features, n_targets=n_targets,
        random_state=0, shuffle=False, noise=noise)
    y = y.reshape(y_shape)

    sample_weight = 3 * rng.randn(len(X))
    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
    sample_weight = sample_weight.astype(float)
    X_tiled, y_tiled = X[indices], y[indices]

    cv = GroupKFold(n_splits=X.shape[0])
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    kfold = RidgeCV(
        alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
        fit_intercept=fit_intercept)
    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
    # `iid` parameter will change from True to False in version 0.22 and will
    # be removed in 0.24
    with ignore_warnings(category=DeprecationWarning):
        kfold.fit(X_tiled, y_tiled)

    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
    kfold_errors = (y_tiled - predictions)**2
    kfold_errors = [
        np.sum(kfold_errors[indices == i], axis=0) for
        i in np.arange(X.shape[0])]
    kfold_errors = np.asarray(kfold_errors)

    X_gcv = X_constructor(X)
    gcv_ridge = RidgeCV(
        alphas=alphas, store_cv_values=True,
        gcv_mode=gcv_mode, fit_intercept=fit_intercept)
    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
    if len(y_shape) == 2:
        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
    else:
        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]

    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
Ejemplo n.º 12
0
def _test_ridge_cv(filter_):
    ridge_cv = RidgeCV()
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)

    cv = KFold(5)
    ridge_cv.set_params(cv=cv)
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 5000)
    univ_selector.fit(train[all_vars], train['Ca'])
    univ_selector2 = SelectKBest(score_func = f_regression, k = 200)
    univ_selector2.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    pvals2 = univ_selector2.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    chosen2 =  []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals2[x]:
            chosen2.append(all_vars[x])
    gbr = GradientBoostingRegressor(n_estimators = 1000,
        learning_rate = .1695, max_depth =1, random_state = 42, 
        verbose = 0, min_samples_leaf=4)
    gbr.fit(train[chosen2], train['Ca'])
    for dset in data:
       dset['Ca_gbr_prds'] = gbr.predict(dset.ix[:, chosen2])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # ridge
    Ca_ridge = RidgeCV(np.array([4.925]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # SVR model
    svr = svm.SVR(C=9500)
    svr.fit(train.ix[:, chosen], train['Ca'])
    for dset in data:
        dset['Ca_svr_prds'] = svr.predict(dset.ix[:, chosen])

    # combination
    models= [ 'Ca_rdg_prds', 'Ca_gbr_prds',  
              'Ca_for_prds', 'Ca_svr_prds', 'Ca_svr_prds' ]   
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
def pred_P(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    P_lassoed_vars = lass_varselect(train, all_vars, 'P', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1600)
    univ_selector.fit(train[all_vars], train['P'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if P_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if P_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    chosen.append('sand_prds' + str(object=loop))
    chosen.append('pH_prds' + str(object=loop))
    chosen.append('SOC_prds' + str(object=loop))
    chosen.append('Ca_prds' + str(object=loop))
    # SVM
    svr = svm.SVR(C=10000, epsilon=.1)
    svr.fit(train.ix[:, all_vars], train['P'])
    for dset in data:
        dset['P_svr_prds'] = svr.predict(dset.ix[:, all_vars])
  
    gbr = GradientBoostingRegressor(n_estimators = 60,
        learning_rate = 0.1, max_depth =5, random_state = 42, 
        verbose = 0, min_samples_leaf=4)
    gbr.fit(train.ix[:, chosen], train['P'])
    for dset in data:
        dset['P_gbr_prds'] = gbr.predict(dset.ix[:,chosen])
    # ridge
    P_ridge = RidgeCV(np.array([.55]), normalize=True)
    P_ridge.fit(train[all_vars], train['P'])
    for dset in data:
        dset['P_rdg_prds'] = P_ridge.predict(dset[all_vars])
    # combination
    models= [ 'P_rdg_prds', 
              'P_svr_prds', 'P_gbr_prds'] #, 'P_las_prds' , 'P_gbr_prds'
    name = 'P_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'P')
Ejemplo n.º 15
0
def test_ridgecv_store_cv_values():
    rng = np.random.RandomState(42)

    n_samples = 8
    n_features = 5
    x = rng.randn(n_samples, n_features)
    alphas = [1e-1, 1e0, 1e1]
    n_alphas = len(alphas)

    r = RidgeCV(alphas=alphas, store_cv_values=True)

    # with len(y.shape) == 1
    y = rng.randn(n_samples)
    r.fit(x, y)
    assert r.cv_values_.shape == (n_samples, n_alphas)

    # with len(y.shape) == 2
    n_targets = 3
    y = rng.randn(n_samples, n_targets)
    r.fit(x, y)
    assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)
Ejemplo n.º 16
0
def test_ridgecv_store_cv_values():
    # Test _RidgeCV's store_cv_values attribute.
    rng = rng = np.random.RandomState(42)

    n_samples = 8
    n_features = 5
    x = rng.randn(n_samples, n_features)
    alphas = [1e-1, 1e0, 1e1]
    n_alphas = len(alphas)

    r = RidgeCV(alphas=alphas, store_cv_values=True)

    # with len(y.shape) == 1
    y = rng.randn(n_samples)
    r.fit(x, y)
    assert_equal(r.cv_values_.shape, (n_samples, n_alphas))

    # with len(y.shape) == 2
    n_responses = 3
    y = rng.randn(n_samples, n_responses)
    r.fit(x, y)
    assert_equal(r.cv_values_.shape, (n_samples, n_responses, n_alphas))
Ejemplo n.º 17
0
def test_ridgecv_sample_weight():
    rng = np.random.RandomState(0)
    alphas = (0.1, 1.0, 10.0)

    # There are different algorithms for n_samples > n_features
    # and the opposite, so test them both.
    for n_samples, n_features in ((6, 5), (5, 10)):
        y = rng.randn(n_samples)
        X = rng.randn(n_samples, n_features)
        sample_weight = 1.0 + rng.rand(n_samples)

        cv = KFold(5)
        ridgecv = RidgeCV(alphas=alphas, cv=cv)
        ridgecv.fit(X, y, sample_weight=sample_weight)

        # Check using GridSearchCV directly
        parameters = {'alpha': alphas}
        gs = GridSearchCV(Ridge(), parameters, cv=cv)
        gs.fit(X, y, sample_weight=sample_weight)

        assert_equal(ridgecv.alpha_, gs.best_estimator_.alpha)
        assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_)
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1400)
    univ_selector.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(1, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    #print forst.feature_importances_
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # lasso
    lass = Lasso(alpha=.0000001, positive=True)
    lass.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    Ca_ridge = RidgeCV(np.array([.5]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # combination
    models= ['Ca_las_prds', 'Ca_rdg_prds', 
             'Ca_for_prds', 'Ca_for_prds',  ] 
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
   
   
# we run an optimizer to find the penalty that minimizes rmse of ridge
init_guess = array([35])  
# init_guess initializes the opimization with a guess of the optimal penalty 
   
t0= time.time()
optimizer = minimize(pc_ridge, init_guess, method='nelder-mead', options= {'xtol':1e-2, 'disp':True})
print "It took {time} minutes to optimize".format(time=(time.time()-t0)/60)

# run ridge with optimal penalization

t0= time.time()
ridge= RidgeCV(alphas=optimizer.x, store_cv_values=True, normalize=True)
# optimizer.x is the ridge penalty that minimized rmse
ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents])
print "It took {time} minutes to run the optimized ridge".format(time=(time.time()-t0)/60)

# create an OLS regression for word count
ols= sm.regression.linear_model.OLS(train.is_exciting, train.word_count)
results= ols.fit()

# add ols and ridge predictions to train and test data 

train['ridge_predictions']=ridge.predict(train_tokens) 
train['length_predictions'] = train.word_count*results.params[0]
test['ridge_predictions']=ridge.predict(test_tokens) 
test['length_predictions'] = test.word_count*results.params[0]

data_for_ensemble = pd.DataFrame({"length_predictions":train.length_predictions,"ridge_predictions":train.ridge_predictions})
Ejemplo n.º 20
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    ridge_gcv = _RidgeGCV(fit_intercept=False)
    ridge = Ridge(alpha=1.0, fit_intercept=False)

    # generalized cross-validation (efficient leave-one-out)
    decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes)
    errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp)
    values, c = ridge_gcv._values(1.0, y_diabetes, *decomp)

    # brute-force leave-one-out: remove one example at a time
    errors2 = []
    values2 = []
    for i in range(n_samples):
        sel = np.arange(n_samples) != i
        X_new = X_diabetes[sel]
        y_new = y_diabetes[sel]
        ridge.fit(X_new, y_new)
        value = ridge.predict([X_diabetes[i]])[0]
        error = (y_diabetes[i] - value) ** 2
        errors2.append(error)
        values2.append(value)

    # check that efficient and brute-force LOO give same results
    assert_almost_equal(errors, errors2)
    assert_almost_equal(values, values2)

    # generalized cross-validation (efficient leave-one-out,
    # SVD variation)
    decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes)
    errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp)
    values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp)

    # check that efficient and SVD efficient LOO give same results
    assert_almost_equal(errors, errors3)
    assert_almost_equal(values, values3)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv2.alpha_, alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv3.alpha_, alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv4.alpha_, alpha_)

    # check that we get same best alpha with sample weights
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
                  sample_weight=np.ones(n_samples))
    assert_equal(ridge_gcv.alpha_, alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T,
                              Y_pred, decimal=5)

    return ret
def ensemble_ridge(penalty):
    
    ridge= RidgeCV(alphas=penalty, store_cv_values=True, normalize=True)
    ridge.fit(data_for_ensemble, train.is_exciting)
    predictions = ridge.predict(data_for_ensemble)
    return np.sqrt(np.mean((train.is_exciting-predictions)**2))
                        dum_dict['dummy_dummies'], dum_dict['var8_dummies'],
                        dum_dict['var1_dummies'], dum_dict['var2_dummies'],
                        dum_dict['var3_dummies'], dum_dict['var4_dummies'],
                        dum_dict['var5_dummies'], dum_dict['var6_dummies'],
                        dum_dict['var9_dummies'])
 for var_type in var_types_for_svc:
     for cols in var_type:
         svc_feats.append(cols)
         
 ##################### Run classifiers ############################
 weights = np.array([fire_train_TRAIN_smp['var11']]).squeeze()
 
 # Ridge
 ## using weights crashes this, don't know why       
 ridge = RidgeCV(np.array([1.5]), store_cv_values=True, normalize=True)
 ridge.fit(fire_train_TRAIN_smp[ridge_feats], fire_train_TRAIN_smp.target)
 write_preds_allsamps(ridge, "fin_rdg_preds", ridge_feats)
 
 # claim size conditional on claim ridge
 ones_only = fire_train_TRAIN_smp['target']>0
 size_train = fire_train_TRAIN_smp.ix[ones_only, :]
 size_ridge = RidgeCV(np.array([1.5]), store_cv_values=True, normalize=True)
 size_ridge.fit(size_train[ridge_feats], size_train.target)
 write_preds_allsamps(size_ridge, "size_rdg_preds", ridge_feats)
 
 # Lasso
 lass = Lasso(alpha=.0000001, positive=True, max_iter=100000 ,
              tol=.001, normalize=True)
 lass.fit(np.array(fire_train_TRAIN_smp[ridge_feats]),
          np.array(fire_train_TRAIN_smp.target))
 write_preds_allsamps(lass, "fin_lass_preds", ridge_feats)
Ejemplo n.º 23
0
#run logistic
logit = LogisticRegression()
logit.fit(train_features, train_outcome)
logit_feats = len(train_features.columns)

validation['predictions'] = logit.predict_proba(validation_for_p)[:, 1]
fpr, tpr, thresholds = roc_curve(validation.is_exciting,
                                 validation.predictions)
auc_score = auc(fpr, tpr)
auc_score

# run ridge

full_ridge = RidgeCV(np.array([7]), store_cv_values=True, normalize=True)
# using data range to gaurantee recency and also run time
full_ridge.fit(train_features, train_outcome)
validation['predictions'] = logit.predict_proba(validation_for_p)[:, 1]
fpr, tpr, thresholds = roc_curve(validation.is_exciting,
                                 validation.predictions)
auc_score = auc(fpr, tpr)
auc_score

# add predictions to train features
ens_train_features['Adaboost'] = pd.DataFrame(
    clf.predict_proba(train_features.iloc[:, 0:30])[:, 1])
validation_for_p['Adaboost'] = pd.DataFrame(
    clf.predict_proba(validation_for_p.iloc[:, 0:30])[:, 1])
test_X['Adaboost'] = pd.DataFrame(
    clf.predict_proba(test_X.iloc[:, 0:30])[:, 1])

ens_train_features['Forest'] = rndm_forest_clf.predict_proba(
Ejemplo n.º 24
0
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    ridge_gcv = _RidgeGCV(fit_intercept=False)
    ridge = Ridge(alpha=1.0, fit_intercept=False)

    # generalized cross-validation (efficient leave-one-out)
    decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes)
    errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp)
    values, c = ridge_gcv._values(1.0, y_diabetes, *decomp)

    # brute-force leave-one-out: remove one example at a time
    errors2 = []
    values2 = []
    for i in range(n_samples):
        sel = np.arange(n_samples) != i
        X_new = X_diabetes[sel]
        y_new = y_diabetes[sel]
        ridge.fit(X_new, y_new)
        value = ridge.predict([X_diabetes[i]])[0]
        error = (y_diabetes[i] - value)**2
        errors2.append(error)
        values2.append(value)

    # check that efficient and brute-force LOO give same results
    assert_almost_equal(errors, errors2)
    assert_almost_equal(values, values2)

    # generalized cross-validation (efficient leave-one-out,
    # SVD variation)
    decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes)
    errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp)
    values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp)

    # check that efficient and SVD efficient LOO give same results
    assert_almost_equal(errors, errors3)
    assert_almost_equal(values, values3)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv2.alpha_, alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv3.alpha_, alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert_equal(ridge_gcv4.alpha_, alpha_)

    # check that we get same best alpha with sample weights
    ridge_gcv.fit(filter_(X_diabetes),
                  y_diabetes,
                  sample_weight=np.ones(n_samples))
    assert_equal(ridge_gcv.alpha_, alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5)

    return ret
def pc_ridge(penalty): 
    # this function takes a complexity penalty as an input amd outputs RMSE
    ridge= RidgeCV(alphas= penalty, store_cv_values=True, normalize=True)
    ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents])
    predictions = ridge.predict(train_tokens)
    return np.sqrt(np.mean((train.is_exciting-predictions)**2))
#run logistic
logit = LogisticRegression()
logit.fit(train_features, train_outcome)
logit_feats = len(train_features.columns)

validation['predictions']=logit.predict_proba(validation_for_p)[:,1]
fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions)
auc_score = auc(fpr,tpr)
auc_score 

# run ridge

full_ridge= RidgeCV(np.array([7]), store_cv_values=True, normalize=True)
# using data range to gaurantee recency and also run time 
full_ridge.fit(train_features, train_outcome)
validation['predictions']=logit.predict_proba(validation_for_p)[:,1]
fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions)
auc_score = auc(fpr,tpr)
auc_score 
  
    
# add predictions to train features
ens_train_features['Adaboost'] = pd.DataFrame(clf.predict_proba(train_features.iloc[:,0:30])[:,1])
validation_for_p['Adaboost'] = pd.DataFrame(clf.predict_proba(validation_for_p.iloc[:,0:30])[:,1])
test_X['Adaboost'] = pd.DataFrame(clf.predict_proba(test_X.iloc[:,0:30])[:,1])

ens_train_features['Forest'] = rndm_forest_clf.predict_proba(train_features.iloc[:,0:forest_features])[:,1]
validation_for_p['Forest'] = rndm_forest_clf.predict_proba(validation_for_p.iloc[:,0:forest_features])[:,1]
test_X['Forest'] = rndm_forest_clf.predict_proba(test_X.iloc[:,0:forest_features])[:,1]