def bag_of_words_ridge(variable):
    vectorizer = TfidfVectorizer(min_df=.1, max_df=.9) #use a vectorizer to count word usage instances and create sparse matrix
    bag_of_words_X = vectorizer.fit(train_and_validation[variable][pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-1')])
    # normalization of vectorizer is fit using train only
    bag_of_words_X = vectorizer.transform(train_and_validation[variable])
    test_bag_of_words= vectorizer.transform(test[variable])
    ridge= RidgeCV(array([18]), store_cv_values=True, normalize=True)
    # using data range to gaurantee recency and also run time 
    ridge.fit(bag_of_words_X[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')], train_and_validation.is_exciting[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')])
    var_nm = "b_of_wds_prds_" + variable
    # put predictions into samples for use later as base classifiers in ada boost    
    train_and_validation[var_nm]=ridge.predict(bag_of_words_X)
    test[var_nm]=ridge.predict(test_bag_of_words)
Ejemplo n.º 2
0
def _test_ridge_cv(filter_):
    ridge_cv = RidgeCV()
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert len(ridge_cv.coef_.shape) == 1
    assert type(ridge_cv.intercept_) == np.float64

    cv = KFold(5)
    ridge_cv.set_params(cv=cv)
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert len(ridge_cv.coef_.shape) == 1
    assert type(ridge_cv.intercept_) == np.float64
Ejemplo n.º 3
0
def _test_ridge_cv(filter_):
    ridge_cv = RidgeCV()
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)

    cv = KFold(5)
    ridge_cv.set_params(cv=cv)
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)
Ejemplo n.º 4
0
def _test_ridge_cv(filter_):
    ridge_cv = RidgeCV()
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)

    cv = KFold(5)
    ridge_cv.set_params(cv=cv)
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)
def pred_SOC(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    SOC_lassoed_vars = lass_varselect(train, all_vars, 'SOC', .000000001)
    univ_selector = SelectKBest(score_func=f_regression, k=4500)
    univ_selector.fit(train[all_vars], train['SOC'])
    univ_selector2 = SelectKBest(score_func=f_regression, k=200)
    univ_selector2.fit(train[all_vars], train['SOC'])
    pvals = univ_selector.get_support()
    pvals2 = univ_selector2.get_support()
    chosen = []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    chosen2 = []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x] | pvals2[x]:
            chosen2.append(all_vars[x])
    lass_only = []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    #randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['SOC'])
    for dset in data:
        dset['SOC_for_prds'] = forst.predict(dset.ix[:, chosen])
    gbr = GradientBoostingRegressor(n_estimators=900,
                                    learning_rate=.0785,
                                    max_depth=1,
                                    random_state=42,
                                    verbose=0,
                                    min_samples_leaf=4,
                                    subsample=.4)
    gbr.fit(train[chosen2], train['SOC'])
    for dset in data:
        dset['SOC_gbr_prds'] = gbr.predict(dset.ix[:, chosen2])
    # lasso
    #lass = Lasso(alpha=.00000025, positive=True)
    #lass.fit(train[all_vars], train['SOC'])
    #for dset in data:
    #    dset['SOC_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    SOC_ridge = RidgeCV(np.array([.315]), normalize=True)
    SOC_ridge.fit(train[all_vars], train['SOC'])
    for dset in data:
        dset['SOC_rdg_prds'] = SOC_ridge.predict(dset[all_vars])
    # SVR
    svr = svm.SVR(C=9000, epsilon=.1)
    svr.fit(train.ix[:, chosen], train['SOC'])
    for dset in data:
        dset['SOC_svr_prds'] = svr.predict(dset.ix[:, chosen])
    # combination
    models = [
        'SOC_rdg_prds', 'SOC_svr_prds', 'SOC_gbr_prds', 'SOC_for_prds',
        'SOC_svr_prds'
    ]
    name = 'SOC_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'SOC')
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func=f_regression, k=1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen = []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only = []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models = ['pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds']
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
Ejemplo n.º 7
0
class RidgeCVImpl():
    def __init__(self,
                 alphas=[0.1, 1.0, 10.0],
                 fit_intercept=True,
                 normalize=False,
                 scoring=None,
                 cv=None,
                 gcv_mode=None,
                 store_cv_values=False):
        self._hyperparams = {
            'alphas': alphas,
            'fit_intercept': fit_intercept,
            'normalize': normalize,
            'scoring': scoring,
            'cv': cv,
            'gcv_mode': gcv_mode,
            'store_cv_values': store_cv_values
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])  
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models= [ 'pH_rdg_prds', 'pH_las_prds', 
              'pH_for_prds', 'pH_for_prds' ] 
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
def pred_sand(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001)
    univ_selector = SelectKBest(score_func=f_regression, k=1200)
    univ_selector.fit(train[all_vars], train['Sand'])
    pvals = univ_selector.get_support()
    chosen = []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only = []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x]:
            lass_only.append(all_vars[x])

    # nearest nieghbors
    #neigh = KNeighborsRegressor(n_neighbors=2)
    #neigh.fit(train.ix[:, chosen], train['Sand'])
    #for dset in data:
    #  dset['sand_ngh_prds'] = neigh.predict(dset.ix[:, chosen])

    # SVM
    #svr = svm.SVR()
    #svr.fit(train.ix[:, lass_only], train['Sand'])
    #for dset in data:
    #dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only])
    # randomforest
    forst = RandomForestRegressor(n_estimators=200)
    forst.fit(train.ix[:, chosen], train['Sand'])
    for dset in data:
        dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen])

    # SVM
    svr = svm.SVR(C=23000)
    svr.fit(train.ix[:, all_vars], train['Sand'])
    for dset in data:
        dset['sand_svr_prds'] = svr.predict(dset.ix[:, all_vars])

    # lasso
    #lass = Lasso(alpha=.0000001, positive=True)
    #lass.fit(train[all_vars], train['Sand'])
    #for dset in data:
    #    dset['sand_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    sand_ridge = RidgeCV(np.array([1.135]), normalize=True)
    sand_ridge.fit(train[all_vars], train['Sand'])
    for dset in data:
        dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars])
    # combination
    models = [
        'sand_rdg_prds', 'sand_svr_prds', 'sand_for_prds', 'sand_svr_prds'
    ]
    #print train.ix[0:20, models]
    name = 'sand_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Sand')
def pred_sand(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['Sand'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if sand_lassoed_vars[x]:
            lass_only.append(all_vars[x]) 
   
    # nearest nieghbors
    #neigh = KNeighborsRegressor(n_neighbors=2)
    #neigh.fit(train.ix[:, chosen], train['Sand'])
    #for dset in data:
      #  dset['sand_ngh_prds'] = neigh.predict(dset.ix[:, chosen])
        
    # SVM
    #svr = svm.SVR()
    #svr.fit(train.ix[:, lass_only], train['Sand'])
    #for dset in data:
        #dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only])
    # randomforest
    forst = RandomForestRegressor(n_estimators=200)
    forst.fit(train.ix[:, chosen], train['Sand'])
    for dset in data:
        dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # SVM
    svr = svm.SVR(C=23000)
    svr.fit(train.ix[:, all_vars], train['Sand'])
    for dset in data:
        dset['sand_svr_prds'] = svr.predict(dset.ix[:, all_vars])
        
    # lasso
    #lass = Lasso(alpha=.0000001, positive=True)
    #lass.fit(train[all_vars], train['Sand'])
    #for dset in data:
    #    dset['sand_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    sand_ridge = RidgeCV(np.array([1.135]), normalize=True)
    sand_ridge.fit(train[all_vars], train['Sand'])
    for dset in data:
        dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars])
    # combination
    models= [ 'sand_rdg_prds', 'sand_svr_prds',
             'sand_for_prds',  'sand_svr_prds'] 
    #print train.ix[0:20, models]
    name = 'sand_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Sand')
def pred_SOC(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    SOC_lassoed_vars = lass_varselect(train, all_vars, 'SOC', .000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 4500)
    univ_selector.fit(train[all_vars], train['SOC'])
    univ_selector2 = SelectKBest(score_func = f_regression, k = 200)
    univ_selector2.fit(train[all_vars], train['SOC'])
    pvals = univ_selector.get_support()
    pvals2 = univ_selector2.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    chosen2 =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x] | pvals2[x]:
            chosen2.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if SOC_lassoed_vars[x]:
            lass_only.append(all_vars[x])    
    #randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['SOC'])
    for dset in data:
        dset['SOC_for_prds'] = forst.predict(dset.ix[:, chosen])
    gbr = GradientBoostingRegressor(n_estimators = 900,
            learning_rate = .0785, max_depth =1, random_state = 42, 
            verbose = 0, min_samples_leaf=4, subsample = .4)
    gbr.fit(train[chosen2], train['SOC'])
    for dset in data:
        dset['SOC_gbr_prds'] = gbr.predict(dset.ix[:, chosen2])    
    # lasso
    #lass = Lasso(alpha=.00000025, positive=True)
    #lass.fit(train[all_vars], train['SOC'])
    #for dset in data:
    #    dset['SOC_las_prds'] = lass.predict(dset[all_vars])

    # ridge
    SOC_ridge = RidgeCV(np.array([.315]), normalize=True)
    SOC_ridge.fit(train[all_vars], train['SOC'])
    for dset in data:
        dset['SOC_rdg_prds'] = SOC_ridge.predict(dset[all_vars])
    # SVR
    svr = svm.SVR(C=9000, epsilon=.1)
    svr.fit(train.ix[:, chosen], train['SOC'])
    for dset in data:
        dset['SOC_svr_prds'] = svr.predict(dset.ix[:, chosen])
    # combination
    models= ['SOC_rdg_prds', 'SOC_svr_prds',
              'SOC_gbr_prds', 'SOC_for_prds',  'SOC_svr_prds' ]
    name = 'SOC_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'SOC')
Ejemplo n.º 12
0
def bag_of_words_ridge(variable):
    vectorizer = TfidfVectorizer(
        min_df=.1, max_df=.9
    )  #use a vectorizer to count word usage instances and create sparse matrix
    bag_of_words_X = vectorizer.fit(
        train_and_validation[variable][pd.to_datetime(
            train_and_validation.date_posted) > pd.to_datetime('2013-11-1')])
    # normalization of vectorizer is fit using train only
    bag_of_words_X = vectorizer.transform(train_and_validation[variable])
    test_bag_of_words = vectorizer.transform(test[variable])
    ridge = RidgeCV(array([18]), store_cv_values=True, normalize=True)
    # using data range to gaurantee recency and also run time
    ridge.fit(
        bag_of_words_X[pd.to_datetime(train_and_validation.date_posted) >
                       pd.to_datetime('2013-11-8')],
        train_and_validation.is_exciting[pd.to_datetime(
            train_and_validation.date_posted) > pd.to_datetime('2013-11-8')])
    var_nm = "b_of_wds_prds_" + variable
    # put predictions into samples for use later as base classifiers in ada boost
    train_and_validation[var_nm] = ridge.predict(bag_of_words_X)
    test[var_nm] = ridge.predict(test_bag_of_words)
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func=f_regression, k=5000)
    univ_selector.fit(train[all_vars], train['Ca'])
    univ_selector2 = SelectKBest(score_func=f_regression, k=200)
    univ_selector2.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    pvals2 = univ_selector2.get_support()
    chosen = []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    chosen2 = []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals2[x]:
            chosen2.append(all_vars[x])
    gbr = GradientBoostingRegressor(n_estimators=1000,
                                    learning_rate=.1695,
                                    max_depth=1,
                                    random_state=42,
                                    verbose=0,
                                    min_samples_leaf=4)
    gbr.fit(train[chosen2], train['Ca'])
    for dset in data:
        dset['Ca_gbr_prds'] = gbr.predict(dset.ix[:, chosen2])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])

    # ridge
    Ca_ridge = RidgeCV(np.array([4.925]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # SVR model
    svr = svm.SVR(C=9500)
    svr.fit(train.ix[:, chosen], train['Ca'])
    for dset in data:
        dset['Ca_svr_prds'] = svr.predict(dset.ix[:, chosen])

    # combination
    models = [
        'Ca_rdg_prds', 'Ca_gbr_prds', 'Ca_for_prds', 'Ca_svr_prds',
        'Ca_svr_prds'
    ]
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 5000)
    univ_selector.fit(train[all_vars], train['Ca'])
    univ_selector2 = SelectKBest(score_func = f_regression, k = 200)
    univ_selector2.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    pvals2 = univ_selector2.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    chosen2 =  []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals2[x]:
            chosen2.append(all_vars[x])
    gbr = GradientBoostingRegressor(n_estimators = 1000,
        learning_rate = .1695, max_depth =1, random_state = 42, 
        verbose = 0, min_samples_leaf=4)
    gbr.fit(train[chosen2], train['Ca'])
    for dset in data:
       dset['Ca_gbr_prds'] = gbr.predict(dset.ix[:, chosen2])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # ridge
    Ca_ridge = RidgeCV(np.array([4.925]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # SVR model
    svr = svm.SVR(C=9500)
    svr.fit(train.ix[:, chosen], train['Ca'])
    for dset in data:
        dset['Ca_svr_prds'] = svr.predict(dset.ix[:, chosen])

    # combination
    models= [ 'Ca_rdg_prds', 'Ca_gbr_prds',  
              'Ca_for_prds', 'Ca_svr_prds', 'Ca_svr_prds' ]   
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
def pred_P(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    P_lassoed_vars = lass_varselect(train, all_vars, 'P', .00000001)
    univ_selector = SelectKBest(score_func=f_regression, k=1600)
    univ_selector.fit(train[all_vars], train['P'])
    pvals = univ_selector.get_support()
    chosen = []
    for x in range(0, len(all_vars)):
        if P_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only = []
    for x in range(0, len(all_vars)):
        if P_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    chosen.append('sand_prds' + str(object=loop))
    chosen.append('pH_prds' + str(object=loop))
    chosen.append('SOC_prds' + str(object=loop))
    chosen.append('Ca_prds' + str(object=loop))
    # SVM
    svr = svm.SVR(C=10000, epsilon=.1)
    svr.fit(train.ix[:, all_vars], train['P'])
    for dset in data:
        dset['P_svr_prds'] = svr.predict(dset.ix[:, all_vars])

    gbr = GradientBoostingRegressor(n_estimators=60,
                                    learning_rate=0.1,
                                    max_depth=5,
                                    random_state=42,
                                    verbose=0,
                                    min_samples_leaf=4)
    gbr.fit(train.ix[:, chosen], train['P'])
    for dset in data:
        dset['P_gbr_prds'] = gbr.predict(dset.ix[:, chosen])
    # ridge
    P_ridge = RidgeCV(np.array([.55]), normalize=True)
    P_ridge.fit(train[all_vars], train['P'])
    for dset in data:
        dset['P_rdg_prds'] = P_ridge.predict(dset[all_vars])
    # combination
    models = ['P_rdg_prds', 'P_svr_prds',
              'P_gbr_prds']  #, 'P_las_prds' , 'P_gbr_prds'
    name = 'P_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'P')
def pred_P(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    P_lassoed_vars = lass_varselect(train, all_vars, 'P', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1600)
    univ_selector.fit(train[all_vars], train['P'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if P_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if P_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    chosen.append('sand_prds' + str(object=loop))
    chosen.append('pH_prds' + str(object=loop))
    chosen.append('SOC_prds' + str(object=loop))
    chosen.append('Ca_prds' + str(object=loop))
    # SVM
    svr = svm.SVR(C=10000, epsilon=.1)
    svr.fit(train.ix[:, all_vars], train['P'])
    for dset in data:
        dset['P_svr_prds'] = svr.predict(dset.ix[:, all_vars])
  
    gbr = GradientBoostingRegressor(n_estimators = 60,
        learning_rate = 0.1, max_depth =5, random_state = 42, 
        verbose = 0, min_samples_leaf=4)
    gbr.fit(train.ix[:, chosen], train['P'])
    for dset in data:
        dset['P_gbr_prds'] = gbr.predict(dset.ix[:,chosen])
    # ridge
    P_ridge = RidgeCV(np.array([.55]), normalize=True)
    P_ridge.fit(train[all_vars], train['P'])
    for dset in data:
        dset['P_rdg_prds'] = P_ridge.predict(dset[all_vars])
    # combination
    models= [ 'P_rdg_prds', 
              'P_svr_prds', 'P_gbr_prds'] #, 'P_las_prds' , 'P_gbr_prds'
    name = 'P_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'P')
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func=f_regression, k=1400)
    univ_selector.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    chosen = []
    for x in range(1, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only = []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    #print forst.feature_importances_
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])

    # lasso
    lass = Lasso(alpha=.0000001, positive=True)
    lass.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    Ca_ridge = RidgeCV(np.array([.5]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # combination
    models = [
        'Ca_las_prds',
        'Ca_rdg_prds',
        'Ca_for_prds',
        'Ca_for_prds',
    ]
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
def pred_Ca(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1400)
    univ_selector.fit(train[all_vars], train['Ca'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(1, len(all_vars)):
        if Ca_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if Ca_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    forst = RandomForestRegressor(n_estimators=120)
    forst.fit(train.ix[:, chosen], train['Ca'])
    #print forst.feature_importances_
    for dset in data:
        dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen])
        
    # lasso
    lass = Lasso(alpha=.0000001, positive=True)
    lass.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    Ca_ridge = RidgeCV(np.array([.5]), normalize=True)
    Ca_ridge.fit(train[all_vars], train['Ca'])
    for dset in data:
        dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars])
    # combination
    models= ['Ca_las_prds', 'Ca_rdg_prds', 
             'Ca_for_prds', 'Ca_for_prds',  ] 
    name = 'Ca_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'Ca')
# run ridge with optimal penalization

t0= time.time()
ridge= RidgeCV(alphas=optimizer.x, store_cv_values=True, normalize=True)
# optimizer.x is the ridge penalty that minimized rmse
ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents])
print "It took {time} minutes to run the optimized ridge".format(time=(time.time()-t0)/60)

# create an OLS regression for word count
ols= sm.regression.linear_model.OLS(train.is_exciting, train.word_count)
results= ols.fit()

# add ols and ridge predictions to train and test data 

train['ridge_predictions']=ridge.predict(train_tokens) 
train['length_predictions'] = train.word_count*results.params[0]
test['ridge_predictions']=ridge.predict(test_tokens) 
test['length_predictions'] = test.word_count*results.params[0]

data_for_ensemble = pd.DataFrame({"length_predictions":train.length_predictions,"ridge_predictions":train.ridge_predictions})

# create a ridge regression that incorporates the bag of words and essay length
init_guess_ens = array([.0125])   
t0= time.time()
ensemble_optimizer = minimize(ensemble_ridge, init_guess_ens, method='nelder-mead', options= {'xtol':1e-2, 'disp':True})
print "It took {time} minutes to optimize".format(time=(time.time()-t0)/60)

ridge= RidgeCV(alphas=array([ensemble_optimizer.x]), store_cv_values=True, normalize=True)
ensemble = ridge.fit(data_for_ensemble, train.is_exciting)
def ensemble_ridge(penalty):
    
    ridge= RidgeCV(alphas=penalty, store_cv_values=True, normalize=True)
    ridge.fit(data_for_ensemble, train.is_exciting)
    predictions = ridge.predict(data_for_ensemble)
    return np.sqrt(np.mean((train.is_exciting-predictions)**2))
Ejemplo n.º 21
0
def ensemble_ridge(penalty):

    ridge = RidgeCV(alphas=penalty, store_cv_values=True, normalize=True)
    ridge.fit(data_for_ensemble, train.is_exciting)
    predictions = ridge.predict(data_for_ensemble)
    return np.sqrt(np.mean((train.is_exciting - predictions)**2))
Ejemplo n.º 22
0
def pc_ridge(penalty):
    # this function takes a complexity penalty as an input amd outputs RMSE
    ridge = RidgeCV(alphas=penalty, store_cv_values=True, normalize=True)
    ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents])
    predictions = ridge.predict(train_tokens)
    return np.sqrt(np.mean((train.is_exciting - predictions)**2))
Ejemplo n.º 23
0
# run ridge with optimal penalization

t0 = time.time()
ridge = RidgeCV(alphas=optimizer.x, store_cv_values=True, normalize=True)
# optimizer.x is the ridge penalty that minimized rmse
ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents])
print "It took {time} minutes to run the optimized ridge".format(
    time=(time.time() - t0) / 60)

# create an OLS regression for word count
ols = sm.regression.linear_model.OLS(train.is_exciting, train.word_count)
results = ols.fit()

# add ols and ridge predictions to train and test data

train['ridge_predictions'] = ridge.predict(train_tokens)
train['length_predictions'] = train.word_count * results.params[0]
test['ridge_predictions'] = ridge.predict(test_tokens)
test['length_predictions'] = test.word_count * results.params[0]

data_for_ensemble = pd.DataFrame({
    "length_predictions": train.length_predictions,
    "ridge_predictions": train.ridge_predictions
})

# create a ridge regression that incorporates the bag of words and essay length
init_guess_ens = array([.0125])
t0 = time.time()
ensemble_optimizer = minimize(ensemble_ridge,
                              init_guess_ens,
                              method='nelder-mead',
Ejemplo n.º 24
0
    clf.predict_proba(test_X.iloc[:, 0:30])[:, 1])

ens_train_features['Forest'] = rndm_forest_clf.predict_proba(
    train_features.iloc[:, 0:forest_features])[:, 1]
validation_for_p['Forest'] = rndm_forest_clf.predict_proba(
    validation_for_p.iloc[:, 0:forest_features])[:, 1]
test_X['Forest'] = rndm_forest_clf.predict_proba(
    test_X.iloc[:, 0:forest_features])[:, 1]

ens_train_features['Logit'] = logit.predict_proba(
    train_features.iloc[:, 0:logit_feats])[:, 1]
validation_for_p['Logit'] = logit.predict_proba(
    validation_for_p.iloc[:, 0:logit_feats])[:, 1]
test_X['Logit'] = logit.predict_proba(test_X.iloc[:, 0:logit_feats])[:, 1]

ens_train_features['Ridge'] = full_ridge.predict(
    train_features.iloc[:, 0:logit_feats])[:, 1]
validation_for_p['Ridge'] = full_ridge.predict(
    validation_for_p.iloc[:, 0:logit_feats])[:, 1]
test_X['Ridge'] = full_ridge.predict(test_X.iloc[:, 0:logit_feats])[:, 1]

# final tree

ens_forest_clf = RandomForestClassifier(n_estimators=600,
                                        min_samples_split=6,
                                        min_samples_leaf=2)
ens_forest_clf.fit(ens_train_features, train_outcome)

validation['predictions'] = ens_forest_clf.predict_proba(
    validation_for_p.iloc[:, 167:171])[:, 1]
fpr, tpr, thresholds = roc_curve(validation.is_exciting,
                                 validation.predictions)
    
# add predictions to train features
ens_train_features['Adaboost'] = pd.DataFrame(clf.predict_proba(train_features.iloc[:,0:30])[:,1])
validation_for_p['Adaboost'] = pd.DataFrame(clf.predict_proba(validation_for_p.iloc[:,0:30])[:,1])
test_X['Adaboost'] = pd.DataFrame(clf.predict_proba(test_X.iloc[:,0:30])[:,1])

ens_train_features['Forest'] = rndm_forest_clf.predict_proba(train_features.iloc[:,0:forest_features])[:,1]
validation_for_p['Forest'] = rndm_forest_clf.predict_proba(validation_for_p.iloc[:,0:forest_features])[:,1]
test_X['Forest'] = rndm_forest_clf.predict_proba(test_X.iloc[:,0:forest_features])[:,1]


ens_train_features['Logit'] = logit.predict_proba(train_features.iloc[:,0:logit_feats])[:,1]
validation_for_p['Logit'] = logit.predict_proba(validation_for_p.iloc[:,0:logit_feats])[:,1]
test_X['Logit'] = logit.predict_proba(test_X.iloc[:,0:logit_feats])[:,1]

ens_train_features['Ridge'] = full_ridge.predict(train_features.iloc[:,0:logit_feats])[:,1]
validation_for_p['Ridge'] = full_ridge.predict(validation_for_p.iloc[:,0:logit_feats])[:,1]
test_X['Ridge'] = full_ridge.predict(test_X.iloc[:,0:logit_feats])[:,1]

# final tree

ens_forest_clf = RandomForestClassifier(n_estimators=600, min_samples_split=6, min_samples_leaf=2)
ens_forest_clf.fit(ens_train_features, train_outcome)


validation['predictions']=ens_forest_clf.predict_proba(validation_for_p.iloc[:,167:171])[:,1]
fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions)
auc_score = auc(fpr,tpr)
auc_score 

# submission
def pc_ridge(penalty): 
    # this function takes a complexity penalty as an input amd outputs RMSE
    ridge= RidgeCV(alphas= penalty, store_cv_values=True, normalize=True)
    ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents])
    predictions = ridge.predict(train_tokens)
    return np.sqrt(np.mean((train.is_exciting-predictions)**2))