def bag_of_words_ridge(variable): vectorizer = TfidfVectorizer(min_df=.1, max_df=.9) #use a vectorizer to count word usage instances and create sparse matrix bag_of_words_X = vectorizer.fit(train_and_validation[variable][pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-1')]) # normalization of vectorizer is fit using train only bag_of_words_X = vectorizer.transform(train_and_validation[variable]) test_bag_of_words= vectorizer.transform(test[variable]) ridge= RidgeCV(array([18]), store_cv_values=True, normalize=True) # using data range to gaurantee recency and also run time ridge.fit(bag_of_words_X[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')], train_and_validation.is_exciting[pd.to_datetime(train_and_validation.date_posted)>pd.to_datetime('2013-11-8')]) var_nm = "b_of_wds_prds_" + variable # put predictions into samples for use later as base classifiers in ada boost train_and_validation[var_nm]=ridge.predict(bag_of_words_X) test[var_nm]=ridge.predict(test_bag_of_words)
def _test_ridge_cv(filter_): ridge_cv = RidgeCV() ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert len(ridge_cv.coef_.shape) == 1 assert type(ridge_cv.intercept_) == np.float64 cv = KFold(5) ridge_cv.set_params(cv=cv) ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert len(ridge_cv.coef_.shape) == 1 assert type(ridge_cv.intercept_) == np.float64
def _test_ridge_cv(filter_): ridge_cv = RidgeCV() ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert_equal(len(ridge_cv.coef_.shape), 1) assert_equal(type(ridge_cv.intercept_), np.float64) cv = KFold(5) ridge_cv.set_params(cv=cv) ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert_equal(len(ridge_cv.coef_.shape), 1) assert_equal(type(ridge_cv.intercept_), np.float64)
def _test_ridge_cv(filter_): ridge_cv = RidgeCV() ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert_equal(len(ridge_cv.coef_.shape), 1) assert_equal(type(ridge_cv.intercept_), np.float64) cv = KFold(5) ridge_cv.set_params(cv=cv) ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert_equal(len(ridge_cv.coef_.shape), 1) assert_equal(type(ridge_cv.intercept_), np.float64)
def pred_SOC(train, val, test, all_vars, loop): data = (val, test, train) # variable selection SOC_lassoed_vars = lass_varselect(train, all_vars, 'SOC', .000000001) univ_selector = SelectKBest(score_func=f_regression, k=4500) univ_selector.fit(train[all_vars], train['SOC']) univ_selector2 = SelectKBest(score_func=f_regression, k=200) univ_selector2.fit(train[all_vars], train['SOC']) pvals = univ_selector.get_support() pvals2 = univ_selector2.get_support() chosen = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) chosen2 = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x] | pvals2[x]: chosen2.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x]: lass_only.append(all_vars[x]) #randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['SOC']) for dset in data: dset['SOC_for_prds'] = forst.predict(dset.ix[:, chosen]) gbr = GradientBoostingRegressor(n_estimators=900, learning_rate=.0785, max_depth=1, random_state=42, verbose=0, min_samples_leaf=4, subsample=.4) gbr.fit(train[chosen2], train['SOC']) for dset in data: dset['SOC_gbr_prds'] = gbr.predict(dset.ix[:, chosen2]) # lasso #lass = Lasso(alpha=.00000025, positive=True) #lass.fit(train[all_vars], train['SOC']) #for dset in data: # dset['SOC_las_prds'] = lass.predict(dset[all_vars]) # ridge SOC_ridge = RidgeCV(np.array([.315]), normalize=True) SOC_ridge.fit(train[all_vars], train['SOC']) for dset in data: dset['SOC_rdg_prds'] = SOC_ridge.predict(dset[all_vars]) # SVR svr = svm.SVR(C=9000, epsilon=.1) svr.fit(train.ix[:, chosen], train['SOC']) for dset in data: dset['SOC_svr_prds'] = svr.predict(dset.ix[:, chosen]) # combination models = [ 'SOC_rdg_prds', 'SOC_svr_prds', 'SOC_gbr_prds', 'SOC_for_prds', 'SOC_svr_prds' ] name = 'SOC_prds' + str(object=loop) write_preds(models, name, train, val, test, 'SOC')
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func=f_regression, k=1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models = ['pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds'] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
class RidgeCVImpl(): def __init__(self, alphas=[0.1, 1.0, 10.0], fit_intercept=True, normalize=False, scoring=None, cv=None, gcv_mode=None, store_cv_values=False): self._hyperparams = { 'alphas': alphas, 'fit_intercept': fit_intercept, 'normalize': normalize, 'scoring': scoring, 'cv': cv, 'gcv_mode': gcv_mode, 'store_cv_values': store_cv_values } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def pred_sand(train, val, test, all_vars, loop): data = (val, test, train) # variable selection sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001) univ_selector = SelectKBest(score_func=f_regression, k=1200) univ_selector.fit(train[all_vars], train['Sand']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest nieghbors #neigh = KNeighborsRegressor(n_neighbors=2) #neigh.fit(train.ix[:, chosen], train['Sand']) #for dset in data: # dset['sand_ngh_prds'] = neigh.predict(dset.ix[:, chosen]) # SVM #svr = svm.SVR() #svr.fit(train.ix[:, lass_only], train['Sand']) #for dset in data: #dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only]) # randomforest forst = RandomForestRegressor(n_estimators=200) forst.fit(train.ix[:, chosen], train['Sand']) for dset in data: dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen]) # SVM svr = svm.SVR(C=23000) svr.fit(train.ix[:, all_vars], train['Sand']) for dset in data: dset['sand_svr_prds'] = svr.predict(dset.ix[:, all_vars]) # lasso #lass = Lasso(alpha=.0000001, positive=True) #lass.fit(train[all_vars], train['Sand']) #for dset in data: # dset['sand_las_prds'] = lass.predict(dset[all_vars]) # ridge sand_ridge = RidgeCV(np.array([1.135]), normalize=True) sand_ridge.fit(train[all_vars], train['Sand']) for dset in data: dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars]) # combination models = [ 'sand_rdg_prds', 'sand_svr_prds', 'sand_for_prds', 'sand_svr_prds' ] #print train.ix[0:20, models] name = 'sand_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Sand')
def pred_sand(train, val, test, all_vars, loop): data = (val, test, train) # variable selection sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['Sand']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest nieghbors #neigh = KNeighborsRegressor(n_neighbors=2) #neigh.fit(train.ix[:, chosen], train['Sand']) #for dset in data: # dset['sand_ngh_prds'] = neigh.predict(dset.ix[:, chosen]) # SVM #svr = svm.SVR() #svr.fit(train.ix[:, lass_only], train['Sand']) #for dset in data: #dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only]) # randomforest forst = RandomForestRegressor(n_estimators=200) forst.fit(train.ix[:, chosen], train['Sand']) for dset in data: dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen]) # SVM svr = svm.SVR(C=23000) svr.fit(train.ix[:, all_vars], train['Sand']) for dset in data: dset['sand_svr_prds'] = svr.predict(dset.ix[:, all_vars]) # lasso #lass = Lasso(alpha=.0000001, positive=True) #lass.fit(train[all_vars], train['Sand']) #for dset in data: # dset['sand_las_prds'] = lass.predict(dset[all_vars]) # ridge sand_ridge = RidgeCV(np.array([1.135]), normalize=True) sand_ridge.fit(train[all_vars], train['Sand']) for dset in data: dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars]) # combination models= [ 'sand_rdg_prds', 'sand_svr_prds', 'sand_for_prds', 'sand_svr_prds'] #print train.ix[0:20, models] name = 'sand_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Sand')
def pred_SOC(train, val, test, all_vars, loop): data = (val, test, train) # variable selection SOC_lassoed_vars = lass_varselect(train, all_vars, 'SOC', .000000001) univ_selector = SelectKBest(score_func = f_regression, k = 4500) univ_selector.fit(train[all_vars], train['SOC']) univ_selector2 = SelectKBest(score_func = f_regression, k = 200) univ_selector2.fit(train[all_vars], train['SOC']) pvals = univ_selector.get_support() pvals2 = univ_selector2.get_support() chosen = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) chosen2 = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x] | pvals2[x]: chosen2.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if SOC_lassoed_vars[x]: lass_only.append(all_vars[x]) #randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['SOC']) for dset in data: dset['SOC_for_prds'] = forst.predict(dset.ix[:, chosen]) gbr = GradientBoostingRegressor(n_estimators = 900, learning_rate = .0785, max_depth =1, random_state = 42, verbose = 0, min_samples_leaf=4, subsample = .4) gbr.fit(train[chosen2], train['SOC']) for dset in data: dset['SOC_gbr_prds'] = gbr.predict(dset.ix[:, chosen2]) # lasso #lass = Lasso(alpha=.00000025, positive=True) #lass.fit(train[all_vars], train['SOC']) #for dset in data: # dset['SOC_las_prds'] = lass.predict(dset[all_vars]) # ridge SOC_ridge = RidgeCV(np.array([.315]), normalize=True) SOC_ridge.fit(train[all_vars], train['SOC']) for dset in data: dset['SOC_rdg_prds'] = SOC_ridge.predict(dset[all_vars]) # SVR svr = svm.SVR(C=9000, epsilon=.1) svr.fit(train.ix[:, chosen], train['SOC']) for dset in data: dset['SOC_svr_prds'] = svr.predict(dset.ix[:, chosen]) # combination models= ['SOC_rdg_prds', 'SOC_svr_prds', 'SOC_gbr_prds', 'SOC_for_prds', 'SOC_svr_prds' ] name = 'SOC_prds' + str(object=loop) write_preds(models, name, train, val, test, 'SOC')
def bag_of_words_ridge(variable): vectorizer = TfidfVectorizer( min_df=.1, max_df=.9 ) #use a vectorizer to count word usage instances and create sparse matrix bag_of_words_X = vectorizer.fit( train_and_validation[variable][pd.to_datetime( train_and_validation.date_posted) > pd.to_datetime('2013-11-1')]) # normalization of vectorizer is fit using train only bag_of_words_X = vectorizer.transform(train_and_validation[variable]) test_bag_of_words = vectorizer.transform(test[variable]) ridge = RidgeCV(array([18]), store_cv_values=True, normalize=True) # using data range to gaurantee recency and also run time ridge.fit( bag_of_words_X[pd.to_datetime(train_and_validation.date_posted) > pd.to_datetime('2013-11-8')], train_and_validation.is_exciting[pd.to_datetime( train_and_validation.date_posted) > pd.to_datetime('2013-11-8')]) var_nm = "b_of_wds_prds_" + variable # put predictions into samples for use later as base classifiers in ada boost train_and_validation[var_nm] = ridge.predict(bag_of_words_X) test[var_nm] = ridge.predict(test_bag_of_words)
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func=f_regression, k=5000) univ_selector.fit(train[all_vars], train['Ca']) univ_selector2 = SelectKBest(score_func=f_regression, k=200) univ_selector2.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() pvals2 = univ_selector2.get_support() chosen = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) chosen2 = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x] | pvals2[x]: chosen2.append(all_vars[x]) gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=.1695, max_depth=1, random_state=42, verbose=0, min_samples_leaf=4) gbr.fit(train[chosen2], train['Ca']) for dset in data: dset['Ca_gbr_prds'] = gbr.predict(dset.ix[:, chosen2]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # ridge Ca_ridge = RidgeCV(np.array([4.925]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # SVR model svr = svm.SVR(C=9500) svr.fit(train.ix[:, chosen], train['Ca']) for dset in data: dset['Ca_svr_prds'] = svr.predict(dset.ix[:, chosen]) # combination models = [ 'Ca_rdg_prds', 'Ca_gbr_prds', 'Ca_for_prds', 'Ca_svr_prds', 'Ca_svr_prds' ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func = f_regression, k = 5000) univ_selector.fit(train[all_vars], train['Ca']) univ_selector2 = SelectKBest(score_func = f_regression, k = 200) univ_selector2.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() pvals2 = univ_selector2.get_support() chosen = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) chosen2 = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x] | pvals2[x]: chosen2.append(all_vars[x]) gbr = GradientBoostingRegressor(n_estimators = 1000, learning_rate = .1695, max_depth =1, random_state = 42, verbose = 0, min_samples_leaf=4) gbr.fit(train[chosen2], train['Ca']) for dset in data: dset['Ca_gbr_prds'] = gbr.predict(dset.ix[:, chosen2]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # ridge Ca_ridge = RidgeCV(np.array([4.925]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # SVR model svr = svm.SVR(C=9500) svr.fit(train.ix[:, chosen], train['Ca']) for dset in data: dset['Ca_svr_prds'] = svr.predict(dset.ix[:, chosen]) # combination models= [ 'Ca_rdg_prds', 'Ca_gbr_prds', 'Ca_for_prds', 'Ca_svr_prds', 'Ca_svr_prds' ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
def pred_P(train, val, test, all_vars, loop): data = (val, test, train) # variable selection P_lassoed_vars = lass_varselect(train, all_vars, 'P', .00000001) univ_selector = SelectKBest(score_func=f_regression, k=1600) univ_selector.fit(train[all_vars], train['P']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if P_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if P_lassoed_vars[x]: lass_only.append(all_vars[x]) chosen.append('sand_prds' + str(object=loop)) chosen.append('pH_prds' + str(object=loop)) chosen.append('SOC_prds' + str(object=loop)) chosen.append('Ca_prds' + str(object=loop)) # SVM svr = svm.SVR(C=10000, epsilon=.1) svr.fit(train.ix[:, all_vars], train['P']) for dset in data: dset['P_svr_prds'] = svr.predict(dset.ix[:, all_vars]) gbr = GradientBoostingRegressor(n_estimators=60, learning_rate=0.1, max_depth=5, random_state=42, verbose=0, min_samples_leaf=4) gbr.fit(train.ix[:, chosen], train['P']) for dset in data: dset['P_gbr_prds'] = gbr.predict(dset.ix[:, chosen]) # ridge P_ridge = RidgeCV(np.array([.55]), normalize=True) P_ridge.fit(train[all_vars], train['P']) for dset in data: dset['P_rdg_prds'] = P_ridge.predict(dset[all_vars]) # combination models = ['P_rdg_prds', 'P_svr_prds', 'P_gbr_prds'] #, 'P_las_prds' , 'P_gbr_prds' name = 'P_prds' + str(object=loop) write_preds(models, name, train, val, test, 'P')
def pred_P(train, val, test, all_vars, loop): data = (val, test, train) # variable selection P_lassoed_vars = lass_varselect(train, all_vars, 'P', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1600) univ_selector.fit(train[all_vars], train['P']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if P_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if P_lassoed_vars[x]: lass_only.append(all_vars[x]) chosen.append('sand_prds' + str(object=loop)) chosen.append('pH_prds' + str(object=loop)) chosen.append('SOC_prds' + str(object=loop)) chosen.append('Ca_prds' + str(object=loop)) # SVM svr = svm.SVR(C=10000, epsilon=.1) svr.fit(train.ix[:, all_vars], train['P']) for dset in data: dset['P_svr_prds'] = svr.predict(dset.ix[:, all_vars]) gbr = GradientBoostingRegressor(n_estimators = 60, learning_rate = 0.1, max_depth =5, random_state = 42, verbose = 0, min_samples_leaf=4) gbr.fit(train.ix[:, chosen], train['P']) for dset in data: dset['P_gbr_prds'] = gbr.predict(dset.ix[:,chosen]) # ridge P_ridge = RidgeCV(np.array([.55]), normalize=True) P_ridge.fit(train[all_vars], train['P']) for dset in data: dset['P_rdg_prds'] = P_ridge.predict(dset[all_vars]) # combination models= [ 'P_rdg_prds', 'P_svr_prds', 'P_gbr_prds'] #, 'P_las_prds' , 'P_gbr_prds' name = 'P_prds' + str(object=loop) write_preds(models, name, train, val, test, 'P')
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func=f_regression, k=1400) univ_selector.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() chosen = [] for x in range(1, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) #print forst.feature_importances_ for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.0000001, positive=True) lass.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_las_prds'] = lass.predict(dset[all_vars]) # ridge Ca_ridge = RidgeCV(np.array([.5]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # combination models = [ 'Ca_las_prds', 'Ca_rdg_prds', 'Ca_for_prds', 'Ca_for_prds', ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func = f_regression, k = 1400) univ_selector.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() chosen = [] for x in range(1, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) #print forst.feature_importances_ for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.0000001, positive=True) lass.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_las_prds'] = lass.predict(dset[all_vars]) # ridge Ca_ridge = RidgeCV(np.array([.5]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # combination models= ['Ca_las_prds', 'Ca_rdg_prds', 'Ca_for_prds', 'Ca_for_prds', ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
# run ridge with optimal penalization t0= time.time() ridge= RidgeCV(alphas=optimizer.x, store_cv_values=True, normalize=True) # optimizer.x is the ridge penalty that minimized rmse ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents]) print "It took {time} minutes to run the optimized ridge".format(time=(time.time()-t0)/60) # create an OLS regression for word count ols= sm.regression.linear_model.OLS(train.is_exciting, train.word_count) results= ols.fit() # add ols and ridge predictions to train and test data train['ridge_predictions']=ridge.predict(train_tokens) train['length_predictions'] = train.word_count*results.params[0] test['ridge_predictions']=ridge.predict(test_tokens) test['length_predictions'] = test.word_count*results.params[0] data_for_ensemble = pd.DataFrame({"length_predictions":train.length_predictions,"ridge_predictions":train.ridge_predictions}) # create a ridge regression that incorporates the bag of words and essay length init_guess_ens = array([.0125]) t0= time.time() ensemble_optimizer = minimize(ensemble_ridge, init_guess_ens, method='nelder-mead', options= {'xtol':1e-2, 'disp':True}) print "It took {time} minutes to optimize".format(time=(time.time()-t0)/60) ridge= RidgeCV(alphas=array([ensemble_optimizer.x]), store_cv_values=True, normalize=True) ensemble = ridge.fit(data_for_ensemble, train.is_exciting)
def ensemble_ridge(penalty): ridge= RidgeCV(alphas=penalty, store_cv_values=True, normalize=True) ridge.fit(data_for_ensemble, train.is_exciting) predictions = ridge.predict(data_for_ensemble) return np.sqrt(np.mean((train.is_exciting-predictions)**2))
def ensemble_ridge(penalty): ridge = RidgeCV(alphas=penalty, store_cv_values=True, normalize=True) ridge.fit(data_for_ensemble, train.is_exciting) predictions = ridge.predict(data_for_ensemble) return np.sqrt(np.mean((train.is_exciting - predictions)**2))
def pc_ridge(penalty): # this function takes a complexity penalty as an input amd outputs RMSE ridge = RidgeCV(alphas=penalty, store_cv_values=True, normalize=True) ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents]) predictions = ridge.predict(train_tokens) return np.sqrt(np.mean((train.is_exciting - predictions)**2))
# run ridge with optimal penalization t0 = time.time() ridge = RidgeCV(alphas=optimizer.x, store_cv_values=True, normalize=True) # optimizer.x is the ridge penalty that minimized rmse ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents]) print "It took {time} minutes to run the optimized ridge".format( time=(time.time() - t0) / 60) # create an OLS regression for word count ols = sm.regression.linear_model.OLS(train.is_exciting, train.word_count) results = ols.fit() # add ols and ridge predictions to train and test data train['ridge_predictions'] = ridge.predict(train_tokens) train['length_predictions'] = train.word_count * results.params[0] test['ridge_predictions'] = ridge.predict(test_tokens) test['length_predictions'] = test.word_count * results.params[0] data_for_ensemble = pd.DataFrame({ "length_predictions": train.length_predictions, "ridge_predictions": train.ridge_predictions }) # create a ridge regression that incorporates the bag of words and essay length init_guess_ens = array([.0125]) t0 = time.time() ensemble_optimizer = minimize(ensemble_ridge, init_guess_ens, method='nelder-mead',
clf.predict_proba(test_X.iloc[:, 0:30])[:, 1]) ens_train_features['Forest'] = rndm_forest_clf.predict_proba( train_features.iloc[:, 0:forest_features])[:, 1] validation_for_p['Forest'] = rndm_forest_clf.predict_proba( validation_for_p.iloc[:, 0:forest_features])[:, 1] test_X['Forest'] = rndm_forest_clf.predict_proba( test_X.iloc[:, 0:forest_features])[:, 1] ens_train_features['Logit'] = logit.predict_proba( train_features.iloc[:, 0:logit_feats])[:, 1] validation_for_p['Logit'] = logit.predict_proba( validation_for_p.iloc[:, 0:logit_feats])[:, 1] test_X['Logit'] = logit.predict_proba(test_X.iloc[:, 0:logit_feats])[:, 1] ens_train_features['Ridge'] = full_ridge.predict( train_features.iloc[:, 0:logit_feats])[:, 1] validation_for_p['Ridge'] = full_ridge.predict( validation_for_p.iloc[:, 0:logit_feats])[:, 1] test_X['Ridge'] = full_ridge.predict(test_X.iloc[:, 0:logit_feats])[:, 1] # final tree ens_forest_clf = RandomForestClassifier(n_estimators=600, min_samples_split=6, min_samples_leaf=2) ens_forest_clf.fit(ens_train_features, train_outcome) validation['predictions'] = ens_forest_clf.predict_proba( validation_for_p.iloc[:, 167:171])[:, 1] fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions)
# add predictions to train features ens_train_features['Adaboost'] = pd.DataFrame(clf.predict_proba(train_features.iloc[:,0:30])[:,1]) validation_for_p['Adaboost'] = pd.DataFrame(clf.predict_proba(validation_for_p.iloc[:,0:30])[:,1]) test_X['Adaboost'] = pd.DataFrame(clf.predict_proba(test_X.iloc[:,0:30])[:,1]) ens_train_features['Forest'] = rndm_forest_clf.predict_proba(train_features.iloc[:,0:forest_features])[:,1] validation_for_p['Forest'] = rndm_forest_clf.predict_proba(validation_for_p.iloc[:,0:forest_features])[:,1] test_X['Forest'] = rndm_forest_clf.predict_proba(test_X.iloc[:,0:forest_features])[:,1] ens_train_features['Logit'] = logit.predict_proba(train_features.iloc[:,0:logit_feats])[:,1] validation_for_p['Logit'] = logit.predict_proba(validation_for_p.iloc[:,0:logit_feats])[:,1] test_X['Logit'] = logit.predict_proba(test_X.iloc[:,0:logit_feats])[:,1] ens_train_features['Ridge'] = full_ridge.predict(train_features.iloc[:,0:logit_feats])[:,1] validation_for_p['Ridge'] = full_ridge.predict(validation_for_p.iloc[:,0:logit_feats])[:,1] test_X['Ridge'] = full_ridge.predict(test_X.iloc[:,0:logit_feats])[:,1] # final tree ens_forest_clf = RandomForestClassifier(n_estimators=600, min_samples_split=6, min_samples_leaf=2) ens_forest_clf.fit(ens_train_features, train_outcome) validation['predictions']=ens_forest_clf.predict_proba(validation_for_p.iloc[:,167:171])[:,1] fpr, tpr, thresholds = roc_curve(validation.is_exciting, validation.predictions) auc_score = auc(fpr,tpr) auc_score # submission
def pc_ridge(penalty): # this function takes a complexity penalty as an input amd outputs RMSE ridge= RidgeCV(alphas= penalty, store_cv_values=True, normalize=True) ridge.fit(train_tokens[0:documents], train.is_exciting[0:documents]) predictions = ridge.predict(train_tokens) return np.sqrt(np.mean((train.is_exciting-predictions)**2))