def main(): # Read arguments parser = argparse.ArgumentParser( description="Build model for regression/classification") parser.add_argument("--config_file", type=str, required=True) parser.add_argument( "--max_events", type=int, default=-1, help="maximum number of events for training", ) mode_group = parser.add_mutually_exclusive_group() mode_group.add_argument( "--wave", dest="mode", action="store_const", const="wave", default="tail", help="if set, use wavelet cleaning", ) mode_group.add_argument( "--tail", dest="mode", action="store_const", const="tail", help="if set, use tail cleaning, otherwise wavelets", ) args = parser.parse_args() # Read configuration file cfg = load_config(args.config_file) # Type of model (regression or classification) model_type = cfg["General"]["model_type"] # Import parameters data_dir = cfg["General"]["data_dir"] outdir = cfg["General"]["outdir"] if not os.path.exists(outdir): os.makedirs(outdir) cam_ids = cfg["General"]["cam_id_list"] table_name_template = cfg["General"]["table_name_template"] table_name = [table_name_template + cam_id for cam_id in cam_ids] # List of features feature_list = cfg["FeatureList"] # Optimisation parameters method_name = cfg["Method"]["name"] tuned_parameters = [cfg["Method"]["tuned_parameters"]] scoring = "explained_variance" cv = cfg["Method"]["cv"] # Split fraction train_fraction = cfg["Split"]["train_fraction"] if model_type in "regressor": data_file = cfg["General"]["data_file"].format(args.mode) filename = path.join(data_dir, data_file) # List of cuts cuts = make_cut_list(cfg["SigFiducialCuts"]) init_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None)) # Name of target target_name = cfg["Method"]["target_name"] elif model_type in "classifier": data_sig_file = cfg["General"]["data_sig_file"].format(args.mode) data_bkg_file = cfg["General"]["data_bkg_file"].format(args.mode) filename_sig = path.join(data_dir, data_sig_file) filename_bkg = path.join(data_dir, data_bkg_file) # List of cuts sig_cuts = make_cut_list(cfg["SigFiducialCuts"]) bkg_cuts = make_cut_list(cfg["BkgFiducialCuts"]) # Model if method_name in "AdaBoostClassifier": init_model = AdaBoostClassifier( DecisionTreeClassifier(max_depth=4)) elif method_name in "RandomForestClassifier": init_model = RandomForestClassifier( n_estimators=500, max_depth=None, min_samples_split=0.05, max_features="sqrt", bootstrap=True, random_state=None, criterion="gini", class_weight= "balanced_subsample", # Reweight events for each tree ) use_same_number_of_sig_and_bkg_for_training = cfg["Split"][ "use_same_number_of_sig_and_bkg_for_training"] print("### Using {} for model construction".format(method_name)) models = dict() for idx, cam_id in enumerate(cam_ids): print("### Building model for {}".format(cam_id)) if model_type in "regressor": # Load data data = pd.read_hdf(filename, table_name[idx], mode="r") data = prepare_data(ds=data, cuts=cuts)[0:args.max_events] # Init model factory factory = TrainModel(case=model_type, target_name=target_name, feature_name_list=feature_list) # Split data factory.split_data(data_sig=data, train_fraction=train_fraction) print("Training sample: sig {}".format(len(factory.data_train))) print("Test sample: sig {}".format(len(factory.data_test))) elif model_type in "classifier": # Load data data_sig = pd.read_hdf(filename_sig, table_name[idx], mode="r") data_bkg = pd.read_hdf(filename_bkg, table_name[idx], mode="r") # Add label data_sig = prepare_data(ds=data_sig, label=1, cuts=sig_cuts) data_bkg = prepare_data(ds=data_bkg, label=0, cuts=bkg_cuts) data_sig = data_sig[0:args.max_events] data_bkg = data_bkg[0:args.max_events] # Init model factory factory = TrainModel(case=model_type, target_name="label", feature_name_list=feature_list) # Split data factory.split_data( data_sig=data_sig, data_bkg=data_bkg, train_fraction=train_fraction, force_same_nsig_nbkg= use_same_number_of_sig_and_bkg_for_training, ) print("Training sample: sig {} and bkg {}".format( len(factory.data_train.query("label==1")), len(factory.data_train.query("label==0")), )) print("Test sample: sig {} and bkg {}".format( len(factory.data_test.query("label==1")), len(factory.data_test.query("label==0")), )) # Build model best_model = factory.get_optimal_model(init_model, tuned_parameters, scoring=scoring, cv=cv) if model_type in "classifier": # print report if model_type in "classifier": print( classification_report( factory.data_scikit["y_test"], best_model.predict(factory.data_scikit["X_test"]), )) # Calibrate model if necessary on test data if cfg["Method"]["calibrate_output"] is True: print("==> Calibrate classifier...") best_model = CalibratedClassifierCV(best_model, method="sigmoid", cv="prefit") best_model.fit(factory.data_scikit["X_test"], factory.data_scikit["y_test"]) # save model models[cam_id] = best_model outname = "{}_{}_{}_{}.pkl.gz".format(model_type, args.mode, cam_id, method_name) joblib.dump(best_model, path.join(outdir, outname)) # save data save_obj( factory.data_scikit, path.join( outdir, "data_scikit_{}_{}_{}_{}.pkl.gz".format( model_type, method_name, args.mode, cam_id), ), ) factory.data_train.to_pickle( path.join( outdir, "data_train_{}_{}_{}_{}.pkl.gz".format(model_type, method_name, args.mode, cam_id), )) factory.data_test.to_pickle( path.join( outdir, "data_test_{}_{}_{}_{}.pkl.gz".format(model_type, method_name, args.mode, cam_id), ))
# for metric in results.keys(): # print("%s: %.3f" % (metric, np.average(results[metric]))) # print("KnnClassifier") # parameters= [3,5,15,121] # for K in parameters: # print("n_estimators") # print(K) # reg = KNeighborsClassifier(n_neighbors=K,n_jobs=-1,algorithm='kd_tree',leaf_size=500) # cross_val_score(reg, X_new, y, scoring=scorer3, cv=KFold(n_splits=3)) # results = scorer3.get_results() # for metric in results.keys(): # print("%s: %.3f" % (metric, np.average(results[metric]))) '''-----------------------TEST-4--------------------------------------------''' Regs = { "linera_model": linear_model.Ridge(random_state=0), "AdaBoostRegressor": AdaBoostRegressor(random_state=0, n_estimators=10), "GaussianNB": GaussianNB() } for reg in Regs: print(reg) cross_val_score(Regs.get(reg), X_new, y, scoring=scorer, cv=KFold(n_splits=5)) results = scorer.get_results() for metric in results.keys(): print("%s: %.3f" % (metric, np.average(results[metric]))) '''--------------------------------------------------------------------------''' '''-------------------------------------------------PART2---TEST1--------------------------------------------''' '''test class_whights '''
test_id = test['id'] train_id = train['id'] train = train.drop(['id', 'labels'], axis=1) test = test.drop('id', axis=1) shift = 200 print("Script name:", sys.argv[0]) args = dict([arg.split('=', maxsplit=1) for arg in sys.argv[1:]]) print(args) ESTIMATORS = { "encv": ElasticNetCV(), "rfr": RandomForestRegressor(n_estimators=250), "svr": SVR(C=1.0, epsilon=0.2), "gbr": GradientBoostingRegressor(n_estimators=250), "adb": AdaBoostRegressor(n_estimators=250), "knn4": KNeighborsRegressor(n_neighbors=4) } test_predictions = pd.DataFrame({'id': test_id, 'loss': np.nan}) test_predictions.set_index(['id']) name = args['classifier'] output = args.get("output", name + '_predictions.csv') if name in ESTIMATORS.keys(): estimator = ESTIMATORS[name] estimator.fit(train, train_labels) test_labels = np.exp(estimator.predict(test)) - shift test_predictions = test_predictions.assign(loss=test_labels) test_predictions.to_csv(output, index=False)
#mse in $ mse = mean_absolute_error(y_test, y_pred) print("The mean absolute error is:$", mse) #chceking r^2 from sklearn.metrics import r2_score print("r_Score:", r2_score(y_test, y_pred)) bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10) bg.fit(X_train, y_train) bg.score(X_train, y_train) bg.score(X_test, y_test) #Adaboosting regr = AdaBoostRegressor() regr.fit(X_train, y_train) regr.score(X_test, y_test) #Decision from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.fit(X_train, y_train) dt.score(X_test, y_test) #gradientBoost from sklearn.ensemble import GradientBoostingRegressor gb = GradientBoostingRegressor() gb.fit(X_train, y_train) gb.score(X_train, y_train) gb.score(X_test, y_test)
a3 = rfReg.predict(testX[varsUsed]) print(mean_squared_error(testY['score'], est.predict(testX[varsUsed]))) rfreg_tuned_parameters = [{'max_depth':[1,2,3],'n_estimators':[50,100,150,200]}] rfregGS = ms.GridSearchCV(RandomForestRegressor(),rfreg_tuned_parameters,cv=5,scoring='neg_mean_squared_error') rfregGS.fit(trainX[varsUsed],trainY['score']) a1 = ms.ParameterGrid(rfreg_tuned_parameters) scoresave = np.zeros(len(a1)) for i in range(len(a1)): rfregmgs = RandomForestRegressor(**a1[i]) rfregmgs.fit(trainX[varsUsed],trainY['score']) y_pred = rfregmgs.predict(testX[varsUsed]) ndcg = util.ndcg.ndcg(testX[['srch_id','prop_id']], testY['score'], y_pred) ndcg scoresave[i] = ndcg adaReg = AdaBoostRegressor() adaReg.fit(trainX[varsUsed],trainY['score']) print(mean_squared_error(testY['score'],adaReg.predict(testX[varsUsed]))) ada_tuned_parameters = [{'loss':['linear','square'],'learning_rate':[0.5,1,2],'n_estimators':[50,100,150,25]}] adaGS = ms.GridSearchCV(AdaBoostRegressor(),ada_tuned_parameters,cv=5,scoring='neg_mean_squared_error') adaGS.fit(trainX[varsUsed],trainY['score']) print(adaGS.score(testY['score'],adaGS.predict(testX[varsUsed]))) print(adaGS.best_params_) print(adaGS.best_score_) #predMat = pd.DataFrame() rfReg = RandomForestRegressor(n_estimators=100) rfReg.fit(trainX[varsUsed], trainY['score']) est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(trainX[varsUsed], trainY['score'])
# step 4. score print('prediction score: ', end="") print_score(y_test, y_pred) print('{:.2f} seconds '.format(time() - start)) from sklearn.ensemble import AdaBoostRegressor # Create the dataset rng = np.random.RandomState(1) # X = np.linspace(0, 6, 100)[:, np.newaxis] # y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0]) # Fit regression model regr_1 = DecisionTreeRegressor(max_depth=4) regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng) regr_1 = dtr regr_2 = AdaBoostRegressor(dtr, n_estimators=300, random_state=rng) regr_1.fit(X_train, y_train) regr_2.fit(X_train, y_train) # Predict y_1 = regr_1.predict(X_train) y_2 = regr_2.predict(X_train) # Plot the results plt.figure() plt.scatter(X_train, y_train, c="k", label="training samples") plt.plot(X_train, y_1, c="g", label="n_estimators=1", linewidth=2)
X_train = train.drop(columns=['Cases', 'Date']) y_train = train[target] X_test = test.drop(columns=['Cases', 'Date']) y_test = test[target] import category_encoders as ce from sklearn.impute import SimpleImputer from sklearn.ensemble import AdaBoostRegressor pipeline = make_pipeline( ce.OrdinalEncoder(), SimpleImputer(strategy='mean'), AdaBoostRegressor(n_estimators=200, random_state=42) ) pipeline.fit(X_train, y_train) y_test = y_test.fillna(y_test.mean()) y_pred = pipeline.predict(X_test) r2_score(y_test, y_pred) test['predicted_cases'] = pd.DataFrame(y_pred) test_california = test[test['Province_State'].str.contains('California') & (test['Case_Type'].str.contains('Confirmed'))]
if isinstance(clf, (DecisionTreeClassifier, OneVsRestClassifier)): if _graphviz.is_supported(): assert '<svg' in expl_html else: assert '<svg' not in expl_html assert res == get_res() @pytest.mark.parametrize(['reg'], [ [DecisionTreeRegressor(random_state=42)], [ExtraTreesRegressor(random_state=42)], [GradientBoostingRegressor(learning_rate=0.075, random_state=42)], [RandomForestRegressor(random_state=42)], [AdaBoostRegressor(random_state=42)], ]) def test_explain_tree_regressor(reg, boston_train): X, y, feature_names = boston_train reg.fit(X, y) res = explain_weights(reg, feature_names=feature_names) expl_text, expl_html = format_as_all(res, reg) for expl in [expl_text, expl_html]: assert 'BIAS' not in expl assert 'LSTAT' in expl if isinstance(reg, DecisionTreeRegressor): assert '---> 50' in expl_text @pytest.mark.parametrize(['clf'], [
# RandomForestClassifier(), # AdaBoostClassifier(), # GradientBoostingClassifier() # ]for classifier in classifiers: # pipe = Pipeline(steps=[('preprocessor', preprocessor), # ('classifier', classifier)]) # pipe.fit(X_train, y_train) # print(classifier) # print("model score: %.3f" % pipe.score(X_test, y_test)) # ``` classifiers = [ SVR(), DecisionTreeRegressor(random_state=random_seed), RandomForestRegressor(random_state=random_seed), AdaBoostRegressor(random_state=random_seed), GaussianProcessRegressor(random_state=random_seed), LinearRegression(), MLPRegressor(random_state=random_seed) ] grid_params = { 'SVR': { 'SVR__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'SVR__C': list(np.logspace(-5, 15, num=11, base=2)), 'SVR__gamma': list(np.logspace(-15, 3, num=10, base=2)), }, 'DecisionTreeRegressor': { 'DecisionTreeRegressor__criterion': ['mse', 'friedman_mse', 'mae'], 'DecisionTreeRegressor__max_depth': list(np.linspace(1, 32, 32, endpoint=True)),
def set_models(self, modelos=None): rs = 1 models = [] if (self.problem_type == "Classification"): # Ensemble Methods if 'AdaBoostClassifier' in modelos: models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=rs))) if 'GradientBoostingClassifier' in modelos: models.append(('GradientBoostingClassifier', GradientBoostingClassifier(random_state=rs))) if 'BaggingClassifier' in modelos: models.append( ('BaggingClassifier', BaggingClassifier(random_state=rs))) if 'RandomForestClassifier' in modelos: models.append(('RandomForestClassifier', RandomForestClassifier(random_state=rs))) if 'ExtraTreesClassifier' in modelos: models.append(('ExtraTreesClassifier', ExtraTreesClassifier(random_state=rs))) # Non linear Methods if 'KNeighborsClassifier' in modelos: models.append(('KNeighborsClassifier', KNeighborsClassifier())) if 'DecisionTreeClassifier' in modelos: models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=rs))) if 'MLPClassifier' in modelos: models.append(('MLPClassifier', MLPClassifier(max_iter=1000, random_state=rs))) if 'SVC' in modelos: models.append(('SVC', SVC(random_state=rs))) # Linear Methods if 'LinearDiscriminantAnalysis' in modelos: models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) if 'GaussianNB' in modelos: models.append(('GaussianNB', GaussianNB())) if 'LogisticRegression' in modelos: models.append(('LogisticRegression', LogisticRegression())) # Voting #estimators = [] #estimators.append( ("Voting_GradientBoostingClassifier", GradientBoostingClassifier(random_state=rs)) ) #estimators.append( ("Voting_ExtraTreesClassifier", ExtraTreesClassifier(random_state=rs)) ) #voting = VotingClassifier(estimators) #if 'VotingClassifier' in modelos: # models.append( ('VotingClassifier', voting) ) elif (self.problem_type == "Regression"): # Ensemble Methods if 'AdaBoostRegressor' in modelos: models.append( ('AdaBoostRegressor', AdaBoostRegressor(random_state=rs))) if 'GradientBoostingRegressor' in modelos: models.append(('GradientBoostingRegressor', GradientBoostingRegressor(random_state=rs))) if 'BaggingRegressor' in modelos: models.append( ('BaggingRegressor', BaggingRegressor(random_state=rs))) if 'RandomForestRegressor' in modelos: models.append(('RandomForestRegressor', RandomForestRegressor(random_state=rs))) if 'ExtraTreesRegressor' in modelos: models.append(('ExtraTreesRegressor', ExtraTreesRegressor(random_state=rs))) # Non linear Methods if 'KNeighborsRegressor' in modelos: models.append(('KNeighborsRegressor', KNeighborsRegressor())) if 'DecisionTreeRegressor' in modelos: models.append(('DecisionTreeRegressor', DecisionTreeRegressor(random_state=rs))) if 'MLPRegressor' in modelos: models.append(('MLPRegressor', MLPRegressor(max_iter=1000, random_state=rs))) if 'SVR' in modelos: models.append(('SVR', SVR())) # Linear Methods if 'LinearRegression' in modelos: models.append(('LinearRegression', LinearRegression())) if 'BayesianRidge' in modelos: models.append(('BayesianRidge', LinearRegression())) return models
from sklearn.ensemble import AdaBoostRegressor from sklearn.linear_model import ElasticNetCV from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline, make_union from sklearn.tree import DecisionTreeRegressor from tpot.builtins import OneHotEncoder, StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:-805.6529764814633 exported_pipeline = make_pipeline( StackingEstimator(estimator=DecisionTreeRegressor( max_depth=7, min_samples_leaf=4, min_samples_split=8)), StackingEstimator(estimator=DecisionTreeRegressor( max_depth=9, min_samples_leaf=11, min_samples_split=19)), OneHotEncoder(minimum_fraction=0.2, sparse=False, threshold=10), StackingEstimator(estimator=AdaBoostRegressor( learning_rate=0.5, loss="square", n_estimators=100)), StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.8, tol=0.001)), KNeighborsRegressor(n_neighbors=3, p=2, weights="distance")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
y = np.array(y) # Define classifiers to try: (clf, name) pairs classifiers = [ (LinearRegression(n_jobs=-1), 'LinearRegression'), (RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0), "RandomForest"), (GradientBoostingRegressor(n_estimators=100, random_state=0), "GradientBoost"), (ExtraTreesRegressor(n_estimators=100, random_state=0), "ExtraTrees"), (DecisionTreeRegressor(random_state=0), "DecisionTrees"), (BaggingRegressor(n_estimators=100, n_jobs=-1, random_state=0), "Bagging"), (AdaBoostRegressor(n_estimators=100, random_state=0), "AdaBoost") # , # (XGBRegressor(n_estimators=100, n_jobs=-1, randomstate=0), "XGBoost") ] ######## SQUID Prediction # Store all ROC curves here: squid_rocs = [] for clf, name in classifiers: print("Evaluating %s classifier (squid)" % name) mae, r2 = cross_validate_and_plot(clf, X, y, cols, name + "_squid", splits) squid_rocs = [name, mae, r2]
showPredictionValidation(y_train, y_test, X_test, X_valid, df_result) print(mae(y_test, y_pred)) print(mse(y_test, y_pred)) print(r2(y_test, y_pred)) pipelines = [] # ============================================================================= pipelines.append(('DSTR', DecisionTreeRegressor())) pipelines.append(('GBM', GradientBoostingRegressor())) pipelines.append(('RDMF', RandomForestRegressor())) pipelines.append(('ADAB', AdaBoostRegressor())) pipelines.append(('ETR', ExtraTreesRegressor())) pipelines.append(('BAGR', BaggingRegressor())) pipelines.append(('KNNR', KNeighborsRegressor(n_neighbors=7))) #pipelines.append(('LR', LinearRegression())) #pipelines.append(('Ridge', Ridge())) #pipelines.append(('Lasso', Lasso())) #pipelines.append(('SVR', SVR())) ## ============================================================================= def apply_loocv(X_train, y_train, X_test, y_test): dict = {} results = [] names = []
return(x_train,y,x_test) x_train , y ,x_test = get_train_test(tr,ts) # In[13]: from sklearn.ensemble import AdaBoostRegressor from sklearn.grid_search import GridSearchCV # ### ADB # In[14]: params_adb = [{'learning_rate' : [1,1.2,1.5,1.7,2] ,'n_estimators' : [300,400,500]}] # In[15]: gsearch = GridSearchCV(estimator= AdaBoostRegressor(), param_grid = params_adb, scoring='mean_squared_error',n_jobs=50,cv=5,verbose=10) # In[ ]: gsearch.fit(x_train,y) print(gsearch.best_params_) print(gsearch.grid_scores_)
def price_predictions(ticker, start, end, forecast_out): file_path = symbol_to_path(ticker) df = pd.read_csv(file_path, index_col="<DTYYYYMMDD>", parse_dates=True, usecols=[ "<DTYYYYMMDD>", "<OpenFixed>", "<HighFixed>", "<LowFixed>", "<CloseFixed>", "<Volume>" ], na_values="nan") df = df.rename( columns={ '<DTYYYYMMDD>': 'Date', "<OpenFixed>": 'Open', '<HighFixed>': 'High', '<LowFixed>': 'Low', '<CloseFixed>': 'Close', '<Volume>': 'Volume' }) # columns order for backtrader type columnsOrder = ["Open", "High", "Low", "Close", "Volume", "OpenInterest"] # change the index by new index df = df.reindex(columns=columnsOrder) # change date index to increasing order df = df.sort_index() # take a part of dataframe df = df.loc[start:end] df['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0 df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0 bbwindow = 25 vlwindow = 10 mmtum = 10 df['BB_Value'] = compute_indicator_bb(df, window=bbwindow) df['Volatility'] = compute_indicator_volatility(df, timeperiod=vlwindow) df['Momentum'] = talib.MOM(df['Close'].values, timeperiod=mmtum) df['OBV'] = talib.OBV(df['Close'].values, df['Volume'].values.astype(np.float64)) df['MACD'], _, _ = talib.MACD(df['Close'].values, fastperiod=12, slowperiod=26, signalperiod=9) _, df['STOCH'] = talib.STOCH(df['High'].values, df['Low'].values, df['Close'].values, fastk_period=14, slowk_period=1, slowd_period=5) df['MFI'] = talib.MFI(df['High'].values, df['Low'].values, df['Close'].values, df['Volume'].values.astype(np.float64), timeperiod=14) # df['EMA3'] = pd.Series(pd.Series.ewm(df['Close'], span = 3, min_periods = 3-1).mean()) # df['EMA6'] = pd.Series(pd.Series.ewm(df['Close'], span = 6, min_periods = 6-1).mean()) # df['EMA18'] = pd.Series(pd.Series.ewm(df['Close'], span = 18, min_periods = 18-1).mean()) df['PDI'] = talib.PLUS_DI(df['High'].values, df['Low'].values, df['Close'].values, timeperiod=14) df['NDI'] = talib.MINUS_DI(df['High'].values, df['Low'].values, df['Close'].values, timeperiod=14) # df = df[['Close', 'HL_PCT', 'PCT_change', 'Volume','BB_Value', # 'Volatility', 'Momentum', 'MACD', 'STOCH', 'MFI', 'OBV']] # df = df[['Close', 'HL_PCT', 'PCT_change', 'Volume', 'BB_Value']] df.fillna(method="ffill", inplace=True) df.fillna(method="backfill", inplace=True) forecast_col = 'Close' #inplace : boolean, default False # If True, fill in place. Note: this will modify any other views on this object, # (e.g. a no-copy slice for a column in a DataFrame). # Du bao 1% cua du lieu # Copy du lieu tu cot Adj. Close vao cot moi # Lenh Shift df['Target'] = df[forecast_col].shift(-forecast_out) # Lenh Drop loai bo label #axis : int or axis name: column # Whether to drop labels from the index (0 / ‘index’) or columns (1 / ‘columns’). X = np.array(df.drop(['Target'], 1)) y_true = df[forecast_col][-forecast_out:] # Preprocessing Input Data X = preprocessing.scale(X) #from sklearn.preprocessing import MinMaxScaler #scaler = MinMaxScaler() #X = scaler.fit_transform(X) # Tach gia tri X va X_lately ra khoi chuoi X_lately = X[-forecast_out:] X = X[:-forecast_out] # Loai bo cac gia tri NA # df.dropna(inplace=True) # Target la vector y lay tu cot label y = np.array(df['Target'].dropna()) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #X_train, X_test, y_train, y_test = train_test_split(X, y) #from sklearn.preprocessing import MinMaxScaler #from sklearn.preprocessing import StandardScaler #scaler = MinMaxScaler() #scaler = StandardScaler() #X_train = scaler.fit_transform(X_train) #X_test = scaler.transform(X_test) #X_lately = scaler.transform(X_lately) n_neighbors = 5 knn = neighbors.KNeighborsRegressor(n_neighbors, weights='uniform') knn.fit(X_train, y_train) print('Train score KNN: ', knn.score(X_train, y_train), 'Test score KNN : ', knn.score(X_test, y_test)) forecast_set = knn.predict(X_lately) print('Price for next {} days'.format(forecast_out), forecast_set) bagging = BaggingRegressor(DecisionTreeRegressor(), n_estimators=50, random_state=50) bagging.fit(X_train, y_train) print('Train score BAG: ', bagging.score(X_train, y_train), 'Test score BAG : ', bagging.score(X_test, y_test)) forecast_set = bagging.predict(X_lately) print('Price for next {} days'.format(forecast_out), forecast_set) rf = RandomForestRegressor(n_estimators=50, random_state=50) rf.fit(X_train, y_train) print('Train score RF: ', rf.score(X_train, y_train), 'Test score RF : ', rf.score(X_test, y_test)) forecast_set = rf.predict(X_lately) print('Price for next {} days'.format(forecast_out), forecast_set) adaboost = AdaBoostRegressor(neighbors.KNeighborsRegressor(n_neighbors=5), n_estimators=30, random_state=0) #adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), # n_estimators=30, random_state=0) adaboost.fit(X_train, y_train) print('Train score Ada: ', adaboost.score(X_train, y_train), 'Test score Ada : ', adaboost.score(X_test, y_test)) forecast_set = adaboost.predict(X_lately) print('Price for next {} days'.format(forecast_out), forecast_set)
def read_constructor_json(self): for i in self.pipeline_constructor_json['estimators']: model_not_found = False if self.pipeline_constructor_json['estimators'][i][ 'model'] == 'RandomForestRegressor': model = RandomForestRegressor(n_jobs=-1) elif self.pipeline_constructor_json['estimators'][i][ 'model'] == 'Lasso': model = Lasso() elif self.pipeline_constructor_json['estimators'][i][ 'model'] == 'LinearRegression': model = LinearRegression() elif self.pipeline_constructor_json['estimators'][i][ 'model'] == 'KNeighborsRegressor': model = KNeighborsRegressor() elif self.pipeline_constructor_json['estimators'][i][ 'model'] == 'AdaBoostRegressor': model = AdaBoostRegressor() else: model_not_found = True if model_not_found: print('Unidentfied estimator: ' + self.pipeline_constructor_json['estimators'][i]['model']) else: self.estimators.append({ 'model': model, 'parameters': self.pipeline_constructor_json['estimators'][i] ['parameters'] }) for i in self.pipeline_constructor_json['pre-estimators']: model_not_found = False if self.pipeline_constructor_json['pre-estimators'][i][ 'model'] == 'VarianceThreshold': model = VarianceThreshold() elif self.pipeline_constructor_json['pre-estimators'][i][ 'model'] == 'SelectKBest': model = CustomSelectKBest() elif self.pipeline_constructor_json['pre-estimators'][i][ 'model'] == 'MinMaxScaler': model = MinMaxScaler() elif self.pipeline_constructor_json['pre-estimators'][i][ 'model'] == 'StandardScaler': model = StandardScaler() elif self.pipeline_constructor_json['pre-estimators'][i][ 'model'] == 'RFE': model = RFE(estimator=DecisionTreeRegressor()) elif self.pipeline_constructor_json['pre-estimators'][i][ 'model'] == 'SimpleImputer': model = SimpleImputer() else: model_not_found = True if model_not_found: print('Unidentfied pre-estimator: ' + self.pipeline_constructor_json['estimators'][i]['model']) else: self.pre_estimators.append({ 'model': model, 'parameters': self.pipeline_constructor_json['pre-estimators'][i] ['parameters'] })
predictors = studies_no_encoding.drop('aadb',axis=1) rf = h2o.estimators.H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10) rf.train(x=predictors, y=response, training_frame=h2o.H2OFrame(studies_no_encoding)) h2o.H2OFrame(studies_no_encoding) X = studies.drop('aadb',axis=1) y = studies['aadb'] scaler = StandardScalar() X_scaled = preprocessing.StandardScaler(X) pipeline = make_pipeline([ ('lasso', linear_model.Lasso()), ('ada', AdaBoostRegressor()), ]) parameters = [ { 'clf': (linear_model.Lasso(),) }, { 'clf': (AdaBoostRegressor(),), 'clf__n_estimators': (1, 5, 25, 100) } ] grid_search = GridSearchCV(pipeline, parameters, cv=10, scoring = 'neg_mean_squared_error') len(studies) ## Don't expect linear models to do well
test_size=0.2, random_state=1) return (X_train, X_test, y_train, y_test), column_names '''Question 3: 3 classes of algorithm''' rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=1) gdbr = GradientBoostingRegressor(learning_rate=0.1, loss='ls', n_estimators=100, random_state=1) abr = AdaBoostRegressor(DecisionTreeRegressor(), learning_rate=0.1, loss='linear', n_estimators=100, random_state=1) k_fold = KFold(n_splits=5, shuffle=True) def cv_mse_r2(model): ''' Takes an instantiated model (estimator) and returns the average mean square error (mse) and coefficient of determination (r2) from kfold cross-validation. Parameters: estimator: model object X_train: 2d numpy array y_train: 1d numpy array nfolds: the number of folds in the kfold cross-validation Returns: mse: average mean_square_error of model over number of folds
def get_model_from_name(model_name, training_params=None, is_hp_search=False): # For Keras epochs = 1000 # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': # print('Heard that this is the test suite. Limiting number of epochs, which will increase ' # 'training speed dramatically at the expense of model accuracy') # epochs = 100 all_model_params = { 'LogisticRegression': {}, 'RandomForestClassifier': { 'n_jobs': -2, 'n_estimators': 30 }, 'ExtraTreesClassifier': { 'n_jobs': -1 }, 'AdaBoostClassifier': {}, 'SGDClassifier': { 'n_jobs': -1 }, 'Perceptron': { 'n_jobs': -1 }, 'LinearSVC': { 'dual': False }, 'LinearRegression': { 'n_jobs': -2 }, 'RandomForestRegressor': { 'n_jobs': -2, 'n_estimators': 30 }, 'LinearSVR': { 'dual': False, 'loss': 'squared_epsilon_insensitive' }, 'ExtraTreesRegressor': { 'n_jobs': -1 }, 'MiniBatchKMeans': { 'n_clusters': 8 }, 'GradientBoostingRegressor': { 'presort': False, 'learning_rate': 0.1, 'warm_start': True }, 'GradientBoostingClassifier': { 'presort': False, 'learning_rate': 0.1, 'warm_start': True }, 'SGDRegressor': { 'shuffle': False }, 'PassiveAggressiveRegressor': { 'shuffle': False }, 'AdaBoostRegressor': {}, 'LGBMRegressor': { 'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384 }, 'LGBMClassifier': { 'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384 }, 'DeepLearningRegressor': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'DeepLearningClassifier': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'CatBoostRegressor': {}, 'CatBoostClassifier': {} } # if os.environ.get('is_test_suite', 0) == 'True': # all_model_params model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if is_hp_search is True: if model_name[:12] == 'DeepLearning': model_params['epochs'] = 50 if model_name[:4] == 'LGBM': model_params['n_estimators'] = 500 if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10, # 000 trees, we will let them do it) model_params.update(training_params) print( 'After overwriting our defaults with your values, here are the final params that will ' 'be used to initialize the model:') print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans(), } try: model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001) model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001) model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier( max_iter=1000, tol=0.001) model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001) model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor( max_iter=1000, tol=0.001) except TypeError: model_map['SGDClassifier'] = SGDClassifier() model_map['Perceptron'] = Perceptron() model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier( ) model_map['SGDRegressor'] = SGDRegressor() model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor() if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if catboost_installed: model_map['CatBoostRegressor'] = CatBoostRegressor() model_map['CatBoostClassifier'] = CatBoostClassifier() if model_name[:12] == 'DeepLearning': if keras_installed is False: # Suppress some level of logs if TF is installed (but allow it to not be installed, # and use Theano instead) try: os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from tensorflow import logging logging.set_verbosity(logging.INFO) except: # TODO: Fix bare Except pass model_map['DeepLearningClassifier'] = KerasClassifier( build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor( build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print( 'It appears you are trying to use a library that is not available when we try to ' 'import it, or using a value for model_names that we do not recognize.' ) raise e if os.environ.get('is_test_suite', False) == 'True': if 'n_jobs' in model_params: model_params['n_jobs'] = 1 model_with_params = model_without_params.set_params(**model_params) return model_with_params
'MSZoning', 'CentralAir', 'KitchenQual', 'Neighborhood', 'Condition1', 'Heating' ] numerical_transformer = SimpleImputer(strategy="mean") categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='most_frequent') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numerical_transformer, numerical_features), ('cat', categorical_transformer, categorical_features)]) regressor = AdaBoostRegressor( n_estimators=26, base_estimator=DecisionTreeRegressor(max_depth=20), learning_rate=1.36) my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', regressor)]) # test_X.fillna({'KitchenQual': 'TA', 'MSZoning': 'RL'}, inplace=True) # train_X, test_X = prepare_categorical_features(train_X, test_X, categorical_features, numerical_features) print("Fitting regressor...") # regressor.fit(train_X, train_y) my_pipeline.fit(train_X, train_y) print("Predicting labels...") # test_predictions = regressor.predict(test_X) test_predictions = my_pipeline.predict(test_X)
param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=rescaledX, y=y_train) print('最优: {} 使用{}'.format(grid_result.best_score_, grid_result.best_params_)) cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params']) for mean, std, param in cv_results: print('{} ({}) with {}'.format(mean, std, param)) # b)集成算法 ensembles = {} # models ensembles['ScalerAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())]) ensembles['ScalerAB-KNN'] = Pipeline([ ('Scaler', StandardScaler()), ('ABKNN', AdaBoostRegressor(base_estimator=KNeighborsRegressor(n_neighbors=3))) ]) ensembles['ScalerAB-LR'] = Pipeline([ ('Scaler', StandardScaler()), ('ABLR', AdaBoostRegressor(base_estimator=LinearRegression())) ]) ensembles['ScalerRFR'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestRegressor())]) ensembles['ScalerETR'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesRegressor())]) ensembles['ScalerGBR'] = Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())])
# In[334]: #Random Forest RF = RandomForestRegressor(n_estimators=20, random_state=36) RF.fit(X_train, y_train) y_pred_RF = RF.predict(X_test) print("Random Forest Regression R^2 value: " + str((r2_score(y_test, y_pred_RF)))) print("Random Forest Regression MSE value: " + str(mean_squared_error(y_test, y_pred_RF))) # In[337]: #Ada boost regressor ADA = AdaBoostRegressor() ADA.fit(X_train, y_train) y_pred_ADA = ADA.predict(X_test) print("Ada Boost Regression R^2 value: " + str(r2_score(y_test, y_pred_ADA))) print("Ada Boost Regression MSE value: " + str(mean_squared_error(y_test, y_pred_ADA))) # Random forest seems like the best regression model. As a result I am going to validate the results of the Random Forest model for comparison with classification techniques. # In[338]: r2_scores_random_forest = [] MSE_scores_random_forest = [] kf = KFold(n_splits=len(df_columns_list))
GradientBoostingRegressor( ), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html SVR( ), # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR LinearSVR( ), # https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html ElasticNet( alpha=0.001, max_iter=10000 ), # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html SGDRegressor( max_iter=10000, tol=1e-3 ), # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html BayesianRidge(), # KernelRidge( alpha=0.6, kernel='polynomial', degree=2, coef0=2.5 ), # https://scikit-learn.org/stable/modules/generated/sklearn.kernel_ridge.KernelRidge.html ExtraTreesRegressor( ), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html XGBRegressor(), AdaBoostRegressor( n_estimators=50 ), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html BaggingRegressor( ), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html DecisionTreeRegressor( ), #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html KNeighborsRegressor() ] # https://scikit-learn.org/0.18/modules/generated/sklearn.neighbors.KNeighborsRegressor.html for m in models: print("- {}".format(m.__class__.__name__))
files.download('SVR.csv') from sklearn import metrics #MAE print(metrics.mean_absolute_error(y_test, prediction)) #MSE print(metrics.mean_squared_error(y_test, prediction)) #RMSE print(np.sqrt(metrics.mean_squared_error(y_test, prediction))) from sklearn.ensemble import AdaBoostRegressor model_ada = AdaBoostRegressor(n_estimators=100) fit = model_ada.fit(X_train, y_train) prediction_ada = model_ada.predict(X_test) from matplotlib import pyplot as plt plt.plot(t, y_test, 'bs', t, prediction_ada, 'g^') plt.xlabel('Samples') plt.ylabel('prediction') plt.title('adaBoost') Data = [prediction_ada, y_test] Data = pd.DataFrame(Data) Data = Data.T Data
max_iter=500, n_jobs=-1) stda = StandardScaler(with_mean=False) processTest = pd.DataFrame(stda.fit_transform(x_test)) processTrain = pd.DataFrame(stda.transform(x_train)) lr_oof_train, lr_oof_test = sm.get_oof_tree(lr, processTrain, y_train, processTest, ntrain, ntest) # lr_oof_train,lr_oof_test = sm.get_oof_tree(lr,x_train,y_train,x_test,ntrain, ntest) # get_oof_regressor(clf, x_train, y_train, x_test, ntrain, ntest, NFOLDS = 5) rf = RandomForestRegressor(n_estimators=600, max_depth=8, n_jobs=-1, random_state=SEED) ada = AdaBoostRegressor(n_estimators=60, learning_rate=0.01, loss='square', random_state=SEED) gb = GradientBoostingRegressor(learning_rate=0.02, n_estimators=80, subsample=0.75, max_depth=6, random_state=SEED) et = ExtraTreesRegressor(n_estimators=150, max_depth=8, max_features='sqrt', n_jobs=-1, random_state=SEED) rf_reg_train, rf_reg_test = sm.get_oof_regressor(rf, x_train, y_train, x_test, ntrain, ntest) ada_reg_train, ada_reg_test = sm.get_oof_regressor(ada, x_train, y_train, x_test, ntrain, ntest)
# more neighbours + weighting according to distance train_display(KNeighborsRegressor(n_neighbors=5, weights='distance'), img) train_display(KNeighborsRegressor(n_neighbors=25, weights='distance'), img) # KNN train_display(KNeighborsRegressor(n_neighbors=2, metric='canberra'), img) # # gradient boosting # train_display(XGBoostRegressor(max_depth=5, \ # n_estimators=100, \ # subsample=0.5, nthreads=4), img) # # Gradient Boosting with deep trees # train_display(XGBoostRegressor(max_depth=12, n_estimators=100, \ # subsample=0.5, nthreads=4, eta=0.1), img) # # NN # train_display(TheanetsRegressor(layers=[20, 20], hidden_activation='tanh', # trainers=[{'algo': 'adadelta', 'learning_rate': 0.01}]), img) # AdaBoost over Decision Trees using random projections base = make_pipeline(GaussianRandomProjection(n_components=10), DecisionTreeRegressor(max_depth=10, max_features=5)) train_display(AdaBoostRegressor(base, n_estimators=50, learning_rate=0.05), img) # Bagging over decision trees using random projections, sometimes referred as Random Forest base = make_pipeline(GaussianRandomProjection(n_components=15), DecisionTreeRegressor(max_depth=12, max_features=5)) train_display(BaggingRegressor(base, n_estimators=100), img)
regressorDT = DecisionTreeRegressor(random_state=0) regressorDT.fit(x_train, y_train) y_predDT = regressorDT.predict(x_test) y_trainpredDT = regressorDT.predict(x_train) print(np.sqrt(metrics.mean_squared_error(y_test, y_predDT))) print(np.sqrt(metrics.mean_squared_error(y_train, y_trainpredDT))) print('Variance score: %.2f' % metrics.r2_score(y_test, y_predDT)) # AdaBoost from sklearn.ensemble import AdaBoostRegressor ada = AdaBoostRegressor(base_estimator=regressorDT, learning_rate=1.0, loss='linear', n_estimators=50, random_state=None) ada.fit(x_train, y_train) y_predada = ada.predict(x_test) y_trainpredada = ada.predict(x_train) print(np.sqrt(metrics.mean_squared_error(y_test, y_predada))) print(np.sqrt(metrics.mean_squared_error(y_train, y_trainpredada))) print('Variance score: %.2f' % metrics.r2_score(y_test, y_predada)) # ExtraTree Classifier from sklearn.ensemble import ExtraTreesRegressor extra = ExtraTreesRegressor(n_estimators=10, criterion='mse',
pipelines.append(('KNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())]))) pipelines.append(('DTR', Pipeline([('Scaler', StandardScaler()), ('DTR', DecisionTreeRegressor())]))) pipelines.append(('RF', Pipeline([('Scaler', StandardScaler()), ('RF', RandomForestRegressor())]))) pipelines.append(('ADA', Pipeline([('Scaler', StandardScaler()), ('ADA', AdaBoostRegressor())]))) pipelines.append( ('SVR', Pipeline([('Scaler', StandardScaler()), ('SVR', SVR())]))) pipelines.append( ('SVR-RBF', Pipeline([('Scaler', StandardScaler()), ('SVR', SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1))]))) pipelines.append(('SVR-Linear', Pipeline([('Scaler', StandardScaler()), ('SVR', SVR(kernel='linear', C=100, gamma='auto'))]))) pipelines.append(('SVR-Poly', Pipeline([('Scaler', StandardScaler()), ('SVR', SVR(kernel='poly',
def train(classifier, X, Y, is_classf, outcome, fs_method, imp_method, data_dir, results_dir, cv=10, verbose=0): results_path = os.path.join(results_dir, 'score_{}-{}-{}-{}.json'.format(classifier, outcome, fs_method, imp_method)) if os.path.exists(results_path): if verbose: print("Model already trained. See {}".format(results_path)) return if classifier == 'Linear': if is_classf: model = LogisticRegression() else: model = LinearRegression() elif classifier == 'Ridge': if is_classf: model = RidgeClassifierCV(alphas=(1e-3, 1e-2, 1e-1, 1, 10, 100)) else: model = RidgeCV(alphas=(1e-3, 1e-2, 1e-1, 1, 10, 100)) print("Finding best alpha...") model.fit(X, Y) best_alpha = model.alpha_ print("Best alpha: {}".format(model.alpha_)) if is_classf: model = RidgeClassifier(alpha=best_alpha) else: model = Ridge(alpha=best_alpha) elif classifier == 'AdaBoost': if is_classf: estimator = DecisionTreeClassifier(max_depth=1) model = AdaBoostClassifier(estimator, n_estimators=100) else: estimator = DecisionTreeRegressor(max_depth=1) model = AdaBoostRegressor(estimator, n_estimators=100) elif classifier == 'RandomForest': if is_classf: model = RandomForestClassifier(n_estimators=50) else: model = RandomForestRegressor(n_estimators=50) elif classifier == 'SVM': if is_classf: model = SVC(kernel='linear', probability=True) else: model = SVR(kernel='linear') else: raise ValueError("model {} not available".format(classifier)) scores = {} metric = brier_score_loss if is_classf else mean_squared_error metric_str = 'brier_loss' if is_classf else 'mse' if verbose: print("Training {}...".format(classifier)) print("10-Fold cross validation...") start = time.time() kf = KFold(n_splits=cv) kf.get_n_splits(X) losses = [] for i, (train_index, test_index) in enumerate(kf.split(X)): if verbose: sys.stdout.write("\rFold {}/{}".format(i+1, cv)) X_train, Y_train = X[train_index], Y[train_index] X_test, Y_test = X[test_index], Y[test_index] model.fit(X_train, Y_train) if classifier == 'Ridge' and is_classf: d = model.decision_function(X_test) Y_pred = np.exp(d) / (1 + np.exp(d)) else: Y_pred = model.predict_proba(X_test)[:,1] if is_classf else model.predict(X_test) losses.append(metric(Y_test, Y_pred)) mean_loss = np.mean(losses) scores['cv_{}'.format(metric_str)] = mean_loss total = int(time.time()-start) if verbose: print("\nTraining took {}m{}s.".format(total // 60, total % 60)) print("cv mean {}: {:.4f}".format(metric_str, mean_loss)) print("Bootstrapping...B)") bs_losses = [] for i in range(cv): if verbose: sys.stdout.write("\rSample {}/{}".format(i+1, cv)) data = np.hstack((np.arange(len(X)).reshape(len(X), 1), X, Y.reshape(len(Y), 1))) train = resample(data, n_samples=int(0.7*len(X))) train_ids = set(train[:,0].astype(np.int64)) train = train[:,1:] test = np.array([sample[1:] for sample in data if sample[0] not in train_ids]) X_train, Y_train = train[:,:-1], train[:,-1] X_test, Y_test = test[:,:-1], test[:,-1] model.fit(X_train, Y_train) if classifier == 'Ridge' and is_classf: # from https://stackoverflow.com/questions/22538080/scikit-learn-ridge-classifier-extracting-class-probabilities d = model.decision_function(X_test) Y_pred = np.exp(d) / (1 + np.exp(d)) else: Y_pred = model.predict_proba(X_test)[:,1] if is_classf else model.predict(X_test) bs_losses.append(metric(Y_test, Y_pred)) mean_loss = np.mean(bs_losses) n = len(bs_losses) lower = mean_loss - 1.96*np.std(bs_losses)/np.sqrt(n) upper = mean_loss + 1.96*np.std(bs_losses)/np.sqrt(n) scores['bootstrap_{}'.format(metric_str)] = mean_loss scores['bootstrap_95_lower'] = lower scores['bootstrap_95_upper'] = upper if verbose: print("\nbootstrap mean {}: {:.4f}".format(metric_str, mean_loss)) print("95% confidence interval: [{:.4f}, {:.4f}]".format(lower, upper)) with open(results_path, 'w') as f: json.dump(scores, f) if verbose: print("Successfully saved scores.")
def evaluateIndividualregressors(x, y, train_size_pct): """ evaluateIndividualregressors x : The features of the dataset to be used for predictions y : The target class for each row in "x" train_size_pct : {float in the range(0.0, 1.0)} the percentage of the dataset that should be used for training """ max_depth_x2 = MAX_DEPTH * 2 n_neighbors_x2 = N_NEIGHBORS * 2 lr1 = LinearRegression() rf_x2 = RandomForestRegressor(max_depth=max_depth_x2, random_state=SEED) et = ExtraTreesRegressor(max_depth=MAX_DEPTH, random_state=SEED) dectree = DecisionTreeRegressor(max_depth=MAX_DEPTH, random_state=SEED) knn = KNeighborsRegressor(n_neighbors=N_NEIGHBORS) knn_x2 = KNeighborsRegressor(n_neighbors=n_neighbors_x2) knn3 = KNeighborsRegressor(n_neighbors=20, metric='euclidean') dumm = DummyRegressor() knb = neighbors.KNeighborsRegressor() SVR1 = MultiOutputRegressor(NuSVR()) ada1 = MultiOutputRegressor(AdaBoostRegressor()) gpc1 = GaussianProcessRegressor() bag = BaggingRegressor(base_estimator=ExtraTreesRegressor(), n_estimators=10, random_state=0) svr1 = MultiOutputRegressor(SVR()) r1 = Ridge() r2 = RidgeCV() xgbrf = MultiOutputRegressor(XGBRFRegressor()) xgb = MultiOutputRegressor(XGBRegressor()) gbr = MultiOutputRegressor( GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='squared_error')) lasso = MultiTaskLassoCV(random_state=42) Bay = MultiOutputRegressor(linear_model.BayesianRidge()) lassolars = linear_model.LassoLars(alpha=.1, normalize=False) linsvr = MultiOutputRegressor(LinearSVR()) regressor_mapping = { f'1-linear regression': lr1, f'2-RandomForest case2-{max_depth_x2}': rf_x2, f'3-ExtraTrees-{MAX_DEPTH}': et, f'4-DecisionTree-{MAX_DEPTH}': dectree, f'5-KNeighbors case1-{N_NEIGHBORS}': knn, f'5-KNeighbors case2-{n_neighbors_x2}': knn_x2, f'6-knn case 3': knn3, f'7-dummy-': dumm, f'8-neighbors.KNeighbors-': knb, f'9-NuSVR-': SVR1, f'10- adaboost-': ada1, f'11- GaussianProcessRegressor': gpc1, f'12- bagging': bag, f'13- svr1': svr1, f'14- ridge': r1, f'15- ridgecv': r2, f'16- xgbrf': xgbrf, f'17- xgboost': xgb, f'18- GradientBoosting': gbr, f'19- lasso': lasso, f'20- BayesianRidge': Bay, f'21- lassolars': lassolars, f'22- linsvr': linsvr } for model_name, model in regressor_mapping.items(): train_test_model(model_name, model, x, y, train_size_pct)