def train_and_test_sensor(idx_sensor, id_sensor, n_sensors, use_lat=False): X_tr1, y_tr1, X_te1, y_te1 = to_array(X_tr_ord, y_tr_ord, X_te_ord, y_te_ord, id_sensor=id_sensor) if use_lat: X_tr2, y_tr2, X_te2, y_te2 = to_array(X_tr_lat, y_tr_lat, X_te_lat, y_te_lat, id_sensor=id_sensor) # Validation using TS split (just to obtain different MAE estimations, no hyperoptimization for the moment) cv_loss = [] for tr_idx, va_idx in TimeSeriesSplit(n_splits=5).split(X_tr1): if not use_lat: train_data = np.atleast_3d(X_tr1[tr_idx]) validation_data = np.atleast_3d(X_tr1[va_idx]) model = conv1D_lon(idx_sensor, n_sensors=n_sensors) else: train_data = [ np.atleast_3d(X_tr1[tr_idx]), np.atleast_3d(X_tr2[tr_idx]) ] validation_data = [ np.atleast_3d(X_tr1[va_idx]), np.atleast_3d(X_tr2[va_idx]) ] model = conv1D_lon_lat(idx_sensor, n_sensors=n_sensors) model.compile(opt, loss='mean_absolute_error') model.fit(train_data, y_tr1[tr_idx], batch_size=batch_size, epochs=epochs, validation_data=(validation_data, y_tr1[va_idx]), callbacks=[c2, c3], verbose=0) cv_loss.append(c3.history['val_loss'][-1]) # Testing if not use_lat: train_data = np.atleast_3d(X_tr1) validation_data = np.atleast_3d(X_te1) model = conv1D_lon(idx_sensor, n_sensors=n_sensors) else: train_data = [np.atleast_3d(X_tr1), np.atleast_3d(X_tr2)] validation_data = [np.atleast_3d(X_te1), np.atleast_3d(X_te2)] model = conv1D_lon_lat(idx_sensor, n_sensors=n_sensors) model.compile(opt, loss='mean_absolute_error') model.fit(train_data, y_tr1, batch_size=batch_size, epochs=epochs, validation_data=(validation_data, y_te1), callbacks=[c2, c3], verbose=0) test_loss = c3.history['val_loss'][-1] #model.save('../models/conv1D_{}_{:1d}.h5'.format(id_sensor, use_lat)) print('MAE_val ', cv_loss) print('MAE_test ', test_loss) return test_loss, cv_loss
def modelsearch(): # get the data _, train_df_field2, _, _, _, humidity_field2, _, _ = getdata() humidity_field2 = humidity_field2.values.reshape(-1) utils.logger.info(train_df_field2.shape) utils.logger.info(humidity_field2.shape) #rmtree(cachedir) cachedir = mkdtemp() #creates a temporary directory pipe = createpipeline(cachedir) utils.logger.info(pipe) # Evaluate different algorithms using cross-validation(cv) methods = [] #methods.append(('LR', LinearRegression())) #no-good #methods.append(('RIDGE', Ridge(random_state=42))) #no-good #methods.append(('LASSO', Lasso(random_state=42))) #no-good #methods.append(('SGR', SGDRegressor(random_state=42))) #no-good methods.append(('SVR', SVR(gamma='auto'))) methods.append(('KNN', KNeighborsRegressor())) methods.append(('MLP', MLPRegressor(random_state=42, max_iter=2000, activation="tanh", shuffle=False))) methods.append(('GBR', GradientBoostingRegressor(random_state=42))) #methods.append(('CART', DecisionTreeRegressor(random_state=42))) #methods.append(('RFR', RandomForestRegressor(random_state=42, n_estimators=200))) #methods.append(('ETR', ExtraTreesRegressor(n_estimators=200, random_state=42))) #methods.append(('ABR', AdaBoostRegressor(n_estimators=200, random_state=42, base_estimator=RandomForestRegressor(random_state=42, max_depth=3)))) #methods.append(('ABR.', AdaBoostRegressor(n_estimators=50, random_state=42, base_estimator=LinearRegression()))) #methods.append(('ABR_', AdaBoostRegressor(n_estimators=50, random_state=42, base_estimator=DecisionTreeRegressor(random_state=42, max_depth=1)))) #methods.append(('ABR__', AdaBoostRegressor(n_estimators=50, random_state=42, base_estimator=ExtraTreesClassifier(n_estimators=7,max_depth=2, random_state=42)))) #methods.append(('BR', BaggingRegressor(n_estimators=200, random_state=42, base_estimator=RandomForestRegressor(random_state=42, max_depth=3)))) #methods.append(('BR.', BaggingRegressor(n_estimators=50, random_state=42, base_estimator=LinearRegression()))) #methods.append(('BR_', BaggingRegressor(n_estimators=50, random_state=42, base_estimator=DecisionTreeRegressor(random_state=42, max_depth=1)))) #methods.append(('BR__', BaggingRegressor(n_estimators=50, random_state=42, base_estimator=ExtraTreesClassifier(n_estimators=7,max_depth=2, random_state=42)))) #base_estimator=LogisticRegression(solver='lbfgs',random_state=42,class_weight=class_weights) #base_estimator=DecisionTreeClassifier(random_state=42, max_depth=5, class_weight=class_weights) #base_estimator=ExtraTreesClassifier(n_estimators=200,max_depth=5, random_state=42, class_weight=class_weights) results = [] names = [] for name, method in methods: #sKfold = model_selection.StratifiedKFold(n_splits = 2, random_state=42) # cross-validation ts_cv = TimeSeriesSplit(5) # 5-fold forward chaining cv_results = cross_val_score(method, pipe.fit_transform(train_df_field2), humidity_field2, cv=ts_cv, scoring='neg_mean_squared_error', verbose=1) results.append(cv_results) names.append(name) utils.logger.info(name + " : " + cv_results) for i in range(len(names)): result = results[i] name = names[i] msg = "%s: %f mean (+/- %f) std" % (name, result.mean(), result.std()) utils.logger.info(msg) #performance of methods
def main(): r = Reader() seasons = [10, 11, 12, 13, 14, 15, 16, 17] x_train = {} y_train = {} x_test = {} y_test = {} vec = DictVectorizer(sparse=False) for season in seasons[:-1]: x, y = r.read("data/"+str(season)+".csv") x_train[season], y_train[season] = x, y x, y = r.read("data/"+str(seasons[-1])+".csv") x_test[seasons[-1]], y_test[seasons[-1]] = x, y # read pred_data x, y = r.read("data/18.csv", interactive=True) x_pred = {} y_pred = {} x_pred[18], y_pred[18] = x, y #x_test.update(x_train) #y_test.update(y_train) #x_train, y_train = transform_to_lstm(x_train, y_train) #x_test, y_test = transform_to_lstm(x_test, y_test) x_train, y_train = dict_list_transform(x_train, y_train) x_test, y_test = dict_list_transform(x_test, y_test) pred_data, y_pred = dict_list_transform(x_pred, y_pred) #print(len(x_test[0])) #print(x_data['Marco Reus'], y_data['Marco Reus']) x_all = pandas.DataFrame(x_test+x_train+pred_data, columns=['name', 'position', 'age', 'club']) x_train = pandas.DataFrame(x_train, columns=['name', 'position', 'age', 'club']) x_test = pandas.DataFrame(x_test, columns=['name', 'position', 'age', 'club']) pred_data = pandas.DataFrame(pred_data, columns=['name', 'position', 'age', 'club']) vec.fit(x_all.to_dict('records')) #print(x_test.to_dict('records')) #train = pandas.DataFrame(x_train.assign(pts=y_train), columns=['name', 'position', 'age', 'club', 'pts']) X_train, X_test = vec.transform(x_train.to_dict('records')), vec.transform(x_test.to_dict('records')) pred_data = vec.transform(pred_data.to_dict('records')) #DEEP NETWORK #print(X_train, y_train) #none = vec.vocabulary_['club=None'] #for p in X_train: # if p[none] == 1: # p = np.full(p.shape, 2) X_train = pandas.DataFrame(X_train).values X_test = pandas.DataFrame(X_test).values y_train = pandas.DataFrame(y_train).values y_test = pandas.DataFrame(y_test).values # lstm reshape X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1])) X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1])) pred_data = pred_data.reshape((pred_data.shape[0], 1, pred_data.shape[1])) # init other kf = KFold(shuffle=True) tscv = TimeSeriesSplit(n_splits=3) scaler = StandardScaler() #init model #dnn = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0) #lstm = KerasRegressor(build_fn=lstm_model, epochs=100, batch_size=24, verbose=1) lstm = KerasRegressor(build_fn=lstm_model, epochs=200, batch_size=1, verbose=0) svr = SVR() lgbm = lgb.LGBMRegressor(boosting_type='dart', num_leaves=40, learning_rate=0.1) #get interactive data p = Parser() #p_int = p.parse_interactive() run_model("LSTM", lstm, [], X_train, X_test, y_train, y_test, tscv, vec, cv=False, out=False, pred_data=None, price_data=None, hyper=True)
def RunRF(Abs_train, Abs_test, X_train, Y_train, X_test, Y_test, name): model = RandomForestClassifier(n_estimators=301, criterion='gini', max_depth=40, min_samples_split=2, min_samples_leaf=10, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.00, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight={ 1: 1, -1: 1.15 }) relevant_features = FeatureSelection(X_train, Y_train, model, N_FEATURES) cv = TimeSeriesSplit(n_splits=10) # cv = 3 min_samples_leaf = [50 * i for i in range(1, 7)] min_samples_split = [i * 2 for i in range(1, 10)] param_grid = [{ 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split }] clf = model_selection.GridSearchCV(model, param_grid, scoring=None, fit_params=None, n_jobs=4, iid=True, refit='best_score_', cv=cv, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score='warn') clf.fit(X_train, Y_train) x = (clf.best_params_) print x # exit() model.set_params(**x) model.fit(X_train, Y_train) # model = RandomForestClassifier(n_estimators=300, criterion='entropy',random_state = 0) X_train_ = X_train[:, relevant_features] X_test_ = X_test[:, relevant_features] model.fit(X_train_, Y_train) pred_test = model.predict(X_test_) pred_train = model.predict(X_train_) cnf_mat_test = GenerateCnfMatrix(pred_test, Y_test) cnf_mat_train = GenerateCnfMatrix(pred_train, Y_train) actual_dist = ComputeDistribution(Y_train, Y_test) accuracy = ComputeAccuracy(cnf_mat_test, cnf_mat_train, name, actual_dist) # print np.mean(cross_val_score(model, X_train, Y_train, cv=100)) if CALCULATE_RETURNS == 'y': returns = ComputeReturns(Abs_test, Abs_train, pred_test, pred_train, Y_test, Y_train, name) print('------------------------------------------') return accuracy[2][0], pred_test
generation_df = get_data(weeks) # prepare data pre_pipeline = Pipeline([ ('date_worker', mytransformers.DateTransformer()), ('shifter', mytransformers.Shifter()) ]) processed_data = pre_pipeline.fit_transform(generation_df, shifter__hours = hours) features = processed_data[0] labels = processed_data[1] # start mlflow run with mlflow.start_run(): # cross validation tscv = TimeSeriesSplit(5) for train_index, test_index in tscv.split(labels): X_train, X_test = features.iloc[train_index], features.iloc[test_index] y_train, y_test = labels[train_index], labels[test_index] model = Lasso(alpha).fit(X_train, y_train) preds = model.predict(X_test) rmse, mae, r2 = eval_metrics(y_test, preds) mlflow.log_param("alpha", alpha) mlflow.log_param("weeks", weeks) mlflow.log_param("hours", hours) mlflow.log_metric("rmse", rmse) mlflow.log_metric("r2", r2) mlflow.log_metric("mae", mae)
from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc, f1_score from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression import xgboost as xgb from sklearn.utils.class_weight import compute_class_weight # X, y dfDate = dfClean['Date'] dfY = dfClean['Target'] dfX = dfClean.copy().drop(columns=['Target', 'Date']) # TS Split nSplit = 3 cvSplit = TimeSeriesSplit(n_splits=nSplit) # Class Weights classWeights = compute_class_weight('balanced', np.unique(dfY), dfY.values) print('The weights are: %s' % classWeights + ' respectively for the classes: %s' % np.unique(dfY)) # Define Models nMinSample = 1250 modelRF = RandomForestClassifier(n_estimators=200, class_weight='balanced_subsample') modelXGB = xgb.XGBClassifier(learning_rate=0.1, sample_weight=classWeights, nthread=-2) #modelLogit_L1 = LogisticRegression(penalty='l1', class_weight='balanced', solver='liblinear') modelLogit_L2 = LogisticRegression(penalty='l2',
'Alcohol_full_bar', 'Alcohol_none', 'Caters_True', 'WiFi_free', 'WiFi_no', 'WiFi_paid', \ 'BikeParking_True', 'NoiseLevel_average', 'NoiseLevel_loud', 'NoiseLevel_quiet', \ 'NoiseLevel_very_loud', 'HasTV_True', 'OutdoorSeating_True', 'RestaurantsTakeOut_True', \ 'RestaurantsReservations_True', 'GoodForKids_True', 'RestaurantsPriceRange2_1', \ 'RestaurantsPriceRange2_2', 'RestaurantsPriceRange2_3', 'RestaurantsPriceRange2_4', \ 'RestaurantsGoodForGroups_True', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', \ 'Tuesday', 'Wednesday'] train_data.dropna(inplace=True) labels = ['running_average'] train_data_X = train_data[features] train_data_y = train_data[labels] alphas = [10**a for a in range(-5, 2)] n_splits = 5 ts_CV = TimeSeriesSplit(n_splits=n_splits) regList, rmse_list_test, rmse_list_train, r2_list_test, r2_list_train = regressionCV_new( train_data, features, labels, alphas, n_splits) best_regMod = regList[np.argmin(rmse_list_test)] min_rmse = np.amin(rmse_list_test) features += ['running_average_past_bin'] regList_pr, rmse_list_test_pr, rmse_list_train_pr, r2_list_test_pr, r2_list_train_pr = regressionCV_new( train_data, features, labels, alphas, n_splits) best_regMod_pr = regList_pr[np.argmin(rmse_list_test_pr)] min_rmse_pr = np.amin(rmse_list_test_pr)
X_ext = X_ext.drop([ 'aaa', 'baa', 'wti_mom', 'incl103', 'm1_mom', 'm2_mom', 'usd_mom', 'ism_inv_mom', 'ism_man_mom', 'ism_prices_mom', 'jobl_claims_mom', 'gold_mom', 'spx_mom', 'wti_vol', 'spx_vol' ], axis=1) X_ext = X_ext.dropna() X_test = X.loc[:datetime(1999, 12, 1), :] y_test = y.align(X_test, join='inner')[0] reg1 = make_pipeline( StandardScaler(), RidgeClassifierCV(fit_intercept=False, normalize=False, cv=TimeSeriesSplit(12))) reg2 = make_pipeline(StandardScaler(), Perceptron(fit_intercept=False)) reg3 = make_pipeline(StandardScaler(), PassiveAggressiveClassifier(fit_intercept=False)) reg4 = make_pipeline( StandardScaler(), SGDClassifier(loss='log', penalty='elasticnet', fit_intercept=False)) reg5 = make_pipeline( StandardScaler(), LogisticRegressionCV(cv=TimeSeriesSplit(12), penalty='l2', fit_intercept=False)) reg6 = make_pipeline(StandardScaler(), SVC(probability=True)) reg7 = make_pipeline(StandardScaler(), GaussianNB()) reg8 = make_pipeline(StandardScaler(), RandomForestClassifier())
from pandas import read_csv from sklearn.model_selection import TimeSeriesSplit from matplotlib import pyplot #load data series = read_csv('data\Elec_daily_Dmd_2D.csv', header=0, index_col=0) # dmd2D = ['data','demand'] # series2D = series['dmd2D'] X = series.values splits = TimeSeriesSplit(n_splits=3) pyplot.figure(1) index = 1 for train_index, test_index in splits.split(X): train = X[train_index] test = X[test_index] #train_size = int(len(X)*0.66) #train, test = X[0:train_size], X[train_size:len(X)] print('Observations: %d' % (len(train) + len(test))) print('Training Observations: %d' % (len(train))) print('Testing Observations: %d' % (len(test))) pyplot.subplot(310 + index) pyplot.plot(train) pyplot.plot([None for i in train] + [X for X in test]) #pyplot(xlabel('date')) index += 1 pyplot.show() # #print(series.head()) # # series.plot()
def random_forest_randomforward(X_train, y_train, X_test, y_test, n_selected_features=1000, scoring='accuracy', n_iter=1000): from sklearn.model_selection import TimeSeriesSplit from datetime import datetime as dt import random import warnings warnings.filterwarnings("ignore") st_t = dt.now() n_samples, n_features = X_train.shape n_estimators = [5, 10, 50, 100, 150, 200, 250, 300] max_depth = [5, 10, 25, 50, 75, 100] min_samples_leaf = [1, 2, 4, 8, 10] min_samples_split = [2, 4, 6, 8, 10] max_features = ["auto", "sqrt", "log2", None] hyperparameter = { 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features': max_features } cv_timeSeries = TimeSeriesSplit(n_splits=5).split(X_train) base_model_rf = RandomForestClassifier(criterion='gini', random_state=42) n_iter_search = 30 scoring = scoring # selected feature set, initialized to be empty count = 0 ddict = {} all_F = [] all_c = [] all_acc = [] all_model = [] while count < n_selected_features: #F = [] max_acc = 0 for i in range(n_iter): col_train = random.sample(list(X_train.columns), count + 1) col_train = np.array(col_train) X_train_tmp = X_train[col_train] acc = 0 rsearch_cv = RandomizedSearchCV( estimator=base_model_rf, random_state=42, param_distributions=hyperparameter, n_iter=n_iter_search, #cv=cv_timeSeries, cv=2, scoring=scoring, n_jobs=-1) rsearch_cv.fit(X_train_tmp, y_train) best_estimator = rsearch_cv.best_estimator_ y_pred = best_estimator.predict(X_test[col_train]) acc = metrics.accuracy_score(y_test, y_pred) if acc > max_acc: max_acc = acc idx = col_train best_model = best_estimator #F.append(idx) count += 1 print("The current number of features: {} - Accuracy: {}%".format( count, round(max_acc * 100, 2))) all_F.append(idx) all_c.append(count) all_acc.append(max_acc) all_model.append(best_model) c = pd.DataFrame(all_c) a = pd.DataFrame(all_acc) f = pd.DataFrame(all_F) f["All"] = f[f.columns[0:]].apply( lambda x: ', '.join(x.dropna().astype(str)), axis=1) all_info = pd.concat([c, a, f["All"]], axis=1) all_info.columns = ['Num_features', 'Accuracy', 'Features'] all_info = all_info.sort_values(by='Accuracy', ascending=False).reset_index(drop=True) print("The total time for searching subset: {}".format(dt.now() - st_t)) return all_info, all_model, f
def xgboost_forward(X_train, y_train, X_test, y_test, n_selected_features=1000, scoring='accuracy'): from sklearn.model_selection import TimeSeriesSplit from datetime import datetime as dt import random import warnings warnings.filterwarnings("ignore") st_t = dt.now() n_samples, n_features = X_train.shape n_estimators = [5, 10, 50, 100, 150, 200, 250, 300] max_depth = [5, 10, 25, 50, 75, 100] min_child_weight = [5, 10, 25, 50, 75, 100] gamma = [0.5, 1, 1.5, 2, 5] subsample = [0.2, 0.4, 0.6, 0.8, 1] colsample_bytree = [0.2, 0.4, 0.6, 0.8, 1] hyperparameter = { 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'gamma': gamma, 'subsample': subsample, 'colsample_bytree': colsample_bytree } cv_timeSeries = TimeSeriesSplit(n_splits=5).split(X_train) xgb = XGBClassifier(learning_rate=0.02, objective='multi:softmax', silent=True, nthread=20) n_iter_search = 30 scoring = scoring # selected feature set, initialized to be empty F = [] count = 0 ddict = {} all_F = [] all_c = [] all_acc = [] all_model = [] while count < n_selected_features: max_acc = 0 for i in X_train.columns: if i not in F: F.append(i) X_train_tmp = X_train[F] acc = 0 rsearch_cv = RandomizedSearchCV( estimator=xgb, random_state=42, param_distributions=hyperparameter, n_iter=n_iter_search, #cv=cv_timeSeries, cv=2, scoring=scoring, n_jobs=-1) rsearch_cv.fit(X_train_tmp, y_train) best_estimator = rsearch_cv.best_estimator_ y_pred = best_estimator.predict(X_test[F]) acc = metrics.accuracy_score(y_test, y_pred) F.pop() if acc > max_acc: max_acc = acc idx = i best_model = best_estimator F.append(idx) count += 1 print("The current number of features: {} - Accuracy: {}%".format( count, round(max_acc * 100, 2))) all_F.append(np.array(F)) all_c.append(count) all_acc.append(max_acc) all_model.append(best_model) c = pd.DataFrame(all_c) a = pd.DataFrame(all_acc) f = pd.DataFrame(all_F) f["All"] = f[f.columns[0:]].apply( lambda x: ', '.join(x.dropna().astype(str)), axis=1) all_info = pd.concat([c, a, f["All"]], axis=1) all_info.columns = ['Num_feature', 'Accuracy', 'Feature'] all_info = all_info.sort_values(by='Accuracy', ascending=False).reset_index(drop=True) print("The total time for searching subset: {}".format(dt.now() - st_t)) return all_info, all_model, f
def get_RandSearchCV(X_train, y_train, X_test, y_test, scoring, type_search, output_file): from sklearn.model_selection import TimeSeriesSplit from datetime import datetime as dt st_t = dt.now() # Numer of trees are used n_estimators = [5, 10, 50, 100, 150, 200, 250, 300] #n_estimators = list(np.arange(100,1000,50)) #n_estimators = [1000] # Maximum depth of each tree max_depth = [5, 10, 25, 50, 75, 100] # Minimum number of samples per leaf min_samples_leaf = [1, 2, 4, 8, 10] # Minimum number of samples to split a node min_samples_split = [2, 4, 6, 8, 10] # Maximum numeber of features to consider for making splits max_features = ["auto", "sqrt", "log2", None] hyperparameter = { 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features': max_features } cv_timeSeries = TimeSeriesSplit(n_splits=5).split(X_train) base_model_rf = RandomForestClassifier(criterion="gini", random_state=42) base_model_gb = GradientBoostingClassifier(criterion="friedman_mse", random_state=42) # Run randomzed search n_iter_search = 30 if type_search == "RandomSearchCV-RandomForest": rsearch_cv = RandomizedSearchCV(estimator=base_model_rf, random_state=42, param_distributions=hyperparameter, n_iter=n_iter_search, cv=cv_timeSeries, scoring=scoring, n_jobs=-1) else: rsearch_cv = RandomizedSearchCV(estimator=base_model_gb, random_state=42, param_distributions=hyperparameter, n_iter=n_iter_search, cv=cv_timeSeries, scoring=scoring, n_jobs=-1) rsearch_cv.fit(X_train, y_train) #f = open("output.txt", "a") print("Best estimator obtained from CV data: \n", rsearch_cv.best_estimator_, file=output_file) print("Best Score: ", rsearch_cv.best_score_, file=output_file) return rsearch_cv
if __name__ == '__main__': # load dataset data = import_training_set(fast_pc = True) data.dropna(inplace=True) # set up classifier and pipeline bagging = BaggingClassifier(base_estimator=GaussianNB(), n_estimators=25, bootstrap=True, max_samples=0.25, n_jobs=1) pipe = Pipeline([('scaler', StandardScaler()), ('reduce_dim', 'passthrough'), ('clf', bagging)]) # set up param grid param_grid = [ {'reduce_dim': ['passthrough']}, {'reduce_dim': [PCA()], 'reduce_dim__n_components': [0.91, 0.93, 0.95, 0.97], 'clf__max_features' : [0.33,0.66,1.0]}, {'reduce_dim': [SelectKBest()], 'reduce_dim__k': [20, 30, 40, 50], 'clf__max_features' : [0.33,0.66,1.0]}] cv = TimeSeriesSplit(n_splits=10, test_size=100000, gap=100000) # initiate hyperparameter search results = search(data,pipe,param_grid,filepath='Results/bagging_naive_25_4.csv',cv=cv)
X = preprocessing.MinMaxScaler().fit_transform(X) #X = preprocessing.StandardScaler().fit_transform(X) y=df.loc[:,'Good sell Point?'] # Split train set and test set xtrain,ytrain=X[:testduration],y[:testduration] xtest,ytest=X[testduration:],y[testduration:] Market_GoodRatio=sum(df['Good sell Point?'].iloc[:testduration,]==1)/len(df['Good sell Point?'].iloc[:testduration,])#Good sell Point Ratio in market is manully set to nearly 0.5 ResultTable=ResultTable.append({'Stock':stock,'Method':'Market Good sell Ratio','AvgScores':Market_GoodRatio,'StdScores':0},ignore_index=True) #Compare and Plot the precision rate of each algorithm index=0 for method in method_list.loc[0,:]: clf = method cv=TimeSeriesSplit(n_splits=4) #Time series test scores = cross_val_score(clf,xtrain, ytrain, cv=4,scoring='precision') print(scores[scores>0]) series={'Stock':stock,'Method':method_list.columns[index],'AvgScores':scores[scores>0].mean(),'StdScores':scores[scores>0].std()} index=index+1 ResultTable=ResultTable.append(series,ignore_index=True) name_list= ['Market Good sell Ratio'] name_list=np.append(name_list,method_list.columns) num_list= ResultTable.loc[ResultTable['Stock']==stock]['AvgScores'] plt.barh(range(len(num_list)), num_list,tick_label = name_list) plt.title(stock+'\nPrecision Rate') plt.show() #Plot precision rate of each method index=0
X_test = df_covid_encoded['2020-10':] X_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) X_test = X_scaler.transform(X_test) y_train = df_econ_encoded.loc[:'2020-09', column] y_test = df_econ_encoded.loc['2020-10':, column] ## Fit a Random Forest Regressor and Find the best Parameters model = RandomForestRegressor() param_search = { 'n_estimators': [20, 50, 100], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [i for i in range(5,15)] } tscv = TimeSeriesSplit(n_splits=3) gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = 'r2') gsearch.fit(X_train, y_train) best_score = gsearch.best_score_ best_model = gsearch.best_estimator_ y_true = y_test.values y_pred = best_model.predict(X_test) print('Regression results Using Random-Forest-Regressor on', column, ' are:') regression_results(y_true, y_pred) sys.stdout = orig_stdout f.close() ## Preprocess the data using StandardScaler
def train_evaluate(parameterization, validation, data_path, n_known_outlier_classes, ratio_known_normal, ratio_known_outlier, ratio_pollution, cfg, n_jobs_dataloader, n_splits=3): device = 'cpu' period = np.array( ['2019-11-08', '2019-11-09', '2019-11-11', '2019-11-12', '2019-11-13']) if (validation == 'kfold'): split = KFold(n_splits=n_splits) elif (validation == 'time_series'): split = TimeSeriesSplit(n_splits=n_splits) else: # Dummy object with split method that return indexes of train/test split 0.8/0.2. Similar to train_test_split without shuffle split = type( 'obj', (object, ), { 'split': lambda p: [([x for x in range(int(len(p) * 0.8))], [x for x in range(int(len(p) * 0.8), len(p))])] }) test_aucs = [] for train, test in split.split(period): dataset = CICFlowADDataset( root=os.path.abspath(data_path), n_known_outlier_classes=n_known_outlier_classes, ratio_known_normal=ratio_known_normal, ratio_known_outlier=ratio_known_outlier, train_dates=period[train], test_dates=period[test], ratio_pollution=ratio_pollution) # Initialize DeepSAD model and set neural network phi # Log random sample of known anomaly classes if more than 1 class if n_known_outlier_classes > 1: logger.info('Known anomaly classes: %s' % (dataset.known_outlier_classes, )) # Initialize Isolation Forest model Isoforest = IsoForest(hybrid=False, n_estimators=int( parameterization['n_estimators']), max_samples=parameterization['max_samples'], contamination=parameterization['contamination'], n_jobs=4, seed=cfg.settings['seed']) # Train model on dataset Isoforest.train(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) # Test model Isoforest.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) test_auc = Isoforest.results['auc_roc'] test_aucs.append(test_auc) reporter(mean_auc=evaluate_aucs(test_aucs=test_aucs))
def main(argv): np.random.seed(1234) if len(argv) != 3: print("Must be in format: python featurize.py <TICKER> <FORWARD_LAG>") exit(0) elif not int(argv[2]): print("Must be in format: python featurize.py <TICKER> <FORWARD_LAG>") exit(0) # set relevant vars ticker = argv[1] forward_lag = int(argv[2]) # display ticker info print("Ticker = ",ticker) print(f"Prediction Window = {forward_lag} days") print() # read data print("Reading data ... ") PREFIX = config.gen_prefix(ticker,forward_lag) # relevant filenames """ FEATURES_TRAIN_FILENAME = f'../data/processed/{PREFIX}_train_features.csv' FEATURES_TEST_FILENAME = f'../data/processed/{PREFIX}_test_features.csv' LABELS_TRAIN_FILENAME = f'../data/processed/{PREFIX}_train_labels.csv' LABELS_TEST_FILENAME = f'../data/processed/{PREFIX}_test_labels.csv' """ FEATURES_FILENAME = f'../data/processed/{PREFIX}_features.csv' LABELS_FILENAME = f'../data/processed/{PREFIX}_labels.csv' # load data X = pd.read_csv(FEATURES_FILENAME).set_index('date') y = pd.read_csv(LABELS_FILENAME).set_index('date') """ X_train = pd.read_csv(FEATURES_TRAIN_FILENAME).set_index('date') X_test = pd.read_csv(FEATURES_TEST_FILENAME).set_index('date') y_train = pd.read_csv(LABELS_TRAIN_FILENAME).set_index('date') y_test = pd.read_csv(LABELS_TEST_FILENAME).set_index('date') """ # Split into train and test data X_train,X_test,y_train,y_test = split_train_test(X,y,train_percent=0.7) # relevant dates print(f"Training data range:\n\t {str(X_train.index[0])[:10]} to {str(X_train.index[-1])[:10]}") print(f"Test data range:\n\t {str(X_test.index[0])[:10]} to {str(X_test.index[-1])[:10]}") print() # reduce number of features # NOTE: want to add this step to pipeline X_train,X_test = config.reduce_features(X_train,X_test,y_train) y_train,y_test = y_train.target.ravel(),y_test.target.ravel() # get necessary sizes n,d = X_train.shape # define cv cv_inner = TimeSeriesSplit(n_splits=5) #cv_outer = TimeSeriesSplit(n_splits=5) # get classifier names model_strs = models.MODELS # scoring dct to track performance scores = {} # dislays subplot legend plot_traces = [] legend=True # best model best_model = None # run all models on dataset for model_str in model_strs: print("\nmodel = ",model_str) dct = {} # define pipeline model = getattr(models, model_str)(n,d) steps = [('Scaler',preprocessors.Scaler()),(model_str,model)] pipe,param_grid = make_pipeline_and_grid(steps) # determine which CV to be used if model_str in ['linearSVR','Dummy','LinearRegressor','KNN','lr_boost','LSTMRegressor']: search = GridSearchCV(pipe,param_grid, cv=cv_inner, refit='mean_absolute_error', iid=False, scoring=METRICS, return_train_score=True, n_jobs=-1) else: search = RandomizedSearchCV(pipe,param_grid, cv=cv_inner, n_iter=20, iid=False, random_state=0, refit='mean_absolute_error', scoring=METRICS, return_train_score=True, n_jobs=-1) # training print("Training Model ... ") search.fit(X_train,y_train) results = search.cv_results_ print("Best Parameters: ",search.best_params_) # make predictions print("Making Predictions ... ") y_pred = search.predict(X_test) # generate subplot traces subplot_traces = gen_subplot( X_train.index, X_test.index, y_train, y_test, y_pred, legend ) # append subplot to plot legend=False plot_traces.append(subplot_traces) # record metrics print("Recording Metrics ... \n") for met in METRICS: dct[f'train_{met}'] = results[f'mean_train_{met}'].mean() test_scores = gen_metrics(y_test,y_pred) dct.update(test_scores) # update scores dict scores[model_str] = dct # Display results scores_df = pd.DataFrame.from_dict(scores,orient='index') print(scores_df) print("\nBest Model: ",scores_df.test_mean_squared_error.idxmin()) print("Mean Squared Error: ",scores_df.test_mean_squared_error.min()) print() # show predictions # NOTE: this is hacky, need to fix later print(f'{forward_lag} day predictions') RAW_PREFIX = PREFIX.replace(f'_{forward_lag}','') raw_df = pd.read_csv(f'../data/raw/{RAW_PREFIX}_hist.csv').set_index('date') prediction_dates = raw_df.index[-forward_lag:] for i,date in enumerate(prediction_dates): print(f'{date}: {y_pred[-forward_lag+i]:.2f}') print() # plot results print("Plotting Results ... ") fig = plot_results(plot_traces,model_strs,ticker) fig.show() print() # log results print("Saving Results ... ") RESULTS_FILENAME = f'../data/results/{PREFIX}.csv' scores_df.to_csv(RESULTS_FILENAME) print(f'\{RESULTS_FILENAME}')
# Divide features and labels y = data.pop("offers") X = data # define polynomial regression degrees poly_reg = PolynomialFeatures(degree=2) X = pd.DataFrame(poly_reg.fit_transform(X)) # create pipeline with regressor and scaler pipeline = Pipeline([("scaler", RobustScaler()), ("regressor", LinearRegression())]) # nested cross validation tscv = TimeSeriesSplit(n_splits=6, max_train_size=365 * 48, test_size=48 * 30) # perform nested cross validation and get results y_test, y_pred = utils.my_cross_val_predict(pipeline, X, y, tscv) # calculate results results = utils.get_results(y_test, y_pred) # save results with open("results/results_polynomial_regression.json", "w") as f: json.dump(results, f) utils.plot_results( y_test, y_pred,
def main(): # Load data print("Reading file...") calendar = pd.read_csv('Data/calendar.csv') sell_prices = pd.read_csv('Data/sell_prices.csv') sales_train_validation = pd.read_csv('Data/sales_train_validation.csv') submission = pd.read_csv('Data/sample_submission.csv') # Reduce memory size print("Reducing memory size...") calendar = reduce_mem(calendar) sell_prices = reduce_mem(sell_prices) sales_train_validation = reduce_mem(sales_train_validation) submission = reduce_mem(submission) # Combine all data into one dataset print("Combining data...") data = combine_data(calendar, sell_prices, sales_train_validation, submission, nrows = 27500000, merge = True) gc.collect() # Encoding data print("Encoding data...") data = data_encoding(data) gc.collect() # Create new feature print("Creating new feature...") data = feature_create(data) data = reduce_mem(data) gc.collect() # Train Test split x = data[data['date'] <= '2016-04-24'] y = x.sort_values('date')['demand'] test = data[(data['date'] > '2016-04-24')] x = x.sort_values('date') test = test.sort_values('date') del data # Model parameters setting ## k-fold using TimeSeriesSplit n_fold = 3 folds = TimeSeriesSplit(n_splits=n_fold) ## lgb model parameters default_params = {"metric": 'rmse', "verbosity": -1, } params = {'num_leaves': 555, 'min_child_weight': 0.034, 'feature_fraction': 0.379, 'bagging_fraction': 0.418, 'min_data_in_leaf': 106, 'objective': 'regression', #default 'max_depth': -1, 'learning_rate': 0.005, "boosting_type": "gbdt", #defaul "bagging_seed": 11, "metric": 'rmse', "verbosity": -1, 'reg_alpha': 0.3899, 'reg_lambda': 0.648, 'random_state': 222, } # Model training columns = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'dayofweek', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30'] splits = folds.split(x, y) y_preds = np.zeros(test.shape[0]) y_oof = np.zeros(x.shape[0]) feature_importances = pd.DataFrame() feature_importances['feature'] = columns mean_score = [] print("Start to train...") for fold_n, (train_index, valid_index) in enumerate(splits): print("-" * 20 +"LGB Fold:"+str(fold_n)+ "-" * 20) X_train, X_valid = x[columns].iloc[train_index], x[columns].iloc[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] dtrain = lgb.Dataset(X_train, label=y_train) dvalid = lgb.Dataset(X_valid, label=y_valid) clf = lgb.train(params, dtrain, 2500, valid_sets = [dtrain, dvalid], early_stopping_rounds = 50, verbose_eval=100) feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance() y_pred_valid = clf.predict(X_valid, num_iteration=clf.best_iteration) y_oof[valid_index] = y_pred_valid val_score = np.sqrt(metrics.mean_squared_error(y_pred_valid, y_valid)) print(f'val rmse score is {val_score}') mean_score.append(val_score) y_preds += clf.predict(test[columns], num_iteration=clf.best_iteration)/n_fold del X_train, X_valid, y_train, y_valid gc.collect() print('mean rmse score over folds is', np.mean(mean_score)) test['demand'] = y_preds # Submission format subs = submit_format(test, submission) subs.to_csv('submission.csv',index = False) # Plot feature importance feature_importances['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(folds.n_splits)]].mean(axis=1) feature_importances.to_csv('feature_importances.csv') plt.figure(figsize=(16, 12)) sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(20), x='average', y='feature'); plt.title('20 TOP feature importance over {} folds average'.format(folds.n_splits));
def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int] = None) -> object: """ This builds a VAR model given a multivariate time series data frame with time as the Index. :param ts_df The time series data to be used for fitting the model. Note that the input can be a data frame with one column or multiple cols or a multivariate array. However, the first column must be the target variable. You must include only Time Series data in it. DO NOT include "Non-Stationary" or "Trendy" data. Make sure your Time Series is "Stationary" before you send it in!! If not, this will give spurious results. :type ts_df pd.DataFrame :param target_col The column name of the target time series that needs to be modeled. All other columns will be considered as exogenous variables (if applicable to method) :type target_col str :param cv: Number of folds to use for cross validation. Number of observations in the Validation set for each fold = forecast period If None, a single fold is used :type cv Optional[int] :rtype object """ self.original_target_col = target_col self.original_preds = [x for x in list(ts_df) if x not in [self.original_target_col]] ts_df = ts_df[[self.original_target_col] + self.original_preds] self.find_best_parameters(data = ts_df) ####################################### #### Cross Validation across Folds #### ####################################### rmse_folds = [] norm_rmse_folds = [] forecast_df_folds = [] NFOLDS = self.get_num_folds_from_cv(cv) #cv = GapWalkForward(n_splits=NFOLDS, gap_size=0, test_size=self.forecast_period) #cv = TimeSeriesSplit(n_splits=NFOLDS, test_size=self.forecast_period) ### sklearn version 0.0.24 max_trainsize = len(ts_df) - self.forecast_period try: cv = TimeSeriesSplit(n_splits=NFOLDS, test_size=self.forecast_period) ### this works only sklearn v 0.0.24] except: cv = TimeSeriesSplit(n_splits=NFOLDS, max_train_size = max_trainsize) if type(ts_df) == dask.dataframe.core.DataFrame: ts_df = dft.head(len(ts_df)) ### this converts dask into a pandas dataframe for fold_number, (train_index, test_index) in enumerate(cv.split(ts_df)): dftx = ts_df.head(len(train_index)+len(test_index)) ts_train = dftx.head(len(train_index)) ## now train will be the first segment of dftx ts_test = dftx.tail(len(test_index)) ### now test will be right after train in dftx print(f"\nFold Number: {fold_number+1} --> Train Shape: {ts_train.shape[0]} Test Shape: {ts_test.shape[0]}") ######################################### #### Define the model with fold data #### ######################################### y_train = ts_train.iloc[:, [0, self.best_d]] bestmodel = self.get_best_model(y_train) ###################################### #### Fit the model with fold data #### ###################################### if self.verbose >= 1: print(f'Fitting best VAR model on Fold: {fold_number+1}') try: self.model = bestmodel.fit(disp=False) except Exception as e: print(e) print(f'Error: VAR Fit on Fold: {fold_number+1} unsuccessful.') return bestmodel, None, np.inf, np.inf forecast_df = self.predict(ts_test.shape[0],simple=False) forecast_df_folds.append(forecast_df['yhat'].values) rmse, norm_rmse = print_dynamic_rmse(ts_test.iloc[:, 0].values, forecast_df['yhat'].values, ts_train.iloc[:, 0].values) rmse_folds.append(rmse) norm_rmse_folds.append(norm_rmse) norm_rmse_folds2 = rmse_folds/ts_df[self.original_target_col].values.std() # Same as what was there in print_dynamic_rmse() self.model.plot_diagnostics(figsize=(16, 12)) axis = self.model.impulse_responses(12, orthogonalized=True).plot(figsize=(12, 4)) axis.set(xlabel='Time Steps', title='VAR model Impulse Response Functions') ############################################### #### Refit the model on the entire dataset #### ############################################### y_train = ts_df.iloc[:, [0, self.best_d]] self.refit(ts_df=y_train) # return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds2
def RunSVM(Abs_train, Abs_test, X_train, Y_train, X_test, Y_test, name): model = SVC(C=1000, kernel='rbf', gamma=1) gs_ = [1.7**i for i in range(1, 20)] gs = [1.0 / i for i in gs_] cs = [i * 1000 for i in range(1, 30)] + [i * 10 for i in range(1, 100)] #cs = [10**i for i in range(2,4)] param_grid = [{'C': cs, 'gamma': gs}] clf = model_selection.GridSearchCV(model, param_grid, scoring=None, fit_params=None, n_jobs=-1, iid=True, refit='best_score_', cv=TimeSeriesSplit(n_splits=2), verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score='warn') clf.fit(X_train, Y_train) x = (clf.best_params_) print x # print model.get_params() model.set_params(**x) model.fit(X_train, Y_train) actual_dist = ComputeDistribution(Y_train, Y_test) pred_train = model.predict(X_train) pred_test = model.predict(X_test) cnf_mat_test = GenerateCnfMatrix(pred_test, Y_test) cnf_mat_train = GenerateCnfMatrix(pred_train, Y_train) accuracy = ComputeAccuracy(cnf_mat_test, cnf_mat_train, name, actual_dist) if CALCULATE_RETURNS == 'y': returns = ComputeReturns(Abs_test, Abs_train, pred_test, pred_train, Y_test, Y_train, name) '''gs_ = [0.001,0.005,0.01,0.05,0.1,0.15,0.28,0.75,1]+range(10,140) gs = [1.0/i for i in gs_] cs = [10,15,50,100,150,500,700,1000,2500,10000] c_array = [] g_array = [] actual_dist_array = [] predicted_test_array =[] predicted_train_array =[] predicted_train_acc_array =[] predicted_test_acc_array = [] ret_pt_tot_train =[] ret_pt_cor_inc_train=[] ret_pt_tot_test=[] ret_pt_cor_inc_test=[] for c in cs: for g in gs: c_array.append(c) g_array.append(g) print 'c ' +str(c) print 'g ' +str(g) #param_grid = [{ 'C': C_range,'kernel': ['rbf']}] #clf = model_selection.GridSearchCV(model, param_grid, cv = TimeSeriesSplit(n_splits = 5)) #clf.fit(X_train, Y_train) #x = (clf.best_params_ ) #print x #model.set_params(**x) model = SVC(C = c, kernel = 'rbf', gamma = g) actual_dist_array.append(list(actual_dist)) predicted_test_array.append(list(accuracy[0])) predicted_train_array.append(list(accuracy[1])) predicted_test_acc_array.append(list(accuracy[2])) predicted_train_acc_array.append(list(accuracy[3])) print(' ') ret_pt_tot_train.append(list(returns[0])) ret_pt_cor_inc_train.append(list(returns[1])) ret_pt_tot_test.append(list(returns[2])) ret_pt_cor_inc_test.append(list(returns[3])) print ('------------------------------------------') c_array = np.asarray(c_array).T g_array = np.asarray(g_array).T predicted_train_array = np.asarray(predicted_train_array).T predicted_train_acc_array = np.asarray(predicted_train_acc_array).T predicted_test_acc_array = np.asarray(predicted_test_acc_array).T predicted_test_array = np.asarray(predicted_test_array).T actual_dist_array = np.asarray(actual_dist_array).T out = np.vstack((c_array,g_array,actual_dist_array,predicted_train_array,predicted_test_array,predicted_train_acc_array,predicted_test_acc_array,ret_pt_tot_train,ret_pt_cor_inc_train,ret_pt_tot_test,ret_pt_cor_inc_test)) #out = out.T #header = ['c','gamma','dist_plus_actual','dist_minus_act','pred_plus_train','pred_minus_train','pred_plus_test','pred_minus_test','pred_tain_accuracy_tot','pred_train_acc_plus','pred_train_acc_minus','pre_test_acc_tot','pred_test_acc_plus','pred_test_acc_minus','ret_pt_tot_train','ret_pt_tot_plus','ret_pt_train_minus','ret_pt_cor_train','ret_pt_inc_train','rt_pt_tot_test','rt_pt_plus_test','rt_pt_minus_test','rt_pt_cor_test','ret_pt_inc_test'] #header = np.asarray(header) #out = np.vstack((header,out)) np.savetxt("c_gaama.csv", out.T, delimiter=",")''' return accuracy[2][0], pred_test
def function(self): self.out_1.val = TimeSeriesSplit()
def build_model( name: str, model_config: dict, data_config: Union[GordoBaseDataset, dict], metadata: dict, ): """ Build a model and serialize to a directory for later serving. Parameters ---------- name: str Name of model to be built model_config: dict Mapping of Model to initialize and any additional kwargs which are to be used in it's initialization. Example:: {'type': 'KerasAutoEncoder', 'kind': 'feedforward_hourglass'} data_config: dict Mapping of the Dataset to initialize, following the same logic as model_config. metadata: dict Mapping of arbitrary metadata data. Returns ------- Tuple[sklearn.base.BaseEstimator, dict] """ # Get the dataset from config logger.debug(f"Initializing Dataset with config {data_config}") dataset = (data_config if isinstance(data_config, GordoBaseDataset) else _get_dataset(data_config)) logger.debug("Fetching training data") start = time.time() X, y = dataset.get_data() time_elapsed_data = time.time() - start # Get the model and dataset logger.debug(f"Initializing Model with config: {model_config}") model = serializer.pipeline_from_definition(model_config) # Cross validate logger.debug(f"Starting to do cross validation") start = time.time() scores: Dict[str, Any] if hasattr(model, "score"): cv_scores = cross_val_score(model, X, y, cv=TimeSeriesSplit(n_splits=3)) scores = { "explained-variance": { "mean": cv_scores.mean(), "std": cv_scores.std(), "max": cv_scores.max(), "min": cv_scores.min(), "raw-scores": cv_scores.tolist(), } } else: logger.debug("Unable to score model, has no attribute 'score'.") scores = dict() cv_duration_sec = time.time() - start # Train logger.debug("Starting to train model.") start = time.time() model.fit(X, y) time_elapsed_model = time.time() - start metadata = {"user-defined": metadata} metadata["name"] = name metadata["dataset"] = dataset.get_metadata() utc_dt = datetime.datetime.now(datetime.timezone.utc) metadata["model"] = { "model-creation-date": str(utc_dt.astimezone()), "model-builder-version": __version__, "model-config": model_config, "data-query-duration-sec": time_elapsed_data, "model-training-duration-sec": time_elapsed_model, "cross-validation": { "cv-duration-sec": cv_duration_sec, "scores": scores }, } gordobase_final_step = _get_final_gordo_base_step(model) if gordobase_final_step: metadata["model"].update(gordobase_final_step.get_metadata()) return model, metadata
def __call__(self, trial): models = [classe.__name__ for classe in Models.__subclasses__()] classifier_name = trial.suggest_categorical('classifier', models) #n_in = trial.suggest_int('window_neg', -90, -1) n_in = trial.suggest_int('window_neg', -7, -7) window = WindowProcessor() X_lag, y_lag = window.transform(X=self.X, y=self.y, n_in=n_in, n_out=self.n_outs) '''Separação sequencial por ser um time series''' X_lag, X_test_lag, y_lag, y_test_lag = train_test_split(X_lag, y_lag, test_size=0.20, stratify=None, shuffle=False) if classifier_name == 'MultiLayerPerceptron': regressor = MultiLayerPerceptron() layers = list() # Determinando Numero de camadas n_layers = trial.suggest_int( 'n_layers', regressor.search_space['num_layer'][0], regressor.search_space['num_layer'][1]) # Determinando quantidade de neuronios por camada for layer in range(n_layers): layers.append( trial.suggest_int( 'layer_{:}'.format(layer), regressor.search_space['hidden_layer_sizes'][0], regressor.search_space['hidden_layer_sizes'][1])) # Determinando penalidade L2 - alpha alpha = trial.suggest_loguniform( 'alpha', regressor.search_space['alpha'][0], regressor.search_space['alpha'][1]) learning_rate_init = trial.suggest_loguniform( 'learning_rate', regressor.search_space['learning_rate_init'][0], regressor.search_space['learning_rate_init'][1]) # Determinando random state random_state = trial.suggest_int( 'random_state', regressor.search_space['random_state'][0], regressor.search_space['random_state'][1]) param_grid = { 'Mlp__hidden_layer_sizes': [tuple(layers)], 'Mlp__alpha': [alpha], 'Mlp__random_state': [random_state], 'Mlp__learning_rate_init': [learning_rate_init] } print('Window Neg: {:}'.format(n_in)) print('Window Forecast: {:}'.format(self.n_outs)) [print(k, v) for k, v in param_grid.items()] with parallel_backend('threading'): grid = GridSearchCV(verbose=1, scoring='neg_mean_absolute_error', estimator=regressor.pipeline, param_grid=param_grid, cv=TimeSeriesSplit(n_splits=5), n_jobs=-1, refit='neg_mean_absolute_error') grid.fit(X=X_lag, y=y_lag) y_hat_test = grid.predict(X_test_lag) y_hat_test = self.y_scaler.inverse_transform(y_hat_test) y_test_lag = self.y_scaler.inverse_transform(y_test_lag) print('Score cross-val: {:}'.format(grid.best_score_)) print('Score Test - MAE: {:}'.format( mean_absolute_error(y_true=y_test_lag, y_pred=y_hat_test))) print('R2 test: {:}'.format( r2_score(y_true=y_test_lag, y_pred=y_hat_test, multioutput='uniform_average'))) print('-' * 100) print('\n') return mean_absolute_error(y_true=y_test_lag, y_pred=y_hat_test)
def run_board_ensemble(X, y, dic_params_board, time_serie=False, n_splits=5, nbr_train_test_split=3, nbr_to_filter=12, performance=accuracy_score): performance_sk = make_scorer(performance) start = datetime.now() dic_params_board["time_serie"] = time_serie dic_params_board["nbr_train_test_split"] = nbr_train_test_split dic_params_board["scoring"] = performance_sk #Data if (time_serie): splits = TimeSeriesSplit(n_splits=n_splits) else: splits = StratifiedKFold(n_splits=n_splits) ensemble_acc_dic = {} clfs_preds = [] best_clf = [] mgs_preds = {'cds': [], 'ruler': [], 'perf': []} maj_vote_preds = {'cds': [], 'ruler': [], 'perf': []} rf_preds = {'cds': [], 'ruler': [], 'perf': []} clf2_preds = {'cds': [], 'ruler': [], 'perf': []} clfs_acc = [] y_of_preds = [] #print(datetime.now()) for train, test in splits.split(X, y): X_train, y_train = X[train], y[train] X_validation, y_validation = X[test], y[test] y_of_preds = y_of_preds + list(y_validation) #Models # run block of code and catch warnings with warnings.catch_warnings(): # ignore all caught warnings warnings.filterwarnings("ignore") board = bo(**dic_params_board) train_preds, y_trained = board.fit(X_train, y_train, predict_training_probas=True) preds = board.predict_probas(X_validation) if (clfs_preds == []): clfs_preds = np.argmax(preds, axis=-1).tolist() else: for ind, pred in enumerate(preds): clfs_preds[ind].extend(np.argmax(pred, axis=-1).tolist()) clf_time = datetime.now() #print("CLF time",clf_time - start, preds.shape[0]) filters_dico = {} #Testing cds filter filt = cds_filter(performance) filt = filt.selection(y_trained, train_preds) filters_dico["cds"] = filt cds_time = datetime.now() #print("cds time", cds_time - clf_time) #Testing cds/perf ruling filter filt = ruler_filter(performance) filt = filt.selection(y_trained, train_preds) filters_dico["ruler"] = filt ruler_time = datetime.now() #print("ruler time", ruler_time - cds_time) #Filter filt = perf_filter(performance) filt = filt.selection(y_trained, train_preds) filters_dico["perf"] = filt #Select the best clf at training and record it's testing scores best_pred = filt.filter(preds, nbr_to_filter=1) best_clf.extend(np.argmax(best_pred[0], axis=1)) best_time = datetime.now() #print("best clf time", best_time - ruler_time) #For each filter for filter_name in filters_dico: #Do train_preds_filtered = filt.filter(train_preds, nbr_to_filter=nbr_to_filter) preds_filtered = filt.filter(preds, nbr_to_filter=nbr_to_filter) #MGS #print("MGS") mgs = MGS(score_function=performance, n_jobs=-1) mgs = mgs.fit(train_preds_filtered, y_trained) pred = mgs.predict_proba(preds_filtered) mgs_preds[filter_name].extend(np.argmax(pred, axis=1).tolist()) mgs_time = datetime.now() #print("mgs",mgs_time - clf_time) #MAJ VOTING #print("Maj_voting") pred = voting_booth().vote(copy.deepcopy(preds_filtered)) maj_vote_preds[filter_name].extend(pred.tolist()) maj_vote_time = datetime.now() #print("vote", maj_vote_time - mgs_time) #Reshaping of train_preds_filtered and preds filtered from (a,b,c) to (a*c,b). #a: number classifiers #b: number of events #c: value for each target class train_preds_filtered = np.array( [x.T for x in train_preds_filtered]) preds_filtered = np.array([x.T for x in preds_filtered]) train_preds_filtered = train_preds_filtered.reshape( -1, train_preds_filtered.shape[-1]) preds_filtered = preds_filtered.reshape(-1, preds_filtered.shape[-1]) #RANDOM FOREST #print("Random Forest") rf = RandomForestClassifier(n_estimators=100, n_jobs=-1) rf = rf.fit(train_preds_filtered.T, y_trained) pred = rf.predict(preds_filtered.T) rf_preds[filter_name].extend(pred.tolist()) rf_time = datetime.now() #New set of classifiers: with warnings.catch_warnings(): # ignore all caught warnings warnings.filterwarnings("ignore") board = bo(**dic_params_board) train_preds2, y_trained2 = board.fit( train_preds_filtered.T, y_trained, predict_training_probas=True) pred = board.predict_probas(preds_filtered.T) #Filter the training to get best one filt2 = perf_filter(performance) filt2 = filt2.selection(y_trained2, train_preds2) #Select the best clf at training pred = np.argmax(filt2.filter(pred, nbr_to_filter=1)[0], axis=1) clf2_preds[filter_name].extend(pred.tolist()) ensemble_acc_dic["BestClf"] = performance(y_of_preds, best_clf) for filter_name in mgs_preds: ensemble_acc_dic["MGS_" + filter_name] = performance( y_of_preds, mgs_preds[filter_name]) ensemble_acc_dic["MajVoting_" + filter_name] = performance( y_of_preds, maj_vote_preds[filter_name]) ensemble_acc_dic["RF_" + filter_name] = performance( y_of_preds, rf_preds[filter_name]) ensemble_acc_dic["BestClf2_" + filter_name] = performance( y_of_preds, clf2_preds[filter_name]) clfs_acc = np.array([performance(y_of_preds, pred) for pred in clfs_preds]) return clfs_acc, ensemble_acc_dic
def get_model(data, target, use_ensemble=True): params1 = { 'el__alpha': np.logspace(-5, 2, 30), 'el__l1_ratio': np.linspace(0, 1, 3), 'pca__n_components': [2, 5, 10] } params2 = { 'rf__n_estimators': range(10, 101, 30), 'rf__max_depth': [2, 5, 9], 'pca__n_components': [2, 5, 10] } params3 = { 'lgb__learning_rate': np.logspace(-6, 0, 5), 'lgb__n_estimators': range(10, 101, 30), 'lgb__max_depth': [6, 9, 12], 'pca__n_components': [2, 5, 10], 'lgb__num_leaves': [100] } rf = Pipeline([('scale', StandardScaler()), ('pca', PCA()), ('rf', RandomForestRegressor())]) el = Pipeline([('scale', StandardScaler()), ('pca', PCA()), ('el', ElasticNet(max_iter=5000))]) lgb = Pipeline([('scale', StandardScaler()), ('pca', PCA()), ('lgb', LGBMRegressor())]) gr_lgb = GridSearchCV(lgb, params3, cv=TimeSeriesSplit(), scoring='neg_mean_squared_error', refit=True) gr_lgb.fit(data, target) logger.info('Booster params discovered') gr_el = GridSearchCV(el, params1, cv=TimeSeriesSplit(), scoring='neg_mean_squared_error', refit=True) gr_el.fit(data, target) logger.info('ElasticNet params discovered') gr_rf = GridSearchCV(rf, params2, cv=TimeSeriesSplit(), scoring='neg_mean_squared_error', refit=True) gr_rf.fit(data, target) logger.info('RandomForest params discovered') res_scores = { 'elastic': gr_el.best_score_, 'random_forest': gr_rf.best_score_, 'lgbm': gr_lgb.best_score_ } res_est = { 'elastic': gr_el.best_estimator_, 'random_forest': gr_rf.best_estimator_, 'lgbm': gr_lgb.best_estimator_ } if use_ensemble: estimators = [('elastic', gr_el.best_estimator_), ('random_forest', gr_rf.best_estimator_), ('lgbm', gr_lgb.best_estimator_)] stacked = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor( n_estimators=100, max_depth=3), passthrough=True) stacked.fit(data, target) logger.info('Ensemble fitted') return stacked return res_est[sorted(res_scores, key=lambda x: (-res_scores[x], x))[0]]
params = [{ 'Switcher__estimator': [RandomForestRegressor()], 'preprocess__p_text__TFIDF__ngram_range': [(1, 1)], 'preprocess__p_text__tSVD__n_components': [8, 9, 10], 'Switcher__estimator__n_estimators': [30, 40], 'Switcher__estimator__max_depth': [8, 9] }, { 'Switcher__estimator': [GradientBoostingRegressor()], 'preprocess__p_text__TFIDF__ngram_range': [(1, 1)], 'preprocess__p_text__tSVD__n_components': [8, 9, 10], 'Switcher__estimator__n_estimators': [30, 40], 'Switcher__estimator__max_depth': [8, 9], 'Switcher__estimator__learning_rate': np.logspace(-2, 1, 4) }] tscv = TimeSeriesSplit(n_splits=5) regr = GridSearchCV(p_tot, param_grid=params, cv=tscv, scoring='r2') Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.25, shuffle=False) regr.fit(Xtrain, Ytrain) print('best params: ', regr.best_params_) print('best score: ', regr.best_score_) with open('./pickled_models/RF_all_property_sold_price.pkl', 'wb') as f: pickle.dump(regr, f) Ypred = regr.predict(Xtest)
import pandas as pd import numpy as np import matplotlib.pyplot as plt X = np.loadtxt('../datasets/X.csv', delimiter=',') y = np.loadtxt('../datasets/y.csv', delimiter=',') ''' INSTRUCTIONS * Import TimeSeriesSplit from sklearn.model_selection. * Instantiate a time series cross-validation iterator with 10 splits. * Iterate through CV splits. On each iteration, visualize the values of the input data that would be used to train the model for that iteration. ''' # Import TimeSeriesSplit from sklearn.model_selection import TimeSeriesSplit # Create time-series cross-validation object cv = TimeSeriesSplit(n_splits=10) # Iterate through CV splits fig, ax = plt.subplots() for ii, (tr, tt) in enumerate(cv.split(X, y)): # Plot the training data on each iteration, to see the behavior of the CV ax.plot(tr, ii + y[tr]) ax.set(title='Training data on each CV iteration', ylabel='CV iteration') plt.show()
return pywt.waverec(coeff, wavelet, mode='per') # Get training, testing datasets df_reg = df.drop("GWP", axis=1) X = df_reg.drop("Discount_off", axis=1) X = df_reg[["date_delta"]] Y = df_reg['Discount_off'] # denoise discount using wavelet transform #Y = pd.Series(denoise_signal(Y)) #X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 1-365/len(df), shuffle = False) X_train = load('Data/regression_train_X.npy', allow_pickle=True) X_test = load('Data/regression_test_X.npy', allow_pickle=True) Y_train = load('Data/regression_train_y.npy', allow_pickle=True) Y_test = load('Data/regression_test_y.npy', allow_pickle=True) time_split = TimeSeriesSplit(n_splits=10) ## train SVM regressors = [ svm.SVR(), # linear_model.SGDRegressor(), linear_model.BayesianRidge(), linear_model.LassoLars(), linear_model.ARDRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.TheilSenRegressor(), linear_model.LinearRegression() ] name = [ 'svm.SVR',
def cross_cwcf(self, gamma_u, C_u, gamma_l, C_l, d_u=None, d_l=None, cv=3, mu=0.6, eta=10, isplt=False): """计算交叉验证的cwc(区间覆盖宽度标准) picp(区间覆盖率) pinew(区间宽度) cv: 折数 mu: 预测置信区间 eta: 惩罚因子,表现算法偏好度。eta增加提升picp;eta减小,提升pinew isplt: 是否将每折画图 """ tscv = TimeSeriesSplit(n_splits=cv) cwc_l = [] picp_l = [] pinew_l = [] i_cv = 0 plot_num = cv * 100 + 3 * 10 for tridx, teidx in tscv.split(self.cv_t): cv_tr_u, cv_te_u = self.cv_u[tridx], self.cv_u[teidx] cv_tr_l, cv_te_l = self.cv_l[tridx], self.cv_l[teidx] cv_tr_t, cv_te_t = self.cv_t[tridx], self.cv_t[teidx] svr_u = svm.SVR(gamma=gamma_u, C=C_u) svr_l = svm.SVR(gamma=gamma_l, C=C_l) pu, pl, d = self.svm_predict(cv_tr_u, cv_te_u, cv_tr_l, cv_te_l, svr_u, svr_l, d_u, d_l) if isplt: plt.figure(figsize=(15, 6)) plt.subplot(plot_num + 1) plt.title('cv train %d' % (i_cv)) plt.plot(cv_tr_u, color='blue', marker='o') plt.plot(cv_tr_l, color='blue', marker='o') plt.plot(cv_tr_t, color='gray', marker='x') plt.subplot(plot_num + 2) plt.title('cv test %d' % (i_cv)) plt.plot(cv_te_u, color='blue', marker='o') plt.plot(cv_te_l, color='blue', marker='o') plt.plot(cv_te_t, color='gray', marker='x') plt.subplot(plot_num + 3) plt.title('cv predict %d' % (i_cv)) plt.plot(pu, color='blue', marker='o') plt.plot(pl, color='blue', marker='o') plt.plot(cv_te_t[d:], color='gray', marker='x') i_cv += 1 plot_num += 3 picp = picpf(cv_te_t[d:], pu, pl) picp_l.append(picp) pinew = pinewf(pu, pl) pinew_l.append(pinew) cwc = cwcf(picp, pinew, mu, eta) cwc_l.append(cwc) return np.array(cwc_l).mean(), np.array(picp_l).mean(), np.array( pinew_l).mean()