def oct_target(alpha): print('Solving ILP for hyperparameter tuning...') all_results = [] tree_depth = tree_depths[0] for r in range(val_repeat): train_df, val_df = preprocessing.train_test_split( train_val_df, split=train_val_ratio, random_state=random_state) preprocessing.normalize(train_df, norm_cols=norm_cols) preprocessing.normalize(val_df, norm_cols=norm_cols) all_results.append( get_results(train_df=train_df, test_df=val_df, alpha=alpha, tree_depth=tree_depth, max_time_per_run=max_time_per_run, threads=threads, print_status=print_status, warm_start=warm_start)) results_df = pd.concat(all_results) all_results_df.append(results_df) aggregated = calc_mean_accuracy_per_alpha(results_df) all_aggregated_df.append(aggregated) best_alpha_acc = aggregated.max()['testing_accuracy'] return best_alpha_acc
def _load_raw_data(args, seed=42): """Load raw data from data directory (data_path) into pandas DataFrames, split train into train/val, create word_to_id dictionary, convert data frame words to ids. """ # create dictionary mapping words to number id word_to_id = _build_vocab(SEINFELD_FILE, args) train_raw_data, valid_raw_data, test_raw_data = [], [], [] for speaker in args.speakers: train_path = os.path.join(args.data_path, speaker + '_train.txt') test_path = os.path.join(args.data_path, speaker + '_test.txt') # read text files into pandas DataFrames df = pd.read_csv(train_path, header=None) # split train into train/val pandas data frames df_train, df_val = pp.train_test_split(df, train_pct=0.85, seed=seed) df_test = pd.read_csv(test_path) # append each character's text to raw data lists train_raw_data.append(_df_to_word_ids(df_train, word_to_id)) valid_raw_data.append(_df_to_word_ids(df_val, word_to_id)) test_raw_data.append(_df_to_word_ids(df_test, word_to_id)) return train_raw_data, valid_raw_data, test_raw_data, word_to_id
def plot_results(predicted_data, full_dataset, fraction): true_data = pp.train_test_split(full_dataset, fraction) xs = range(len(full_dataset)) fig = plt.figure(facecolor='white', figsize=(8, 5)) ax = fig.add_subplot(111) ax.plot(xs[:len(true_data[2])], true_data[2][:, 0], label='Train Data') ax.plot(xs[len(true_data[2]):], true_data[3][:, 0], label='Test Data') plt.plot(xs[len(true_data[2]):], predicted_data, label='Predicted') plt.legend() plt.show()
def setup_plot(full_dataset, fraction): plt.ion() true_data = pp.train_test_split(full_dataset, fraction) xs = range(len(full_dataset)) fig = plt.figure(facecolor='white', figsize=(8, 5)) ax = fig.add_subplot(111) ax.plot(xs[:len(true_data[2])], true_data[2][:, 0], label='Train Data') ax.plot(xs[len(true_data[2]):], true_data[3][:, 0], label='Test Data') plt.scatter(0, 0, label='Prediction', c='g', s=1) plt.legend() plt.pause(0.01) return xs, ax, len(true_data[2])
def main(): starter_time = timer(None) df_total = pd.read_csv('train.csv', index_col = None, header = 0, memory_map = True) df_total = df_total.drop(['ID_code'],axis = 1) df_total = df_total.sample(1000) df_total.index = range(len(df_total)) frame_train,frame_test = pp.train_test_split(df_total, 'target', 0.3) frame_train = pp.normalization(frame_train,'target') frame_test = pp.normalization(frame_test,'target') X = frame_train.drop(['target'], axis = 1) y = frame_train['target'] X_pred = frame_test.drop(['target'],axis = 1) y_truth = frame_test['target'] print ('数据读入完成') base_learners = constant.base_learners print ('基学习器载入完成,开始训练基学习器') df_single_output, P = single_model_test(base_learners, X, y, X_pred,y_truth) plot_roc_curve (y_truth, P.values, list(P.columns)) print ('基学习器训练完成,开始调节参数') base_param_dicts = constant.base_param_dicts df_params_base = base_hyperparam_tuning (X,y,base_learners, base_param_dicts, n_iterations = 50) df_params_base.to_csv('params_base.csv') print ('参数调节完成,开始训练中间层') layer1_learners = constant.layer1_learners layer1_param_dicts = constant.layer1_param_dicts print ('开始为中间层调参') #in_layer_1, df_params_1 = layer_hyperparam_tuning(X,y,pre_layer_learners=base_learners, local_layer_learners = layer1_learners, param_dicts_layer = layer1_param_dicts, n_iterations = 50, pre_params = 'params_base.csv') #df_params_1.to_csv('params1.csv') #设定学习器参数并确定元学习器 print ('开始训练元学习器') meta_learner = constant.meta_learner meta_param_dicts = constant.meta_param_dicts meta_layer_model, df_params_meta = layer_hyperparam_tuning(X,y,pre_layer_learners = layer1_learners, local_layer_learners = meta_learner, param_dicts_layer = meta_param_dicts, n_iterations = 50, pre_params = 'params_base.csv') df_params_meta.to_csv('paramsMeta.csv') params_pre = pd.read_csv('paramsMeta.csv') params_pre.set_index(['Unnamed: 0'], inplace = True) for case_name, params in params_pre["params"].items(): case_est = case_name params = eval(params) for est_name, est in meta_learner: if est_name == case_est: est.set_params(**params) layer_list = constant.layer_list pred_proba_1 ,stacking_model = stacking_training(X,y,X_pred,layer_list = layer_list,meta_learner = meta_learner) print (roc_auc_score(y_truth, pred_proba_1[:,1])) timer(starter_time) return pred_proba_1, stacking_model
def bayesian_tuning(train_val_df, train_val_ratio, tree_depths, target_col_name, val_repeat=8, print_status=True, max_time_per_run=300, warm_start=False): from bayes_opt import BayesianOptimization print('Starting bayesian optimization...') norm_cols = [col for col in train_val_df.columns if not col==target_col_name] #target function for bayesian optimization needs to be defined (strange scoping) all_results_df = [] all_aggregated_df = [] def oct_target(alpha): print('Solving ILP for hyperparameter tuning...') all_results = [] tree_depth = tree_depths[0] for r in range(val_repeat): train_df, val_df = preprocessing.train_test_split(train_val_df, split=train_val_ratio) preprocessing.normalize(train_df, norm_cols=norm_cols) preprocessing.normalize(val_df, norm_cols=norm_cols) all_results.append(get_results(train_df=train_df, test_df=val_df, alpha=alpha, tree_depth=tree_depth, max_time_per_run=max_time_per_run, threads=threads, print_status=print_status, warm_start=warm_start)) results_df = pd.concat(all_results) all_results_df.append(results_df) aggregated = calc_mean_accuracy_per_alpha(results_df) all_aggregated_df.append(aggregated) best_alpha_acc = aggregated.max()['testing_accuracy'] return best_alpha_acc alpha_min = 0 train_df, val_df = preprocessing.train_test_split(train_val_df, split=train_val_ratio) # need to split for an initial guess on max alpha l_hat, mis_points = baseline_accuracy(train_df, target_col_name) alpha_max = mis_points/l_hat bo = BayesianOptimization(oct_target, {'alpha': (alpha_min, alpha_max)}) n_iter = int(alpha_max*(2/15))+1 if n_iter<5: n_iter = 10 bo.maximize(init_points=2, n_iter=n_iter, kappa=2) return pd.concat(all_results_df), pd.concat(all_aggregated_df), bo.res['max']['max_params']['alpha']
def plot_event_predictions( # pylint: disable=too-many-arguments, too-many-locals model, data, start_time, stop_time, x_fields, y_fields, n_points_behind=15, n_points_ahead=3): """Show predictions along-side real data. Args: model (keras.Sequential|models.ModelWrapper): The trained model. data (pd.DataFrame): The dataframe with the actual data. start_time (pd.Timestamp): The time that the event starts. stop_time (pd.Timestamp): The time that the event ends. x_fields (list): List of column names used to train the model. y_fields (list): List of column names used to train the model. n_points_behind (int): input vector size. n_points_ahead (int): output vector size. """ event = data[(data.index > start_time) & (data.index < stop_time)] x, _, _, _ = preprocessing.train_test_split( # pylint: disable=invalid-name event, x_fields, y_fields, percent=1, n_points_behind=n_points_behind, n_points_ahead=n_points_ahead) times = event.index.get_values() n_samples = len(event) - (n_points_behind + n_points_ahead) times = [ times[i + n_points_behind:i + n_points_behind + n_points_ahead] for i in xrange(n_samples) ] results = model.predict(x) event[y_fields].plot() for time, result in zip(times, results): index = pd.DatetimeIndex(time) series = pd.Series(result, index=index) series.plot(c='green')
from sklearn import svm from sklearn import metrics import preprocessing as pre import joblib # Accessing dataset data = pd.read_csv("data.csv") # print(data.head()) print("Initial shape:", data.shape) data = pd.DataFrame(data) data, labels = pre.preprocess(data) print("Shape after preprocessing:", data.shape, "and length of target array:", len(labels)) train, train_labels, test, test_labels = pre.train_test_split(data, labels) print("Train data:", train.shape, len(train_labels)) print("Test data:", test.shape, len(test_labels)) # SVM Classifier clf = svm.SVC(kernel="rbf", C=1.0) # rbf kernel # Train the model using the training sets clf.fit(train, train_labels) # Predict the response for test dataset y_pred = clf.predict(test) print("Accuracy:", metrics.accuracy_score(test_labels, y_pred)) prec = metrics.precision_score(test_labels, y_pred)
from hyperopt import tpe from hyperopt import fmin import ast from hyperopt import Trials import lightgbm as lgb import preprocessing as pp import feature_selection as fs MAX_EVALS = 500 N_FOLDS = 10 df_total = pd.read_csv('train.csv', index_col=None, header=0, memory_map=True) df_total = df_total.drop(['ID_code'], axis=1) #df_total = df_total.sample(1000) #df_total.index = range(len(df_total)) frame_train, frame_test = pp.train_test_split(df_total, 'target', 0.3) frame_train = pp.normalization(frame_train, 'target') frame_test = pp.normalization(frame_test, 'target') X = frame_train.drop(['target'], axis=1) y = frame_train['target'] X_pred = frame_test.drop(['target'], axis=1) y_truth = frame_test['target'] X = np.array(X) X_pred = np.array(X_pred) train_set = lgb.Dataset(X, label=y) def objective(params, n_folds=N_FOLDS): """Objective function for Gradient Boosting Machine Hyperparameter Optimization""" # Keep track of evals
#--------------------------------------------------------------------------- print '\n' + '-------------------------Starting PREPROCESSING-------------------------------' + '\n' start_timer_preprocess = timeit.default_timer() print 'Step 1 : Loading data' df = preprocessing.load_data(path=code_path + '/Data/') print 'Step 2 : Dropping redundant features' df = preprocessing.drop_features(df) print 'Step 3 : Cleaning up missing value tags : -999.0' df = preprocessing.drop_missing_values(df) train, test = preprocessing.train_test_split(df, perc=0.80) elapsed = timer(start_timer_preprocess) print '\n' + '---------------------Finished PREPROCESSING stage, took ' + str( elapsed) + ' seconds----------' + '\n' #---------------------------------------------------------------------------- # ----------------------SAMPLING : Training with 2 types of samples--------- # 1) Uniform sample # 2) Choice sample #---------------------------------------------------------------------------- print '--------------------------Starting SAMPLING-----------------------------------' + '\n' start_timer_sampling = timeit.default_timer()
def uci_experiment(loc, target_col, hot_encode_cols, tree_depths, alphas_tuning, repeat, val_repeat=3, train_test_ratio=0.8, train_val_ratio=0.66, header=None, max_time_per_run=300, threads=None, save_to_file=True, print_status=False, f_name=None, character_encoding='utf-8', warm_start=False): """ TODO: currently only numerical datasets are supported (preprocessing needs to be adjusted) input checks need to be added loc: location of dataset (string) target_col: number of target column (to predict) tree_depths: list of tree depths to run experiments with alphas: list of tree complexity parameters to run experiments with repeat: integer indicating how often experiment should be repeated train_test_ratio: percentage (between zero and one) indicating how much of data is used for training and validation (rest for testing) train_val_ratio: percentage indicating how much of data of training and validation is used for training (rest for validation) header: whether or not data under url has a header to load. if no header: set to None, if header: integer indicating row number max_time_per_run: how much time is spend for one optimization run threads: how many threads in gurobi optimization (None falls back to gurobi default) save_to_file: boolean indicating whether results are saved to a file filename: filename to save results in (if none and save to file: time will be used as filename) characer_encoding: (string) how to decode characters """ df = None if is_url(loc): #read dataframe from url html = requests.get(loc).content s = io.StringIO(html.decode(character_encoding)) df = pd.read_csv(s, header=header) else: df = pd.read_csv(loc) #hot encode if needed if not hot_encode_cols is None: df, target_col = preprocessing.hot_encode(df, target_col, hot_encode_cols) #split into training (+validation) and testing train_val_df, test_df = preprocessing.train_test_split( df, split=train_test_ratio) #test remains untouched until alpha is chosen target_col_name = df.columns[target_col] norm_cols = [col for col in df.columns if not col == target_col_name] #all_results = [] #all (repeat) experimental results for different values of alpha, tree depths results_df, aggregated, best_alpha = hyperparameter_tuning( method=alphas_tuning, train_val_df=train_val_df, train_val_ratio=train_val_ratio, tree_depths=tree_depths, target_col_name=target_col_name, val_repeat=val_repeat, warm_start=warm_start) print('Validation done. Best alpha: {0}'.format(best_alpha)) #get final result/ accuracy final_results = [] for tree_depth in tree_depths: for r in range(1): train_df, val_df = preprocessing.train_test_split(train_val_df, split=0.66) preprocessing.normalize(train_df, norm_cols=norm_cols) preprocessing.normalize(test_df, norm_cols=norm_cols) final_results.append( get_results(train_df=train_df, test_df=test_df, alpha=best_alpha, tree_depth=tree_depth, max_time_per_run=max_time_per_run, threads=threads, print_status=print_status, warm_start=warm_start)) #list of dataframes final_results_df = pd.concat(final_results) aggregated_final = calc_mean_accuracy_per_alpha(final_results_df) if save_to_file: dir_name = 'experiments' persist_results(dir_name=dir_name, f_name=f_name + '_validation_', results_df=results_df, aggregated=aggregated) persist_results(dir_name=dir_name, f_name=f_name + '_final_', results_df=final_results_df, aggregated=aggregated_final) return results_df
def gd_tuning(train_val_df, train_val_ratio, tree_depths, target_col_name, val_repeat=8, decrease_threshold=0.05, p=0.02, print_status=True, max_time_per_run=300, warm_start=False): """ stop if accuracy is worse than best_accuracy-decrease_threshold p: after running algorithm, calculate alpha by taking take mean of all alphas that achieved accuracy within range of p of best acc """ print('Starting parameter tuning.') train_df, val_df = preprocessing.train_test_split(train_val_df, split=train_val_ratio) l_hat, mis_points = baseline_accuracy(train_df, target_col_name) alpha_max = mis_points / l_hat alpha_min = 0 #alpha_min = 9.92419825072886 #alpha_max = 9.92419825072886 test_n_alphas = 50 #test_n_alphas = 1 print( 'Testing maximum of {0} values for alpha between {1} and {2}.'.format( test_n_alphas, alpha_min, alpha_max)) alphas = np.linspace(alpha_min, alpha_max, test_n_alphas) all_results = [] norm_cols = [ col for col in train_val_df.columns if not col == target_col_name ] for no, alpha in enumerate(alphas): print('Testing alpha={0}'.format(alpha)) for tree_depth in tree_depths: for r in range(val_repeat): #create new train/val train_df, val_df = preprocessing.train_test_split( train_val_df, split=train_val_ratio) #preprocessing #normalize preprocessing.normalize(train_df, norm_cols=norm_cols) preprocessing.normalize(val_df, norm_cols=norm_cols) all_results.append( get_results(train_df=train_df, test_df=val_df, alpha=alpha, tree_depth=tree_depth, max_time_per_run=max_time_per_run, threads=threads, print_status=print_status, warm_start=warm_start)) #list of dataframes if not alpha == 0: results_df = pd.concat(all_results) aggregated = calc_mean_accuracy_per_alpha(results_df) best_alpha = aggregated.idxmax()[ 'testing_accuracy'] #df is indexed by alpha best_alpha_acc = aggregated.max()['testing_accuracy'] #check whether last tested alpha decreased significantly alpha_acc = aggregated['testing_accuracy'][ alpha] #accuracy for current alpha if alpha_acc < best_alpha_acc - decrease_threshold: print( 'Accuracy for alpha={0}: {1} is worse than best accuracy for alpha={2}: {3}.\nStopping criterion is met...' .format(alpha, alpha_acc, best_alpha, best_alpha_acc)) break #take mean of top_n best alphas #best_alpha = np.mean(aggregated.sort_values(by='testing_accuracy', ascending=False).index[:top_n]) #take mean of all alphas that achieved accuracy within p of best acc #p = 0.02 best_alpha = np.mean( aggregated[aggregated['testing_accuracy'] > best_alpha_acc - p].index) return results_df, aggregated, best_alpha
def bayesian_tuning(train_val_df, train_val_ratio, tree_depths, target_col_name, val_repeat=8, print_status=True, max_time_per_run=300, warm_start=False): from bayes_opt import BayesianOptimization print('Starting bayesian optimization...') norm_cols = [ col for col in train_val_df.columns if not col == target_col_name ] #target function for bayesian optimization needs to be defined (strange scoping) all_results_df = [] all_aggregated_df = [] train_df, val_df = preprocessing.train_test_split(train_val_df, split=train_val_ratio) if val_repeat == 1: random_state = np.random.randint(low=0, high=100) else: random_state = None def oct_target(alpha): print('Solving ILP for hyperparameter tuning...') all_results = [] tree_depth = tree_depths[0] for r in range(val_repeat): train_df, val_df = preprocessing.train_test_split( train_val_df, split=train_val_ratio, random_state=random_state) preprocessing.normalize(train_df, norm_cols=norm_cols) preprocessing.normalize(val_df, norm_cols=norm_cols) all_results.append( get_results(train_df=train_df, test_df=val_df, alpha=alpha, tree_depth=tree_depth, max_time_per_run=max_time_per_run, threads=threads, print_status=print_status, warm_start=warm_start)) results_df = pd.concat(all_results) all_results_df.append(results_df) aggregated = calc_mean_accuracy_per_alpha(results_df) all_aggregated_df.append(aggregated) best_alpha_acc = aggregated.max()['testing_accuracy'] return best_alpha_acc alpha_min = 0.5 train_df, val_df = preprocessing.train_test_split( train_val_df, split=train_val_ratio ) # need to split for an initial guess on max alpha l_hat, mis_points = baseline_accuracy(train_df, target_col_name) alpha_max = mis_points / l_hat max_splits = np.power(tree_depths[0], 2) - 1 alpha_max = alpha_max * ((max_splits - 1) / max_splits) alpha_min = alpha_max * ((1.0) / max_splits) bo = BayesianOptimization(oct_target, {'alpha': (alpha_min, alpha_max)}) n_iter = int((alpha_max - alpha_min) * (1 / 15)) #n_iter = 5 if n_iter < 4: n_iter = 4 if tree_depths[0] > 2: if n_iter > max_splits: n_iter = max_splits print( '***\nAlpha min: {0}\nAlpha max: {1}\nTesting {2} values for alpha\n***' .format(alpha_min, alpha_max, n_iter)) bo.maximize(init_points=2, n_iter=n_iter, kappa=3) return pd.concat(all_results_df), pd.concat( all_aggregated_df), bo.res['max']['max_params']['alpha']
self.N_k_t[i][j].start = ln.n_k_t[j] self.c_k_t[i][j].start = ln.c_k_t[j] self.L_t[i].start = ln.l_t if __name__ == '__main__': # data target = 4 df = pd.read_csv('../data/iris/iris.data') target_name = df.columns[target] norm_cols = [col for col in df.columns if not col == target_name] preprocessing.normalize(df, norm_cols=norm_cols) # Parameters: tree_complexity = 1 tree_depth = 2 df_train, df_test = preprocessing.train_test_split(df, split=0.8) print('Training samples: {0}'.format(len(df_train))) print('Testing samples: {0}'.format(len(df_test))) o = OCTH(df_train, target, tree_complexity, tree_depth, warm_start=True) o.fit() # print('*' * 10) # print('SOLUTION') # print('*' * 10) # print(o.tree) preds = o.predict(df, feat_cols=norm_cols) print('Training accuracy: {0}'.format(o.training_accuracy())) print('Testing accuracy: {0}'.format(o.accuracy_on_test(df_test, target))) # o.model.MIPGap
def main(*args): import pandas as pd import logging ################################################################## # preparations : arguments 변수 정의 및 logging 설정 # logging level : DEBUG, INFO, WARNING, ERROR, CRITICAL data_adrress = args[0] log_adrress = args[1] model_output_adrress = args[2] formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("log") logger.setLevel(logging.INFO) file_handler = logging.FileHandler(log_adrress + "/log.log") file_handler.setFormatter(formatter) logger.addHandler(file_handler) logging.basicConfig() logger.info("Program Start..") ################################################################## # Frist : Read data from the directory -> rawData import read_data logger.info("read data start..") rawData = read_data.read_csv(data_adrress) # 콘솔창 출력 column 확대 pd.set_option('display.max_columns', None) # pd.set_option('display.max_rows', None) #print(rawData) logger.info("read data end..") ################################################################## # Second : Preprocess data from rawData to Train & Test set # Step : 1.remove Outlier 2.Feature Scaling 3.Test/Train Split & Shuffle import preprocessing logger.info("preprocessing start..") # Proprocess rawData according to data characteristics # For regression (ex. Demand/Time forecasting) --> preprocessingData # step 1 : remove outlier (from EDA or domain Knowledge) preprocessing_Data = rawData # step 2 : Feature scaling # For classification (ex. Image, Natural language) --> preprocessingData preprocessing_data = rawData # X (Independent variable) & Y (Dependent variable) Split independent_var = preprocessing_data.loc[:, [ "Temperature", "Humidity", "Light", "CO2", "HumidityRatio" ]] dependent_var = preprocessing_data.loc[:, ["Occupancy"]] # Train & Test Split (독립변수, 의존변수, Shuffle 유무, Test Set 사이즈) x_train, x_test, y_train, y_test = preprocessing.train_test_split( independent_var, dependent_var, True, 0.2) logger.info("preprocessing end..") ################################################################## # Third : Build Model logger.info("build Model start..") import logistic_regression import randomforest import gradientboosting x_train = x_train.values y_train = y_train.values.ravel() logistic_model = logistic_regression.regression(x_train, y_train) randomforest_model = randomforest.randomforest(x_train, y_train, 20, 0) xgboost_model = gradientboosting.xgb(x_train, y_train, 0.02, 20) logger.info("build Model end..") ################################################################## # Fourth : Test & Tuning Model logger.info("test start..") from sklearn import metrics y_pred_logistic_model = logistic_model.predict(x_test) print('logistic_model 정확도 :', metrics.accuracy_score(y_test, y_pred_logistic_model)) y_pred_randomforest_model = randomforest_model.predict(x_test) print('randomforest_model 정확도 :', metrics.accuracy_score(y_test, y_pred_randomforest_model)) y_pred_xgboost_model = xgboost_model.predict(x_test) print('xgboost_model 정확도 :', metrics.accuracy_score(y_test.values, y_pred_xgboost_model)) confusion = metrics.confusion_matrix(y_test.values, y_pred_xgboost_model) print(confusion) logger.info("test end..") ################################################################## # Fifth : clear memory & Save Output logger.info("save start..") import joblib joblib.dump(logistic_model, model_output_adrress + "./logistic_model.pkl") logger.info("save end..") logger.info("Program End..")
OUTPUT = 'Close' INPUTS = ['Close'] # open data data = preprocessing.read_data(FILE_PATH) # select time scope data = preprocessing.select_data(data, "2015-01-01", "") # reformat time step, not doing anything if already the right time format (to check) data = preprocessing.format_time_step(data, "H") # set timestamp as index data = data.set_index("Timestamp") data = data[INPUTS] # train/test split train, test = preprocessing.train_test_split(data, 0.7) train = train[0] test = test[0] time_indices = [1420680240, 1421120280] truth = train.loc[time_indices, OUTPUT] # dummy echo model model = dummy_echo.Model(1) model.train(train) prediction = model.predict([1420680240]) train_gen = KerasGenerator(train[INPUTS].values, train[OUTPUT].values, 10, 1, 10, 1, 32) test_gen = KerasGenerator(test[INPUTS].values, test[OUTPUT].values, 10, 1, 10, 1, 32)