def imp_data(normalize): imp_packets = prep.get_imp(packets) if (normalize == 'y'): normalize_features = [ 'srcip1', 'srcip1', 'sport', 'dstip1', 'dstip2', 'dsport', 'dur', 'sbytes', 'Stime', 'Ltime' ] prep.normalization(imp_packets, normalize_features) Dbscan_fixed_eps_info(0.5)
def main(): starter_time = timer(None) df_total = pd.read_csv('train.csv', index_col = None, header = 0, memory_map = True) df_total = df_total.drop(['ID_code'],axis = 1) df_total = df_total.sample(1000) df_total.index = range(len(df_total)) frame_train,frame_test = pp.train_test_split(df_total, 'target', 0.3) frame_train = pp.normalization(frame_train,'target') frame_test = pp.normalization(frame_test,'target') X = frame_train.drop(['target'], axis = 1) y = frame_train['target'] X_pred = frame_test.drop(['target'],axis = 1) y_truth = frame_test['target'] print ('数据读入完成') base_learners = constant.base_learners print ('基学习器载入完成,开始训练基学习器') df_single_output, P = single_model_test(base_learners, X, y, X_pred,y_truth) plot_roc_curve (y_truth, P.values, list(P.columns)) print ('基学习器训练完成,开始调节参数') base_param_dicts = constant.base_param_dicts df_params_base = base_hyperparam_tuning (X,y,base_learners, base_param_dicts, n_iterations = 50) df_params_base.to_csv('params_base.csv') print ('参数调节完成,开始训练中间层') layer1_learners = constant.layer1_learners layer1_param_dicts = constant.layer1_param_dicts print ('开始为中间层调参') #in_layer_1, df_params_1 = layer_hyperparam_tuning(X,y,pre_layer_learners=base_learners, local_layer_learners = layer1_learners, param_dicts_layer = layer1_param_dicts, n_iterations = 50, pre_params = 'params_base.csv') #df_params_1.to_csv('params1.csv') #设定学习器参数并确定元学习器 print ('开始训练元学习器') meta_learner = constant.meta_learner meta_param_dicts = constant.meta_param_dicts meta_layer_model, df_params_meta = layer_hyperparam_tuning(X,y,pre_layer_learners = layer1_learners, local_layer_learners = meta_learner, param_dicts_layer = meta_param_dicts, n_iterations = 50, pre_params = 'params_base.csv') df_params_meta.to_csv('paramsMeta.csv') params_pre = pd.read_csv('paramsMeta.csv') params_pre.set_index(['Unnamed: 0'], inplace = True) for case_name, params in params_pre["params"].items(): case_est = case_name params = eval(params) for est_name, est in meta_learner: if est_name == case_est: est.set_params(**params) layer_list = constant.layer_list pred_proba_1 ,stacking_model = stacking_training(X,y,X_pred,layer_list = layer_list,meta_learner = meta_learner) print (roc_auc_score(y_truth, pred_proba_1[:,1])) timer(starter_time) return pred_proba_1, stacking_model
def ml_pipeline(response): output_preprocessing = preprocessing.preprocess() if output_preprocessing: # Update any related flows already in the dataset with the latest data. preprocessing.update_related() # Data normalization into a [0,1] scale. preprocessing.normalization() if config.df.shape[0] >= 10: if config.args.kmeans: kmeans.kmeans() if config.args.dbscan: dbscan.dbscan() postprocessing.postprocess() eval_counter.counter() return response
def main(): csv_data = pd.read_csv( 'https://firebasestorage.googleapis.com/v0/b/fir-crud-36cbe.appspot.com/o/Iris.csv?alt=media&token=71bdac3f-96e5-4aae-9b60-78025c1d3330' ) csv_data = preprocessing.normalization(csv_data) dfTraining, dfTesting = preprocessing.splitData(csv_data) biases = (0.0001, 0.0001) weights = learn.learning(dfTraining, 0.5, biases) test.testing(dfTesting, weights, biases)
def raw_data(evalu, del_tcp, normalize): if (evalu == 'km_e'): method_km.Elbow(packets) elif (evalu == 'km_s'): if (del_tcp == 'y'): prep.del_tcp_features(packets) if (normalize == 'y'): normalize_features = ['Stime', 'Ltime', 'Sload', 'Dload'] prep.normalization(packets, normalize_features) Kmeans_fixed_k_info() elif (evalu == 'db'): if (del_tcp == 'y'): prep.del_tcp_features(packets) if (normalize == 'y'): #要抓所有key出來當normalize_all normalize_features = normalize_all prep.normalization(packets, normalize_features) elif (del_tcp == 'n'): if (normalize == 'y'): normalize_features = normalize_tcp prep.normalization(packets, normalize_features) """ del packets['Ltime'] del packets['Stime'] del packets['Sintpkt'] del packets['Dintpkt'] del packets['Sjit'] del packets['Djit'] """ #print(packets.loc[0]) Dbscan_fixed_eps_info(0.5)
def separate_and_prepare_data(self, data, labels, train_index, test_index, num_features): trn_x, tst_x = data[data.index.isin(train_index)], data[ data.index.isin(test_index)] trn_y, tst_y = labels[labels.index.isin(train_index)], labels[ labels.index.isin(test_index)] # PREPARATION FOR TRAINING PART trn_x, tst_x = prep.normalization(trn_x, tst_x) trn_x, trn_y = prep.balance_oversampling(trn_x, trn_y) trn_x, tst_x = prep.correlation_removal_kcross(trn_x, tst_x) trn_x, tst_x = prep.select_features(trn_x, trn_y, tst_x, num_features) trn_x_values = trn_x.values tst_x_values = tst_x.values trn_y_values = trn_y.values tst_y_values = tst_y.values return trn_x_values, tst_x_values, trn_y_values, tst_y_values
y_train = train.pop('class') y_train = y_train.astype('int') x_train = train y_test = test.pop('class') y_test = y_test.astype('int') x_test = test #################### NORMALIZE DATA ############################# target_count_dataset = y.replace({0: 'Healthy', 1: 'Sick'}).value_counts() target_count_test = y_test.replace({0: 'Healthy', 1: 'Sick'}).value_counts() # print_balance(target_count_dataset, target_count_test, title1='balance of whole set', title2='balance of test set') x_train, x_test = prep.normalization(x_train, x_test) #################### NORMALIZE DATA ############################# target_count_dataset = y.replace({0: 'Healthy', 1: 'Sick'}).value_counts() target_count_test = y_test.replace({0: 'Healthy', 1: 'Sick'}).value_counts() # print_balance(target_count_dataset, target_count_test, title1='balance of whole set', title2='balance of test set') x_train, x_test = prep.normalization(x_train, x_test) #################### DATA BALANCE ############################# # before sampling target_count = y_train.replace({0: 'Healthy', 1: 'Sick'}).value_counts() min_class = target_count.idxmin()
def main(): os.makedirs(fold_name) # fit model file_path = fold_name + '/' + fold_name + ".h5" log_file_path = fold_name + '/' + fold_name + ".log" log = open(log_file_path, 'w') # model setting seq = CovLSTM2D_model() with redirect_stdout(log): seq.summary() # TODO # seq = STResNet_model() train_X_raw, train_Y_raw, sst_grid_raw = np.load( DATA_PATH) # from .npy file # normalization, data for ConvLSTM Model -n ahead -5 dimension train_X = np.zeros((len_seq, len_frame, 10, 50, 1), dtype=np.float) train_Y = np.zeros((len_seq, len_frame, 10, 50, 1), dtype=np.float) sst_grid = np.zeros((len_seq + len_frame, len_frame, 10, 50, 1), dtype=np.float) for i in range(len_seq): for k in range(len_frame): train_X[i, k, ::, ::, 0] = pp.normalization(sst_grid_raw[i, k, ::, ::, 0]) train_Y[i, k, ::, ::, 0] = pp.normalization(sst_grid_raw[i + len_frame, k, ::, ::, 0]) sst_grid[i, k, ::, ::, 0] = pp.normalization(sst_grid_raw[i, k, ::, ::, 0]) seq = multi_gpu_model(seq, gpus=2) # sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) # rmsprop = optimizers.RMSprop(lr=0.1) seq.compile(loss="mse", optimizer='adam') if not os.path.exists(file_path): # ConvLSTM Model history = seq.fit(train_X[:train_length], train_Y[:train_length], batch_size=batch_size, epochs=epochs, validation_split=validation_split) seq.save(file_path) pyplot.plot(history.history['loss']) log.write("\n train_loss=========") log.write("\n %s" % history.history['loss']) pyplot.plot(history.history['val_loss']) log.write("\n\n\n val_loss=========") log.write("\n %s" % history.history['val_loss']) pyplot.title('model loss') pyplot.ylabel('loss') pyplot.xlabel('epoch') pyplot.legend(['train', 'validation'], loc='upper left') pyplot.savefig(fold_name + '/%i_epoch_loss.png' % epochs) else: seq = load_model(file_path) model_sum_rmse = 0 base_sum_rmse = 0 model_sum_mae = 0 base_sum_mae = 0 model_sum_mape = 0 base_sum_mape = 0 single_point_model_sum_rmse = 0 single_point_base_sum_rmse = 0 for k in range(start_seq, end_seq): # direct with -n steps model_sum_rmse_current = 0 base_sum_rmse_current = 0 model_sum_mae_current = 0 base_sum_mae_current = 0 model_sum_mape_current = 0 base_sum_mape_current = 0 pred_sequence_raw = sst_grid[k][::, ::, ::, ::] act_sequence = sst_grid[k + len_frame][::, ::, ::, ::] pred_sequence = seq.predict( pred_sequence_raw[np.newaxis, ::, ::, ::, ::]) pred_sequence = pred_sequence[0, ::, ::, ::, ::] for j in range(len_frame): baseline_frame = pp.inverse_normalization( pred_sequence_raw[j, ::, ::, 0]) pred_toplot = pp.inverse_normalization(pred_sequence[j, ::, ::, 0]) act_toplot = pp.inverse_normalization(act_sequence[j, ::, ::, 0]) model_rmse = mean_squared_error(act_toplot, pred_toplot) baseline_rmse = mean_squared_error(act_toplot, baseline_frame) model_mae = mean_absolute_error(act_toplot, pred_toplot) baseline_mae = mean_absolute_error(act_toplot, baseline_frame) model_mape = pp.mean_absolute_percentage_error( act_toplot, pred_toplot) baseline_mape = pp.mean_absolute_percentage_error( act_toplot, baseline_frame) model_sum_rmse, base_sum_rmse = model_sum_rmse + model_rmse, base_sum_rmse + baseline_rmse model_sum_mae, base_sum_mae = model_sum_mae + model_mae, base_sum_mae + baseline_mae model_sum_mape, base_sum_mape = model_sum_mape + model_mape, base_sum_mape + baseline_mape model_sum_rmse_current, base_sum_rmse_current = model_sum_rmse_current + model_rmse, base_sum_rmse_current + baseline_rmse model_sum_mae_current, base_sum_mae_current = model_sum_mae_current + model_mae, base_sum_mae_current + baseline_mae model_sum_mape_current, base_sum_mape_current = model_sum_mape_current + model_mape, base_sum_mape_current + baseline_mape single_model_rmse = (act_toplot[point_x, point_y] - pred_toplot[point_x, point_y])**2 single_base_rmse = (act_toplot[point_x, point_y] - baseline_frame[point_x, point_y])**2 single_point_model_sum_rmse = single_point_model_sum_rmse + single_model_rmse single_point_base_sum_rmse = single_point_base_sum_rmse + single_base_rmse log.write("\n\n ============") log.write("\n Round: %s" % str(k + 1)) log.write("\nTotal Model RMSE: %s" % (sqrt(model_sum_rmse_current / len_frame))) log.write("\nTotal Baseline RMSE: %s" % (sqrt(base_sum_rmse_current / len_frame))) log.write("\nTotal Model MAE: %s" % (model_sum_mae_current / len_frame)) log.write("\nTotal Baseline MAE: %s" % (base_sum_mae_current / len_frame)) log.write("\nModel MAPE: %s" % (model_sum_mape_current / len_frame)) log.write("\nBaseline MAPE: %s" % (base_sum_mape_current / len_frame)) print("============") print("Round: %s" % str(k + 1)) print("Total Model RMSE: %s" % (sqrt(model_sum_rmse_current / len_frame))) print("Total Baseline RMSE: %s" % (sqrt(base_sum_rmse_current / len_frame))) print("Total Model MAE: %s" % (model_sum_mae_current / len_frame)) print("Total Baseline MAE: %s" % (base_sum_mae_current / len_frame)) print("Model MAPE: %s" % (model_sum_mape_current / len_frame)) print("Baseline MAPE: %s" % (base_sum_mape_current / len_frame)) print("=" * 10) print("Total Model RMSE: %s" % (sqrt(model_sum_rmse / (len_frame * (end_seq - start_seq))))) print("Total Baseline RMSE: %s" % (sqrt(base_sum_rmse / (len_frame * (end_seq - start_seq))))) print("Total Model MAE: %s" % (model_sum_mae / (len_frame * (end_seq - start_seq)))) print("Total Baseline MAE: %s" % (base_sum_mae / (len_frame * (end_seq - start_seq)))) print("Model MAPE: %s" % (model_sum_mape / (len_frame * (end_seq - start_seq)))) print("Baseline MAPE: %s" % (base_sum_mape / (len_frame * (end_seq - start_seq)))) print("Single Model RMSE: %s" % (sqrt(single_point_model_sum_rmse / (len_frame * (end_seq - start_seq))))) print("Single Baseline RMSE: %s" % (sqrt(single_point_base_sum_rmse / (len_frame * (end_seq - start_seq))))) log.write("\n\n Total:") log.write("\nTotal Model RMSE: %s" % (sqrt(model_sum_rmse / (len_frame * (end_seq - start_seq))))) log.write("\nTotal Baseline RMSE: %s" % (sqrt(base_sum_rmse / (len_frame * (end_seq - start_seq))))) log.write("\nTotal Model MAE: %s" % (model_sum_mae / (len_frame * (end_seq - start_seq)))) log.write("\nTotal Baseline MAE: %s" % (base_sum_mae / (len_frame * (end_seq - start_seq)))) log.write("\nModel MAPE: %s" % (model_sum_mape / (len_frame * (end_seq - start_seq)))) log.write("\nBaseline MAPE: %s" % (single_point_base_sum_rmse / (len_frame * (end_seq - start_seq)))) log.close() for k in range(start_seq, end_seq, 80): pred_sequence_raw = sst_grid[k][::, ::, ::, ::] new_frame = seq.predict(pred_sequence_raw[np.newaxis, ::, ::, ::, ::]) pred_sequence = new_frame[0] act_sequence = sst_grid[k + len_frame][::, ::, ::, ::] for i in range(len_frame): fig = plt.figure(figsize=(16, 8)) ax = fig.add_subplot(321) ax.text(1, 3, 'Prediction', fontsize=12) pred_toplot = pp.inverse_normalization(pred_sequence[i, ::, ::, 0]) plt.imshow(pred_toplot) cbar = plt.colorbar(plt.imshow(pred_toplot), orientation='horizontal') cbar.set_label('°C', fontsize=12) # 将预测seq-12数据作为baseline baseline_frame = pp.inverse_normalization( pred_sequence_raw[i, ::, ::, 0]) ax = fig.add_subplot(322) plt.text(1, 3, 'Baseline', fontsize=12) plt.imshow(baseline_frame) cbar = plt.colorbar(plt.imshow(baseline_frame), orientation='horizontal') cbar.set_label('°C', fontsize=12) ax = fig.add_subplot(323) plt.text(1, 3, 'Ground truth', fontsize=12) act_toplot = pp.inverse_normalization(act_sequence[i, ::, ::, 0]) plt.imshow(act_toplot) cbar = plt.colorbar(plt.imshow(act_toplot), orientation='horizontal') cbar.set_label('°C', fontsize=12) ax = fig.add_subplot(324) plt.text(1, 3, 'Ground truth', fontsize=12) act_toplot = pp.inverse_normalization(act_sequence[i, ::, ::, 0]) plt.imshow(act_toplot) cbar = plt.colorbar(plt.imshow(act_toplot), orientation='horizontal') cbar.set_label('°C', fontsize=12) ax = fig.add_subplot(325) plt.text(1, 3, 'Diff_Pred', fontsize=12) diff_toplot = act_toplot - pred_toplot plt.imshow(diff_toplot) cbar = plt.colorbar(plt.imshow(diff_toplot), orientation='horizontal') cbar.set_label('°C', fontsize=12) ax = fig.add_subplot(326) plt.text(1, 3, 'Diff_Base', fontsize=12) diff_toplot = act_toplot - baseline_frame plt.imshow(diff_toplot) cbar = plt.colorbar(plt.imshow(diff_toplot), orientation='horizontal') cbar.set_label('°C', fontsize=12) plt.savefig(fold_name + '/%s_%s_animate.png' % (str(k + 1), str(i + 1)))
from hyperopt import fmin import ast from hyperopt import Trials import lightgbm as lgb import preprocessing as pp import feature_selection as fs MAX_EVALS = 500 N_FOLDS = 10 df_total = pd.read_csv('train.csv', index_col=None, header=0, memory_map=True) df_total = df_total.drop(['ID_code'], axis=1) #df_total = df_total.sample(1000) #df_total.index = range(len(df_total)) frame_train, frame_test = pp.train_test_split(df_total, 'target', 0.3) frame_train = pp.normalization(frame_train, 'target') frame_test = pp.normalization(frame_test, 'target') X = frame_train.drop(['target'], axis=1) y = frame_train['target'] X_pred = frame_test.drop(['target'], axis=1) y_truth = frame_test['target'] X = np.array(X) X_pred = np.array(X_pred) train_set = lgb.Dataset(X, label=y) def objective(params, n_folds=N_FOLDS): """Objective function for Gradient Boosting Machine Hyperparameter Optimization""" # Keep track of evals global ITERATION
args = parser.parse_args() train = bool(args.train_model) predict = bool(args.predict) use_word2vec = bool(args.word2vec) df_train, df_test = pp.load_dataset() #clean word dataset_train = df_train[0].apply(lambda x: pp.clean_word(x)) dataset_test = df_test[0].apply(lambda x: pp.clean_word(x)) #delete punc dataset_train = dataset_train.apply(lambda x: pp.clean_punct(x)) dataset_test = dataset_test.apply(lambda x: pp.clean_punct(x)) #normalization dataset_train = dataset_train.apply(lambda x: pp.normalization(x)) dataset_test = dataset_test.apply(lambda x: pp.normalization(x)) # #get word-index tokenizer = Tokenizer(num_words=2000, oov_token='<OOV>') tokenizer.fit_on_texts(list(dataset_train)) word_index = tokenizer.word_index if train: #preparing sequences #pad sequences print(dataset_train) X_train = tokenizer.texts_to_sequences(dataset_train) X_train = sequence.pad_sequences(X_train, padding='post',
test_image, test_label = (test_data[2000:, 0], test_data[2000:, 1]) print('train_image shape: ', train_image.shape) print('valid_image shape: ', valid_image.shape) print('test_image shape: ', test_image.shape) print('train_label shape: ', train_label.shape) print('valid_label shape: ', valid_label.shape) print('test_label shape: ', test_label.shape) #clean memory #ps.release_memory(array_of_img, test_of_img, train_cat, train_dog, test_cat, test_dog) train_image, train_label = ps.processing(train_image, train_label) valid_image, valid_label = ps.processing(valid_image, valid_label) test_image, test_label = ps.processing(test_image, test_label) train_image = ps.normalization(train_image) valid_image = ps.normalization(valid_image) test_image = ps.normalization(test_image) print('train_image shape: ', train_image.shape) print('train_label shape: ', train_label.shape) print('train_label the first shape: ', train_label[0].shape) print('train_label the first type: ', type(train_label[0])) print('train_label the first content: ', train_label[0]) ## 4. initial variable weights = None biases = None n_hidden1 = 3000 #n_hidden2 = 1800 alpha = 0.3
] # data = list of tuples (sim_name, has_damage, sim) load_dataset() sims, sims_labels = preprocessing.clean(data, min(orders)) if conf.separated_test: test_sims, _ = preprocessing.clean(test_data, min(orders)) del orders, data, test_data results = [] n_fold = 1 if conf.separated_test: train, train_lengths = numpy.concatenate( sims, axis=0), [len(sim) for sim in sims] test, test_lengths = numpy.concatenate( test_sims, axis=0), [len(sim) for sim in test_sims] train[:, :-1], test[:, :-1] = preprocessing.normalization( train[:, :-1], test[:, :-1]) run() else: skf = StratifiedKFold(n_splits=5) # number of folds for train_indexes, test_indexes in skf.split(sims, sims_labels): train, test, train_lengths, test_lengths = [], [], [], [] for i in train_indexes: train.extend(spectrum for spectrum in sims[i]) train_lengths.append(len(sims[i])) for i in test_indexes: test.extend(spectrum for spectrum in sims[i]) test_lengths.append(len(sims[i])) train = numpy.array(train) test = numpy.array(test) train[:, :-1], test[:, :-1] = preprocessing.normalization( train[:, :-1], test[:, :-1])