def imp_data(normalize):
    imp_packets = prep.get_imp(packets)
    if (normalize == 'y'):
        normalize_features = [
            'srcip1', 'srcip1', 'sport', 'dstip1', 'dstip2', 'dsport', 'dur',
            'sbytes', 'Stime', 'Ltime'
        ]
        prep.normalization(imp_packets, normalize_features)

    Dbscan_fixed_eps_info(0.5)
Exemple #2
0
def main():
    starter_time = timer(None)
    df_total = pd.read_csv('train.csv', index_col = None, header = 0, memory_map = True)
    df_total = df_total.drop(['ID_code'],axis = 1)
    df_total = df_total.sample(1000)
    df_total.index = range(len(df_total))
    frame_train,frame_test = pp.train_test_split(df_total, 'target', 0.3)
    frame_train = pp.normalization(frame_train,'target')
    frame_test = pp.normalization(frame_test,'target')
    X = frame_train.drop(['target'], axis = 1)
    y = frame_train['target']
    X_pred = frame_test.drop(['target'],axis = 1)
    y_truth = frame_test['target']
    print ('数据读入完成')
    base_learners = constant.base_learners
    print ('基学习器载入完成,开始训练基学习器')
    df_single_output, P = single_model_test(base_learners, X, y, X_pred,y_truth)
    plot_roc_curve (y_truth, P.values, list(P.columns))
    print ('基学习器训练完成,开始调节参数')
    base_param_dicts = constant.base_param_dicts
    df_params_base = base_hyperparam_tuning (X,y,base_learners, base_param_dicts, n_iterations = 50)
    df_params_base.to_csv('params_base.csv')
    print ('参数调节完成,开始训练中间层')
    layer1_learners = constant.layer1_learners
    
    layer1_param_dicts = constant.layer1_param_dicts
    print ('开始为中间层调参')
    
    #in_layer_1, df_params_1 = layer_hyperparam_tuning(X,y,pre_layer_learners=base_learners, local_layer_learners = layer1_learners, param_dicts_layer = layer1_param_dicts, n_iterations = 50, pre_params = 'params_base.csv')
    #df_params_1.to_csv('params1.csv')
    
    
    
    
    #设定学习器参数并确定元学习器
    print ('开始训练元学习器')
    meta_learner = constant.meta_learner
    meta_param_dicts = constant.meta_param_dicts
    meta_layer_model, df_params_meta = layer_hyperparam_tuning(X,y,pre_layer_learners = layer1_learners, local_layer_learners = meta_learner, param_dicts_layer = meta_param_dicts, n_iterations = 50, pre_params = 'params_base.csv')
    df_params_meta.to_csv('paramsMeta.csv')
    params_pre = pd.read_csv('paramsMeta.csv')
    params_pre.set_index(['Unnamed: 0'], inplace = True)
    for case_name, params in params_pre["params"].items():
        case_est = case_name
        params = eval(params)
        for est_name, est in meta_learner:
            if est_name == case_est:
                est.set_params(**params)
    layer_list = constant.layer_list
    pred_proba_1 ,stacking_model = stacking_training(X,y,X_pred,layer_list = layer_list,meta_learner = meta_learner)
    print (roc_auc_score(y_truth, pred_proba_1[:,1]))
    timer(starter_time)
    return pred_proba_1, stacking_model
Exemple #3
0
def ml_pipeline(response):
    output_preprocessing = preprocessing.preprocess()
    if output_preprocessing:
        # Update any related flows already in the dataset with the latest data.
        preprocessing.update_related()
        # Data normalization into a [0,1] scale.
        preprocessing.normalization()
    if config.df.shape[0] >= 10:
        if config.args.kmeans:
            kmeans.kmeans()
        if config.args.dbscan:
            dbscan.dbscan()
        postprocessing.postprocess()
        eval_counter.counter()
    return response
Exemple #4
0
def main():
    csv_data = pd.read_csv(
        'https://firebasestorage.googleapis.com/v0/b/fir-crud-36cbe.appspot.com/o/Iris.csv?alt=media&token=71bdac3f-96e5-4aae-9b60-78025c1d3330'
    )
    csv_data = preprocessing.normalization(csv_data)
    dfTraining, dfTesting = preprocessing.splitData(csv_data)
    biases = (0.0001, 0.0001)
    weights = learn.learning(dfTraining, 0.5, biases)
    test.testing(dfTesting, weights, biases)
def raw_data(evalu, del_tcp, normalize):
    if (evalu == 'km_e'):
        method_km.Elbow(packets)
    elif (evalu == 'km_s'):
        if (del_tcp == 'y'):
            prep.del_tcp_features(packets)
            if (normalize == 'y'):
                normalize_features = ['Stime', 'Ltime', 'Sload', 'Dload']
                prep.normalization(packets, normalize_features)

        Kmeans_fixed_k_info()

    elif (evalu == 'db'):
        if (del_tcp == 'y'):
            prep.del_tcp_features(packets)
            if (normalize == 'y'):
                #要抓所有key出來當normalize_all

                normalize_features = normalize_all
                prep.normalization(packets, normalize_features)
        elif (del_tcp == 'n'):
            if (normalize == 'y'):
                normalize_features = normalize_tcp
                prep.normalization(packets, normalize_features)
                """ del packets['Ltime']
                del packets['Stime']
                del packets['Sintpkt']
                del packets['Dintpkt']
                del packets['Sjit']
                del packets['Djit'] """
                #print(packets.loc[0])
        Dbscan_fixed_eps_info(0.5)
Exemple #6
0
    def separate_and_prepare_data(self, data, labels, train_index, test_index,
                                  num_features):
        trn_x, tst_x = data[data.index.isin(train_index)], data[
            data.index.isin(test_index)]
        trn_y, tst_y = labels[labels.index.isin(train_index)], labels[
            labels.index.isin(test_index)]

        # PREPARATION FOR TRAINING PART
        trn_x, tst_x = prep.normalization(trn_x, tst_x)
        trn_x, trn_y = prep.balance_oversampling(trn_x, trn_y)
        trn_x, tst_x = prep.correlation_removal_kcross(trn_x, tst_x)
        trn_x, tst_x = prep.select_features(trn_x, trn_y, tst_x, num_features)

        trn_x_values = trn_x.values
        tst_x_values = tst_x.values
        trn_y_values = trn_y.values
        tst_y_values = tst_y.values

        return trn_x_values, tst_x_values, trn_y_values, tst_y_values
Exemple #7
0
y_train = train.pop('class')
y_train = y_train.astype('int')
x_train = train

y_test = test.pop('class')
y_test = y_test.astype('int')
x_test = test

#################### NORMALIZE DATA #############################

target_count_dataset = y.replace({0: 'Healthy', 1: 'Sick'}).value_counts()
target_count_test = y_test.replace({0: 'Healthy', 1: 'Sick'}).value_counts()

# print_balance(target_count_dataset, target_count_test, title1='balance of whole set', title2='balance of test set')

x_train, x_test = prep.normalization(x_train, x_test)

#################### NORMALIZE DATA #############################

target_count_dataset = y.replace({0: 'Healthy', 1: 'Sick'}).value_counts()
target_count_test = y_test.replace({0: 'Healthy', 1: 'Sick'}).value_counts()

# print_balance(target_count_dataset, target_count_test, title1='balance of whole set', title2='balance of test set')

x_train, x_test = prep.normalization(x_train, x_test)

#################### DATA BALANCE #############################

# before sampling
target_count = y_train.replace({0: 'Healthy', 1: 'Sick'}).value_counts()
min_class = target_count.idxmin()
Exemple #8
0
def main():
    os.makedirs(fold_name)
    # fit model
    file_path = fold_name + '/' + fold_name + ".h5"
    log_file_path = fold_name + '/' + fold_name + ".log"
    log = open(log_file_path, 'w')

    # model setting
    seq = CovLSTM2D_model()
    with redirect_stdout(log):
        seq.summary()

    # TODO
    # seq = STResNet_model()

    train_X_raw, train_Y_raw, sst_grid_raw = np.load(
        DATA_PATH)  # from .npy file

    # normalization, data for ConvLSTM Model -n ahead -5 dimension
    train_X = np.zeros((len_seq, len_frame, 10, 50, 1), dtype=np.float)
    train_Y = np.zeros((len_seq, len_frame, 10, 50, 1), dtype=np.float)
    sst_grid = np.zeros((len_seq + len_frame, len_frame, 10, 50, 1),
                        dtype=np.float)
    for i in range(len_seq):
        for k in range(len_frame):
            train_X[i, k, ::, ::,
                    0] = pp.normalization(sst_grid_raw[i, k, ::, ::, 0])
            train_Y[i, k, ::, ::,
                    0] = pp.normalization(sst_grid_raw[i + len_frame,
                                                       k, ::, ::, 0])
            sst_grid[i, k, ::, ::,
                     0] = pp.normalization(sst_grid_raw[i, k, ::, ::, 0])

    seq = multi_gpu_model(seq, gpus=2)
    # sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    # rmsprop = optimizers.RMSprop(lr=0.1)
    seq.compile(loss="mse", optimizer='adam')

    if not os.path.exists(file_path):
        # ConvLSTM Model
        history = seq.fit(train_X[:train_length],
                          train_Y[:train_length],
                          batch_size=batch_size,
                          epochs=epochs,
                          validation_split=validation_split)
        seq.save(file_path)
        pyplot.plot(history.history['loss'])
        log.write("\n train_loss=========")
        log.write("\n %s" % history.history['loss'])
        pyplot.plot(history.history['val_loss'])
        log.write("\n\n\n val_loss=========")
        log.write("\n %s" % history.history['val_loss'])
        pyplot.title('model loss')
        pyplot.ylabel('loss')
        pyplot.xlabel('epoch')
        pyplot.legend(['train', 'validation'], loc='upper left')
        pyplot.savefig(fold_name + '/%i_epoch_loss.png' % epochs)
    else:
        seq = load_model(file_path)

    model_sum_rmse = 0
    base_sum_rmse = 0
    model_sum_mae = 0
    base_sum_mae = 0
    model_sum_mape = 0
    base_sum_mape = 0

    single_point_model_sum_rmse = 0
    single_point_base_sum_rmse = 0

    for k in range(start_seq, end_seq):
        # direct with -n steps
        model_sum_rmse_current = 0
        base_sum_rmse_current = 0
        model_sum_mae_current = 0
        base_sum_mae_current = 0
        model_sum_mape_current = 0
        base_sum_mape_current = 0

        pred_sequence_raw = sst_grid[k][::, ::, ::, ::]
        act_sequence = sst_grid[k + len_frame][::, ::, ::, ::]
        pred_sequence = seq.predict(
            pred_sequence_raw[np.newaxis, ::, ::, ::, ::])
        pred_sequence = pred_sequence[0, ::, ::, ::, ::]
        for j in range(len_frame):
            baseline_frame = pp.inverse_normalization(
                pred_sequence_raw[j, ::, ::, 0])
            pred_toplot = pp.inverse_normalization(pred_sequence[j, ::, ::, 0])
            act_toplot = pp.inverse_normalization(act_sequence[j, ::, ::, 0])

            model_rmse = mean_squared_error(act_toplot, pred_toplot)
            baseline_rmse = mean_squared_error(act_toplot, baseline_frame)

            model_mae = mean_absolute_error(act_toplot, pred_toplot)
            baseline_mae = mean_absolute_error(act_toplot, baseline_frame)

            model_mape = pp.mean_absolute_percentage_error(
                act_toplot, pred_toplot)
            baseline_mape = pp.mean_absolute_percentage_error(
                act_toplot, baseline_frame)

            model_sum_rmse, base_sum_rmse = model_sum_rmse + model_rmse, base_sum_rmse + baseline_rmse
            model_sum_mae, base_sum_mae = model_sum_mae + model_mae, base_sum_mae + baseline_mae
            model_sum_mape, base_sum_mape = model_sum_mape + model_mape, base_sum_mape + baseline_mape

            model_sum_rmse_current, base_sum_rmse_current = model_sum_rmse_current + model_rmse, base_sum_rmse_current + baseline_rmse
            model_sum_mae_current, base_sum_mae_current = model_sum_mae_current + model_mae, base_sum_mae_current + baseline_mae
            model_sum_mape_current, base_sum_mape_current = model_sum_mape_current + model_mape, base_sum_mape_current + baseline_mape

            single_model_rmse = (act_toplot[point_x, point_y] -
                                 pred_toplot[point_x, point_y])**2
            single_base_rmse = (act_toplot[point_x, point_y] -
                                baseline_frame[point_x, point_y])**2

            single_point_model_sum_rmse = single_point_model_sum_rmse + single_model_rmse
            single_point_base_sum_rmse = single_point_base_sum_rmse + single_base_rmse

        log.write("\n\n ============")
        log.write("\n Round: %s" % str(k + 1))
        log.write("\nTotal Model RMSE: %s" %
                  (sqrt(model_sum_rmse_current / len_frame)))
        log.write("\nTotal Baseline RMSE: %s" %
                  (sqrt(base_sum_rmse_current / len_frame)))
        log.write("\nTotal Model MAE: %s" %
                  (model_sum_mae_current / len_frame))
        log.write("\nTotal Baseline MAE: %s" %
                  (base_sum_mae_current / len_frame))
        log.write("\nModel MAPE: %s" % (model_sum_mape_current / len_frame))
        log.write("\nBaseline MAPE: %s" % (base_sum_mape_current / len_frame))

        print("============")
        print("Round: %s" % str(k + 1))
        print("Total Model RMSE: %s" %
              (sqrt(model_sum_rmse_current / len_frame)))
        print("Total Baseline RMSE: %s" %
              (sqrt(base_sum_rmse_current / len_frame)))
        print("Total Model MAE: %s" % (model_sum_mae_current / len_frame))
        print("Total Baseline MAE: %s" % (base_sum_mae_current / len_frame))
        print("Model MAPE: %s" % (model_sum_mape_current / len_frame))
        print("Baseline MAPE: %s" % (base_sum_mape_current / len_frame))

    print("=" * 10)
    print("Total Model RMSE: %s" % (sqrt(model_sum_rmse /
                                         (len_frame * (end_seq - start_seq)))))
    print("Total Baseline RMSE: %s" %
          (sqrt(base_sum_rmse / (len_frame * (end_seq - start_seq)))))
    print("Total Model MAE: %s" % (model_sum_mae / (len_frame *
                                                    (end_seq - start_seq))))
    print("Total Baseline MAE: %s" % (base_sum_mae / (len_frame *
                                                      (end_seq - start_seq))))
    print("Model MAPE: %s" % (model_sum_mape / (len_frame *
                                                (end_seq - start_seq))))
    print("Baseline MAPE: %s" % (base_sum_mape / (len_frame *
                                                  (end_seq - start_seq))))

    print("Single Model RMSE: %s" % (sqrt(single_point_model_sum_rmse /
                                          (len_frame *
                                           (end_seq - start_seq)))))
    print("Single Baseline RMSE: %s" % (sqrt(single_point_base_sum_rmse /
                                             (len_frame *
                                              (end_seq - start_seq)))))

    log.write("\n\n Total:")
    log.write("\nTotal Model RMSE: %s" %
              (sqrt(model_sum_rmse / (len_frame * (end_seq - start_seq)))))
    log.write("\nTotal Baseline RMSE: %s" %
              (sqrt(base_sum_rmse / (len_frame * (end_seq - start_seq)))))
    log.write("\nTotal Model MAE: %s" % (model_sum_mae /
                                         (len_frame * (end_seq - start_seq))))
    log.write("\nTotal Baseline MAE: %s" %
              (base_sum_mae / (len_frame * (end_seq - start_seq))))
    log.write("\nModel MAPE: %s" % (model_sum_mape / (len_frame *
                                                      (end_seq - start_seq))))
    log.write("\nBaseline MAPE: %s" % (single_point_base_sum_rmse /
                                       (len_frame * (end_seq - start_seq))))
    log.close()

    for k in range(start_seq, end_seq, 80):
        pred_sequence_raw = sst_grid[k][::, ::, ::, ::]
        new_frame = seq.predict(pred_sequence_raw[np.newaxis, ::, ::, ::, ::])
        pred_sequence = new_frame[0]
        act_sequence = sst_grid[k + len_frame][::, ::, ::, ::]
        for i in range(len_frame):
            fig = plt.figure(figsize=(16, 8))
            ax = fig.add_subplot(321)
            ax.text(1, 3, 'Prediction', fontsize=12)
            pred_toplot = pp.inverse_normalization(pred_sequence[i, ::, ::, 0])
            plt.imshow(pred_toplot)
            cbar = plt.colorbar(plt.imshow(pred_toplot),
                                orientation='horizontal')
            cbar.set_label('°C', fontsize=12)

            # 将预测seq-12数据作为baseline
            baseline_frame = pp.inverse_normalization(
                pred_sequence_raw[i, ::, ::, 0])
            ax = fig.add_subplot(322)
            plt.text(1, 3, 'Baseline', fontsize=12)
            plt.imshow(baseline_frame)
            cbar = plt.colorbar(plt.imshow(baseline_frame),
                                orientation='horizontal')
            cbar.set_label('°C', fontsize=12)

            ax = fig.add_subplot(323)
            plt.text(1, 3, 'Ground truth', fontsize=12)
            act_toplot = pp.inverse_normalization(act_sequence[i, ::, ::, 0])
            plt.imshow(act_toplot)
            cbar = plt.colorbar(plt.imshow(act_toplot),
                                orientation='horizontal')
            cbar.set_label('°C', fontsize=12)

            ax = fig.add_subplot(324)
            plt.text(1, 3, 'Ground truth', fontsize=12)
            act_toplot = pp.inverse_normalization(act_sequence[i, ::, ::, 0])
            plt.imshow(act_toplot)
            cbar = plt.colorbar(plt.imshow(act_toplot),
                                orientation='horizontal')
            cbar.set_label('°C', fontsize=12)

            ax = fig.add_subplot(325)
            plt.text(1, 3, 'Diff_Pred', fontsize=12)
            diff_toplot = act_toplot - pred_toplot
            plt.imshow(diff_toplot)
            cbar = plt.colorbar(plt.imshow(diff_toplot),
                                orientation='horizontal')
            cbar.set_label('°C', fontsize=12)

            ax = fig.add_subplot(326)
            plt.text(1, 3, 'Diff_Base', fontsize=12)
            diff_toplot = act_toplot - baseline_frame
            plt.imshow(diff_toplot)
            cbar = plt.colorbar(plt.imshow(diff_toplot),
                                orientation='horizontal')
            cbar.set_label('°C', fontsize=12)
            plt.savefig(fold_name + '/%s_%s_animate.png' %
                        (str(k + 1), str(i + 1)))
Exemple #9
0
from hyperopt import fmin
import ast
from hyperopt import Trials
import lightgbm as lgb
import preprocessing as pp
import feature_selection as fs

MAX_EVALS = 500
N_FOLDS = 10

df_total = pd.read_csv('train.csv', index_col=None, header=0, memory_map=True)
df_total = df_total.drop(['ID_code'], axis=1)
#df_total = df_total.sample(1000)
#df_total.index = range(len(df_total))
frame_train, frame_test = pp.train_test_split(df_total, 'target', 0.3)
frame_train = pp.normalization(frame_train, 'target')
frame_test = pp.normalization(frame_test, 'target')
X = frame_train.drop(['target'], axis=1)
y = frame_train['target']
X_pred = frame_test.drop(['target'], axis=1)
y_truth = frame_test['target']
X = np.array(X)
X_pred = np.array(X_pred)
train_set = lgb.Dataset(X, label=y)


def objective(params, n_folds=N_FOLDS):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""

    # Keep track of evals
    global ITERATION
    args = parser.parse_args()

    train = bool(args.train_model)
    predict = bool(args.predict)
    use_word2vec = bool(args.word2vec)

    df_train, df_test = pp.load_dataset()
    #clean word
    dataset_train = df_train[0].apply(lambda x: pp.clean_word(x))
    dataset_test = df_test[0].apply(lambda x: pp.clean_word(x))
    #delete punc
    dataset_train = dataset_train.apply(lambda x: pp.clean_punct(x))
    dataset_test = dataset_test.apply(lambda x: pp.clean_punct(x))
    #normalization
    dataset_train = dataset_train.apply(lambda x: pp.normalization(x))
    dataset_test = dataset_test.apply(lambda x: pp.normalization(x))

    # #get word-index
    tokenizer = Tokenizer(num_words=2000, oov_token='<OOV>')
    tokenizer.fit_on_texts(list(dataset_train))
    word_index = tokenizer.word_index

    if train:

        #preparing sequences
        #pad sequences
        print(dataset_train)
        X_train = tokenizer.texts_to_sequences(dataset_train)
        X_train = sequence.pad_sequences(X_train,
                                         padding='post',
test_image, test_label = (test_data[2000:, 0], test_data[2000:, 1])
print('train_image shape: ', train_image.shape)
print('valid_image shape: ', valid_image.shape)
print('test_image shape: ', test_image.shape)
print('train_label shape: ', train_label.shape)
print('valid_label shape: ', valid_label.shape)
print('test_label shape: ', test_label.shape)

#clean memory
#ps.release_memory(array_of_img, test_of_img, train_cat, train_dog, test_cat, test_dog)

train_image, train_label = ps.processing(train_image, train_label)
valid_image, valid_label = ps.processing(valid_image, valid_label)
test_image, test_label = ps.processing(test_image, test_label)

train_image = ps.normalization(train_image)
valid_image = ps.normalization(valid_image)
test_image = ps.normalization(test_image)

print('train_image shape: ', train_image.shape)
print('train_label shape: ', train_label.shape)
print('train_label the first shape: ', train_label[0].shape)
print('train_label the first type: ', type(train_label[0]))
print('train_label the first content: ', train_label[0])

## 4. initial variable
weights = None
biases = None
n_hidden1 = 3000
#n_hidden2 = 1800
alpha = 0.3
Exemple #12
0
]  # data = list of tuples (sim_name, has_damage, sim)
load_dataset()

sims, sims_labels = preprocessing.clean(data, min(orders))
if conf.separated_test:
    test_sims, _ = preprocessing.clean(test_data, min(orders))
del orders, data, test_data

results = []
n_fold = 1
if conf.separated_test:
    train, train_lengths = numpy.concatenate(
        sims, axis=0), [len(sim) for sim in sims]
    test, test_lengths = numpy.concatenate(
        test_sims, axis=0), [len(sim) for sim in test_sims]
    train[:, :-1], test[:, :-1] = preprocessing.normalization(
        train[:, :-1], test[:, :-1])
    run()
else:
    skf = StratifiedKFold(n_splits=5)  # number of folds
    for train_indexes, test_indexes in skf.split(sims, sims_labels):
        train, test, train_lengths, test_lengths = [], [], [], []
        for i in train_indexes:
            train.extend(spectrum for spectrum in sims[i])
            train_lengths.append(len(sims[i]))
        for i in test_indexes:
            test.extend(spectrum for spectrum in sims[i])
            test_lengths.append(len(sims[i]))
        train = numpy.array(train)
        test = numpy.array(test)
        train[:, :-1], test[:, :-1] = preprocessing.normalization(
            train[:, :-1], test[:, :-1])