Ejemplo n.º 1
0
def get_feature_names():
    id_train_all, x_train_all, y_train_all = data_util.get_poor_god(
        'training_is_0.csv', sub_class='ischemic')
    x_train_all.rename(columns={
        'MRS_TX_1': '30-day mRS',
        'discharged_mrs': 'Discharge mRS',
        'Toilet_use': 'Toilet use',
        'Bowel_control': 'Bowel control',
        'Bladder_control': 'Bladder control',
        'TRMNG_FL': 'Nasogastric tube',
        'TRMRE_FL': 'Rehab',
        'OFFDT_ID_1': 'Discharge to Home',
        'NIHS_6aL_out': 'Discharge NIHSS 6aL',
        'NIHS_6aL_in': 'Admission NIHSS 6aL',
        'NIHS_6bR_out': 'Discharge NIHSS 6bR',
        'NIHS_10_out': 'Discharge NIHSS 10',
        'NIHS_5aL_out': 'Discharge NIHSS 5aL',
        'NIHS_5bR_out': 'Discharge NIHSS 5bR',
        'NIHS_1b_out': 'Discharge NIHSS 1b',
        'NIHS_9_out': 'Discharge NIHSS 9',
        'NIHS_5aL_in': 'Admission NIHSS 5aL',
        'NIHS_1b_in': 'Admission NIHSS 1b'
    },
                       inplace=True)
    return x_train_all.columns.values
Ejemplo n.º 2
0
    test_loss_array = []
    predict_array = []
    # ====== Multi-classes
    # x_data, y_data = data_util.get_individual('wholeset_Jim_nomissing_validated.csv')
    # for index, (train, test) in enumerate(kfold.split(x_data, y_data)):
    #     history, model = mlp_multi(data_util.scale(x_data.iloc[train]),
    #                                to_categorical(y_data.iloc[train]),
    #                                parameter)
    #     history_array.append(history)
    #     loss, acc = model.evaluate(data_util.scale(x_data.iloc[test]),
    #                                to_categorical(y_data.iloc[test]),
    #                                verbose=0)
    #     test_acc_array.append(acc)
    #     test_loss_array.append(loss)
    # ====== Binary
    id_data, x_data, y_data = data_util.get_poor_god(
        'wholeset_Jim_nomissing_validated.csv')
    for index, (train, test) in enumerate(kfold.split(x_data, y_data)):
        x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input(
            x_data.iloc[train])
        # x_train = data_util.kera_feature(x_data.iloc[train])
        history, model = mlp_binary(data_util.scale(x_train_cnn),
                                    to_categorical(y_data.iloc[train]),
                                    parameter)
        history_array.append(history)

        x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input(
            x_data.iloc[test])
        # x_test = data_util.kera_feature(x_data.iloc[test])
        loss, acc = model.evaluate(data_util.scale(x_test_cnn),
                                   to_categorical(y_data.iloc[test]),
                                   verbose=0)
Ejemplo n.º 3
0
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
from my_utils import data_util

if __name__ == '__main__':
    subtype = 'he'
    # hold_out_round = 1
    for hold_out_round in range(0, 10, 1):
        if subtype == 'is':
            sub_class = 'ischemic'
        else:
            sub_class = 'hemorrhagic'
        id_train_all, x_train_all, y_train_all = data_util.get_poor_god('training_' + subtype + '_' + str(hold_out_round) + '.csv', sub_class=sub_class)
        feature_names = x_train_all.columns.values
        forest = ExtraTreesClassifier()
        kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=hold_out_round)
        for index, (train, test) in enumerate(kfold.split(x_train_all, y_train_all)):
            x_train = data_util.scale(x_train_all.iloc[train])
            y_train = y_train_all.iloc[train]
            forest.fit(x_train, y_train)
            importances = forest.feature_importances_
            std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
            indices = np.argsort(importances)[::-1]
            # Print the feature ranking
            imp = []
            print("Feature ranking:")
            for i in range(x_train.shape[1]):
                print("%d. feature %d (%f) %s" % (i + 1, indices[i], importances[indices[i]], feature_names[indices[i]]))
Ejemplo n.º 4
0
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt  #Data visualisation libraries
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from my_utils import data_util, performance_util
from sklearn.metrics import roc_auc_score

id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample(
    'training_is_9.csv', sub_class='ischemic')
id_hold, x_hold, y_hold = data_util.get_poor_god('hold_is_9.csv',
                                                 sub_class='ischemic')
lm = LogisticRegression()
x = x_train_all[['MRS_TX_1']]
y = y_train_all
lm.fit(x, y)
test_x = x_hold[['MRS_TX_1']]
predictions = lm.predict(test_x)
logit_roc_auc = roc_auc_score(y_hold, lm.predict(test_x))
print(logit_roc_auc)
Ejemplo n.º 5
0
def do_svm(hold_out_round, sub_class, experiment):
    np.random.seed(hold_out_round)
    if sub_class == 'ischemic':
        id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample(
            'training_is_' + str(hold_out_round) + '.csv', sub_class=sub_class)
        id_hold, x_hold, y_hold = data_util.get_poor_god(
            'hold_is_' + str(hold_out_round) + '.csv', sub_class=sub_class)
    else:
        id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample(
            'training_he_' + str(hold_out_round) + '.csv', sub_class=sub_class)
        id_hold, x_hold, y_hold = data_util.get_poor_god(
            'hold_he_' + str(hold_out_round) + '.csv', sub_class=sub_class)
    #
    if experiment == 0:
        save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'all' + os.sep
        model_name = 'svm_' + sub_class + '_h_' + str(hold_out_round)
    elif experiment == 1:
        x_train_all = data_util.feature_selection(x_train_all, sub_class)
        x_hold = data_util.feature_selection(x_hold, sub_class)
        save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'fs' + os.sep
        model_name = 'svm_fs_' + sub_class + '_h_' + str(hold_out_round)
    elif experiment == 2:
        x_train_all = x_train_all.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                                       errors='ignore',
                                       axis=1)
        x_hold = x_hold.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                             errors='ignore',
                             axis=1)
        save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'all_nf' + os.sep
        model_name = 'svm_nf_' + sub_class + '_h_' + str(hold_out_round)
    else:
        x_train_all = data_util.feature_selection(x_train_all, sub_class)
        x_train_all = x_train_all.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                                       errors='ignore',
                                       axis=1)
        x_hold = data_util.feature_selection(x_hold, sub_class)
        x_hold = x_hold.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                             errors='ignore',
                             axis=1)
        save_path = '..' + os.sep + 'result' + os.sep + 'svm' + os.sep + 'fs_nf' + os.sep
        model_name = 'svm_fs_nf_' + sub_class + '_h_' + str(hold_out_round)
    #
    test_acc_array = []
    kfold = StratifiedKFold(n_splits=10,
                            shuffle=True,
                            random_state=hold_out_round)
    classifier = SVC(kernel='linear',
                     probability=True,
                     random_state=hold_out_round,
                     verbose=True)
    for index, (train, test) in enumerate(kfold.split(x_train_all,
                                                      y_train_all)):
        # Training
        x_train = data_util.scale(x_train_all.iloc[train])
        y_train = y_train_all.iloc[train]
        # Testing
        x_test = data_util.scale(x_train_all.iloc[test])
        y_test = y_train_all.iloc[test]
        # train on 90% training
        classifier.fit(x_train, y_train)
        predict_result_train = id_train_all.iloc[train]
        train_probas = classifier.predict_proba(x_train)
        predict_result_train['label'] = y_train
        predict_result_train['0'] = train_probas[:, 0]
        predict_result_train['1'] = train_probas[:, 1]
        predict_result_train.to_csv(save_path + model_name + '_train_cv' +
                                    str(index) + '.csv',
                                    sep=',',
                                    encoding='utf-8')
        # Evaluation on 10% training
        predict_result_test = id_train_all.iloc[test]
        test_probas = classifier.predict_proba(x_test)
        predict_result_test['label'] = y_test
        predict_result_test['0'] = test_probas[:, 0]
        predict_result_test['1'] = test_probas[:, 1]
        predict_result_test.to_csv(save_path + model_name + '_test_cv' +
                                   str(index) + '.csv',
                                   sep=',',
                                   encoding='utf-8')
        test_acc = accuracy_score(y_test, classifier.predict(x_test))
        test_acc_array.append(test_acc)
        performance_util.save_model(classifier, model_name + '_' + str(index))
    print('10-CV Done')
    # --
    best_model_inx = test_acc_array.index(max(test_acc_array))
    hold_model = performance_util.load_ml_model(model_name, best_model_inx)
    x_hold = data_util.scale(x_hold)
    predict_result_hold = id_hold
    holdout_probas = hold_model.predict_proba(x_hold)
    predict_result_hold['label'] = y_hold
    predict_result_hold['0'] = holdout_probas[:, 0]
    predict_result_hold['1'] = holdout_probas[:, 1]
    predict_result_hold.to_csv(save_path + model_name + '_hold.csv',
                               sep=',',
                               encoding='utf-8')
    print('hold-out Done')
Ejemplo n.º 6
0
def do_mlp_cnn(hold_out_round, sub_class, experiment):
    np.random.seed(hold_out_round)
    if sub_class == 'ischemic':
        id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample(
            'training_is_' + str(hold_out_round) + '.csv', sub_class=sub_class)
        id_hold, x_hold, y_hold = data_util.get_poor_god(
            'hold_is_' + str(hold_out_round) + '.csv', sub_class=sub_class)
    else:
        id_train_all, x_train_all, y_train_all = data_util.get_poor_god_downsample(
            'training_he_' + str(hold_out_round) + '.csv', sub_class=sub_class)
        id_hold, x_hold, y_hold = data_util.get_poor_god(
            'hold_he_' + str(hold_out_round) + '.csv', sub_class=sub_class)
    #
    if experiment == 0:
        save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'all' + os.sep
        parameter = {
            'model_name': 'mlp_cnn_' + sub_class + '_h_' + str(hold_out_round),
            'size_of_batch': 56,
            'nb_epoch': 150,
            'drop_rate': 0.5
        }
    elif experiment == 1:
        save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'fs' + os.sep
        selected_features = data_util.get_selected_feature_name(sub_class)
        parameter = {
            'model_name':
            'mlp_cnn_fs_' + sub_class + '_h_' + str(hold_out_round),
            'size_of_batch': 56,
            'nb_epoch': 150,
            'drop_rate': 0.5
        }
    elif experiment == 2:
        save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'all_nf' + os.sep
        parameter = {
            'model_name':
            'mlp_cnn_nf_' + sub_class + '_h_' + str(hold_out_round),
            'size_of_batch': 56,
            'nb_epoch': 150,
            'drop_rate': 0.5
        }
    else:
        save_path = '..' + os.sep + 'result' + os.sep + 'mlp_cnn' + os.sep + 'fs_nf' + os.sep
        selected_features = data_util.get_selected_feature_name(sub_class)
        parameter = {
            'model_name':
            'mlp_cnn_fs_nf_' + sub_class + '_h_' + str(hold_out_round),
            'size_of_batch':
            56,
            'nb_epoch':
            150,
            'drop_rate':
            0.5
        }

    test_acc_array = []
    test_loss_array = []
    kfold = StratifiedKFold(n_splits=10,
                            shuffle=True,
                            random_state=hold_out_round)
    for index, (train, test) in enumerate(kfold.split(x_train_all,
                                                      y_train_all)):
        # training
        x_train_cnn, x_train_mlp = data_util.split_cnn_mlp_input(
            x_train_all.iloc[train])
        if experiment == 2:
            x_train_cnn = x_train_cnn.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                           errors='ignore',
                                           axis=1)
            x_train_mlp = x_train_mlp.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                           errors='ignore',
                                           axis=1)
        if experiment == 1 or experiment == 3:
            x_train_cnn, x_train_mlp = data_util.selected_cnn_mlp_input(
                x_train_cnn, x_train_mlp, selected_features)
            if experiment == 3:
                x_train_cnn = x_train_cnn.drop([
                    'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0',
                    'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1',
                    'VEIHD_1', 'MRS_TX_1'
                ],
                                               errors='ignore',
                                               axis=1)
                x_train_mlp = x_train_mlp.drop([
                    'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0',
                    'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1',
                    'VEIHD_1', 'MRS_TX_1'
                ],
                                               errors='ignore',
                                               axis=1)
        x_train_cnn = data_util.scale(x_train_cnn)
        x_train_mlp = data_util.scale(x_train_mlp)
        y_train = y_train_all.iloc[train]

        # Testing
        x_test_cnn, x_test_mlp = data_util.split_cnn_mlp_input(
            x_train_all.iloc[test])
        if experiment == 2:
            x_test_cnn = x_test_cnn.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                         errors='ignore',
                                         axis=1)
            x_test_mlp = x_test_mlp.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                         errors='ignore',
                                         axis=1)
        if experiment == 1 or experiment == 3:
            x_test_cnn, x_test_mlp = data_util.selected_cnn_mlp_input(
                x_test_cnn, x_test_mlp, selected_features)
            if experiment == 3:
                x_test_cnn = x_test_cnn.drop([
                    'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0',
                    'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1',
                    'VEIHD_1', 'MRS_TX_1'
                ],
                                             errors='ignore',
                                             axis=1)
                x_test_mlp = x_test_mlp.drop([
                    'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0',
                    'FLU_ID_1_4.0', 'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1',
                    'VEIHD_1', 'MRS_TX_1'
                ],
                                             errors='ignore',
                                             axis=1)
        x_test_cnn = np.expand_dims(data_util.scale(x_test_cnn), 2)
        x_test_mlp = data_util.scale(x_test_mlp)
        y_test = y_train_all.iloc[test]

        # train on 90% training
        history, model = mlp_cnn_binary(x_train_cnn, x_train_mlp,
                                        to_categorical(y_train), parameter,
                                        index)
        performance_util.save_train_validation(
            save_path + parameter['model_name'], history, 'acc', str(index))
        predict_result_train = id_train_all.iloc[train]
        x_train_cnn = np.expand_dims(x_train_cnn, 2)
        train_probas = model.predict([x_train_cnn, x_train_mlp])
        predict_result_train['label'] = y_train
        predict_result_train['0'] = train_probas[:, 0]
        predict_result_train['1'] = train_probas[:, 1]
        predict_result_train.to_csv(save_path + parameter['model_name'] +
                                    '_train_cv' + str(index) + '.csv',
                                    sep=',',
                                    encoding='utf-8')
        # Evaluation on 10% training
        predict_result_test = id_train_all.iloc[test]
        test_probas = model.predict([x_test_cnn, x_test_mlp])
        predict_result_test['label'] = y_test
        predict_result_test['0'] = test_probas[:, 0]
        predict_result_test['1'] = test_probas[:, 1]
        predict_result_test.to_csv(save_path + parameter['model_name'] +
                                   '_test_cv' + str(index) + '.csv',
                                   sep=',',
                                   encoding='utf-8')

        loss, acc = model.evaluate([x_test_cnn, x_test_mlp],
                                   to_categorical(y_test),
                                   verbose=0)
        test_acc_array.append(acc)
        test_loss_array.append(loss)
        # plot_fig.plot_acc_loss(history, 'acc')
    performance_util.save_test(save_path + parameter['model_name'],
                               test_acc_array, test_loss_array)
    print('10-CV Done')
    # --
    best_model_inx = test_acc_array.index(max(test_acc_array))
    hold_model = performance_util.load_nn_model(parameter['model_name'],
                                                best_model_inx)
    x_hold_cnn, x_hold_mlp = data_util.split_cnn_mlp_input(x_hold)

    if experiment == 2:
        x_hold_cnn = x_hold_cnn.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                                     errors='ignore',
                                     axis=1)
        x_hold_mlp = x_hold_mlp.drop([
            'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
            'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
        ],
                                     errors='ignore',
                                     axis=1)
    if experiment == 1 or experiment == 3:
        x_hold_cnn, x_hold_mlp = data_util.selected_cnn_mlp_input(
            x_hold_cnn, x_hold_mlp, selected_features)
        if experiment == 3:
            x_hold_cnn = x_hold_cnn.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                         errors='ignore',
                                         axis=1)
            x_hold_mlp = x_hold_mlp.drop([
                'FLU_ID_1_1.0', 'FLU_ID_1_2.0', 'FLU_ID_1_3.0', 'FLU_ID_1_4.0',
                'FLU_ID_1_5.0', 'FLU_ID_1_6.0', 'VERS_1', 'VEIHD_1', 'MRS_TX_1'
            ],
                                         errors='ignore',
                                         axis=1)

    x_hold_cnn = np.expand_dims(data_util.scale(x_hold_cnn), 2)
    x_hold_mlp = data_util.scale(x_hold_mlp)
    predict_result_hold = id_hold
    holdout_probas = hold_model.predict([x_hold_cnn, x_hold_mlp])
    predict_result_hold['label'] = y_hold
    predict_result_hold['0'] = holdout_probas[:, 0]
    predict_result_hold['1'] = holdout_probas[:, 1]
    predict_result_hold.to_csv(save_path + parameter['model_name'] +
                               '_hold.csv',
                               sep=',',
                               encoding='utf-8')
    print('hold-out Done')
Ejemplo n.º 7
0
        else:
            all_selected_features = np.append(all_selected_features,
                                              selected_f_names)
    feature_dict = Counter(all_selected_features)
    # use to draw heatmap
    return list(feature_dict.keys())


if __name__ == '__main__':
    # Just get the feature names
    subtype = 'he'
    if subtype == 'is':
        sub_class = 'ischemic'
    else:
        sub_class = 'hemorrhagic'
    id_train_all, x_train_all, y_train_all = data_util.get_poor_god(
        'training_' + subtype + '_0.csv', sub_class=sub_class)
    feature_names = x_train_all.columns.values
    #
    for hold_out_round in range(0, 10, 1):
        if subtype == 'is':
            sub_class = 'ischemic'
        else:
            sub_class = 'hemorrhagic'
        df = pd.read_csv('f_' + subtype + '_' + str(hold_out_round) + '.csv',
                         encoding='utf8')
        mean_importance = df.drop(['f_index'], axis=1).mean(axis=1)
        if hold_out_round == 0:
            robust_f_df = pd.DataFrame(data={'f_index': df['f_index']})
            robust_f_df['rf' + str(hold_out_round)] = mean_importance
        else:
            robust_f_df['rf' + str(hold_out_round)] = mean_importance