Ejemplo n.º 1
0
def prepare_data(df_static, df_dynamic, static_feature, args):

    # label assignment (according to imputed SpO2)
    imputer = DataImputation()
    df_static = imputer.impute_static_dataframe(df_static)
    df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic)
    path_sta_label = 'data/label/static_label.pkl'
    path_dyn_label = 'data/label/dynamic_label.pkl'
    label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh,
                                   hypoxemia_window=args.hypoxemia_window,
                                   prediction_window=args.prediction_window)
    if os.path.exists(path_sta_label) and os.path.exists(path_dyn_label):
        static_label = pd.read_pickle(path_sta_label)
        positive_pids = label_assign.get_positive_pids(static_label)
    else:
        print('Assigning labels...')
        static_label, dynamic_label = label_assign.assign_label(
            df_static, df_dynamic)
        static_label.to_pickle(path_sta_label)
        dynamic_label.to_pickle(path_dyn_label)
        print('Done.')

    # get subgroup pids
    subgroup_pids = PatientFilter(
        df_static=df_static,
        mode='exclude',
        include_icd=None,
        exclude_icd9=['745', '746', '747'],
        exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25',
                       'Q26']).filter_by_icd()

    print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)),
          '/', len(subgroup_pids))
    print('Before trimming:', len(positive_pids), '/', len(df_static))
    print('Trimmed cases:', len(df_static) - len(subgroup_pids))
    pos_rate = len(set(subgroup_pids)
                   & set(positive_pids)) / len(subgroup_pids)
    print('Positive rate:', pos_rate)

    # select features with pid in subgroup as data matrix, and split into training and test set
    selected_idx = subgroup_pids
    static_feature = static_feature.drop(columns=[
        'AnesthesiaDuration', 'Airway_1', 'Airway_1_Time', 'Airway_2',
        'Airway_2_Time', 'EBL', 'Urine_Output'
    ])
    X = static_feature.iloc[selected_idx, 1:]
    y = static_label.loc[selected_idx, 'label']

    return X, y, pos_rate
def prepare_data(df_static, df_dynamic):
    '''Prepare Data'''
    # label assignment (according to imputed SpO2)
    imputer = DataImputation()
    df_static = imputer.impute_static_dataframe(df_static)
    df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic)

    path_sta_label = 'data/label/static_label_lstm_' + str(args.hypoxemia_window) + '.pkl'
    path_dyn_label = 'data/label/dynamic_label_lstm_' + str(args.hypoxemia_window) + '.pkl'
    label_assign = LabelAssignment(hypoxemia_thresh=90,
                                   hypoxemia_window=args.hypoxemia_window,
                                   prediction_window=5)
    # print('Assigning labels...')
    # static_label, dynamic_label = label_assign.assign_label(df_static, df_dynamic)
    # static_label.to_pickle(path_sta_label)
    # dynamic_label.to_pickle(path_dyn_label)

    static_label = pd.read_pickle(path_sta_label)
    dynamic_label = pd.read_pickle(path_dyn_label)
    positive_pids = label_assign.get_positive_pids(static_label)
    print('Done.')

    # normalization of data
    min_max_scaler = preprocessing.MinMaxScaler()
    data = df_dynamic.iloc[:, 3:].values
    df_dynamic.iloc[:, 3:] = min_max_scaler.fit_transform(data)

    # get subgroup pids
    subgroup_pids = PatientFilter(df_static=df_static,
                                  mode='exclude',
                                  include_icd=['J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7',
                                               'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8',
                                               'P27.9', 'J44', 'V46.1', 'Z99.1'],  # High-risk group
                                  exclude_icd9=['745', '746', '747'],
                                  exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd()

    # split subgroup pids into training and test pid set
    pid_train, pid_test, _, _ = train_test_split(static_label.loc[subgroup_pids]['pid'].values,
                                                 static_label.loc[subgroup_pids]['label'].values,
                                                 test_size=0.2,
                                                 random_state=0,
                                                 stratify=static_label.loc[subgroup_pids]['label'].values)
    pid_train = sorted(list(pid_train))
    pid_test = sorted(list(pid_test))

    print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids))
    print('Before trimming:', len(positive_pids), '/', len(df_static))
    print('Trimmed cases:', len(df_static) - len(subgroup_pids))

    # select feature rows with pid in subgroup as data matrix
    print('Training/testing split:', len(pid_train), '/', len(pid_test))
    print('Split into training and test set...')
    is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values
    is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values
    # dynamic_label.loc[list(dynamic_label[dynamic_label.if_to_drop == 1].index), 'label'] = 2
    selected_idx_train = list(np.where(is_in_train)[0])
    selected_idx_test = list(np.where(is_in_test)[0])

    timeSeriesTr = df_dynamic.iloc[selected_idx_train, 0:21]
    labelsTr = static_label.iloc[pid_train][['pid', 'label']]
    timeSeriesTe = df_dynamic.iloc[selected_idx_test, 0:21]
    labelsTe = static_label.iloc[pid_test][['pid', 'label']]

    num_pos = np.sum(labelsTr['label'].values) + np.sum(labelsTe['label'].values)
    num_all = len(labelsTr) + len(labelsTe)
    pos_rate = num_pos / num_all

    return timeSeriesTr, labelsTr, timeSeriesTe, labelsTe, pos_rate, dynamic_label
Ejemplo n.º 3
0
    print("Anesthesia Duration: {:.0f} ({:.0f}, {:.0f})".format(des_sta.loc['AnesthesiaDuration', '50%'],
                                                                des_sta.loc['AnesthesiaDuration', '25%'],
                                                                des_sta.loc['AnesthesiaDuration', '75%']))


raw = pd.read_csv('../data/raw_data/static_updated.csv')
static = pd.read_csv('../data/data_frame/static_dataframe.csv')
dynamic = pd.read_csv('../data/data_frame/dynamic_dataframe.csv')
static['BMI'] = (static['WEIGHT'] / 1000) / (static['HEIGHT'] / 100) / (static['HEIGHT'] / 100)

print('Assigning labels...')
imputer = DataImputation()
df_static = imputer.impute_static_dataframe(static)
df_dynamic = imputer.impute_dynamic_dataframe(dynamic)
label_assign = LabelAssignment(hypoxemia_thresh=90,
                               hypoxemia_window=10,
                               prediction_window=5)
static_label = pd.read_pickle('../data/label/static_label.pkl')
dynamic_label = pd.read_pickle('../data/label/dynamic_label.pkl')
positive_pids = label_assign.get_positive_pids(static_label)
print('Done.')

# get subgroup pids
subgroup_pids = PatientFilter(df_static=df_static,
                              mode='exclude',
                              include_icd=None,
                              exclude_icd9=['745', '746', '747'],
                              exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd()
static = pd.read_csv('../data/data_frame/static_dataframe.csv')
static['BMI'] = (static['WEIGHT'] / 1000) / (static['HEIGHT'] / 100) / (static['HEIGHT'] / 100)
dynamic = pd.read_csv('../data/data_frame/dynamic_dataframe.csv')
def prepare_data(df_static, df_dynamic, dynamic_feature, args):

    # label assignment (according to imputed SpO2)
    print('Assigning labels...')
    imputer = DataImputation()
    df_static = imputer.impute_static_dataframe(df_static)
    df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic)
    path_sta_label = 'data/result/static_label.pkl'
    path_dyn_label = 'data/result/dynamic_label.pkl'
    label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh,
                                   hypoxemia_window=args.hypoxemia_window,
                                   prediction_window=args.prediction_window)
    # print('Assigning labels...')
    # static_label, dynamic_label = label_assign.assign_label(df_static, df_dynamic)
    # static_label.to_pickle(path_sta_label)
    # dynamic_label.to_pickle(path_dyn_label)

    static_label = pd.read_pickle(path_sta_label)
    dynamic_label = pd.read_pickle(path_dyn_label)
    positive_pids = label_assign.get_positive_pids(static_label)
    print('Done.')

    # get subgroup pids
    subgroup_pids = PatientFilter(
        df_static=df_static,
        mode=args.filter_mode,
        include_icd=[
            'J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7',
            'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8',
            'P27.9', 'J44', 'V46.1', 'Z99.1'
        ],  # High-risk group
        exclude_icd9=['745', '746', '747'],
        exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25',
                       'Q26']).filter_by_icd()

    # split subgroup pids into training and test pid set
    pid_train, pid_test, _, _ = train_test_split(
        static_label.loc[subgroup_pids]['pid'].values,
        static_label.loc[subgroup_pids]['label'].values,
        test_size=0.1,
        random_state=args.random_state,
        stratify=static_label.loc[subgroup_pids]['label'].values)
    pid_train = sorted(list(pid_train))
    pid_test = sorted(list(pid_test))

    print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)),
          '/', len(subgroup_pids))
    print('Before trimming:', len(positive_pids), '/', len(df_static))
    print('Trimmed cases:', len(df_static) - len(subgroup_pids))

    del df_static, df_dynamic

    # select feature rows with pid in subgroup as data matrix
    print('Training/testing split:', len(pid_train), '/', len(pid_test))
    print('Split into training and test set...')
    to_keep = (dynamic_label['if_to_drop'] == 0).values
    is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values
    is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values
    selected_idx_train = list(np.where(to_keep & is_in_train)[0])
    selected_idx_test = list(np.where(to_keep & is_in_test)[0])

    # adjust features used
    dynamic_feature = dynamic_feature.drop(
        columns=['AnesthesiaDuration', 'EBL', 'Urine_Output'])
    # column_names = list(dynamic_feature.columns)
    # drop_list = []
    # for name in column_names:
    #     if 'FiO2' in name or 'coreTemp' in name:
    #         drop_list.append(name)
    # dynamic_feature.drop(columns=drop_list)

    # split into training and test set
    X_train = dynamic_feature.iloc[selected_idx_train, 2:]
    X_test = dynamic_feature.iloc[selected_idx_test, 2:]
    y_train = dynamic_label.loc[selected_idx_train, 'label']
    y_test = dynamic_label.loc[selected_idx_test, 'label']

    # shuffle X and y
    X_train, y_train = shuffle(X_train, y_train, random_state=0)

    # positive number
    num_pos = np.sum(y_train) + np.sum(y_test)
    num_all = len(selected_idx_train) + len(selected_idx_test)
    pos_rate = num_pos / num_all
    print('Positive samples:', num_pos, '/', num_all)
    print('Ratio:', '%0.2f' % (num_pos / num_all * 100), '%')

    return X_train, X_test, y_train, y_test, pos_rate
Ejemplo n.º 5
0
import matplotlib.pyplot as plt
import argparse
import pickle
import sys

df_static = pd.read_csv(config.get('processed', 'df_static_file'))
df_dynamic = pd.read_csv(config.get('processed', 'df_dynamic_file'))

INDs = []
for hypoxemia_window in np.linspace(1, 10, 10).astype(int):
    print(hypoxemia_window)
    imputer = DataImputation()
    df_static = imputer.impute_static_dataframe(df_static)
    df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic)
    label_assign = LabelAssignment(hypoxemia_thresh=90,
                                   hypoxemia_window=hypoxemia_window,
                                   prediction_window=5)
    static_label, dynamic_label = label_assign.assign_label(
        df_static, df_dynamic)

    labels = dynamic_label['label'].values
    INDs.append(list(np.where(labels == 1)))

# initialize arguments
parser = argparse.ArgumentParser(description='hypoxemia prediction')
parser.add_argument('--hypoxemia_thresh', type=int, default=90)
parser.add_argument('--hypoxemia_window', type=int, default=1)
parser.add_argument('--prediction_window', type=int, default=5)
parser.add_argument('--filter_mode', type=str, default='exclude')
parser.add_argument('--dynamic_feature_file',
                    type=str,
Ejemplo n.º 6
0
def prepare_data(df_static, df_dynamic, dynamic_feature, args):

    # label assignment (according to imputed SpO2)
    print('Assigning labels...')
    imputer = DataImputation()
    df_static = imputer.impute_static_dataframe(df_static)
    df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic)
    label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh,
                                   hypoxemia_window=args.hypoxemia_window,
                                   prediction_window=args.prediction_window)
    static_label, dynamic_label = label_assign.assign_multi_label(
        df_static, df_dynamic)
    positive_pids = label_assign.get_positive_pids(static_label)
    print('Done.')

    # get subgroup pids
    subgroup_pids = PatientFilter(
        df_static=df_static,
        mode=args.filter_mode,
        include_icd=[
            'J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7',
            'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8',
            'P27.9', 'J44', 'V46.1', 'Z99.1'
        ],  # High-risk group
        exclude_icd9=['745', '746', '747'],
        exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25',
                       'Q26']).filter_by_icd()

    # split subgroup pids into training and test pid set
    pid_train, pid_test, _, _ = train_test_split(
        static_label.loc[subgroup_pids]['pid'].values,
        static_label.loc[subgroup_pids]['label'].values,
        test_size=0.1,
        random_state=0,
        stratify=static_label.loc[subgroup_pids]['label'].values)
    pid_train = sorted(list(pid_train))
    pid_test = sorted(list(pid_test))

    print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)),
          '/', len(subgroup_pids))
    print('Before trimming:', len(positive_pids), '/', len(df_static))
    print('Trimmed cases:', len(df_static) - len(subgroup_pids))

    del df_static, df_dynamic

    # select feature rows with pid in subgroup as data matrix
    print('Training/testing split:', len(pid_train), '/', len(pid_test))
    print('Split into training and test set...')
    to_keep = (dynamic_label['if_to_drop'] == 0).values
    is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values
    is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values
    selected_idx_train = list(np.where(to_keep & is_in_train)[0])
    selected_idx_test = list(np.where(to_keep & is_in_test)[0])

    # split into training and test set
    X_train = dynamic_feature.iloc[selected_idx_train, 2:].values
    X_test = dynamic_feature.iloc[selected_idx_test, 2:].values
    y_train = dynamic_label.loc[selected_idx_train, 'label'].values
    y_test = dynamic_label.loc[selected_idx_test, 'label'].values

    # shuffle X and y
    X_train, y_train = shuffle(
        X_train,
        y_train,
        # random_state=0
    )

    # positive number
    num_pos = np.sum(y_train) + np.sum(y_test)
    num_all = len(selected_idx_train) + len(selected_idx_test)
    print('Positive samples:', num_pos, '/', num_all)
    print('Ratio:', '%0.2f' % (num_pos / num_all * 100), '%')

    return X_train, X_test, y_train, y_test
df = df_raw[df_raw['Airway_Event1'] == airway_list[0]][0:10]
for airway in airway_list:
    if airway == airway_list[0]:
        continue
    df = df.append(df_raw[df_raw['Airway_Event1'] == airway][0:10])
index = df.index
df = df.reindex(columns=['Airway_Event1'] + df.columns.to_list())
df.to_csv('../data/results/airway_samples.csv')

# label assignment (according to imputed SpO2)
print('Assigning labels...')
imputer = DataImputation()
df_static = imputer.impute_static_dataframe(df_static)
df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic)
label_assign = LabelAssignment(hypoxemia_thresh=90,
                               hypoxemia_window=10,
                               prediction_window=5)
static_label, dynamic_label = label_assign.assign_label(df_static, df_dynamic)
positive_pids = label_assign.get_positive_pids(static_label)
print('Done.')

# get subgroup pids
subgroup_pids = PatientFilter(df_static=df_static,
                              mode='exclude',
                              include_icd=None,
                              exclude_icd9=['745', '746', '747'],
                              exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd()

print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids))
print('Before trimming:', len(positive_pids), '/', len(df_static))
print('Trimmed cases:', len(df_static) - len(subgroup_pids))