Example #1
0
def prepare_data(df_static, df_dynamic, static_feature, args):

    # label assignment (according to imputed SpO2)
    imputer = DataImputation()
    df_static = imputer.impute_static_dataframe(df_static)
    df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic)
    path_sta_label = 'data/label/static_label.pkl'
    path_dyn_label = 'data/label/dynamic_label.pkl'
    label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh,
                                   hypoxemia_window=args.hypoxemia_window,
                                   prediction_window=args.prediction_window)
    if os.path.exists(path_sta_label) and os.path.exists(path_dyn_label):
        static_label = pd.read_pickle(path_sta_label)
        positive_pids = label_assign.get_positive_pids(static_label)
    else:
        print('Assigning labels...')
        static_label, dynamic_label = label_assign.assign_label(
            df_static, df_dynamic)
        static_label.to_pickle(path_sta_label)
        dynamic_label.to_pickle(path_dyn_label)
        print('Done.')

    # get subgroup pids
    subgroup_pids = PatientFilter(
        df_static=df_static,
        mode='exclude',
        include_icd=None,
        exclude_icd9=['745', '746', '747'],
        exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25',
                       'Q26']).filter_by_icd()

    print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)),
          '/', len(subgroup_pids))
    print('Before trimming:', len(positive_pids), '/', len(df_static))
    print('Trimmed cases:', len(df_static) - len(subgroup_pids))
    pos_rate = len(set(subgroup_pids)
                   & set(positive_pids)) / len(subgroup_pids)
    print('Positive rate:', pos_rate)

    # select features with pid in subgroup as data matrix, and split into training and test set
    selected_idx = subgroup_pids
    static_feature = static_feature.drop(columns=[
        'AnesthesiaDuration', 'Airway_1', 'Airway_1_Time', 'Airway_2',
        'Airway_2_Time', 'EBL', 'Urine_Output'
    ])
    X = static_feature.iloc[selected_idx, 1:]
    y = static_label.loc[selected_idx, 'label']

    return X, y, pos_rate
def prepare_data(df_static, df_dynamic):
    '''Prepare Data'''
    # label assignment (according to imputed SpO2)
    imputer = DataImputation()
    df_static = imputer.impute_static_dataframe(df_static)
    df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic)

    path_sta_label = 'data/label/static_label_lstm_' + str(args.hypoxemia_window) + '.pkl'
    path_dyn_label = 'data/label/dynamic_label_lstm_' + str(args.hypoxemia_window) + '.pkl'
    label_assign = LabelAssignment(hypoxemia_thresh=90,
                                   hypoxemia_window=args.hypoxemia_window,
                                   prediction_window=5)
    # print('Assigning labels...')
    # static_label, dynamic_label = label_assign.assign_label(df_static, df_dynamic)
    # static_label.to_pickle(path_sta_label)
    # dynamic_label.to_pickle(path_dyn_label)

    static_label = pd.read_pickle(path_sta_label)
    dynamic_label = pd.read_pickle(path_dyn_label)
    positive_pids = label_assign.get_positive_pids(static_label)
    print('Done.')

    # normalization of data
    min_max_scaler = preprocessing.MinMaxScaler()
    data = df_dynamic.iloc[:, 3:].values
    df_dynamic.iloc[:, 3:] = min_max_scaler.fit_transform(data)

    # get subgroup pids
    subgroup_pids = PatientFilter(df_static=df_static,
                                  mode='exclude',
                                  include_icd=['J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7',
                                               'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8',
                                               'P27.9', 'J44', 'V46.1', 'Z99.1'],  # High-risk group
                                  exclude_icd9=['745', '746', '747'],
                                  exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd()

    # split subgroup pids into training and test pid set
    pid_train, pid_test, _, _ = train_test_split(static_label.loc[subgroup_pids]['pid'].values,
                                                 static_label.loc[subgroup_pids]['label'].values,
                                                 test_size=0.2,
                                                 random_state=0,
                                                 stratify=static_label.loc[subgroup_pids]['label'].values)
    pid_train = sorted(list(pid_train))
    pid_test = sorted(list(pid_test))

    print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)), '/', len(subgroup_pids))
    print('Before trimming:', len(positive_pids), '/', len(df_static))
    print('Trimmed cases:', len(df_static) - len(subgroup_pids))

    # select feature rows with pid in subgroup as data matrix
    print('Training/testing split:', len(pid_train), '/', len(pid_test))
    print('Split into training and test set...')
    is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values
    is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values
    # dynamic_label.loc[list(dynamic_label[dynamic_label.if_to_drop == 1].index), 'label'] = 2
    selected_idx_train = list(np.where(is_in_train)[0])
    selected_idx_test = list(np.where(is_in_test)[0])

    timeSeriesTr = df_dynamic.iloc[selected_idx_train, 0:21]
    labelsTr = static_label.iloc[pid_train][['pid', 'label']]
    timeSeriesTe = df_dynamic.iloc[selected_idx_test, 0:21]
    labelsTe = static_label.iloc[pid_test][['pid', 'label']]

    num_pos = np.sum(labelsTr['label'].values) + np.sum(labelsTe['label'].values)
    num_all = len(labelsTr) + len(labelsTe)
    pos_rate = num_pos / num_all

    return timeSeriesTr, labelsTr, timeSeriesTe, labelsTe, pos_rate, dynamic_label
def prepare_data(df_static, df_dynamic, dynamic_feature, args):

    # label assignment (according to imputed SpO2)
    print('Assigning labels...')
    imputer = DataImputation()
    df_static = imputer.impute_static_dataframe(df_static)
    df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic)
    path_sta_label = 'data/result/static_label.pkl'
    path_dyn_label = 'data/result/dynamic_label.pkl'
    label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh,
                                   hypoxemia_window=args.hypoxemia_window,
                                   prediction_window=args.prediction_window)
    # print('Assigning labels...')
    # static_label, dynamic_label = label_assign.assign_label(df_static, df_dynamic)
    # static_label.to_pickle(path_sta_label)
    # dynamic_label.to_pickle(path_dyn_label)

    static_label = pd.read_pickle(path_sta_label)
    dynamic_label = pd.read_pickle(path_dyn_label)
    positive_pids = label_assign.get_positive_pids(static_label)
    print('Done.')

    # get subgroup pids
    subgroup_pids = PatientFilter(
        df_static=df_static,
        mode=args.filter_mode,
        include_icd=[
            'J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7',
            'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8',
            'P27.9', 'J44', 'V46.1', 'Z99.1'
        ],  # High-risk group
        exclude_icd9=['745', '746', '747'],
        exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25',
                       'Q26']).filter_by_icd()

    # split subgroup pids into training and test pid set
    pid_train, pid_test, _, _ = train_test_split(
        static_label.loc[subgroup_pids]['pid'].values,
        static_label.loc[subgroup_pids]['label'].values,
        test_size=0.1,
        random_state=args.random_state,
        stratify=static_label.loc[subgroup_pids]['label'].values)
    pid_train = sorted(list(pid_train))
    pid_test = sorted(list(pid_test))

    print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)),
          '/', len(subgroup_pids))
    print('Before trimming:', len(positive_pids), '/', len(df_static))
    print('Trimmed cases:', len(df_static) - len(subgroup_pids))

    del df_static, df_dynamic

    # select feature rows with pid in subgroup as data matrix
    print('Training/testing split:', len(pid_train), '/', len(pid_test))
    print('Split into training and test set...')
    to_keep = (dynamic_label['if_to_drop'] == 0).values
    is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values
    is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values
    selected_idx_train = list(np.where(to_keep & is_in_train)[0])
    selected_idx_test = list(np.where(to_keep & is_in_test)[0])

    # adjust features used
    dynamic_feature = dynamic_feature.drop(
        columns=['AnesthesiaDuration', 'EBL', 'Urine_Output'])
    # column_names = list(dynamic_feature.columns)
    # drop_list = []
    # for name in column_names:
    #     if 'FiO2' in name or 'coreTemp' in name:
    #         drop_list.append(name)
    # dynamic_feature.drop(columns=drop_list)

    # split into training and test set
    X_train = dynamic_feature.iloc[selected_idx_train, 2:]
    X_test = dynamic_feature.iloc[selected_idx_test, 2:]
    y_train = dynamic_label.loc[selected_idx_train, 'label']
    y_test = dynamic_label.loc[selected_idx_test, 'label']

    # shuffle X and y
    X_train, y_train = shuffle(X_train, y_train, random_state=0)

    # positive number
    num_pos = np.sum(y_train) + np.sum(y_test)
    num_all = len(selected_idx_train) + len(selected_idx_test)
    pos_rate = num_pos / num_all
    print('Positive samples:', num_pos, '/', num_all)
    print('Ratio:', '%0.2f' % (num_pos / num_all * 100), '%')

    return X_train, X_test, y_train, y_test, pos_rate
Example #4
0
                                                  des_sta.loc['Urine_Output', '75%']))

    # Anesthesia Duration
    print("Anesthesia Duration: {:.0f} ({:.0f}, {:.0f})".format(des_sta.loc['AnesthesiaDuration', '50%'],
                                                                des_sta.loc['AnesthesiaDuration', '25%'],
                                                                des_sta.loc['AnesthesiaDuration', '75%']))


raw = pd.read_csv('../data/raw_data/static_updated.csv')
static = pd.read_csv('../data/data_frame/static_dataframe.csv')
dynamic = pd.read_csv('../data/data_frame/dynamic_dataframe.csv')
static['BMI'] = (static['WEIGHT'] / 1000) / (static['HEIGHT'] / 100) / (static['HEIGHT'] / 100)

print('Assigning labels...')
imputer = DataImputation()
df_static = imputer.impute_static_dataframe(static)
df_dynamic = imputer.impute_dynamic_dataframe(dynamic)
label_assign = LabelAssignment(hypoxemia_thresh=90,
                               hypoxemia_window=10,
                               prediction_window=5)
static_label = pd.read_pickle('../data/label/static_label.pkl')
dynamic_label = pd.read_pickle('../data/label/dynamic_label.pkl')
positive_pids = label_assign.get_positive_pids(static_label)
print('Done.')

# get subgroup pids
subgroup_pids = PatientFilter(df_static=df_static,
                              mode='exclude',
                              include_icd=None,
                              exclude_icd9=['745', '746', '747'],
                              exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26']).filter_by_icd()
Example #5
0
def prepare_data(df_static, df_dynamic, dynamic_feature, args):

    # label assignment (according to imputed SpO2)
    print('Assigning labels...')
    imputer = DataImputation()
    df_static = imputer.impute_static_dataframe(df_static)
    df_dynamic = imputer.impute_dynamic_dataframe(df_dynamic)
    label_assign = LabelAssignment(hypoxemia_thresh=args.hypoxemia_thresh,
                                   hypoxemia_window=args.hypoxemia_window,
                                   prediction_window=args.prediction_window)
    static_label, dynamic_label = label_assign.assign_multi_label(
        df_static, df_dynamic)
    positive_pids = label_assign.get_positive_pids(static_label)
    print('Done.')

    # get subgroup pids
    subgroup_pids = PatientFilter(
        df_static=df_static,
        mode=args.filter_mode,
        include_icd=[
            'J96.', 'J98.', '519.', '518.', '277.0', 'E84', 'Q31.5', '770.7',
            'P27.1', '490', '491', '492', '493', '494', '495', '496', 'P27.8',
            'P27.9', 'J44', 'V46.1', 'Z99.1'
        ],  # High-risk group
        exclude_icd9=['745', '746', '747'],
        exclude_icd10=['Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25',
                       'Q26']).filter_by_icd()

    # split subgroup pids into training and test pid set
    pid_train, pid_test, _, _ = train_test_split(
        static_label.loc[subgroup_pids]['pid'].values,
        static_label.loc[subgroup_pids]['label'].values,
        test_size=0.1,
        random_state=0,
        stratify=static_label.loc[subgroup_pids]['label'].values)
    pid_train = sorted(list(pid_train))
    pid_test = sorted(list(pid_test))

    print('Positive Patient:', len(set(subgroup_pids) & set(positive_pids)),
          '/', len(subgroup_pids))
    print('Before trimming:', len(positive_pids), '/', len(df_static))
    print('Trimmed cases:', len(df_static) - len(subgroup_pids))

    del df_static, df_dynamic

    # select feature rows with pid in subgroup as data matrix
    print('Training/testing split:', len(pid_train), '/', len(pid_test))
    print('Split into training and test set...')
    to_keep = (dynamic_label['if_to_drop'] == 0).values
    is_in_train = dynamic_label[['pid']].isin(pid_train)['pid'].values
    is_in_test = dynamic_label[['pid']].isin(pid_test)['pid'].values
    selected_idx_train = list(np.where(to_keep & is_in_train)[0])
    selected_idx_test = list(np.where(to_keep & is_in_test)[0])

    # split into training and test set
    X_train = dynamic_feature.iloc[selected_idx_train, 2:].values
    X_test = dynamic_feature.iloc[selected_idx_test, 2:].values
    y_train = dynamic_label.loc[selected_idx_train, 'label'].values
    y_test = dynamic_label.loc[selected_idx_test, 'label'].values

    # shuffle X and y
    X_train, y_train = shuffle(
        X_train,
        y_train,
        # random_state=0
    )

    # positive number
    num_pos = np.sum(y_train) + np.sum(y_test)
    num_all = len(selected_idx_train) + len(selected_idx_test)
    print('Positive samples:', num_pos, '/', num_all)
    print('Ratio:', '%0.2f' % (num_pos / num_all * 100), '%')

    return X_train, X_test, y_train, y_test