Exemple #1
0
def get_all_features_data(labs_df,
                          labs_data_dict,
                          vitals_df,
                          vitals_data_dict,
                          demographics_df,
                          demographics_data_dict,
                          medications_df,
                          medications_data_dict,
                          include_medications=True):
    '''Returns the merged labs, vitals and demographics features into a single table and the data dict'''

    time_col = parse_time_col(vitals_data_dict)
    id_cols = parse_id_cols(vitals_data_dict)

    # merge the labs, vitals and medications

    if include_medications:
        highfreq_df = pd.merge(pd.merge(vitals_df,
                                        labs_df,
                                        on=id_cols + [time_col],
                                        how='outer'),
                               medications_df,
                               on=id_cols + [time_col],
                               how='outer')

        # forward fill medications because the patient is/is not on medication on new time points created by outer join
        medication_features = parse_feature_cols(medications_data_dict)
        highfreq_df[id_cols + medication_features] = highfreq_df[
            id_cols + medication_features].groupby(id_cols).apply(
                lambda x: x.fillna(method='pad')).copy()

        highfreq_df[id_cols + medication_features] = highfreq_df[
            id_cols + medication_features].fillna(0)
        highfreq_data_dict = merge_data_dicts(
            [labs_data_dict, vitals_data_dict, medications_data_dict])

    else:
        highfreq_df = pd.merge(vitals_df,
                               labs_df,
                               on=id_cols + [time_col],
                               how='outer')
        highfreq_data_dict = merge_data_dicts(
            [labs_data_dict, vitals_data_dict])

    highfreq_data_dict['fields'] = highfreq_data_dict['schema']['fields']
    cols_to_keep = parse_id_cols(highfreq_data_dict) + [
        parse_time_col(highfreq_data_dict)
    ] + parse_feature_cols(highfreq_data_dict)
    highfreq_df = highfreq_df[cols_to_keep].copy()

    # merge the highfrequency features with the static features
    features_df = pd.merge(highfreq_df,
                           demographics_df,
                           on=id_cols,
                           how='inner')
    features_data_dict = merge_data_dicts(
        [highfreq_data_dict, demographics_data_dict])
    features_data_dict['fields'] = features_data_dict['schema']['fields']

    return features_df, features_data_dict
Exemple #2
0
def get_all_features_data(labs_df, labs_data_dict, vitals_df, vitals_data_dict, demographics_df, demographics_data_dict):
    '''Returns the merged labs, vitals and demographics features into a single table and the data dict'''

    time_col = parse_time_col(vitals_data_dict)
    id_cols = parse_id_cols(vitals_data_dict)

    # merge the labs and vitals
    highfreq_df = pd.merge(vitals_df, labs_df, on=id_cols +[time_col], how='outer')
    highfreq_data_dict = merge_data_dicts([labs_data_dict, vitals_data_dict])
    highfreq_data_dict['fields'] = highfreq_data_dict['schema']['fields']
    cols_to_keep = parse_id_cols(highfreq_data_dict) + [parse_time_col(highfreq_data_dict)] + parse_feature_cols(highfreq_data_dict)
    highfreq_df = highfreq_df[cols_to_keep].copy()


    # merge the highfrequency features with the static features
    features_df = pd.merge(highfreq_df, demographics_df, on=id_cols, how='inner')
    features_data_dict = merge_data_dicts([highfreq_data_dict, demographics_data_dict])
    features_data_dict['fields'] = features_data_dict['schema']['fields']

    return features_df, features_data_dict
def compute_mews(ts_df, args, mews_df):
    id_cols = parse_id_cols(args.data_dict)
    id_cols = remove_col_names_from_list_if_not_in_df(id_cols, ts_df)
    feature_cols = ['systolic_blood_pressure', 'heart_rate', 'respiratory_rate', 'body_temperature']
    time_col = parse_time_col(args.data_dict)

    # Obtain fenceposts based on where any key differs
    # Be sure keys are converted to a numerical datatype (so fencepost detection is possible)
    keys_df = ts_df[id_cols].copy()
    for col in id_cols:
        if not pd.api.types.is_numeric_dtype(keys_df[col].dtype):
            keys_df[col] = keys_df[col].astype('category')
            keys_df[col] = keys_df[col].cat.codes
    fp = np.hstack([0, 1 + np.flatnonzero(np.diff(keys_df.values, axis=0).any(axis=1)), keys_df.shape[0]])
    nrows = len(fp)- 1

    timestamp_arr = np.asarray(ts_df[time_col].values.copy(), dtype=np.float64)
    mews_scores = np.zeros(nrows)
    
    # impute missing values per feature to population median for that feature
    ts_df_imputed = ts_df.groupby(id_cols).apply(lambda x: x.fillna(method='pad'))
    ts_df_imputed.fillna(ts_df_imputed.median(), inplace=True)
    mews_features_df = ts_df_imputed[feature_cols].copy()
    
    #print('Computing mews score in first %s hours of data'%(args.max_time_step))
    pbar=ProgressBar()
    for p in pbar(range(nrows)):
        # get the data for the current fencepost
        fp_start = fp[p]
        fp_end = fp[p+1]

        cur_timestamp_arr = timestamp_arr[fp_start:fp_end]
        cur_features_df = mews_features_df.iloc[fp_start:fp_end,:].reset_index(drop=True)
        
        cur_mews_scores = np.zeros(len(cur_timestamp_arr))
        for feature in feature_cols:
            feature_vals_np = cur_features_df[feature].astype(float)
            mews_df_cur_feature = mews_df[mews_df['vital']==feature].reset_index(drop=True)
            feature_maxrange_np = mews_df_cur_feature['range_max'].to_numpy().astype(float)
            scores_idx = np.searchsorted(feature_maxrange_np, feature_vals_np)
            cur_mews_scores += mews_df_cur_feature.loc[scores_idx, 'score'].to_numpy().astype(float)
        
        # set mews score as last observed mews score over all timesteps
        #mews_scores[p]=np.median(cur_mews_scores)
        mews_scores[p]=cur_mews_scores[-1]
    mews_scores_df = pd.DataFrame(data=mews_scores, columns=['mews_score'])

    for col_name in id_cols[::-1]:
        mews_scores_df.insert(0, col_name, ts_df[col_name].values[fp[:-1]].copy())
    return mews_scores_df   
Exemple #4
0
    # get the train test features
    x_train_csv = os.path.join(args.train_test_split_dir, 'x_train.csv.gz')
    x_valid_csv = os.path.join(args.train_test_split_dir, 'x_valid.csv.gz')
    x_test_csv = os.path.join(args.train_test_split_dir, 'x_test.csv.gz')
    x_dict_json = os.path.join(args.train_test_split_dir, 'x_dict.json')

    # impute values by carry forward and then pop mean on train and test sets separately
    x_data_dict = load_data_dict_json(x_dict_json)
    x_train_df = pd.read_csv(x_train_csv)
    x_valid_df = pd.read_csv(x_valid_csv)
    x_test_df = pd.read_csv(x_test_csv)

    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    time_col = parse_time_col(x_data_dict)

    # add mask features
    non_medication_feature_cols = [
        feature_col for feature_col in feature_cols
        if 'medication' not in feature_col
    ]
    medication_feature_cols = [
        feature_col for feature_col in feature_cols
        if 'medication' in feature_col
    ]

    print('Adding missing values mask as features...')
    for feature_col in non_medication_feature_cols:
        x_train_df.loc[:, 'mask_' +
                       feature_col] = (~x_train_df[feature_col].isna()) * 1.0
        try:
            vitals_data_dict['fields'] = vitals_data_dict['schema']['fields']
        except KeyError:
            pass

    with open(labs_data_dict_json, 'r') as f:
        labs_data_dict = json.load(f)
        try:
            labs_data_dict['fields'] = labs_data_dict['schema']['fields']
        except KeyError:
            pass

    id_cols = parse_id_cols(vitals_data_dict)
    vitals = parse_feature_cols(vitals_data_dict)
    labs = parse_feature_cols(labs_data_dict)
    time_col = parse_time_col(vitals_data_dict)

    # compute missingness per stay
    vital_counts_per_stay_df = df_vitals.groupby(id_cols).count()
    print('#######################################')
    print('MISSINGNESS OF VITALS OVER FULL STAYS : ')
    vitals_missing_rate_entire_stay_dict = dict()
    for vital in vitals:
        vitals_missing_rate_entire_stay_dict[vital] = (
            (vital_counts_per_stay_df[vital]
             == 0).sum()) / vital_counts_per_stay_df.shape[0]
    vitals_missing_rate_entire_stay_series = pd.Series(
        vitals_missing_rate_entire_stay_dict)
    print(vitals_missing_rate_entire_stay_series)

    lab_counts_per_stay_df = df_labs.groupby(id_cols).count()