コード例 #1
0
def get_all_features_data(labs_df,
                          labs_data_dict,
                          vitals_df,
                          vitals_data_dict,
                          demographics_df,
                          demographics_data_dict,
                          medications_df,
                          medications_data_dict,
                          include_medications=True):
    '''Returns the merged labs, vitals and demographics features into a single table and the data dict'''

    time_col = parse_time_col(vitals_data_dict)
    id_cols = parse_id_cols(vitals_data_dict)

    # merge the labs, vitals and medications

    if include_medications:
        highfreq_df = pd.merge(pd.merge(vitals_df,
                                        labs_df,
                                        on=id_cols + [time_col],
                                        how='outer'),
                               medications_df,
                               on=id_cols + [time_col],
                               how='outer')

        # forward fill medications because the patient is/is not on medication on new time points created by outer join
        medication_features = parse_feature_cols(medications_data_dict)
        highfreq_df[id_cols + medication_features] = highfreq_df[
            id_cols + medication_features].groupby(id_cols).apply(
                lambda x: x.fillna(method='pad')).copy()

        highfreq_df[id_cols + medication_features] = highfreq_df[
            id_cols + medication_features].fillna(0)
        highfreq_data_dict = merge_data_dicts(
            [labs_data_dict, vitals_data_dict, medications_data_dict])

    else:
        highfreq_df = pd.merge(vitals_df,
                               labs_df,
                               on=id_cols + [time_col],
                               how='outer')
        highfreq_data_dict = merge_data_dicts(
            [labs_data_dict, vitals_data_dict])

    highfreq_data_dict['fields'] = highfreq_data_dict['schema']['fields']
    cols_to_keep = parse_id_cols(highfreq_data_dict) + [
        parse_time_col(highfreq_data_dict)
    ] + parse_feature_cols(highfreq_data_dict)
    highfreq_df = highfreq_df[cols_to_keep].copy()

    # merge the highfrequency features with the static features
    features_df = pd.merge(highfreq_df,
                           demographics_df,
                           on=id_cols,
                           how='inner')
    features_data_dict = merge_data_dicts(
        [highfreq_data_dict, demographics_data_dict])
    features_data_dict['fields'] = features_data_dict['schema']['fields']

    return features_df, features_data_dict
コード例 #2
0
def update_data_dict_mews(data_dict):
    id_cols = parse_id_cols(data_dict)

    new_fields = []
    for name in id_cols:
        for col in data_dict['fields']:
            if col['name'] == name:
                new_fields.append(col)

    new_fields.append({
        'name': 'mews_score',
        'role': 'feature',
        'type': 'numeric',
        'description': 'Modified Early Warning Score',
        'units': 'NONE',
        'constraints': {
            'required': 'FALSE',
            'minimum': '0',
            'maximum': 'INF'
        }
    })

    new_data_dict = copy.deepcopy(data_dict)
    if 'schema' in new_data_dict:
        new_data_dict['schema']['fields'] = new_fields
        del new_data_dict['fields']
    else:
        new_data_dict['fields'] = new_fields

    return new_data_dict
コード例 #3
0
def get_all_features_data(labs_df, labs_data_dict, vitals_df, vitals_data_dict, demographics_df, demographics_data_dict):
    '''Returns the merged labs, vitals and demographics features into a single table and the data dict'''

    time_col = parse_time_col(vitals_data_dict)
    id_cols = parse_id_cols(vitals_data_dict)

    # merge the labs and vitals
    highfreq_df = pd.merge(vitals_df, labs_df, on=id_cols +[time_col], how='outer')
    highfreq_data_dict = merge_data_dicts([labs_data_dict, vitals_data_dict])
    highfreq_data_dict['fields'] = highfreq_data_dict['schema']['fields']
    cols_to_keep = parse_id_cols(highfreq_data_dict) + [parse_time_col(highfreq_data_dict)] + parse_feature_cols(highfreq_data_dict)
    highfreq_df = highfreq_df[cols_to_keep].copy()


    # merge the highfrequency features with the static features
    features_df = pd.merge(highfreq_df, demographics_df, on=id_cols, how='inner')
    features_data_dict = merge_data_dicts([highfreq_data_dict, demographics_data_dict])
    features_data_dict['fields'] = features_data_dict['schema']['fields']

    return features_df, features_data_dict
コード例 #4
0
def compute_mews(ts_df, args, mews_df):
    id_cols = parse_id_cols(args.data_dict)
    id_cols = remove_col_names_from_list_if_not_in_df(id_cols, ts_df)
    feature_cols = ['systolic_blood_pressure', 'heart_rate', 'respiratory_rate', 'body_temperature']
    time_col = parse_time_col(args.data_dict)

    # Obtain fenceposts based on where any key differs
    # Be sure keys are converted to a numerical datatype (so fencepost detection is possible)
    keys_df = ts_df[id_cols].copy()
    for col in id_cols:
        if not pd.api.types.is_numeric_dtype(keys_df[col].dtype):
            keys_df[col] = keys_df[col].astype('category')
            keys_df[col] = keys_df[col].cat.codes
    fp = np.hstack([0, 1 + np.flatnonzero(np.diff(keys_df.values, axis=0).any(axis=1)), keys_df.shape[0]])
    nrows = len(fp)- 1

    timestamp_arr = np.asarray(ts_df[time_col].values.copy(), dtype=np.float64)
    mews_scores = np.zeros(nrows)
    
    # impute missing values per feature to population median for that feature
    ts_df_imputed = ts_df.groupby(id_cols).apply(lambda x: x.fillna(method='pad'))
    ts_df_imputed.fillna(ts_df_imputed.median(), inplace=True)
    mews_features_df = ts_df_imputed[feature_cols].copy()
    
    #print('Computing mews score in first %s hours of data'%(args.max_time_step))
    pbar=ProgressBar()
    for p in pbar(range(nrows)):
        # get the data for the current fencepost
        fp_start = fp[p]
        fp_end = fp[p+1]

        cur_timestamp_arr = timestamp_arr[fp_start:fp_end]
        cur_features_df = mews_features_df.iloc[fp_start:fp_end,:].reset_index(drop=True)
        
        cur_mews_scores = np.zeros(len(cur_timestamp_arr))
        for feature in feature_cols:
            feature_vals_np = cur_features_df[feature].astype(float)
            mews_df_cur_feature = mews_df[mews_df['vital']==feature].reset_index(drop=True)
            feature_maxrange_np = mews_df_cur_feature['range_max'].to_numpy().astype(float)
            scores_idx = np.searchsorted(feature_maxrange_np, feature_vals_np)
            cur_mews_scores += mews_df_cur_feature.loc[scores_idx, 'score'].to_numpy().astype(float)
        
        # set mews score as last observed mews score over all timesteps
        #mews_scores[p]=np.median(cur_mews_scores)
        mews_scores[p]=cur_mews_scores[-1]
    mews_scores_df = pd.DataFrame(data=mews_scores, columns=['mews_score'])

    for col_name in id_cols[::-1]:
        mews_scores_df.insert(0, col_name, ts_df[col_name].values[fp[:-1]].copy())
    return mews_scores_df   
コード例 #5
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=32,
                        help='Number of hidden units')
    parser.add_argument('--hidden_layers',
                        type=int,
                        default=1,
                        help='Number of hidden layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--is_data_simulated',
        type=bool,
        default=False,
        help='boolean to check if data is simulated or from mimic')
    parser.add_argument(
        '--simulated_data_dir',
        type=str,
        default='simulated_data/2-state/',
        help=
        'dir in which to simulated data is saved.Must be provide if is_data_simulated = True'
    )
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_sequence')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_sequence')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    _, T, F = X_train.shape

    print('number of time points : %s\n number of features : %s\n' % (T, F))

    # set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor(
        [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).double()

    # scale features
    #     X_train = standard_scaler_3d(X_train)
    #     X_test = standard_scaler_3d(X_test)

    # callback to compute gradient norm
    compute_grad_norm = ComputeGradientNorm(norm_type=2)

    # LSTM
    if args.output_filename_prefix == None:
        output_filename_prefix = (
            'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' %
            (args.hidden_units, args.hidden_layers, args.lr, args.dropout,
             args.weight_decay))
    else:
        output_filename_prefix = args.output_filename_prefix

    print('RNN parameters : ' + output_filename_prefix)
    # #     from IPython import embed; embed()
    rnn = RNNBinaryClassifier(
        max_epochs=50,
        batch_size=args.batch_size,
        device=device,
        lr=args.lr,
        callbacks=[
            EpochScoring('roc_auc',
                         lower_is_better=False,
                         on_train=True,
                         name='aucroc_score_train'),
            EpochScoring('roc_auc',
                         lower_is_better=False,
                         on_train=False,
                         name='aucroc_score_valid'),
            EarlyStopping(monitor='aucroc_score_valid',
                          patience=20,
                          threshold=0.002,
                          threshold_mode='rel',
                          lower_is_better=False),
            LRScheduler(policy=ReduceLROnPlateau,
                        mode='max',
                        monitor='aucroc_score_valid',
                        patience=10),
            compute_grad_norm,
            GradientNormClipping(gradient_clip_value=0.3,
                                 gradient_clip_norm_type=2),
            Checkpoint(monitor='aucroc_score_valid',
                       f_history=os.path.join(
                           args.output_dir, output_filename_prefix + '.json')),
            TrainEndCheckpoint(dirname=args.output_dir,
                               fn_prefix=output_filename_prefix),
        ],
        criterion=torch.nn.CrossEntropyLoss,
        criterion__weight=class_weights,
        train_split=skorch.dataset.CVSplit(args.validation_size),
        module__rnn_type='LSTM',
        module__n_layers=args.hidden_layers,
        module__n_hiddens=args.hidden_units,
        module__n_inputs=X_train.shape[-1],
        module__dropout_proba=args.dropout,
        optimizer=torch.optim.Adam,
        optimizer__weight_decay=args.weight_decay)

    clf = rnn.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_train)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos)
    print('AUROC with LSTM (Train) : %.2f' % auroc_train_final)

    y_pred_proba = clf.predict_proba(X_test)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos)
    print('AUROC with LSTM (Test) : %.2f' % auroc_test_final)
コード例 #6
0
def compute_mews_dynamic(ts_df, data_dict, mews_df, outcomes_df):
    id_cols = parse_id_cols(data_dict)
    id_cols = remove_col_names_from_list_if_not_in_df(id_cols, ts_df)

    feature_cols = [
        'systolic_blood_pressure', 'heart_rate', 'respiratory_rate',
        'body_temperature'
    ]

    time_cols = parse_time_cols(data_dict)
    time_cols = remove_col_names_from_list_if_not_in_df(time_cols, ts_df)

    if len(time_cols) == 0:
        raise ValueError("Expected at least one variable with role='time'")
    elif len(time_cols) > 1:
        #         raise ValueError("More than one time variable found. Expected exactly one.")
        print("More than one time variable found. Choosing %s" % time_cols[-1])
    time_col = time_cols[-1]

    # Obtain fenceposts based on where any key differs
    # Be sure keys are converted to a numerical datatype (so fencepost detection is possible)
    keys_df = ts_df[id_cols].copy()
    for col in id_cols:
        if not pd.api.types.is_numeric_dtype(keys_df[col].dtype):
            keys_df[col] = keys_df[col].astype('category')
            keys_df[col] = keys_df[col].cat.codes
    fp = np.hstack([
        0, 1 + np.flatnonzero(np.diff(keys_df.values, axis=0).any(axis=1)),
        keys_df.shape[0]
    ])
    nrows = len(fp) - 1

    timestamp_arr = np.asarray(ts_df[time_col].values.copy(), dtype=np.float32)
    features_arr = ts_df[feature_cols].values
    ids_arr = ts_df[id_cols].values
    prediction_window = 12
    prediction_horizon = 24
    max_hrs_data_observed = 504
    t_start = -24  # start time
    dynamic_mews_id_list = list()
    dynamic_outcomes_list = list()
    dynamic_window_list = list()
    dynamic_stay_lengths_list = list()

    # define outcome column (TODO : Avoid hardcording by loading from config.json)
    outcome_col = 'clinical_deterioration_outcome'

    # impute missing values per feature to population median for that feature
    print('Imputing missing values with forward fill for MEWS computation...')
    ts_df_imputed = ts_df.groupby(id_cols).apply(
        lambda x: x.fillna(method='pad'))
    ts_df_imputed.fillna(ts_df_imputed.median(), inplace=True)
    mews_features_df = ts_df_imputed[feature_cols].copy()

    print('Computing mews scores dynamically...')
    pbar = ProgressBar()
    dynamic_mews_scores_list = list()

    for p in pbar(range(nrows)):
        # get the data for the current fencepost
        fp_start = fp[p]
        fp_end = fp[p + 1]

        cur_timestamp_arr = timestamp_arr[fp_start:fp_end]
        cur_mews_features_df = mews_features_df.iloc[
            fp_start:fp_end, :].reset_index(drop=True)

        # get the current stay id (Do this outside the loop)
        cur_id_df = ts_df[id_cols].iloc[fp[p]:fp[p + 1]].drop_duplicates(
            subset=id_cols)

        # get the stay length of the current
        cur_outcomes_df = pd.merge(outcomes_df,
                                   cur_id_df,
                                   on=id_cols,
                                   how='inner')
        cur_stay_length = cur_outcomes_df['stay_length'].values[0]
        cur_final_outcome = int(cur_outcomes_df[outcome_col].values[0])

        # create windows from start to length of stay (0-prediction_window, 0-2*prediction_window, ... 0-length_of_stay)
        t_end = min(cur_stay_length, max_hrs_data_observed)
        window_ends = np.arange(t_start + prediction_window,
                                t_end + prediction_window, prediction_window)

        cur_dynamic_mews_scores = np.zeros([len(window_ends), 1],
                                           dtype=np.float32)

        for q, window_end in enumerate(window_ends):
            cur_dynamic_idx = (cur_timestamp_arr >
                               t_start) & (cur_timestamp_arr <= window_end)
            cur_dynamic_timestamp_arr = cur_timestamp_arr[cur_dynamic_idx]
            cur_dynamic_mews_features_df = cur_mews_features_df[
                cur_dynamic_idx]

            cur_mews_scores = np.zeros(len(cur_dynamic_timestamp_arr))

            if len(cur_dynamic_timestamp_arr) > 0:
                for feature in feature_cols:
                    feature_vals_np = cur_dynamic_mews_features_df[
                        feature].astype(float)
                    mews_df_cur_feature = mews_df[
                        mews_df['vital'] == feature].reset_index(drop=True)
                    feature_maxrange_np = mews_df_cur_feature[
                        'range_max'].to_numpy().astype(float)
                    scores_idx = np.searchsorted(feature_maxrange_np,
                                                 feature_vals_np)
                    cur_mews_scores += mews_df_cur_feature.loc[
                        scores_idx, 'score'].to_numpy().astype(float)

                cur_dynamic_mews_scores[q] = cur_mews_scores[-1]

            # set mews score as last observed mews score over all timesteps

            # keep track of stay ids
            dynamic_mews_id_list.append(cur_id_df.values[0])

            # keep track of windows
            dynamic_window_list.append(np.array([t_start, window_end]))

            # keep track of the stay lengths
            dynamic_stay_lengths_list.append(cur_stay_length)

            # if the length of stay is within the prediction horizon, set the outcome as the clinical deterioration outcome, else set 0
            if window_end >= cur_stay_length - prediction_horizon:
                dynamic_outcomes_list.append(cur_final_outcome)
            else:
                dynamic_outcomes_list.append(0)

        dynamic_mews_scores_list.append(cur_dynamic_mews_scores)

    # horizontally stack across all slices
    dynamic_mews_df = pd.DataFrame(np.vstack(dynamic_mews_scores_list),
                                   columns=['mews_score'])

    # add the ids back to the collapsed features
    ids_df = pd.DataFrame(dynamic_mews_id_list, columns=id_cols)

    # add the window start and ends
    dynamic_window_df = pd.DataFrame(np.vstack(dynamic_window_list),
                                     columns=['window_start', 'window_end'])
    dynamic_stay_lengths_df = pd.DataFrame(
        np.vstack(dynamic_stay_lengths_list), columns=['stay_length'])

    dynamic_mews_df = pd.concat([ids_df, dynamic_mews_df, dynamic_window_df],
                                axis=1)

    dynamic_outcomes_df = pd.DataFrame(np.array(dynamic_outcomes_list),
                                       columns=[outcome_col])
    dynamic_outcomes_df = pd.concat([
        ids_df, dynamic_outcomes_df, dynamic_window_df, dynamic_stay_lengths_df
    ],
                                    axis=1)

    return dynamic_mews_df, dynamic_outcomes_df
コード例 #7
0
                        default=True,
                        type=lambda x: (str(x).lower() == 'true'),
                        required=False)

    args = parser.parse_args()

    # read the data dictionaries
    print('Reading train-test data...')

    # read the data dict JSONs and parse the feature and outcome columns
    x_data_dict_file, y_data_dict_file = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_data_dict_file)
    y_data_dict = load_data_dict_json(y_data_dict_file)

    feature_cols = parse_feature_cols(x_data_dict)
    key_cols = parse_id_cols(x_data_dict)

    df_by_split = dict()
    for split_name, csv_files in [('train', args.train_csv_files.split(',')),
                                  ('test', args.test_csv_files.split(','))]:
        cur_df = None
        for csv_file in csv_files:

            # TODO use json data dict to load specific columns as desired types
            more_df = pd.read_csv(csv_file)
            if cur_df is None:
                cur_df = more_df
            else:
                if args.merge_x_y:
                    cur_df = cur_df.merge(more_df, on=key_cols)
                else:
コード例 #8
0
    args = parser.parse_args()

    # get all the collapsed labs, collapsed vitals, demographics and outcomes data dicts
    with open(
            os.path.join(args.static_data_dict_dir, 'Spec-Demographics.json'),
            'r') as f1:
        demographics_data_dict = json.load(f1)
    demographics_data_dict['fields'] = demographics_data_dict['schema'][
        'fields']

    with open(
            os.path.join(args.static_data_dict_dir,
                         'Spec-Outcomes_TransferToICU.json'), 'r') as f2:
        outcomes_data_dict = json.load(f2)

    id_cols = parse_id_cols(demographics_data_dict)
    # get all the collapsed labs, collapsed vitals, demographics and outcomes in all the tslice folders

    if args.include_medications == 'True':
        print(
            'Merging collapsed vitals, collapsed labs, collapsed medications, demographics and outcomes in all the tslice folders = %s into a single features table and a single outcomes table...'
            % args.tslice_list)
    else:
        print(
            'Merging collapsed vitals, collapsed labs, demographics and outcomes in all the tslice folders = %s into a single features table and a single outcomes table...'
            % args.tslice_list)

    features_df_all_slices_list = list()
    outcomes_df_all_slices_list = list()
    mews_df_all_slices_list = list()
    for tslice in args.tslice_list.split(' '):
コード例 #9
0
        os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                     'MewsDynamic.csv.gz'))

    demographics_df = pd.read_csv(
        os.path.join(args.static_data_dict_dir,
                     'demographics_before_icu.csv.gz'))

    # get data dicts of collapsed features
    demographics_dd = load_data_dict_json(
        os.path.join(args.static_data_dict_dir, 'Spec-Demographics.json'))
    outcomes_dd = load_data_dict_json(
        os.path.join(args.static_data_dict_dir,
                     'Spec-Outcomes_TransferToICU.json'))

    # merge vitals, labs and medications
    id_cols = parse_id_cols(demographics_dd)

    print('Merging demographics...')
    # merge demographics
    dynamic_mews_df = pd.merge(dynamic_mews_df,
                               demographics_df,
                               on=id_cols,
                               how='left')

    # Set the dynamic outputs to be same as the vitals dynamic outputs because all stays contain atleast 1 vital
    dynamic_outputs_df = pd.read_csv(
        os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                     'OutputsDynamicMews.csv.gz'))

    # add admission timestamp as a column for outputs for creating train-test splits based on timestamps later
    dynamic_outputs_df = pd.merge(
コード例 #10
0
        '--tslice',
        type=str,
        default=2,
        help=
        '''Slice of data to be extracted. If tslice is provided with a % sign (for eg. 20%), 
        then the script extracts the first tslice% data from the stay. If tslice is an int (for eg. 5),
        the the script extracts the first  tslice hrs of data. If tslice is negative (for eg. -5), then 
        the script extracts the data until tslice hours before deterioration/discharge.'''
    )
    parser.add_argument('--output_dir', type=str)

    args = parser.parse_args()
    labs_df, labs_data_dict, vitals_df, vitals_data_dict, \
    demographics_df, demographics_data_dict, outcomes_df, outcomes_data_dict = get_preprocessed_data(args.preproc_data_dir)

    id_cols = parse_id_cols(vitals_data_dict)
    labs_feature_cols = parse_feature_cols(labs_data_dict)
    vitals_feature_cols = parse_feature_cols(vitals_data_dict)

    # get lengths of stay for each admission
    vitals_df_with_stay_lengths = pd.merge(vitals_df,
                                           outcomes_df[id_cols +
                                                       ['stay_length']],
                                           on=id_cols,
                                           how='inner')
    labs_df_with_stay_lengths = pd.merge(labs_df,
                                         outcomes_df[id_cols +
                                                     ['stay_length']],
                                         on=id_cols,
                                         how='inner')
    #demographics_df_with_stay_lengths = pd.merge(demographics_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner')
コード例 #11
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--n_filters',
                        type=int,
                        default=32,
                        help='Number of filters')
    parser.add_argument('--kernel_size',
                        type=int,
                        default=1,
                        help='size of eack kernel')
    parser.add_argument('--n_conv_layers',
                        type=int,
                        default=1,
                        help='number of convolutional layers')
    parser.add_argument('--stride', type=int, default=1, help='stride')
    parser.add_argument('--pool_size',
                        type=int,
                        default=4,
                        help='max pool size')
    parser.add_argument('--dense_units',
                        type=int,
                        default=128,
                        help='number of units in fully connected layer')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_sequence')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_sequence')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    N, T, F = X_train.shape

    # add class weights
    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(y_train),
                                                      y_train)
    #     class_weights = dict(zip(range(len(class_weights)), class_weights))

    # convert y_train to categorical
    y_train = keras.utils.to_categorical(y_train)
    y_test = keras.utils.to_categorical(y_test)

    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=args.validation_size, random_state=213)

    print('number of time points : %s\nnumber of features : %s\n' % (T, F))

    set_random_seed(args.seed)
    model = keras.Sequential()
    for i in range(args.n_conv_layers):
        model.add(
            keras.layers.Conv1D(filters=args.n_filters,
                                kernel_size=args.kernel_size,
                                activation='relu',
                                strides=args.stride))
    model.add(keras.layers.Dropout(args.dropout))
    model.add(keras.layers.MaxPooling1D(pool_size=args.pool_size))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(args.dense_units, activation='relu'))
    model.add(keras.layers.Dense(2, activation='softmax'))

    # set optimizer
    opt = keras.optimizers.Adam(learning_rate=args.lr)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy', keras.metrics.AUC()])

    # set early stopping
    early_stopping = EarlyStopping(monitor='val_auc',
                                   patience=20,
                                   mode='max',
                                   verbose=1)

    model.fit(X_train,
              y_train,
              epochs=100,
              validation_data=(X_val, y_val),
              callbacks=[early_stopping],
              class_weight=class_weights,
              batch_size=args.batch_size)

    y_score_val = model.predict_proba(X_val)
    val_auc = roc_auc_score(y_val, y_score_val)
    print('AUC on val set : %.4f' % val_auc)

    y_score_test = model.predict_proba(X_test)
    test_auc = roc_auc_score(y_test, y_score_test)
    print('AUC on val set : %.4f' % test_auc)

    # save the model history
    training_hist_df = pd.DataFrame(model.history.history)
    training_hist_df.loc[:, 'test_auc'] = test_auc
    training_hist_csv = os.path.join(args.output_dir,
                                     args.output_filename_prefix + '.csv')
    training_hist_df.to_csv(training_hist_csv, index=False)

    # save the model
    model_file = os.path.join(args.output_dir,
                              args.output_filename_prefix + '.model')
    model.save(model_file)
                     'Spec_CollapsedLabsDynamic.json'))
    demographics_dd = load_data_dict_json(
        os.path.join(args.static_data_dict_dir, 'Spec-Demographics.json'))
    outcomes_dd = load_data_dict_json(
        os.path.join(args.static_data_dict_dir,
                     'Spec-Outcomes_TransferToICU.json'))

    # get dynamic outputs
    vitals_output = pd.read_csv(
        os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                     'OutputsDynamicVitals.csv.gz'))
    labs_output = pd.read_csv(
        os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                     'OutputsDynamicLabs.csv.gz'))

    id_cols = parse_id_cols(vitals_dd)

    if args.include_medications == 'true':
        print(
            'Merging labs, vitals. medications into a single table of dynamic collapsed features...'
        )

        dynamic_collapsed_medications_df = pd.read_csv(
            os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                         'CollapsedMedicationsDynamic.csv.gz'))
        medications_dd = load_data_dict_json(
            os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
                         'Spec_CollapsedMedicationsDynamic.json'))

        medications_output = pd.read_csv(
            os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
コード例 #13
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--valid_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=32,
                        help='Number of hidden units')
    parser.add_argument('--hidden_layers',
                        type=int,
                        default=1,
                        help='Number of hidden layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--is_data_simulated',
        type=bool,
        default=False,
        help='boolean to check if data is simulated or from mimic')
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_valid_csv_filename, y_valid_csv_filename = args.valid_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_tstep')

    valid_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_valid_csv_filename,
        y_csv_path=y_valid_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_tstep')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_tstep')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_valid, y_valid = valid_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    N, T, F = X_train.shape

    #     from IPython import embed; embed()
    #     X_train = (X_train - np.min(X_train))/(np.max(X_train)-np.min(X_train))
    #     X_valid = (X_valid - np.min(X_train))/(np.max(X_train)-np.min(X_train))
    #     X_test = (X_test - np.min(X_train))/(np.max(X_train)-np.min(X_train))

    valid_ds = Dataset(X_valid, y_valid)

    print('number of time points : %s\nnumber of features : %s\n' % (T, F))

    # set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor(
        [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).float()

    print('Number of training sequences : %s' % N)
    print('Number of test sequences : %s' % X_test.shape[0])
    print('Ratio positive in train : %.2f' %
          ((y_train == 1).sum() / len(y_train)))
    print('Ratio positive in test : %.2f' %
          ((y_test == 1).sum() / len(y_test)))

    # callback to compute gradient norm
    compute_grad_norm = ComputeGradientNorm(norm_type=2)

    # LSTM
    if args.output_filename_prefix == None:
        output_filename_prefix = (
            'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' %
            (args.hidden_units, args.hidden_layers, args.lr, args.dropout,
             args.weight_decay))
    else:
        output_filename_prefix = args.output_filename_prefix

    print('RNN parameters : ' + output_filename_prefix)

    loss_early_stopping_cp = EarlyStopping(monitor='valid_loss',
                                           patience=15,
                                           threshold=0.002,
                                           threshold_mode='rel',
                                           lower_is_better=True)

    rnn = RNNPerTStepBinaryClassifier(
        max_epochs=250,
        batch_size=args.batch_size,
        device=device,
        lr=args.lr,
        callbacks=[
            EpochScoring(calc_auprc,
                         lower_is_better=False,
                         on_train=True,
                         name='auprc_train'),
            EpochScoring(calc_auprc,
                         lower_is_better=False,
                         on_train=False,
                         name='auprc_valid'),
            EpochScoring(calc_auroc,
                         lower_is_better=False,
                         on_train=True,
                         name='auroc_train'),
            EpochScoring(calc_auroc,
                         lower_is_better=False,
                         on_train=False,
                         name='auroc_valid'),
            #               EpochScoring(calc_precision, lower_is_better=False, on_train=True, name='precision_train'),
            #               EpochScoring(calc_precision, lower_is_better=False, on_train=False, name='precision_valid'),
            #               EpochScoring(calc_recall, lower_is_better=False, on_train=True, name='recall_train'),
            #               EpochScoring(calc_recall, lower_is_better=False, on_train=False, name='recall_valid'),
            #               EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'),
            #               EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'),
            #                   EarlyStopping(monitor='auprc_valid', patience=5, threshold=0.002, threshold_mode='rel',
            #                                                  lower_is_better=False),
            #               LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10),
            #                   compute_grad_norm,
            #               GradientNormClipping(gradient_clip_value=0.5, gradient_clip_norm_type=2),
            loss_early_stopping_cp,
            Checkpoint(monitor='auprc_valid',
                       f_history=os.path.join(
                           args.output_dir, output_filename_prefix + '.json')),
            TrainEndCheckpoint(dirname=args.output_dir,
                               fn_prefix=output_filename_prefix),
        ],
        #               criterion=torch.nn.CrossEntropyLoss,
        #               criterion__weight=class_weights,
        train_split=predefined_split(valid_ds),
        module__rnn_type='GRU',
        module__n_layers=args.hidden_layers,
        module__n_hiddens=args.hidden_units,
        module__n_inputs=X_train.shape[-1],
        module__dropout_proba=args.dropout,
        optimizer=torch.optim.Adam,
        optimizer__weight_decay=args.weight_decay)

    #     N=len(X_train)
    #     X_train = X_train[:N]
    #     y_train = y_train[:N]

    clf = rnn.fit(X_train, y_train)

    # get threshold with max recall at fixed precision
    fixed_precision = 0.1

    # get predict probas for y=1 on validation set
    keep_inds_va = torch.logical_not(
        torch.all(torch.isnan(torch.FloatTensor(X_valid)), dim=-1))
    y_va_pred_proba = clf.predict_proba(
        X_valid)[keep_inds_va][:, 1].detach().numpy()

    unique_probas = np.unique(y_va_pred_proba)
    thr_grid_G = np.linspace(np.percentile(unique_probas, 1),
                             max(unique_probas), 100)

    precision_scores_G, recall_scores_G = [
        np.zeros(thr_grid_G.size),
        np.zeros(thr_grid_G.size)
    ]
    for gg, thr in enumerate(thr_grid_G):
        #             logistic_clf.module_.linear_transform_layer.bias.data = torch.tensor(thr_grid[gg]).double()
        curr_thr_y_preds = clf.predict_proba(
            torch.FloatTensor(X_valid))[keep_inds_va][:, 1] >= thr_grid_G[gg]
        precision_scores_G[gg] = precision_score(y_valid[keep_inds_va],
                                                 curr_thr_y_preds)
        recall_scores_G[gg] = recall_score(y_valid[keep_inds_va],
                                           curr_thr_y_preds)

    keep_inds = precision_scores_G >= fixed_precision

    if keep_inds.sum() > 0:
        print('Choosing threshold with precision >= %.3f' % fixed_precision)
    else:
        fixed_precision_old = fixed_precision
        fixed_precision = np.percentile(precision_scores_G, 99)
        keep_inds = precision_scores_G >= fixed_precision
        print(
            'Could not find threshold with precision >= %.3f \n Choosing threshold to maximize recall at precision %.3f'
            % (fixed_precision_old, fixed_precision))

    thr_grid_G = thr_grid_G[keep_inds]
    precision_scores_G = precision_scores_G[keep_inds]
    recall_scores_G = recall_scores_G[keep_inds]
    thr_perf_df = pd.DataFrame(
        np.vstack([
            thr_grid_G[np.newaxis, :], precision_scores_G[np.newaxis, :],
            recall_scores_G[np.newaxis, :]
        ]).T,
        columns=['thr', 'precision_score', 'recall_score'])

    print(thr_perf_df)
    best_ind = np.argmax(recall_scores_G)
    best_thr = thr_grid_G[best_ind]
    print('chosen threshold : %.3f' % best_thr)

    splits = ['train', 'valid', 'test']
    #     data_splits = ((x_tr, y_tr), (x_va, y_va), (X_test, y_test))
    auroc_per_split, auprc_per_split, precisions_per_split, recalls_per_split = [
        np.zeros(len(splits)),
        np.zeros(len(splits)),
        np.zeros(len(splits)),
        np.zeros(len(splits))
    ]

    for ii, (X, y) in enumerate([(X_train, y_train), (X_valid, y_valid),
                                 (X_test, y_test)]):
        keep_inds = torch.logical_not(
            torch.all(torch.isnan(torch.FloatTensor(X)), dim=-1))
        y_pred_proba_pos = clf.predict_proba(X)[keep_inds][:,
                                                           1].detach().numpy()
        #         y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
        auroc_per_split[ii] = roc_auc_score(y[keep_inds], y_pred_proba_pos)
        #         y_pred_proba_pos = np.asarray(y_pred_proba_pos)
        auprc_per_split[ii] = average_precision_score(y[keep_inds],
                                                      y_pred_proba_pos)
        y_pred = y_pred_proba_pos >= best_thr
        precisions_per_split[ii] = precision_score(y[keep_inds], y_pred)
        recalls_per_split[ii] = recall_score(y[keep_inds], y_pred)

    auroc_train, auroc_valid, auroc_test = auroc_per_split
    auprc_train, auprc_valid, auprc_test = auprc_per_split
    precision_train, precision_valid, precision_test = precisions_per_split
    recall_train, recall_valid, recall_test = recalls_per_split

    # save performance
    perf_dict = {
        'auroc_train': auroc_train,
        'auroc_valid': auroc_valid,
        'auroc_test': auroc_test,
        'auprc_train': auprc_train,
        'auprc_valid': auprc_valid,
        'auprc_test': auprc_test,
        'precision_train': precision_train,
        'precision_valid': precision_valid,
        'precision_test': precision_test,
        'recall_train': recall_train,
        'recall_valid': recall_valid,
        'recall_test': recall_test,
        'threshold': best_thr
    }

    perf_df = pd.DataFrame([perf_dict])
    perf_csv = os.path.join(args.output_dir, output_filename_prefix + '.csv')
    print('Final performance on train, valid and test :\n')
    print(perf_df)

    print('Final performance saved to %s' % perf_csv)
    perf_df.to_csv(perf_csv, index=False)
コード例 #14
0
    if len(args.group_cols) == 0 or args.group_cols[0] is not None:
        group_cols = args.group_cols
    elif args.group_cols[0] is None:
        try:
            fields = data_dict['fields']
        except KeyError:
            fields = data_dict['schema']['fields']
        group_cols = [c['name'] for c in fields
                      if c['role'] in ('id', 'key') and c['name'] in df.columns]
    '''
    # sort the dataframe by timestamp
    if 'window_start' in df.columns:
        df_timesorted = df.sort_values(
            by=['admission_timestamp', 'window_start', 'window_end'])
    else:
        id_cols = parse_id_cols(data_dict)
        df_timesorted = df.sort_values(by=['admission_timestamp'] + id_cols +
                                       ['hours_since_admission'])

    # set the first 3 years of admissions for training
    train_admission_ts_start = df_timesorted['admission_timestamp'].min()

    # set the 4th year as validation
    valid_admission_ts_start = str(
        pd.to_datetime(train_admission_ts_start) +
        datetime.timedelta(hours=24 * 365 * 3))

    # set the 5th year for test
    test_admission_ts_start = str(
        pd.to_datetime(train_admission_ts_start) +
        datetime.timedelta(hours=24 * 365 * 4))
def featurize_stack_of_many_time_series(
    ts_df=None,
    ts_data_dict=None,
    outcomes_df=None,
    outcomes_data_dict=None,
    summary_ops=['mean', 'min', 'max'],
    percentile_slices_to_featurize=[(0, 100)],
    outcome_col='clinical_deterioration_outcome',
    outcome_seq_duration_col='stay_length',
    start_time_of_each_sequence=-24.0,
    max_time_of_each_sequence=504,
    start_time_of_endpoints=0.0,
    time_between_endpoints=12,
    prediction_horizon=24,
    verbose=True,
):
    ''' Featurize many patient stays slices and extract outcome for each slice

    Args
    ----
    ts_df : pandas DataFrame
        Each row provides all measurements at one time of a single patient-stay
        Must contain one column already converted to numerical time
    ts_data_dict : dict
        Provides specification for every column of ts_df
    outcomes_df : pandas DataFrame
        Each row provides outcome of a single patient stay
    outcomes_data_dict : dict
        Provides specification for each column of outcomes_df
    summary_ops : list of strings
        Identifies the summary functions we wish to apply to each variable's ts
    percentile_slices_to_featurize : list of tuples
        Indicates percentile range of all subwindows we will featurize
        Example: [(0, 100), (0, 50)])

    Returns
    -------
    all_feat_df : DataFrame
        One row per featurized window of any patient-stay slice
        Key columns: ids + ['start', 'stop']
        Value columns: one per extracted feature
    all_outcomes_df : DataFrame
        One row per featurized window of any patient-stay slice
        Key columns: ids + ['start', 'stop']
        Value columns: just one, the outcome column

    Examples
    --------
    >>> args = make_fake_input_data(n_seqs=25, n_features=10, max_duration=50.0)
    >>> feat_df, outcome_df = featurize_stack_of_many_time_series(*args,
    ...     summary_ops=['mean', 'slope'],
    ...     start_time_of_each_sequence=0,
    ...     start_time_of_endpoints=0.0,
    ...     time_between_endpoints=12.0,
    ...     verbose=False,
    ...     );
    >>> feat_df.shape
    (95, 24)

    '''
    # Parse desired slices to featurize at each window
    # This allows command-line specification of ranges as a string
    if isinstance(percentile_slices_to_featurize, str):
        percentile_slices_to_featurize = ast.literal_eval(
            percentile_slices_to_featurize)
    if isinstance(summary_ops, str):
        summary_ops = summary_ops.split(' ')

    # Parse provided data dictionary
    # Recover specific columns for each of the different roles:
    id_cols = parse_id_cols(ts_data_dict)
    id_cols = remove_col_names_from_list_if_not_in_df(id_cols, ts_df)
    feature_cols = parse_feature_cols(ts_data_dict)
    feature_cols = remove_col_names_from_list_if_not_in_df(feature_cols, ts_df)
    time_cols = parse_time_cols(ts_data_dict)
    time_cols = remove_col_names_from_list_if_not_in_df(time_cols, ts_df)
    if len(time_cols) == 0:
        raise ValueError("Expected at least one variable with role='time'")
    elif len(time_cols) > 1:
        raise Warning("More than one time variable found. Choosing %s" %
                      time_cols[-1])
    time_col = time_cols[-1]

    # Obtain fenceposts delineating each individual sequence within big stack
    # We assume that sequences changeover when *any* key differs
    # We convert all keys to a numerical datatype to make this possible
    keys_df = ts_df[id_cols].copy()
    for col in id_cols:
        if not pd.api.types.is_numeric_dtype(keys_df[col].dtype):
            keys_df[col] = keys_df[col].astype('category')
            keys_df[col] = keys_df[col].cat.codes
    middle_fence_posts = 1 + np.flatnonzero(
        np.diff(keys_df.values, axis=0).any(axis=1))
    fp = np.hstack([0, middle_fence_posts, keys_df.shape[0]])

    feat_arr_per_seq = list()
    windows_per_seq = list()
    outcomes_per_seq = list()
    durations_per_seq = list()
    missingness_density_per_seq = list()
    ids_per_seq = list()

    # Total number of features we'll compute in each feature vector
    F = len(percentile_slices_to_featurize) * len(feature_cols) * len(
        summary_ops)

    # Loop over each sequence in the tall tidy-format dataset
    start_time_sec = time.time()
    n_seqs = len(fp) - 1
    pbar = ProgressBar()
    for p in pbar(range(n_seqs)):

        # Get features and times for the current fencepost
        fp_start = fp[p]
        fp_end = fp[p + 1]

        # Get the current stay keys
        cur_id_df = ts_df[id_cols].iloc[fp_start:fp_end].drop_duplicates(
            subset=id_cols)

        if outcomes_df is not None:
            # Get the total duration of the current sequence
            cur_outcomes_df = pd.merge(outcomes_df,
                                       cur_id_df,
                                       on=id_cols,
                                       how='inner')

            # Get the current sequence's finale outcome
            cur_final_outcome = int(cur_outcomes_df[outcome_col].values[0])
            cur_seq_duration = float(
                cur_outcomes_df[outcome_seq_duration_col].values[0])
        else:
            cur_seq_duration = float(
                ts_df[time_col].iloc[fp_start:fp_end].values[-1])

        # Create windows at desired spacing
        stop_time_of_cur_sequence = min(cur_seq_duration,
                                        max_time_of_each_sequence)
        window_ends = np.arange(
            start_time_of_endpoints,
            stop_time_of_cur_sequence + 0.01 * time_between_endpoints,
            time_between_endpoints)

        # Create a dictionary of times and values for each feature
        time_arr_by_var = dict()
        val_arr_by_var = dict()
        times_U = ts_df[time_col].values[fp_start:fp_end]
        for feature_col in feature_cols:
            vals_U = ts_df[feature_col].values[fp_start:fp_end]
            keep_mask_U = np.isfinite(vals_U)
            if np.sum(keep_mask_U) > 0:
                time_arr_by_var[feature_col] = times_U[keep_mask_U]
                val_arr_by_var[feature_col] = vals_U[keep_mask_U]

        cur_seq_missing_density = (
            1.0 - len(val_arr_by_var.keys()) / float(len(feature_cols)))

        # Deprecated code from preetish.... MCH couldn't get this to work.
        '''
        v = cur_fp_df.set_index(time_col).agg(lambda x: x.dropna().to_dict()) 
        res = v[v.str.len() > 0].to_dict()
        for feature_col in feature_cols:
            if feature_col in res.keys():
                time_arr_by_var[feature_col] = np.array(
                    list(res[feature_col].keys()), dtype=np.float64)
                val_arr_by_var[feature_col] = np.array(
                    list(res[feature_col].values()), dtype=np.float64)
        '''

        W = len(window_ends)
        window_features_WF = np.zeros([W, F], dtype=np.float32)
        window_starts_stops_W2 = np.zeros([W, 2], dtype=np.float32)
        if outcomes_df is not None:
            window_outcomes_W1 = np.zeros([W, 1], dtype=np.int64)

        for ww, window_end in enumerate(window_ends):
            window_starts_stops_W2[ww, 0] = start_time_of_each_sequence
            window_starts_stops_W2[ww, 1] = window_end

            window_features_WF[ww, :], feat_names = featurize_ts(
                time_arr_by_var,
                val_arr_by_var,
                var_cols=feature_cols,
                var_spec_dict=ts_data_dict,
                start_numerictime=start_time_of_each_sequence,
                stop_numerictime=window_end,
                summary_ops=summary_ops,
                percentile_slices_to_featurize=percentile_slices_to_featurize)

            if outcomes_df is not None:
                # Determine the outcome for this window
                # Set outcome as final outcome if within the provided horizon
                # Otherwise, set to zero
                if window_end >= cur_seq_duration - prediction_horizon:
                    window_outcomes_W1[ww] = cur_final_outcome
                else:
                    window_outcomes_W1[ww] = 0

        # Append all windows from this sequence to the big lists
        feat_arr_per_seq.append(window_features_WF)
        windows_per_seq.append(window_starts_stops_W2)
        ids_per_seq.append(np.tile(cur_id_df.values[0], (W, 1)))

        durations_per_seq.append(np.tile(cur_seq_duration, (W, 1)))
        missingness_density_per_seq.append(cur_seq_missing_density)
        if outcomes_df is not None:
            outcomes_per_seq.append(window_outcomes_W1)

    # Produce final data frames
    features_df = pd.DataFrame(np.vstack(feat_arr_per_seq), columns=feat_names)
    ids_df = pd.DataFrame(np.vstack(ids_per_seq), columns=id_cols)
    windows_df = pd.DataFrame(np.vstack(windows_per_seq),
                              columns=['start', 'stop'])
    all_features_df = pd.concat([ids_df, windows_df, features_df], axis=1)

    if outcomes_df is not None:
        durations_df = pd.DataFrame(np.vstack(durations_per_seq),
                                    columns=[outcome_seq_duration_col])
        outcomes_df = pd.DataFrame(np.vstack(outcomes_per_seq),
                                   columns=[outcome_col])
        all_outcomes_df = pd.concat(
            [ids_df, windows_df, durations_df, outcomes_df], axis=1)
    else:
        durations_df = pd.DataFrame(np.vstack(durations_per_seq),
                                    columns=[outcome_seq_duration_col])
        all_outcomes_df = pd.concat([ids_df, windows_df, durations_df], axis=1)

    seq_lengths = np.vstack([a[0] for a in durations_per_seq])
    elapsed_time_sec = time.time() - start_time_sec

    if verbose:
        print('-----------------------------------------')
        print('Processed %d sequences of duration %.1f-%.1f in %.1f sec' % (
            n_seqs,
            np.percentile(seq_lengths, 5),
            np.percentile(seq_lengths, 95),
            elapsed_time_sec,
        ))
        print('    Total number of measured features: %d' % len(feature_cols))
        print(
            '    Fraction of possible features NEVER seen in a seq. : %.2f-%.2f '
            % (
                np.percentile(missingness_density_per_seq, 5),
                np.percentile(missingness_density_per_seq, 95),
            ))
        print('-----------------------------------------')
    return all_features_df, all_outcomes_df