Beispiel #1
0
def get_all_features_data(labs_df,
                          labs_data_dict,
                          vitals_df,
                          vitals_data_dict,
                          demographics_df,
                          demographics_data_dict,
                          medications_df,
                          medications_data_dict,
                          include_medications=True):
    '''Returns the merged labs, vitals and demographics features into a single table and the data dict'''

    time_col = parse_time_col(vitals_data_dict)
    id_cols = parse_id_cols(vitals_data_dict)

    # merge the labs, vitals and medications

    if include_medications:
        highfreq_df = pd.merge(pd.merge(vitals_df,
                                        labs_df,
                                        on=id_cols + [time_col],
                                        how='outer'),
                               medications_df,
                               on=id_cols + [time_col],
                               how='outer')

        # forward fill medications because the patient is/is not on medication on new time points created by outer join
        medication_features = parse_feature_cols(medications_data_dict)
        highfreq_df[id_cols + medication_features] = highfreq_df[
            id_cols + medication_features].groupby(id_cols).apply(
                lambda x: x.fillna(method='pad')).copy()

        highfreq_df[id_cols + medication_features] = highfreq_df[
            id_cols + medication_features].fillna(0)
        highfreq_data_dict = merge_data_dicts(
            [labs_data_dict, vitals_data_dict, medications_data_dict])

    else:
        highfreq_df = pd.merge(vitals_df,
                               labs_df,
                               on=id_cols + [time_col],
                               how='outer')
        highfreq_data_dict = merge_data_dicts(
            [labs_data_dict, vitals_data_dict])

    highfreq_data_dict['fields'] = highfreq_data_dict['schema']['fields']
    cols_to_keep = parse_id_cols(highfreq_data_dict) + [
        parse_time_col(highfreq_data_dict)
    ] + parse_feature_cols(highfreq_data_dict)
    highfreq_df = highfreq_df[cols_to_keep].copy()

    # merge the highfrequency features with the static features
    features_df = pd.merge(highfreq_df,
                           demographics_df,
                           on=id_cols,
                           how='inner')
    features_data_dict = merge_data_dicts(
        [highfreq_data_dict, demographics_data_dict])
    features_data_dict['fields'] = features_data_dict['schema']['fields']

    return features_df, features_data_dict
def update_data_dict_mews(args):
    data_dict = args.data_dict

    id_cols = parse_id_cols(args.data_dict)
    feature_cols = parse_feature_cols(args.data_dict)

    new_fields = []
    for name in id_cols:
        for col in data_dict['fields']:
            if col['name'] == name:
                new_fields.append(col)

    new_fields.append({
        'name': 'mews_score',
        'role': 'feature',
        'type': 'numeric',
        'description': 'Modified Early Warning Score',
        'units': 'NONE',
        'constraints': {
            'required': 'FALSE',
            'minimum': '0',
            'maximum': 'INF'
        }
    })

    new_data_dict = copy.deepcopy(data_dict)
    if 'schema' in new_data_dict:
        new_data_dict['schema']['fields'] = new_fields
        del new_data_dict['fields']
    else:
        new_data_dict['fields'] = new_fields

    return new_data_dict
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=32,
                        help='Number of hidden units')
    parser.add_argument('--hidden_layers',
                        type=int,
                        default=1,
                        help='Number of hidden layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--is_data_simulated',
        type=bool,
        default=False,
        help='boolean to check if data is simulated or from mimic')
    parser.add_argument(
        '--simulated_data_dir',
        type=str,
        default='simulated_data/2-state/',
        help=
        'dir in which to simulated data is saved.Must be provide if is_data_simulated = True'
    )
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_sequence')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_sequence')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    _, T, F = X_train.shape

    print('number of time points : %s\n number of features : %s\n' % (T, F))

    # set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor(
        [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).double()

    # scale features
    #     X_train = standard_scaler_3d(X_train)
    #     X_test = standard_scaler_3d(X_test)

    # callback to compute gradient norm
    compute_grad_norm = ComputeGradientNorm(norm_type=2)

    # LSTM
    if args.output_filename_prefix == None:
        output_filename_prefix = (
            'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' %
            (args.hidden_units, args.hidden_layers, args.lr, args.dropout,
             args.weight_decay))
    else:
        output_filename_prefix = args.output_filename_prefix

    print('RNN parameters : ' + output_filename_prefix)
    # #     from IPython import embed; embed()
    rnn = RNNBinaryClassifier(
        max_epochs=50,
        batch_size=args.batch_size,
        device=device,
        lr=args.lr,
        callbacks=[
            EpochScoring('roc_auc',
                         lower_is_better=False,
                         on_train=True,
                         name='aucroc_score_train'),
            EpochScoring('roc_auc',
                         lower_is_better=False,
                         on_train=False,
                         name='aucroc_score_valid'),
            EarlyStopping(monitor='aucroc_score_valid',
                          patience=20,
                          threshold=0.002,
                          threshold_mode='rel',
                          lower_is_better=False),
            LRScheduler(policy=ReduceLROnPlateau,
                        mode='max',
                        monitor='aucroc_score_valid',
                        patience=10),
            compute_grad_norm,
            GradientNormClipping(gradient_clip_value=0.3,
                                 gradient_clip_norm_type=2),
            Checkpoint(monitor='aucroc_score_valid',
                       f_history=os.path.join(
                           args.output_dir, output_filename_prefix + '.json')),
            TrainEndCheckpoint(dirname=args.output_dir,
                               fn_prefix=output_filename_prefix),
        ],
        criterion=torch.nn.CrossEntropyLoss,
        criterion__weight=class_weights,
        train_split=skorch.dataset.CVSplit(args.validation_size),
        module__rnn_type='LSTM',
        module__n_layers=args.hidden_layers,
        module__n_hiddens=args.hidden_units,
        module__n_inputs=X_train.shape[-1],
        module__dropout_proba=args.dropout,
        optimizer=torch.optim.Adam,
        optimizer__weight_decay=args.weight_decay)

    clf = rnn.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_train)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos)
    print('AUROC with LSTM (Train) : %.2f' % auroc_train_final)

    y_pred_proba = clf.predict_proba(X_test)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos)
    print('AUROC with LSTM (Test) : %.2f' % auroc_test_final)
Beispiel #4
0
    parser.add_argument('--merge_x_y',
                        default=True,
                        type=lambda x: (str(x).lower() == 'true'),
                        required=False)

    args = parser.parse_args()

    # read the data dictionaries
    print('Reading train-test data...')

    # read the data dict JSONs and parse the feature and outcome columns
    x_data_dict_file, y_data_dict_file = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_data_dict_file)
    y_data_dict = load_data_dict_json(y_data_dict_file)

    feature_cols = parse_feature_cols(x_data_dict)
    key_cols = parse_id_cols(x_data_dict)

    df_by_split = dict()
    for split_name, csv_files in [('train', args.train_csv_files.split(',')),
                                  ('test', args.test_csv_files.split(','))]:
        cur_df = None
        for csv_file in csv_files:

            # TODO use json data dict to load specific columns as desired types
            more_df = pd.read_csv(csv_file)
            if cur_df is None:
                cur_df = more_df
            else:
                if args.merge_x_y:
                    cur_df = cur_df.merge(more_df, on=key_cols)
Beispiel #5
0
    else:
        print(
            'Merging collapsed vitals, collapsed labs, demographics and outcomes data dicts into a single features data dict and a single outcomes data dict...'
        )

        features_dict_merged = collapsed_labs_data_dict['schema'][
            'fields'] + collapsed_vitals_data_dict['schema'][
                'fields'] + demographics_data_dict['schema']['fields']

    for feat_dict in features_dict_merged:
        if feat_dict['name'] not in feat_names:
            features_data_dict['schema']['fields'].append(feat_dict)
            feat_names.append(feat_dict['name'])

    # convert the features to numpy float 32 to avoid memory issues
    feature_cols = parse_feature_cols(features_data_dict['schema'])
    feature_type_dict = dict.fromkeys(feature_cols)
    for k in feature_type_dict.keys():
        feature_type_dict[k] = np.float32
    features_df_all_slices = features_df_all_slices.astype(feature_type_dict)

    # save to disk
    features_csv = os.path.join(args.output_dir, 'features.csv.gz')
    outcomes_csv = os.path.join(args.output_dir, 'outcomes.csv.gz')
    mews_csv = os.path.join(args.output_dir, 'mews.csv.gz')
    features_json = os.path.join(args.output_dir, 'Spec_features.json')
    outcomes_json = os.path.join(args.output_dir, 'Spec_outcomes.json')
    mews_json = os.path.join(args.output_dir, 'Spec_mews.json')

    print('saving features and outcomes to :\n%s\n%s\n%s' %
          (features_csv, outcomes_csv, mews_csv))
Beispiel #6
0
        type=str,
        default=2,
        help=
        '''Slice of data to be extracted. If tslice is provided with a % sign (for eg. 20%), 
        then the script extracts the first tslice% data from the stay. If tslice is an int (for eg. 5),
        the the script extracts the first  tslice hrs of data. If tslice is negative (for eg. -5), then 
        the script extracts the data until tslice hours before deterioration/discharge.'''
    )
    parser.add_argument('--output_dir', type=str)

    args = parser.parse_args()
    labs_df, labs_data_dict, vitals_df, vitals_data_dict, \
    demographics_df, demographics_data_dict, outcomes_df, outcomes_data_dict = get_preprocessed_data(args.preproc_data_dir)

    id_cols = parse_id_cols(vitals_data_dict)
    labs_feature_cols = parse_feature_cols(labs_data_dict)
    vitals_feature_cols = parse_feature_cols(vitals_data_dict)

    # get lengths of stay for each admission
    vitals_df_with_stay_lengths = pd.merge(vitals_df,
                                           outcomes_df[id_cols +
                                                       ['stay_length']],
                                           on=id_cols,
                                           how='inner')
    labs_df_with_stay_lengths = pd.merge(labs_df,
                                         outcomes_df[id_cols +
                                                     ['stay_length']],
                                         on=id_cols,
                                         how='inner')
    #demographics_df_with_stay_lengths = pd.merge(demographics_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner')
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--n_filters',
                        type=int,
                        default=32,
                        help='Number of filters')
    parser.add_argument('--kernel_size',
                        type=int,
                        default=1,
                        help='size of eack kernel')
    parser.add_argument('--n_conv_layers',
                        type=int,
                        default=1,
                        help='number of convolutional layers')
    parser.add_argument('--stride', type=int, default=1, help='stride')
    parser.add_argument('--pool_size',
                        type=int,
                        default=4,
                        help='max pool size')
    parser.add_argument('--dense_units',
                        type=int,
                        default=128,
                        help='number of units in fully connected layer')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_sequence')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_sequence')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    N, T, F = X_train.shape

    # add class weights
    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(y_train),
                                                      y_train)
    #     class_weights = dict(zip(range(len(class_weights)), class_weights))

    # convert y_train to categorical
    y_train = keras.utils.to_categorical(y_train)
    y_test = keras.utils.to_categorical(y_test)

    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=args.validation_size, random_state=213)

    print('number of time points : %s\nnumber of features : %s\n' % (T, F))

    set_random_seed(args.seed)
    model = keras.Sequential()
    for i in range(args.n_conv_layers):
        model.add(
            keras.layers.Conv1D(filters=args.n_filters,
                                kernel_size=args.kernel_size,
                                activation='relu',
                                strides=args.stride))
    model.add(keras.layers.Dropout(args.dropout))
    model.add(keras.layers.MaxPooling1D(pool_size=args.pool_size))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(args.dense_units, activation='relu'))
    model.add(keras.layers.Dense(2, activation='softmax'))

    # set optimizer
    opt = keras.optimizers.Adam(learning_rate=args.lr)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy', keras.metrics.AUC()])

    # set early stopping
    early_stopping = EarlyStopping(monitor='val_auc',
                                   patience=20,
                                   mode='max',
                                   verbose=1)

    model.fit(X_train,
              y_train,
              epochs=100,
              validation_data=(X_val, y_val),
              callbacks=[early_stopping],
              class_weight=class_weights,
              batch_size=args.batch_size)

    y_score_val = model.predict_proba(X_val)
    val_auc = roc_auc_score(y_val, y_score_val)
    print('AUC on val set : %.4f' % val_auc)

    y_score_test = model.predict_proba(X_test)
    test_auc = roc_auc_score(y_test, y_score_test)
    print('AUC on val set : %.4f' % test_auc)

    # save the model history
    training_hist_df = pd.DataFrame(model.history.history)
    training_hist_df.loc[:, 'test_auc'] = test_auc
    training_hist_csv = os.path.join(args.output_dir,
                                     args.output_filename_prefix + '.csv')
    training_hist_df.to_csv(training_hist_csv, index=False)

    # save the model
    model_file = os.path.join(args.output_dir,
                              args.output_filename_prefix + '.model')
    model.save(model_file)
Beispiel #8
0
    args = parser.parse_args()

    # get the train test features
    x_train_csv = os.path.join(args.train_test_split_dir, 'x_train.csv.gz')
    x_valid_csv = os.path.join(args.train_test_split_dir, 'x_valid.csv.gz')
    x_test_csv = os.path.join(args.train_test_split_dir, 'x_test.csv.gz')
    x_dict_json = os.path.join(args.train_test_split_dir, 'x_dict.json')

    # impute values by carry forward and then pop mean on train and test sets separately
    x_data_dict = load_data_dict_json(x_dict_json)
    x_train_df = pd.read_csv(x_train_csv)
    x_valid_df = pd.read_csv(x_valid_csv)
    x_test_df = pd.read_csv(x_test_csv)

    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    time_col = parse_time_col(x_data_dict)

    # add mask features
    non_medication_feature_cols = [
        feature_col for feature_col in feature_cols
        if 'medication' not in feature_col
    ]
    medication_feature_cols = [
        feature_col for feature_col in feature_cols
        if 'medication' in feature_col
    ]

    print('Adding missing values mask as features...')
    for feature_col in non_medication_feature_cols:
        x_train_df.loc[:, 'mask_' +
Beispiel #9
0
def get_all_features_data(labs_df, labs_data_dict, vitals_df, vitals_data_dict, demographics_df, demographics_data_dict):
    '''Returns the merged labs, vitals and demographics features into a single table and the data dict'''

    time_col = parse_time_col(vitals_data_dict)
    id_cols = parse_id_cols(vitals_data_dict)

    # merge the labs and vitals
    highfreq_df = pd.merge(vitals_df, labs_df, on=id_cols +[time_col], how='outer')
    highfreq_data_dict = merge_data_dicts([labs_data_dict, vitals_data_dict])
    highfreq_data_dict['fields'] = highfreq_data_dict['schema']['fields']
    cols_to_keep = parse_id_cols(highfreq_data_dict) + [parse_time_col(highfreq_data_dict)] + parse_feature_cols(highfreq_data_dict)
    highfreq_df = highfreq_df[cols_to_keep].copy()


    # merge the highfrequency features with the static features
    features_df = pd.merge(highfreq_df, demographics_df, on=id_cols, how='inner')
    features_data_dict = merge_data_dicts([highfreq_data_dict, demographics_data_dict])
    features_data_dict['fields'] = features_data_dict['schema']['fields']

    return features_df, features_data_dict
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--valid_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=32,
                        help='Number of hidden units')
    parser.add_argument('--hidden_layers',
                        type=int,
                        default=1,
                        help='Number of hidden layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--is_data_simulated',
        type=bool,
        default=False,
        help='boolean to check if data is simulated or from mimic')
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_valid_csv_filename, y_valid_csv_filename = args.valid_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_tstep')

    valid_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_valid_csv_filename,
        y_csv_path=y_valid_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_tstep')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_tstep')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_valid, y_valid = valid_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    N, T, F = X_train.shape

    #     from IPython import embed; embed()
    #     X_train = (X_train - np.min(X_train))/(np.max(X_train)-np.min(X_train))
    #     X_valid = (X_valid - np.min(X_train))/(np.max(X_train)-np.min(X_train))
    #     X_test = (X_test - np.min(X_train))/(np.max(X_train)-np.min(X_train))

    valid_ds = Dataset(X_valid, y_valid)

    print('number of time points : %s\nnumber of features : %s\n' % (T, F))

    # set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor(
        [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).float()

    print('Number of training sequences : %s' % N)
    print('Number of test sequences : %s' % X_test.shape[0])
    print('Ratio positive in train : %.2f' %
          ((y_train == 1).sum() / len(y_train)))
    print('Ratio positive in test : %.2f' %
          ((y_test == 1).sum() / len(y_test)))

    # callback to compute gradient norm
    compute_grad_norm = ComputeGradientNorm(norm_type=2)

    # LSTM
    if args.output_filename_prefix == None:
        output_filename_prefix = (
            'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' %
            (args.hidden_units, args.hidden_layers, args.lr, args.dropout,
             args.weight_decay))
    else:
        output_filename_prefix = args.output_filename_prefix

    print('RNN parameters : ' + output_filename_prefix)

    loss_early_stopping_cp = EarlyStopping(monitor='valid_loss',
                                           patience=15,
                                           threshold=0.002,
                                           threshold_mode='rel',
                                           lower_is_better=True)

    rnn = RNNPerTStepBinaryClassifier(
        max_epochs=250,
        batch_size=args.batch_size,
        device=device,
        lr=args.lr,
        callbacks=[
            EpochScoring(calc_auprc,
                         lower_is_better=False,
                         on_train=True,
                         name='auprc_train'),
            EpochScoring(calc_auprc,
                         lower_is_better=False,
                         on_train=False,
                         name='auprc_valid'),
            EpochScoring(calc_auroc,
                         lower_is_better=False,
                         on_train=True,
                         name='auroc_train'),
            EpochScoring(calc_auroc,
                         lower_is_better=False,
                         on_train=False,
                         name='auroc_valid'),
            #               EpochScoring(calc_precision, lower_is_better=False, on_train=True, name='precision_train'),
            #               EpochScoring(calc_precision, lower_is_better=False, on_train=False, name='precision_valid'),
            #               EpochScoring(calc_recall, lower_is_better=False, on_train=True, name='recall_train'),
            #               EpochScoring(calc_recall, lower_is_better=False, on_train=False, name='recall_valid'),
            #               EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'),
            #               EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'),
            #                   EarlyStopping(monitor='auprc_valid', patience=5, threshold=0.002, threshold_mode='rel',
            #                                                  lower_is_better=False),
            #               LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10),
            #                   compute_grad_norm,
            #               GradientNormClipping(gradient_clip_value=0.5, gradient_clip_norm_type=2),
            loss_early_stopping_cp,
            Checkpoint(monitor='auprc_valid',
                       f_history=os.path.join(
                           args.output_dir, output_filename_prefix + '.json')),
            TrainEndCheckpoint(dirname=args.output_dir,
                               fn_prefix=output_filename_prefix),
        ],
        #               criterion=torch.nn.CrossEntropyLoss,
        #               criterion__weight=class_weights,
        train_split=predefined_split(valid_ds),
        module__rnn_type='GRU',
        module__n_layers=args.hidden_layers,
        module__n_hiddens=args.hidden_units,
        module__n_inputs=X_train.shape[-1],
        module__dropout_proba=args.dropout,
        optimizer=torch.optim.Adam,
        optimizer__weight_decay=args.weight_decay)

    #     N=len(X_train)
    #     X_train = X_train[:N]
    #     y_train = y_train[:N]

    clf = rnn.fit(X_train, y_train)

    # get threshold with max recall at fixed precision
    fixed_precision = 0.1

    # get predict probas for y=1 on validation set
    keep_inds_va = torch.logical_not(
        torch.all(torch.isnan(torch.FloatTensor(X_valid)), dim=-1))
    y_va_pred_proba = clf.predict_proba(
        X_valid)[keep_inds_va][:, 1].detach().numpy()

    unique_probas = np.unique(y_va_pred_proba)
    thr_grid_G = np.linspace(np.percentile(unique_probas, 1),
                             max(unique_probas), 100)

    precision_scores_G, recall_scores_G = [
        np.zeros(thr_grid_G.size),
        np.zeros(thr_grid_G.size)
    ]
    for gg, thr in enumerate(thr_grid_G):
        #             logistic_clf.module_.linear_transform_layer.bias.data = torch.tensor(thr_grid[gg]).double()
        curr_thr_y_preds = clf.predict_proba(
            torch.FloatTensor(X_valid))[keep_inds_va][:, 1] >= thr_grid_G[gg]
        precision_scores_G[gg] = precision_score(y_valid[keep_inds_va],
                                                 curr_thr_y_preds)
        recall_scores_G[gg] = recall_score(y_valid[keep_inds_va],
                                           curr_thr_y_preds)

    keep_inds = precision_scores_G >= fixed_precision

    if keep_inds.sum() > 0:
        print('Choosing threshold with precision >= %.3f' % fixed_precision)
    else:
        fixed_precision_old = fixed_precision
        fixed_precision = np.percentile(precision_scores_G, 99)
        keep_inds = precision_scores_G >= fixed_precision
        print(
            'Could not find threshold with precision >= %.3f \n Choosing threshold to maximize recall at precision %.3f'
            % (fixed_precision_old, fixed_precision))

    thr_grid_G = thr_grid_G[keep_inds]
    precision_scores_G = precision_scores_G[keep_inds]
    recall_scores_G = recall_scores_G[keep_inds]
    thr_perf_df = pd.DataFrame(
        np.vstack([
            thr_grid_G[np.newaxis, :], precision_scores_G[np.newaxis, :],
            recall_scores_G[np.newaxis, :]
        ]).T,
        columns=['thr', 'precision_score', 'recall_score'])

    print(thr_perf_df)
    best_ind = np.argmax(recall_scores_G)
    best_thr = thr_grid_G[best_ind]
    print('chosen threshold : %.3f' % best_thr)

    splits = ['train', 'valid', 'test']
    #     data_splits = ((x_tr, y_tr), (x_va, y_va), (X_test, y_test))
    auroc_per_split, auprc_per_split, precisions_per_split, recalls_per_split = [
        np.zeros(len(splits)),
        np.zeros(len(splits)),
        np.zeros(len(splits)),
        np.zeros(len(splits))
    ]

    for ii, (X, y) in enumerate([(X_train, y_train), (X_valid, y_valid),
                                 (X_test, y_test)]):
        keep_inds = torch.logical_not(
            torch.all(torch.isnan(torch.FloatTensor(X)), dim=-1))
        y_pred_proba_pos = clf.predict_proba(X)[keep_inds][:,
                                                           1].detach().numpy()
        #         y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
        auroc_per_split[ii] = roc_auc_score(y[keep_inds], y_pred_proba_pos)
        #         y_pred_proba_pos = np.asarray(y_pred_proba_pos)
        auprc_per_split[ii] = average_precision_score(y[keep_inds],
                                                      y_pred_proba_pos)
        y_pred = y_pred_proba_pos >= best_thr
        precisions_per_split[ii] = precision_score(y[keep_inds], y_pred)
        recalls_per_split[ii] = recall_score(y[keep_inds], y_pred)

    auroc_train, auroc_valid, auroc_test = auroc_per_split
    auprc_train, auprc_valid, auprc_test = auprc_per_split
    precision_train, precision_valid, precision_test = precisions_per_split
    recall_train, recall_valid, recall_test = recalls_per_split

    # save performance
    perf_dict = {
        'auroc_train': auroc_train,
        'auroc_valid': auroc_valid,
        'auroc_test': auroc_test,
        'auprc_train': auprc_train,
        'auprc_valid': auprc_valid,
        'auprc_test': auprc_test,
        'precision_train': precision_train,
        'precision_valid': precision_valid,
        'precision_test': precision_test,
        'recall_train': recall_train,
        'recall_valid': recall_valid,
        'recall_test': recall_test,
        'threshold': best_thr
    }

    perf_df = pd.DataFrame([perf_dict])
    perf_csv = os.path.join(args.output_dir, output_filename_prefix + '.csv')
    print('Final performance on train, valid and test :\n')
    print(perf_df)

    print('Final performance saved to %s' % perf_csv)
    perf_df.to_csv(perf_csv, index=False)
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--preproc_data_dir', help = 'directory where the labs, vitals, demographics and outcomes are stored')
    parser.add_argument('--tslice',  type=str, default=2,
        help='''Slice of data to be extracted. If tslice is provided with a % sign (for eg. 20%), 
        then the script extracts the first tslice% data from the stay. If tslice is an int (for eg. 5),
        the the script extracts the first  tslice hrs of data. If tslice is negative (for eg. -5), then 
        the script extracts the data until tslice hours before deterioration/discharge.''')
    parser.add_argument('--output_dir', type=str)

    args = parser.parse_args()
    labs_df, labs_data_dict, vitals_df, vitals_data_dict, \
    demographics_df, demographics_data_dict, medications_df, medications_data_dict, outcomes_df, outcomes_data_dict = get_preprocessed_data(args.preproc_data_dir)  
        
    id_cols = parse_id_cols(vitals_data_dict)
    labs_feature_cols = parse_feature_cols(labs_data_dict)
    vitals_feature_cols = parse_feature_cols(vitals_data_dict)
    medications_feature_cols = parse_feature_cols(medications_data_dict)

    # get lengths of stay for each admission
    vitals_df_with_stay_lengths = pd.merge(vitals_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner')
    labs_df_with_stay_lengths = pd.merge(labs_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner')
    medications_df_with_stay_lengths = pd.merge(medications_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner')
    #demographics_df_with_stay_lengths = pd.merge(demographics_df, outcomes_df[id_cols + ['stay_length']], on=id_cols, how='inner')
    
    # find stays that satisfy minimum stay length
    censor_start=504
    tstops_df = outcomes_df[id_cols].copy()
    if ('%' in args.tslice):
        min_stay_length = 0
        print('Including EHR measured in first %s percent of patient stays having atleast %s hours of data'%(args.tslice, min_stay_length))
def featurize_stack_of_many_time_series(
    ts_df=None,
    ts_data_dict=None,
    outcomes_df=None,
    outcomes_data_dict=None,
    summary_ops=['mean', 'min', 'max'],
    percentile_slices_to_featurize=[(0, 100)],
    outcome_col='clinical_deterioration_outcome',
    outcome_seq_duration_col='stay_length',
    start_time_of_each_sequence=-24.0,
    max_time_of_each_sequence=504,
    start_time_of_endpoints=0.0,
    time_between_endpoints=12,
    prediction_horizon=24,
    verbose=True,
):
    ''' Featurize many patient stays slices and extract outcome for each slice

    Args
    ----
    ts_df : pandas DataFrame
        Each row provides all measurements at one time of a single patient-stay
        Must contain one column already converted to numerical time
    ts_data_dict : dict
        Provides specification for every column of ts_df
    outcomes_df : pandas DataFrame
        Each row provides outcome of a single patient stay
    outcomes_data_dict : dict
        Provides specification for each column of outcomes_df
    summary_ops : list of strings
        Identifies the summary functions we wish to apply to each variable's ts
    percentile_slices_to_featurize : list of tuples
        Indicates percentile range of all subwindows we will featurize
        Example: [(0, 100), (0, 50)])

    Returns
    -------
    all_feat_df : DataFrame
        One row per featurized window of any patient-stay slice
        Key columns: ids + ['start', 'stop']
        Value columns: one per extracted feature
    all_outcomes_df : DataFrame
        One row per featurized window of any patient-stay slice
        Key columns: ids + ['start', 'stop']
        Value columns: just one, the outcome column

    Examples
    --------
    >>> args = make_fake_input_data(n_seqs=25, n_features=10, max_duration=50.0)
    >>> feat_df, outcome_df = featurize_stack_of_many_time_series(*args,
    ...     summary_ops=['mean', 'slope'],
    ...     start_time_of_each_sequence=0,
    ...     start_time_of_endpoints=0.0,
    ...     time_between_endpoints=12.0,
    ...     verbose=False,
    ...     );
    >>> feat_df.shape
    (95, 24)

    '''
    # Parse desired slices to featurize at each window
    # This allows command-line specification of ranges as a string
    if isinstance(percentile_slices_to_featurize, str):
        percentile_slices_to_featurize = ast.literal_eval(
            percentile_slices_to_featurize)
    if isinstance(summary_ops, str):
        summary_ops = summary_ops.split(' ')

    # Parse provided data dictionary
    # Recover specific columns for each of the different roles:
    id_cols = parse_id_cols(ts_data_dict)
    id_cols = remove_col_names_from_list_if_not_in_df(id_cols, ts_df)
    feature_cols = parse_feature_cols(ts_data_dict)
    feature_cols = remove_col_names_from_list_if_not_in_df(feature_cols, ts_df)
    time_cols = parse_time_cols(ts_data_dict)
    time_cols = remove_col_names_from_list_if_not_in_df(time_cols, ts_df)
    if len(time_cols) == 0:
        raise ValueError("Expected at least one variable with role='time'")
    elif len(time_cols) > 1:
        raise Warning("More than one time variable found. Choosing %s" %
                      time_cols[-1])
    time_col = time_cols[-1]

    # Obtain fenceposts delineating each individual sequence within big stack
    # We assume that sequences changeover when *any* key differs
    # We convert all keys to a numerical datatype to make this possible
    keys_df = ts_df[id_cols].copy()
    for col in id_cols:
        if not pd.api.types.is_numeric_dtype(keys_df[col].dtype):
            keys_df[col] = keys_df[col].astype('category')
            keys_df[col] = keys_df[col].cat.codes
    middle_fence_posts = 1 + np.flatnonzero(
        np.diff(keys_df.values, axis=0).any(axis=1))
    fp = np.hstack([0, middle_fence_posts, keys_df.shape[0]])

    feat_arr_per_seq = list()
    windows_per_seq = list()
    outcomes_per_seq = list()
    durations_per_seq = list()
    missingness_density_per_seq = list()
    ids_per_seq = list()

    # Total number of features we'll compute in each feature vector
    F = len(percentile_slices_to_featurize) * len(feature_cols) * len(
        summary_ops)

    # Loop over each sequence in the tall tidy-format dataset
    start_time_sec = time.time()
    n_seqs = len(fp) - 1
    pbar = ProgressBar()
    for p in pbar(range(n_seqs)):

        # Get features and times for the current fencepost
        fp_start = fp[p]
        fp_end = fp[p + 1]

        # Get the current stay keys
        cur_id_df = ts_df[id_cols].iloc[fp_start:fp_end].drop_duplicates(
            subset=id_cols)

        if outcomes_df is not None:
            # Get the total duration of the current sequence
            cur_outcomes_df = pd.merge(outcomes_df,
                                       cur_id_df,
                                       on=id_cols,
                                       how='inner')

            # Get the current sequence's finale outcome
            cur_final_outcome = int(cur_outcomes_df[outcome_col].values[0])
            cur_seq_duration = float(
                cur_outcomes_df[outcome_seq_duration_col].values[0])
        else:
            cur_seq_duration = float(
                ts_df[time_col].iloc[fp_start:fp_end].values[-1])

        # Create windows at desired spacing
        stop_time_of_cur_sequence = min(cur_seq_duration,
                                        max_time_of_each_sequence)
        window_ends = np.arange(
            start_time_of_endpoints,
            stop_time_of_cur_sequence + 0.01 * time_between_endpoints,
            time_between_endpoints)

        # Create a dictionary of times and values for each feature
        time_arr_by_var = dict()
        val_arr_by_var = dict()
        times_U = ts_df[time_col].values[fp_start:fp_end]
        for feature_col in feature_cols:
            vals_U = ts_df[feature_col].values[fp_start:fp_end]
            keep_mask_U = np.isfinite(vals_U)
            if np.sum(keep_mask_U) > 0:
                time_arr_by_var[feature_col] = times_U[keep_mask_U]
                val_arr_by_var[feature_col] = vals_U[keep_mask_U]

        cur_seq_missing_density = (
            1.0 - len(val_arr_by_var.keys()) / float(len(feature_cols)))

        # Deprecated code from preetish.... MCH couldn't get this to work.
        '''
        v = cur_fp_df.set_index(time_col).agg(lambda x: x.dropna().to_dict()) 
        res = v[v.str.len() > 0].to_dict()
        for feature_col in feature_cols:
            if feature_col in res.keys():
                time_arr_by_var[feature_col] = np.array(
                    list(res[feature_col].keys()), dtype=np.float64)
                val_arr_by_var[feature_col] = np.array(
                    list(res[feature_col].values()), dtype=np.float64)
        '''

        W = len(window_ends)
        window_features_WF = np.zeros([W, F], dtype=np.float32)
        window_starts_stops_W2 = np.zeros([W, 2], dtype=np.float32)
        if outcomes_df is not None:
            window_outcomes_W1 = np.zeros([W, 1], dtype=np.int64)

        for ww, window_end in enumerate(window_ends):
            window_starts_stops_W2[ww, 0] = start_time_of_each_sequence
            window_starts_stops_W2[ww, 1] = window_end

            window_features_WF[ww, :], feat_names = featurize_ts(
                time_arr_by_var,
                val_arr_by_var,
                var_cols=feature_cols,
                var_spec_dict=ts_data_dict,
                start_numerictime=start_time_of_each_sequence,
                stop_numerictime=window_end,
                summary_ops=summary_ops,
                percentile_slices_to_featurize=percentile_slices_to_featurize)

            if outcomes_df is not None:
                # Determine the outcome for this window
                # Set outcome as final outcome if within the provided horizon
                # Otherwise, set to zero
                if window_end >= cur_seq_duration - prediction_horizon:
                    window_outcomes_W1[ww] = cur_final_outcome
                else:
                    window_outcomes_W1[ww] = 0

        # Append all windows from this sequence to the big lists
        feat_arr_per_seq.append(window_features_WF)
        windows_per_seq.append(window_starts_stops_W2)
        ids_per_seq.append(np.tile(cur_id_df.values[0], (W, 1)))

        durations_per_seq.append(np.tile(cur_seq_duration, (W, 1)))
        missingness_density_per_seq.append(cur_seq_missing_density)
        if outcomes_df is not None:
            outcomes_per_seq.append(window_outcomes_W1)

    # Produce final data frames
    features_df = pd.DataFrame(np.vstack(feat_arr_per_seq), columns=feat_names)
    ids_df = pd.DataFrame(np.vstack(ids_per_seq), columns=id_cols)
    windows_df = pd.DataFrame(np.vstack(windows_per_seq),
                              columns=['start', 'stop'])
    all_features_df = pd.concat([ids_df, windows_df, features_df], axis=1)

    if outcomes_df is not None:
        durations_df = pd.DataFrame(np.vstack(durations_per_seq),
                                    columns=[outcome_seq_duration_col])
        outcomes_df = pd.DataFrame(np.vstack(outcomes_per_seq),
                                   columns=[outcome_col])
        all_outcomes_df = pd.concat(
            [ids_df, windows_df, durations_df, outcomes_df], axis=1)
    else:
        durations_df = pd.DataFrame(np.vstack(durations_per_seq),
                                    columns=[outcome_seq_duration_col])
        all_outcomes_df = pd.concat([ids_df, windows_df, durations_df], axis=1)

    seq_lengths = np.vstack([a[0] for a in durations_per_seq])
    elapsed_time_sec = time.time() - start_time_sec

    if verbose:
        print('-----------------------------------------')
        print('Processed %d sequences of duration %.1f-%.1f in %.1f sec' % (
            n_seqs,
            np.percentile(seq_lengths, 5),
            np.percentile(seq_lengths, 95),
            elapsed_time_sec,
        ))
        print('    Total number of measured features: %d' % len(feature_cols))
        print(
            '    Fraction of possible features NEVER seen in a seq. : %.2f-%.2f '
            % (
                np.percentile(missingness_density_per_seq, 5),
                np.percentile(missingness_density_per_seq, 95),
            ))
        print('-----------------------------------------')
    return all_features_df, all_outcomes_df