Example #1
0
                                                                                      id_cols, time_col, sequence_feature_cols)
                    
                    
                    # impute missing values in the test features
                    curr_sequence_features_df = curr_sequence_features_df.groupby(id_cols).apply(lambda x: x.fillna(method='pad')).copy()

                    for feature_col in sequence_feature_cols:
                        curr_sequence_features_df[feature_col].fillna(curr_sequence_features_df[feature_col].mean(), inplace=True)  
                    
                    for feature_col in sequence_feature_cols:
                        curr_sequence_features_df[feature_col].fillna(x_train_df[feature_col].mean(), inplace=True) 
                    # load test data with TidySequentialDataLoader
                    test_vitals = TidySequentialDataCSVLoader(
                        x_csv_path=curr_sequence_features_df,
                        y_csv_path=chosen_stay_outcomes_df,
                        x_col_names=feature_cols_with_mask_features,
                        idx_col_names=id_cols,
                        y_col_name=args.outcome_column_name,
                        y_label_type='per_sequence'
                    )    

                    # predict on test data
                    x_test, y_test = test_vitals.get_batch_data(batch_id=0)


                    per_feature_scaling = np.load(os.path.join(args.rnn_models_dir, 'per_feature_scaling.npy'))
                    for f in range(x_test.shape[2]):
                        x_test[:,:,f] = x_test[:,:,f]/per_feature_scaling[f]
                    
                    mask_feature_cols = [i for i in feature_cols_with_mask_features if 'mask' in i]
                    
                    total_missing_features_over_time[q]=(curr_sequence_features_df[mask_feature_cols]==0).sum().sum()
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=32,
                        help='Number of hidden units')
    parser.add_argument('--hidden_layers',
                        type=int,
                        default=1,
                        help='Number of hidden layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--is_data_simulated',
        type=bool,
        default=False,
        help='boolean to check if data is simulated or from mimic')
    parser.add_argument(
        '--simulated_data_dir',
        type=str,
        default='simulated_data/2-state/',
        help=
        'dir in which to simulated data is saved.Must be provide if is_data_simulated = True'
    )
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_sequence')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_sequence')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    _, T, F = X_train.shape

    print('number of time points : %s\n number of features : %s\n' % (T, F))

    # set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor(
        [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).double()

    # scale features
    #     X_train = standard_scaler_3d(X_train)
    #     X_test = standard_scaler_3d(X_test)

    # callback to compute gradient norm
    compute_grad_norm = ComputeGradientNorm(norm_type=2)

    # LSTM
    if args.output_filename_prefix == None:
        output_filename_prefix = (
            'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' %
            (args.hidden_units, args.hidden_layers, args.lr, args.dropout,
             args.weight_decay))
    else:
        output_filename_prefix = args.output_filename_prefix

    print('RNN parameters : ' + output_filename_prefix)
    # #     from IPython import embed; embed()
    rnn = RNNBinaryClassifier(
        max_epochs=50,
        batch_size=args.batch_size,
        device=device,
        lr=args.lr,
        callbacks=[
            EpochScoring('roc_auc',
                         lower_is_better=False,
                         on_train=True,
                         name='aucroc_score_train'),
            EpochScoring('roc_auc',
                         lower_is_better=False,
                         on_train=False,
                         name='aucroc_score_valid'),
            EarlyStopping(monitor='aucroc_score_valid',
                          patience=20,
                          threshold=0.002,
                          threshold_mode='rel',
                          lower_is_better=False),
            LRScheduler(policy=ReduceLROnPlateau,
                        mode='max',
                        monitor='aucroc_score_valid',
                        patience=10),
            compute_grad_norm,
            GradientNormClipping(gradient_clip_value=0.3,
                                 gradient_clip_norm_type=2),
            Checkpoint(monitor='aucroc_score_valid',
                       f_history=os.path.join(
                           args.output_dir, output_filename_prefix + '.json')),
            TrainEndCheckpoint(dirname=args.output_dir,
                               fn_prefix=output_filename_prefix),
        ],
        criterion=torch.nn.CrossEntropyLoss,
        criterion__weight=class_weights,
        train_split=skorch.dataset.CVSplit(args.validation_size),
        module__rnn_type='LSTM',
        module__n_layers=args.hidden_layers,
        module__n_hiddens=args.hidden_units,
        module__n_inputs=X_train.shape[-1],
        module__dropout_proba=args.dropout,
        optimizer=torch.optim.Adam,
        optimizer__weight_decay=args.weight_decay)

    clf = rnn.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_train)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos)
    print('AUROC with LSTM (Train) : %.2f' % auroc_train_final)

    y_pred_proba = clf.predict_proba(X_test)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos)
    print('AUROC with LSTM (Test) : %.2f' % auroc_test_final)
Example #3
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--n_filters',
                        type=int,
                        default=32,
                        help='Number of filters')
    parser.add_argument('--kernel_size',
                        type=int,
                        default=1,
                        help='size of eack kernel')
    parser.add_argument('--n_conv_layers',
                        type=int,
                        default=1,
                        help='number of convolutional layers')
    parser.add_argument('--stride', type=int, default=1, help='stride')
    parser.add_argument('--pool_size',
                        type=int,
                        default=4,
                        help='max pool size')
    parser.add_argument('--dense_units',
                        type=int,
                        default=128,
                        help='number of units in fully connected layer')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_sequence')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_sequence')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    N, T, F = X_train.shape

    # add class weights
    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(y_train),
                                                      y_train)
    #     class_weights = dict(zip(range(len(class_weights)), class_weights))

    # convert y_train to categorical
    y_train = keras.utils.to_categorical(y_train)
    y_test = keras.utils.to_categorical(y_test)

    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=args.validation_size, random_state=213)

    print('number of time points : %s\nnumber of features : %s\n' % (T, F))

    set_random_seed(args.seed)
    model = keras.Sequential()
    for i in range(args.n_conv_layers):
        model.add(
            keras.layers.Conv1D(filters=args.n_filters,
                                kernel_size=args.kernel_size,
                                activation='relu',
                                strides=args.stride))
    model.add(keras.layers.Dropout(args.dropout))
    model.add(keras.layers.MaxPooling1D(pool_size=args.pool_size))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(args.dense_units, activation='relu'))
    model.add(keras.layers.Dense(2, activation='softmax'))

    # set optimizer
    opt = keras.optimizers.Adam(learning_rate=args.lr)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy', keras.metrics.AUC()])

    # set early stopping
    early_stopping = EarlyStopping(monitor='val_auc',
                                   patience=20,
                                   mode='max',
                                   verbose=1)

    model.fit(X_train,
              y_train,
              epochs=100,
              validation_data=(X_val, y_val),
              callbacks=[early_stopping],
              class_weight=class_weights,
              batch_size=args.batch_size)

    y_score_val = model.predict_proba(X_val)
    val_auc = roc_auc_score(y_val, y_score_val)
    print('AUC on val set : %.4f' % val_auc)

    y_score_test = model.predict_proba(X_test)
    test_auc = roc_auc_score(y_test, y_score_test)
    print('AUC on val set : %.4f' % test_auc)

    # save the model history
    training_hist_df = pd.DataFrame(model.history.history)
    training_hist_df.loc[:, 'test_auc'] = test_auc
    training_hist_csv = os.path.join(args.output_dir,
                                     args.output_filename_prefix + '.csv')
    training_hist_df.to_csv(training_hist_csv, index=False)

    # save the model
    model_file = os.path.join(args.output_dir,
                              args.output_filename_prefix + '.model')
    model.save(model_file)
Example #4
0
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--save',
                        type=str,
                        default='model.pt',
                        help='path to save the final model')
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    device = 'cpu'

    dataset = TidySequentialDataCSVLoader('my_dataset.csv')
    X, y = dataset.get_batch_data(batch_id=0)

    rnn = RNNBinaryClassifier(
        max_epochs=args.epochs,
        batch_size=args.batch_size,
        device=device,
        callbacks=[
            #skorch.callbacks.Checkpoint(),
            skorch.callbacks.ProgressBar(),
        ],
        module__rnn_type='ELMAN+relu',
        module__n_inputs=X.shape[-1],
        module__n_hiddens=10,
        module__n_layers=1,
        optimizer=torch.optim.SGD,
Example #5
0
                        metavar='N',
                        help='number of epochs')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--save',
                        type=str,
                        default='model.pt',
                        help='path to save the final model')
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    device = 'cpu'

    dataset = TidySequentialDataCSVLoader(
        per_tstep_csv_path='eeg_rnn_data/eeg_train_balanced.csv',
        idx_col_names='chunk_id',
        x_col_names=['eeg_signal'],
        y_col_name='seizure_binary_label',
        y_label_type='per_tstep')
    X, y = dataset.get_batch_data(batch_id=0)

    rnn = RNNBinaryClassifier(
        max_epochs=args.epochs,
        batch_size=args.batch_size,
        device=device,
        callbacks=[
            #skorch.callbacks.Checkpoint(),
            skorch.callbacks.ProgressBar(),
        ],
        module__rnn_type='ELMAN+relu',
        #module__rnn_type='LSTM',
        module__n_inputs=X.shape[-1],
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )

    parser.add_argument('--train_vitals_csv',
                        type=str,
                        help='Location of vitals data for training')
    parser.add_argument('--test_vitals_csv',
                        type=str,
                        help='Location of vitals data for testing')
    parser.add_argument('--metadata_csv',
                        type=str,
                        help='Location of metadata for testing and training')
    parser.add_argument('--data_dict', type=str)
    parser.add_argument('--batch_size',
                        type=int,
                        default=256,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=100000,
                        help='Number of epochs')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=10,
                        help='Number of hidden units')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-2,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.3,
                        help='dropout for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--save',
                        type=str,
                        default='RNNmodel.pt',
                        help='path to save the final model')
    parser.add_argument('--report_dir',
                        type=str,
                        default='html',
                        help='dir in which to save results report')
    parser.add_argument('--simulated_data_dir',
                        type=str,
                        default='simulated_data/2-state/',
                        help='dir in which to simulated data is saved')
    parser.add_argument(
        '--is_data_simulated',
        type=bool,
        default=False,
        help='boolean to check if data is simulated or from mimic')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default='current_config',
        help='file to save the loss and validation over epochs')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    # hyperparameter space
    #     learning_rate = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
    #     hyperparameters = dict(lr=learning_rate)

    # extract data
    if not (args.is_data_simulated):
        #------------------------Loaded from TidySequentialDataCSVLoader--------------------#
        train_vitals = TidySequentialDataCSVLoader(
            per_tstep_csv_path=args.train_vitals_csv,
            per_seq_csv_path=args.metadata_csv,
            idx_col_names=['subject_id', 'episode_id'],
            x_col_names='__all__',
            y_col_name='inhospital_mortality',
            y_label_type='per_tstep')

        test_vitals = TidySequentialDataCSVLoader(
            per_tstep_csv_path=args.test_vitals_csv,
            per_seq_csv_path=args.metadata_csv,
            idx_col_names=['subject_id', 'episode_id'],
            x_col_names='__all__',
            y_col_name='inhospital_mortality',
            y_label_type='per_tstep')

        X_train_with_time_appended, y_train = train_vitals.get_batch_data(
            batch_id=0)
        X_test_with_time_appended, y_test = test_vitals.get_batch_data(
            batch_id=0)
        _, T, F = X_train_with_time_appended.shape

        if T > 1:
            X_train = X_train_with_time_appended[:, :,
                                                 1:]  # removing hours column
            X_test = X_test_with_time_appended[:, :,
                                               1:]  # removing hours column
        else:  # account for collapsed features across time
            X_train = X_train_with_time_appended
            X_test = X_test_with_time_appended

    # set class weights as (1-Beta)/(1-Beta^(number of training samples in class))


#     beta = (len(y_train)-1)/len(y_train)
#     class_weights = torch.tensor(np.asarray([(1-beta)/(1-beta**((y_train==0).sum())), (1-beta)/(1-beta**((y_train==1).sum()))]))

# set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor(
        [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).double()

    # define a auc scorer function and pass it as callback of skorch to track training and validation AUROC
    roc_auc_scorer = make_scorer(roc_auc_score,
                                 greater_is_better=True,
                                 needs_threshold=True)

    # use only last time step as feature for LR debugging
    #     X_train = X_train[:,-1,:][:,np.newaxis,:]
    #     X_test = X_test[:,-1,:][:,np.newaxis,:]

    # use time steps * features as vectorized feature into RNN for LR debugging
    #     X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]*X_train.shape[2]))
    #     X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]*X_test.shape[2]))

    #---------------------------------------------------------------------#
    # Pseudo LSTM (hand engineered features through LSTM, collapsed across time)
    #---------------------------------------------------------------------#
    # instantiate RNN
    rnn = RNNBinaryClassifier(
        max_epochs=args.epochs,
        batch_size=args.batch_size,
        device=device,
        criterion=torch.nn.CrossEntropyLoss,
        criterion__weight=class_weights,
        train_split=skorch.dataset.CVSplit(4),
        callbacks=[
            skorch.callbacks.GradientNormClipping(gradient_clip_value=0.4,
                                                  gradient_clip_norm_type=2),
            skorch.callbacks.EpochScoring(roc_auc_scorer,
                                          lower_is_better=False,
                                          on_train=True,
                                          name='aucroc_score_train'),
            skorch.callbacks.EpochScoring(roc_auc_scorer,
                                          lower_is_better=False,
                                          on_train=False,
                                          name='aucroc_score_valid'),
            ComputeGradientNorm(
                norm_type=2,
                f_history=args.report_dir +
                '/%s_running_rnn_classifer_gradient_norm_history.csv' %
                args.output_filename_prefix),
            #             LSTMtoLogReg(),# transformation to log reg for debugging
            skorch.callbacks.EarlyStopping(monitor='aucroc_score_valid',
                                           patience=1000,
                                           threshold=1e-10,
                                           threshold_mode='rel',
                                           lower_is_better=False),
            skorch.callbacks.Checkpoint(
                monitor='train_loss',
                f_history=args.report_dir +
                '/%s_running_rnn_classifer_history.json' %
                args.output_filename_prefix),
            #             skorch.callbacks.Checkpoint(monitor='aucroc_score_valid', f_pickle = args.report_dir + '/%s_running_rnn_classifer_model'%args.output_filename_prefix),
            skorch.callbacks.PrintLog(floatfmt='.2f')
        ],
        module__rnn_type='LSTM',
        module__n_inputs=X_train.shape[-1],
        module__n_hiddens=args.hidden_units,
        module__n_layers=1,
        #         module__dropout_proba_non_recurrent=args.dropout,
        #         module__dropout_proba=args.dropout,
        optimizer=torch.optim.SGD,
        optimizer__weight_decay=1e-2,
        #         optimizer__momentum=0.9,
        #         optimizer=torch.optim.Adam,
        lr=args.lr)

    from IPython import embed
    embed()

    # scale input features
    X_train = standard_scaler_3d(X_train)
    X_test = standard_scaler_3d(X_test)
    rnn.fit(X_train, y_train)

    # get the training history
    epochs, train_loss, validation_loss, aucroc_score_train, aucroc_score_valid = get_loss_plots_from_training_history(
        rnn.history)

    # plot the validation and training error plots and save
    f = plt.figure()
    plt.plot(epochs, train_loss, 'r-.', label='Train Loss')
    plt.plot(epochs, validation_loss, 'b-.', label='Validation Loss')
    plt.plot(epochs, aucroc_score_train, 'g-.', label='AUCROC score (Train)')
    plt.plot(epochs, aucroc_score_valid, 'm-.', label='AUCROC score (Valid)')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training Performance (learning rate : %s, hidden units : %s)' %
              (str(args.lr), str(args.hidden_units)))
    f.savefig(args.report_dir + '/%s_training_performance_plots.png' %
              args.output_filename_prefix)
    plt.close()

    # save the training and validation loss in a csv
    train_perf_df = pd.DataFrame(
        data=np.stack([epochs, train_loss, validation_loss]).T,
        columns=['epochs', 'train_loss', 'validation_loss'])
    train_perf_df.to_csv(args.report_dir +
                         '/%s_perf_metrics.csv' % args.output_filename_prefix)

    # save classifier history to later evaluate early stopping for this model
    dump(
        rnn, args.report_dir +
        '/%s_rnn_classifer.pkl' % args.output_filename_prefix)

    y_pred_proba = rnn.predict_proba(X_test)
    y_pred = convert_proba_to_binary(y_pred_proba)

    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_pos)
    roc_area = roc_auc_score(y_test, y_pred_proba_pos)

    from IPython import embed
    embed()

    # Brief Summary
    #     print('Best lr:', rnn.best_estimator_.get_params()['lr'])
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Balanced Accuracy:', balanced_accuracy_score(y_test, y_pred))
    print('Log Loss:', log_loss(y_test, y_pred_proba))
    print('AUC ROC:', roc_area)
    conf_matrix = confusion_matrix(y_test, y_pred)
    true_neg = conf_matrix[0][0]
    true_pos = conf_matrix[1][1]
    false_neg = conf_matrix[1][0]
    false_pos = conf_matrix[0][1]
    print('True Positive Rate:', float(true_pos) / (true_pos + false_neg))
    print('True Negative Rate:', float(true_neg) / (true_neg + false_pos))
    print('Positive Predictive Value:',
          float(true_pos) / (true_pos + false_pos))
    print('Negative Predictive Value',
          float(true_neg) / (true_neg + false_pos))

    create_html_report(args.report_dir, args.output_filename_prefix, y_test,
                       y_pred, y_pred_proba, args.lr)
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch RNN with variable-length numeric sequences wrapper'
    )
    parser.add_argument('--outcome_col_name', type=str, required=True)
    parser.add_argument('--train_csv_files', type=str, required=True)
    parser.add_argument('--valid_csv_files', type=str, required=True)
    parser.add_argument('--test_csv_files', type=str, required=True)
    parser.add_argument('--data_dict_files', type=str, required=True)
    parser.add_argument('--batch_size',
                        type=int,
                        default=1024,
                        help='Number of sequences per minibatch')
    parser.add_argument('--epochs',
                        type=int,
                        default=50,
                        help='Number of epochs')
    parser.add_argument('--hidden_units',
                        type=int,
                        default=32,
                        help='Number of hidden units')
    parser.add_argument('--hidden_layers',
                        type=int,
                        default=1,
                        help='Number of hidden layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0005,
                        help='Learning rate for the optimizer')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout for optimizer')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0001,
                        help='weight decay for optimizer')
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--validation_size',
                        type=float,
                        default=0.15,
                        help='validation split size')
    parser.add_argument(
        '--is_data_simulated',
        type=bool,
        default=False,
        help='boolean to check if data is simulated or from mimic')
    parser.add_argument(
        '--output_dir',
        type=str,
        default=None,
        help=
        'directory where trained model and loss curves over epochs are saved')
    parser.add_argument(
        '--output_filename_prefix',
        type=str,
        default=None,
        help='prefix for the training history jsons and trained classifier')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'

    x_train_csv_filename, y_train_csv_filename = args.train_csv_files.split(
        ',')
    x_valid_csv_filename, y_valid_csv_filename = args.valid_csv_files.split(
        ',')
    x_test_csv_filename, y_test_csv_filename = args.test_csv_files.split(',')
    x_dict, y_dict = args.data_dict_files.split(',')
    x_data_dict = load_data_dict_json(x_dict)

    # get the id and feature columns
    id_cols = parse_id_cols(x_data_dict)
    feature_cols = parse_feature_cols(x_data_dict)
    # extract data
    train_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_train_csv_filename,
        y_csv_path=y_train_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_tstep')

    valid_vitals = TidySequentialDataCSVLoader(
        x_csv_path=x_valid_csv_filename,
        y_csv_path=y_valid_csv_filename,
        x_col_names=feature_cols,
        idx_col_names=id_cols,
        y_col_name=args.outcome_col_name,
        y_label_type='per_tstep')

    test_vitals = TidySequentialDataCSVLoader(x_csv_path=x_test_csv_filename,
                                              y_csv_path=y_test_csv_filename,
                                              x_col_names=feature_cols,
                                              idx_col_names=id_cols,
                                              y_col_name=args.outcome_col_name,
                                              y_label_type='per_tstep')

    X_train, y_train = train_vitals.get_batch_data(batch_id=0)
    X_valid, y_valid = valid_vitals.get_batch_data(batch_id=0)
    X_test, y_test = test_vitals.get_batch_data(batch_id=0)
    N, T, F = X_train.shape

    #     from IPython import embed; embed()
    #     X_train = (X_train - np.min(X_train))/(np.max(X_train)-np.min(X_train))
    #     X_valid = (X_valid - np.min(X_train))/(np.max(X_train)-np.min(X_train))
    #     X_test = (X_test - np.min(X_train))/(np.max(X_train)-np.min(X_train))

    valid_ds = Dataset(X_valid, y_valid)

    print('number of time points : %s\nnumber of features : %s\n' % (T, F))

    # set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor(
        [1 / (y_train == 0).sum(), 1 / (y_train == 1).sum()]).float()

    print('Number of training sequences : %s' % N)
    print('Number of test sequences : %s' % X_test.shape[0])
    print('Ratio positive in train : %.2f' %
          ((y_train == 1).sum() / len(y_train)))
    print('Ratio positive in test : %.2f' %
          ((y_test == 1).sum() / len(y_test)))

    # callback to compute gradient norm
    compute_grad_norm = ComputeGradientNorm(norm_type=2)

    # LSTM
    if args.output_filename_prefix == None:
        output_filename_prefix = (
            'hiddens=%s-layers=%s-lr=%s-dropout=%s-weight_decay=%s' %
            (args.hidden_units, args.hidden_layers, args.lr, args.dropout,
             args.weight_decay))
    else:
        output_filename_prefix = args.output_filename_prefix

    print('RNN parameters : ' + output_filename_prefix)

    loss_early_stopping_cp = EarlyStopping(monitor='valid_loss',
                                           patience=15,
                                           threshold=0.002,
                                           threshold_mode='rel',
                                           lower_is_better=True)

    rnn = RNNPerTStepBinaryClassifier(
        max_epochs=250,
        batch_size=args.batch_size,
        device=device,
        lr=args.lr,
        callbacks=[
            EpochScoring(calc_auprc,
                         lower_is_better=False,
                         on_train=True,
                         name='auprc_train'),
            EpochScoring(calc_auprc,
                         lower_is_better=False,
                         on_train=False,
                         name='auprc_valid'),
            EpochScoring(calc_auroc,
                         lower_is_better=False,
                         on_train=True,
                         name='auroc_train'),
            EpochScoring(calc_auroc,
                         lower_is_better=False,
                         on_train=False,
                         name='auroc_valid'),
            #               EpochScoring(calc_precision, lower_is_better=False, on_train=True, name='precision_train'),
            #               EpochScoring(calc_precision, lower_is_better=False, on_train=False, name='precision_valid'),
            #               EpochScoring(calc_recall, lower_is_better=False, on_train=True, name='recall_train'),
            #               EpochScoring(calc_recall, lower_is_better=False, on_train=False, name='recall_valid'),
            #               EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'),
            #               EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'),
            #                   EarlyStopping(monitor='auprc_valid', patience=5, threshold=0.002, threshold_mode='rel',
            #                                                  lower_is_better=False),
            #               LRScheduler(policy=ReduceLROnPlateau, mode='max', monitor='aucroc_score_valid', patience=10),
            #                   compute_grad_norm,
            #               GradientNormClipping(gradient_clip_value=0.5, gradient_clip_norm_type=2),
            loss_early_stopping_cp,
            Checkpoint(monitor='auprc_valid',
                       f_history=os.path.join(
                           args.output_dir, output_filename_prefix + '.json')),
            TrainEndCheckpoint(dirname=args.output_dir,
                               fn_prefix=output_filename_prefix),
        ],
        #               criterion=torch.nn.CrossEntropyLoss,
        #               criterion__weight=class_weights,
        train_split=predefined_split(valid_ds),
        module__rnn_type='GRU',
        module__n_layers=args.hidden_layers,
        module__n_hiddens=args.hidden_units,
        module__n_inputs=X_train.shape[-1],
        module__dropout_proba=args.dropout,
        optimizer=torch.optim.Adam,
        optimizer__weight_decay=args.weight_decay)

    #     N=len(X_train)
    #     X_train = X_train[:N]
    #     y_train = y_train[:N]

    clf = rnn.fit(X_train, y_train)

    # get threshold with max recall at fixed precision
    fixed_precision = 0.1

    # get predict probas for y=1 on validation set
    keep_inds_va = torch.logical_not(
        torch.all(torch.isnan(torch.FloatTensor(X_valid)), dim=-1))
    y_va_pred_proba = clf.predict_proba(
        X_valid)[keep_inds_va][:, 1].detach().numpy()

    unique_probas = np.unique(y_va_pred_proba)
    thr_grid_G = np.linspace(np.percentile(unique_probas, 1),
                             max(unique_probas), 100)

    precision_scores_G, recall_scores_G = [
        np.zeros(thr_grid_G.size),
        np.zeros(thr_grid_G.size)
    ]
    for gg, thr in enumerate(thr_grid_G):
        #             logistic_clf.module_.linear_transform_layer.bias.data = torch.tensor(thr_grid[gg]).double()
        curr_thr_y_preds = clf.predict_proba(
            torch.FloatTensor(X_valid))[keep_inds_va][:, 1] >= thr_grid_G[gg]
        precision_scores_G[gg] = precision_score(y_valid[keep_inds_va],
                                                 curr_thr_y_preds)
        recall_scores_G[gg] = recall_score(y_valid[keep_inds_va],
                                           curr_thr_y_preds)

    keep_inds = precision_scores_G >= fixed_precision

    if keep_inds.sum() > 0:
        print('Choosing threshold with precision >= %.3f' % fixed_precision)
    else:
        fixed_precision_old = fixed_precision
        fixed_precision = np.percentile(precision_scores_G, 99)
        keep_inds = precision_scores_G >= fixed_precision
        print(
            'Could not find threshold with precision >= %.3f \n Choosing threshold to maximize recall at precision %.3f'
            % (fixed_precision_old, fixed_precision))

    thr_grid_G = thr_grid_G[keep_inds]
    precision_scores_G = precision_scores_G[keep_inds]
    recall_scores_G = recall_scores_G[keep_inds]
    thr_perf_df = pd.DataFrame(
        np.vstack([
            thr_grid_G[np.newaxis, :], precision_scores_G[np.newaxis, :],
            recall_scores_G[np.newaxis, :]
        ]).T,
        columns=['thr', 'precision_score', 'recall_score'])

    print(thr_perf_df)
    best_ind = np.argmax(recall_scores_G)
    best_thr = thr_grid_G[best_ind]
    print('chosen threshold : %.3f' % best_thr)

    splits = ['train', 'valid', 'test']
    #     data_splits = ((x_tr, y_tr), (x_va, y_va), (X_test, y_test))
    auroc_per_split, auprc_per_split, precisions_per_split, recalls_per_split = [
        np.zeros(len(splits)),
        np.zeros(len(splits)),
        np.zeros(len(splits)),
        np.zeros(len(splits))
    ]

    for ii, (X, y) in enumerate([(X_train, y_train), (X_valid, y_valid),
                                 (X_test, y_test)]):
        keep_inds = torch.logical_not(
            torch.all(torch.isnan(torch.FloatTensor(X)), dim=-1))
        y_pred_proba_pos = clf.predict_proba(X)[keep_inds][:,
                                                           1].detach().numpy()
        #         y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
        auroc_per_split[ii] = roc_auc_score(y[keep_inds], y_pred_proba_pos)
        #         y_pred_proba_pos = np.asarray(y_pred_proba_pos)
        auprc_per_split[ii] = average_precision_score(y[keep_inds],
                                                      y_pred_proba_pos)
        y_pred = y_pred_proba_pos >= best_thr
        precisions_per_split[ii] = precision_score(y[keep_inds], y_pred)
        recalls_per_split[ii] = recall_score(y[keep_inds], y_pred)

    auroc_train, auroc_valid, auroc_test = auroc_per_split
    auprc_train, auprc_valid, auprc_test = auprc_per_split
    precision_train, precision_valid, precision_test = precisions_per_split
    recall_train, recall_valid, recall_test = recalls_per_split

    # save performance
    perf_dict = {
        'auroc_train': auroc_train,
        'auroc_valid': auroc_valid,
        'auroc_test': auroc_test,
        'auprc_train': auprc_train,
        'auprc_valid': auprc_valid,
        'auprc_test': auprc_test,
        'precision_train': precision_train,
        'precision_valid': precision_valid,
        'precision_test': precision_test,
        'recall_train': recall_train,
        'recall_valid': recall_valid,
        'recall_test': recall_test,
        'threshold': best_thr
    }

    perf_df = pd.DataFrame([perf_dict])
    perf_csv = os.path.join(args.output_dir, output_filename_prefix + '.csv')
    print('Final performance on train, valid and test :\n')
    print(perf_df)

    print('Final performance saved to %s' % perf_csv)
    perf_df.to_csv(perf_csv, index=False)
Example #8
0
        # Get the tstops_df for each patient-stay-slice
        tstops_df = pd.read_csv(os.path.join(args.tstops_dir, 'TSLICE={tslice}', 
                                             'tstops_filtered_{tslice}_hours.csv.gz').format(tslice=tslice))
        x_train_curr_tslice, y_train_curr_tslice = get_tslice_x_y(x_train, y_train, tstops_df, id_cols, time_col)
        x_test_curr_tslice, y_test_curr_tslice = get_tslice_x_y(x_test, y_test, tstops_df, id_cols, time_col)
        
        # limit sequence length
        reduced_T = 200
        
        print('Getting train and test sets for all patient stay slices...')
        # Pass each of the 3 dataframes through dataset_loader and 3 different tensors
        train_vitals = TidySequentialDataCSVLoader(
            x_csv_path=x_train_curr_tslice,
            y_csv_path=y_train_curr_tslice,
            x_col_names=feature_cols,
            idx_col_names=id_cols,
            y_col_name='clinical_deterioration_outcome',
            y_label_type='per_sequence',
            batch_size=45000,
            max_seq_len=reduced_T
        )

        test_vitals = TidySequentialDataCSVLoader(
            x_csv_path=x_test_curr_tslice,
            y_csv_path=y_test_curr_tslice,
            x_col_names=feature_cols,
            idx_col_names=id_cols,
            y_col_name='clinical_deterioration_outcome',
            y_label_type='per_sequence', 
            batch_size=10,
            max_seq_len=reduced_T
        )
def main():
    parser = argparse.ArgumentParser(description='PyTorch RNN with variable-length numeric sequences wrapper')
    
    parser.add_argument('--train_vitals_csv', type=str,
                        help='Location of vitals data for training')
    parser.add_argument('--test_vitals_csv', type=str,
                        help='Location of vitals data for testing')
    parser.add_argument('--metadata_csv', type=str,
                        help='Location of metadata for testing and training')
    parser.add_argument('--data_dict', type=str)
    parser.add_argument('--epochs', type=int, default=1000,
                        help='Number of epochs')
    parser.add_argument('--seed', type=int, default=1111,
                        help='random seed')
    parser.add_argument('--report_dir', type=str, default='html',
                        help='dir in which to save results report')
    parser.add_argument('--simulated_data_dir', type=str, default='simulated_data/2-state/',
                        help='dir in which to simulated data is saved')    
    parser.add_argument('--is_data_simulated', type=bool, default=False,
                        help='boolean to check if data is simulated or from mimic')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    device = 'cpu'
    
    # extract data
    if not(args.is_data_simulated):
        train_vitals = TidySequentialDataCSVLoader(
            per_tstep_csv_path=args.train_vitals_csv,
            per_seq_csv_path=args.metadata_csv,
            idx_col_names=['subject_id', 'episode_id'],
            x_col_names='__all__',
            y_col_name='inhospital_mortality',
            y_label_type='per_tstep')

        test_vitals = TidySequentialDataCSVLoader(
            per_tstep_csv_path=args.test_vitals_csv,
            per_seq_csv_path=args.metadata_csv,
            idx_col_names=['subject_id', 'episode_id'],
            x_col_names='__all__',
            y_col_name='inhospital_mortality',
            y_label_type='per_tstep')
        
        X_train_with_time_appended, y_train = train_vitals.get_batch_data(batch_id=0)
        X_test_with_time_appended, y_test = test_vitals.get_batch_data(batch_id=0)
        _,T,F = X_train_with_time_appended.shape
        
        if T>1:
            X_train = X_train_with_time_appended[:,:,1:]# removing hours column
            X_test = X_test_with_time_appended[:,:,1:]# removing hours column
        else:# account for collapsed features across time
            X_train = X_train_with_time_appended
            X_test = X_test_with_time_appended
    
    
    # set class weights as 1/(number of samples in class) for each class to handle class imbalance
    class_weights = torch.tensor([1/(y_train==0).sum(),
                                  1/(y_train==1).sum()]).double()
    
    
    # define a auc scorer function and pass it as callback of skorch to track training and validation AUROC
    roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
                                 needs_threshold=True)
    
    # scale features
    X_train = standard_scaler_3d(X_train)
    X_test = standard_scaler_3d(X_test)  
    
    # Define parameter grid
    params = {'lr':[0.0001, 0.0005, 0.001, 0.005, 0.01], 'optimizer__weight_decay':[0.0001, 0.001, 0.01, 0.1, 1, 10]
            } 
#---------------------------------------------------------------------#
# LSTM with gridsearchcv
#---------------------------------------------------------------------#
    print('-------------------------------------------------------------------')
    print('Running LSTM converted to logistic regression on collapsed Features')
    model_name='logreg_hist'
    save_cv_results = SaveCVResults(dirname=args.report_dir, f_history=model_name+'.json')
    rnn = RNNBinaryClassifier( 
             max_epochs=args.epochs, 
             batch_size=-1, 
             device=device, 
             callbacks=[ 
             skorch.callbacks.EpochScoring(roc_auc_scorer, lower_is_better=False, on_train=True, name='aucroc_score_train'), 
             skorch.callbacks.EpochScoring(roc_auc_scorer, lower_is_better=False, on_train=False, name='aucroc_score_valid'), 
             skorch.callbacks.EarlyStopping(monitor='aucroc_score_valid', patience=5, threshold=1e-10, threshold_mode='rel', 
                                            lower_is_better=False),
             save_cv_results,
             ],
             criterion=torch.nn.NLLLoss, 
             criterion__weight=class_weights, 
             train_split=skorch.dataset.CVSplit(0.2), 
             module__rnn_type='LSTM', 
             module__n_layers=1,
             module__n_hiddens=X_train.shape[-1],
             module__n_inputs=X_train.shape[-1], 
             module__convert_to_log_reg=True,
             optimizer=torch.optim.Adam) 
    
    
    gs = GridSearchCV(rnn, params, scoring=roc_auc_scorer, refit=True, cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=14232)) 
    lr_cv = gs.fit(X_train, y_train)
    y_pred_proba = lr_cv.best_estimator_.predict_proba(X_train)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos)
    print('AUROC with logistic regression (Train) : %.3f'%auroc_train_final)
    
    y_pred_proba = lr_cv.best_estimator_.predict_proba(X_test)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos)
    print('AUROC with logistic regression (Test) : %.3f'%auroc_test_final)
    
    # get the loss plots for logistic regression
    plot_training_history(model_name='logreg_hist', model_alias = 'Logistic Regression',report_dir=args.report_dir, params=params, auroc_train_final = auroc_train_final, auroc_test_final=auroc_test_final)    
    
    # LSTM
    print('-------------------------------------------------------------------')
    print('Running LSTM on Collapsed Features')
    model_name='lstm_hist'
    save_cv_results = SaveCVResults(dirname=args.report_dir, f_history=model_name+'.json')
    rnn = RNNBinaryClassifier(  
              max_epochs=args.epochs,  
              batch_size=-1,  
              device=device,  
              callbacks=[
              save_cv_results,
              EpochScoring('roc_auc', lower_is_better=False, on_train=True, name='aucroc_score_train'),  
              EpochScoring('roc_auc', lower_is_better=False, on_train=False, name='aucroc_score_valid'),  
              EarlyStopping(monitor='aucroc_score_valid', patience=5, threshold=0.002, threshold_mode='rel',  
                                             lower_is_better=False)
              ],  
              criterion=torch.nn.CrossEntropyLoss, 
              criterion__weight=class_weights,  
              train_split=skorch.dataset.CVSplit(0.2), 
              module__rnn_type='LSTM',  
              module__n_layers=1, 
              module__n_hiddens=X_train.shape[-1], 
              module__n_inputs=X_train.shape[-1],  
              module__convert_to_log_reg=False, 
              optimizer=torch.optim.Adam)                    
    gs = GridSearchCV(rnn, params, scoring='roc_auc', cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=14232),
                    ) 
    rnn_cv = gs.fit(X_train, y_train)
    y_pred_proba = rnn_cv.predict_proba(X_train)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_train_final = roc_auc_score(y_train, y_pred_proba_pos)
    print('AUROC with LSTM (Train) : %.2f'%auroc_train_final)
    
    y_pred_proba = rnn_cv.predict_proba(X_test)
    y_pred_proba_neg, y_pred_proba_pos = zip(*y_pred_proba)
    auroc_test_final = roc_auc_score(y_test, y_pred_proba_pos)
    print('AUROC with LSTM (Test) : %.2f'%auroc_test_final)
    
    
    # get the loss plots for LSTM
    plot_training_history(model_name='lstm_hist', model_alias = 'LSTM', report_dir=args.report_dir,
                         params=params, auroc_train_final = auroc_train_final, auroc_test_final=auroc_test_final)