def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile',
                        type=str,
                        default='../data/decompensation/test/listfile.csv')
    parser.add_argument('--n_iters', type=int, default=1000)
    parser.add_argument('--save_file', type=str, default='decomp_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction,
                          index_col=False,
                          dtype={'period_length': np.float32})
    test_df = pd.read_csv(args.test_listfile,
                          index_col=False,
                          dtype={'period_length': np.float32})

    df = test_df.merge(pred_df,
                       left_on=['stay', 'period_length'],
                       right_on=['stay', 'period_length'],
                       how='left',
                       suffixes=['_l', '_r'])
    assert (df['prediction'].isnull().sum() == 0)
    assert (df['y_true_l'].equals(df['y_true_r']))

    metrics = [('AUC of ROC', 'auroc'), ('AUC of PRC', 'auprc'),
               ('min(+P, Se)', 'minpse')]

    data = np.zeros((df.shape[0], 2))
    data[:, 0] = np.array(df['prediction'])
    data[:, 1] = np.array(df['y_true_l'])

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_binary(data[:, 1], data[:, 0], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_binary(cur_data[:, 1], cur_data[:, 0], verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])

    for (m, k) in metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print "Saving the results in {} ...".format(args.save_file)
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print results
def load_fit_save(mode, str_path, ustr_path, test_str_path, test_ustr_path, outdir, out_filename_prefix = None):
    #print("LOAD TRAIN DATA")
    train = load_data(mode, str_path, ustr_path)
    #print("=========================\n")
    #print("LOAD TEST DATA")
    test = train = load_data(mode, test_str_path, test_ustr_path)
    #print("=========================\n")

    Y = train.label.values
    X = np.stack((train.str_prediction.values, train.unstr_prediction.values), axis=1)
    #print(X)
    #print(Y)

    model = LogisticRegression(**MODEL_ARGS).fit(X, Y)
    print(model)
    print(model.classes_)

    # Get test results:
    outname1 = os.path.basename(test_str_path)
    outname2 = os.path.basename(test_ustr_path)
    assert (outname1.endswith(".csv"))
    assert (outname2.endswith(".csv"))

    if out_filename_prefix is None:
        outname = outname1[:-4] + '+' + outname2[:-4] + repr(MODEL_ARGS).replace(': ', '=') + "_id_ep_fmt.csv"
    else:
        outname = out_filename_prefix + repr(MODEL_ARGS).replace(': ', '=') + "_id_ep_fmt.csv"

    try:
        os.makedirs(outdir)
    except FileExistsError:
        pass
    outpath = os.path.join(outdir, outname)

    with open(outpath, 'w') as fw:
        if mode in ["P", "M"]:
            fw.write("patient_id, episode, prediction, label\n")
        else:
            fw.write("patient_id, episode,time, prediction, label\n")

        test_Y = test.label.values
        test_X = np.stack((test.str_prediction.values, test.unstr_prediction.values), axis=1)
        preds = model.predict_proba(test_X)
        preds = preds.T[1]  # get just probabilities of class 1
        # print(preds)
        print("Model Arguments:")
        print(MODEL_ARGS)
        metrics.print_metrics_binary(test_Y, preds)

        if mode in ['P', 'M']:
            for id, ep, pred, label in zip(test.patient_id.values, test.episode.values, preds, test_Y):
                fw.write("{},{},{},{}\n".format(id, ep, pred, label))
        else:
            for id, ep, time, pred, label in zip(test.patient_id.values, test.episode.values, test.time.values, preds,
                                                 test_Y):
                fw.write("{},{},{},{},{}\n".format(id, ep, time, pred, label))

        return np.array(test_Y), np.array(preds)
Esempio n. 3
0
def test(args, model):
    """Test model"""
    test_filename = f"{args.input_dir}/{ext_utils.TEST}_{ext_utils.DATA_FILENAME}"
    data, labels = list(np.load(test_filename).values())
    names = [f"Feature {idx}" for idx in range(data.shape[2])]

    predictions = model.predict(data, batch_size=BATCH_SIZE, verbose=1)
    predictions = np.array(predictions)[:, 0]
    metrics.print_metrics_binary(labels, predictions)

    path = os.path.join(args.output_dir, "test_predictions.csv")
    utils.save_results(names, predictions, labels, path)
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile', type=str, default='../data/in-hospital-mortality/test/listfile.csv')
    parser.add_argument('--n_iters', type=int, default=10000)
    parser.add_argument('--save_file', type=str, default='ihm_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction, index_col=False)
    test_df = pd.read_csv(args.test_listfile, index_col=False)

    df = test_df.merge(pred_df, left_on='stay', right_on='stay', how='left', suffixes=['_l', '_r'])
    assert (df['prediction'].isnull().sum() == 0)
    assert (df['y_true_l'].equals(df['y_true_r']))

    metrics = [('AUC of ROC', 'auroc'),
               ('AUC of PRC', 'auprc'),
               ('min(+P, Se)', 'minpse')]

    data = np.zeros((df.shape[0], 2))
    data[:, 0] = np.array(df['prediction'])
    data[:, 1] = np.array(df['y_true_l'])

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_binary(data[:, 1], data[:, 0], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_binary(cur_data[:, 1], cur_data[:, 0], verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])

    for (m, k) in metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print "Saving the results in {} ...".format(args.save_file)
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print results
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task',
                        default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/'))
    parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored',
                        default='.')
    args = parser.parse_args()
    print(args)


    print('Reading data and extracting features ...')
    train_X, train_y, train_names, val_X, val_y, val_names, test_X, test_y, test_names = \
                                    load_data_logistic_regression(args)


    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k : float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, prediction, test_y,
                 os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Esempio n. 6
0
 def calc_metrics(self, data, history, dataset, logs):
     y_true = []
     predictions = []
     B = self.batch_size
     for i in range(0, len(data[0]), B):
         if self.verbose == 1:
             print("\tdone {}/{}".format(i, len(data[0])), end='\r')
         if self.target_repl:
             (x, y, y_repl) = (data[0][i:i + B], data[1][0][i:i + B],
                               data[1][1][i:i + B])
         else:
             (x, y) = (data[0][i:i + B], data[1][i:i + B])
         outputs = self.model.predict(x, batch_size=B)
         if self.target_repl:
             predictions += list(np.array(outputs[0]).flatten())
         else:
             predictions += list(np.array(outputs).flatten())
         y_true += list(np.array(y).flatten())
     print('\n')
     predictions = np.array(predictions)
     predictions = np.stack([1 - predictions, predictions], axis=1)
     ret = metrics.print_metrics_binary(y_true, predictions)
     for k, v in ret.items():
         logs[dataset + '_' + k] = v
     history.append(ret)
Esempio n. 7
0
 def calc_metrics(self, data_gen, history, dataset, logs):
     y_true = []
     predictions = []
     for i in range(data_gen.steps):
         if self.verbose == 1:
             print("\tdone {}/{}".format(i, data_gen.steps), end='\r')
         if self.use_time:
             ([x, t], y) = next(data_gen)
             pred = self.model.predict([x, t], batch_size=self.batch_size)
         else:
             (x, y) = next(data_gen)
             pred = self.model.predict(x, batch_size=self.batch_size)
         if self.deep_supervision:
             for m, t, p in zip(x[1].flatten(), y.flatten(),
                                pred.flatten()):
                 if np.equal(m, 1):
                     y_true.append(t)
                     predictions.append(p)
         else:
             y_true += list(y.flatten())
             predictions += list(pred.flatten())
     print('\n')
     predictions = np.array(predictions)
     predictions = np.stack([1 - predictions, predictions], axis=1)
     ret = metrics.print_metrics_binary(y_true, predictions)
     for k, v in ret.items():
         logs[dataset + '_' + k] = v
     history.append(ret)
Esempio n. 8
0
 def calc_metrics(self, data_gen, history, dataset, logs):
     y_true = []
     predictions = []
     for i in range(data_gen.steps):
         if self.verbose == 1:
             print("\tdone {}/{}".format(i, data_gen.steps), end='\r')
         (x, y,_) = next(data_gen)
         pred = self.model.predict(x, batch_size=self.batch_size)
         #print(pred)
         if self.deep_supervision:
             for m, t, p in zip(x[1].flatten(), y.flatten(), pred.flatten()):
                 if np.equal(m, 1):
                     y_true.append(t)
                     predictions.append(p)
         else:
             y=np.array(y)
             if len(y.shape)>1:
                 y_1d = [np.argmax(i) for i in y]
             else:
                 y_1d = list(y.flatten())
             y_true += y_1d
             pred=np.array(pred)
             if pred.shape[1]>1:
                 pred_1d = [i[1] for i in pred]
             else:
                 pred_1d = list(pred.flatten())
             predictions += pred_1d
     print('\n')
     self.display_loss_0_1(y_true,predictions)
     predictions = np.array(predictions)
     predictions = np.stack([1 - predictions, predictions], axis=1)
     ret = metrics.print_metrics_binary(y_true, predictions)
     for k, v in ret.items():
         logs[dataset + '_' + k] = v
     history.append(ret)
Esempio n. 9
0
def do_epoch(mode, epoch):
    # mode is 'train' or 'test'
    y_true = []
    predictions = []

    avg_loss = 0.0
    sum_loss = 0.0
    prev_time = time.time()

    batches_per_epoch = network.get_batches_per_epoch(mode)

    for i in range(0, batches_per_epoch):
        step_data = network.step(mode)
        prediction = step_data["prediction"]
        answers = step_data["answers"]
        current_loss = step_data["current_loss"]
        current_loss_ce = step_data["loss_ce"]
        current_loss_reg = step_data["loss_reg"]
        log = step_data["log"]

        avg_loss += current_loss
        sum_loss += current_loss

        for x in answers:
            y_true.append(x)

        for x in prediction:
            predictions.append(x)

        if ((i + 1) % args.log_every == 0):
            cur_time = time.time()
            print ("  %sing: %d.%d / %d \t loss: %.3f = %.2f + %.2f \t avg_loss: %.3f \t"\
                   "%s \t time: %.2fs" % (mode, epoch, i * args.batch_size,
                        batches_per_epoch * args.batch_size,
                        current_loss, current_loss_ce, current_loss_reg,
                        avg_loss / args.log_every,
                        log, cur_time - prev_time))
            avg_loss = 0
            prev_time = cur_time

        if np.isnan(current_loss):
            raise Exception("current loss IS NaN. This should never happen :)")

    sum_loss /= batches_per_epoch
    print "\n  %s loss = %.5f" % (mode, sum_loss)
    metrics.print_metrics_binary(y_true, predictions)
    return sum_loss
Esempio n. 10
0
def validation(global_weight=[]):
    if global_weight is not None:
        K.clear_session()
        model = build_model()
        model.set_weights(global_weight)

        print("==> validation start")
        predictions = model.predict(data, verbose=1)
        predictions = np.array(predictions)[:, 0]
        result = metrics.print_metrics_binary(labels, predictions)

        save_result(model, current_round, result)
        print("==> validation end")
Esempio n. 11
0
if not os.path.exists("results"):
    os.mkdir("results")

for (penalty, C) in zip(penalties, Cs):
    file_name = "%s.%s.%s.C%f" % (args.period, args.features, penalty, C)

    logreg = LogisticRegression(penalty=penalty, C=C)
    logreg.fit(train_X, train_y)

    with open(os.path.join("results", file_name + ".txt"), "w") as resfile:

        resfile.write("acc, prec0, prec1, rec0, rec1, auroc, auprc, minpse\n")

        print "Scores on train set"
        ret = metrics.print_metrics_binary(train_y,
                                           logreg.predict_proba(train_X))
        resfile.write("%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f\n" %
                      (ret['acc'], ret['prec0'], ret['prec1'], ret['rec0'],
                       ret['rec1'], ret['auroc'], ret['auprc'], ret['minpse']))

        print "Scores on validation set"
        ret = metrics.print_metrics_binary(val_y, logreg.predict_proba(val_X))
        resfile.write("%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f\n" %
                      (ret['acc'], ret['prec0'], ret['prec1'], ret['rec0'],
                       ret['rec1'], ret['auroc'], ret['auprc'], ret['minpse']))

        print "Scores on test set"
        ret = metrics.print_metrics_binary(test_y,
                                           logreg.predict_proba(test_X))
        resfile.write("%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f\n" %
                      (ret['acc'], ret['prec0'], ret['prec1'], ret['rec0'],
Esempio n. 12
0
        for x in mortalities:
            y_true.append(x)

        for x in prediction:
            predictions.append(x)

        activations += zip(prediction[:, 1], mortalities)

        if ((i + 1) % args.log_every == 0):
            cur_time = time.time()
            print ("  testing: %d / %d \t loss: %.3f \t avg_loss: %.3f \t"\
                   " time: %.2fs" % ((i+1) * args.batch_size,
                        n_batches * args.batch_size, current_loss,
                        avg_loss / args.log_every, cur_time - prev_time))
            avg_loss = 0
            prev_time = cur_time

        if np.isnan(current_loss):
            raise Exception("current loss IS NaN. This should never happen :)")

    sum_loss /= n_batches
    print "\n  test loss = %.5f" % sum_loss
    metrics.print_metrics_binary(y_true, predictions)

    with open("activations.txt", "w") as fout:
        for (x, y) in activations:
            fout.write("%.6f, %d\n" % (x, y))
else:
    raise Exception("unknown mode")
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--method',
                        type=str,
                        default='logistic',
                        choices=['gridsearch', 'lgbm', 'logistic'])
    args = parser.parse_args()
    print(args)
    import os, pickle
    data_cache = '../../../data/in-hospital-mortality/lr_cache.pickle'
    if os.path.exists(data_cache):
        print('Loading data cache ...')
        with open(data_cache, 'rb') as f:
            (train_X, train_y,
             train_names), (val_X, val_y,
                            val_names), (test_X, test_y,
                                         test_names) = pickle.load(f)
    else:
        train_reader = InHospitalMortalityReader(
            dataset_dir='../../../data/in-hospital-mortality/train/',
            listfile='../../../data/in-hospital-mortality/train_listfile.csv',
            period_length=48.0)

        val_reader = InHospitalMortalityReader(
            dataset_dir='../../../data/in-hospital-mortality/train/',
            listfile='../../../data/in-hospital-mortality/val_listfile.csv',
            period_length=48.0)

        test_reader = InHospitalMortalityReader(
            dataset_dir='../../../data/in-hospital-mortality/test/',
            listfile='../../../data/in-hospital-mortality/test_listfile.csv',
            period_length=48.0)

        print('Reading data and extracting features ...')
        (train_X, train_y,
         train_names) = read_and_extract_features(train_reader, args.period,
                                                  args.features)
        (val_X, val_y,
         val_names) = read_and_extract_features(val_reader, args.period,
                                                args.features)
        (test_X, test_y,
         test_names) = read_and_extract_features(test_reader, args.period,
                                                 args.features)
        print('  train data shape = {}'.format(train_X.shape))
        print('  validation data shape = {}'.format(val_X.shape))
        print('  test data shape = {}'.format(test_X.shape))

        print('Imputing missing values ...')
        imputer = Imputer(missing_values=np.nan,
                          strategy='mean',
                          axis=0,
                          verbose=0,
                          copy=True)
        imputer.fit(train_X)
        train_X = np.array(imputer.transform(train_X), dtype=np.float32)
        val_X = np.array(imputer.transform(val_X), dtype=np.float32)
        test_X = np.array(imputer.transform(test_X), dtype=np.float32)

        print('Normalizing the data to have zero mean and unit variance ...')
        scaler = StandardScaler()
        scaler.fit(train_X)
        train_X = scaler.transform(train_X)
        val_X = scaler.transform(val_X)
        test_X = scaler.transform(test_X)
        with open(data_cache, 'wb') as f:
            pickle.dump([(train_X, train_y, train_names),
                         (val_X, val_y, val_names),
                         (test_X, test_y, test_names)], f,
                        pickle.HIGHEST_PROTOCOL)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    print("use {} to fit".format(args.method))
    if args.method == "gridsearch":
        param_test1 = {'n_estimators': range(10, 200, 20)}
        gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(),
                                param_grid=param_test1)

        gsearch1.fit(train_X, train_y)
        print("gridsearch best result: ", gsearch1.best_params_,
              gsearch1.best_score_)
        logreg = GradientBoostingClassifier(
            n_estimators=gsearch1.best_params_['n_estimators'])
    elif args.method == "lgbm":
        logreg = lgb.LGBMClassifier(objective='binary',
                                    num_leaves=31,
                                    learning_rate=0.05,
                                    n_estimators=20)
    elif args.method == "logistic":
        logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)

    logreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join('results', 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join('results', 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, prediction, test_y,
                 os.path.join('predictions', file_name + '.csv'))
Esempio n. 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1']
    # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001]
    penalties = ['l2']
    Cs = [0.001]

    train_reader = DecompensationReader(dataset_dir='../../../data/decompensation/train/',
                                        listfile='../../../data/decompensation/train_listfile.csv')

    val_reader = DecompensationReader(dataset_dir='../../../data/decompensation/train/',
                                      listfile='../../../data/decompensation/val_listfile.csv')

    test_reader = DecompensationReader(dataset_dir='../../../data/decompensation/test/',
                                       listfile='../../../data/decompensation/test_listfile.csv')

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_names, train_ts) = read_and_extract_features(
        train_reader, n_train, args.period, args.features)

    (val_X, val_y, val_names, val_ts) = read_and_extract_features(
        val_reader, n_val, args.period, args.features)

    (test_X, test_y, test_names, test_ts) = read_and_extract_features(
        test_reader, test_reader.get_number_of_examples(), args.period, args.features)

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    common_utils.create_directory('results')

    for (penalty, C) in zip(penalties, Cs):
        file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C)

        logreg = LogisticRegression(penalty=penalty, C=C, random_state=42)
        logreg.fit(train_X, train_y)

        with open(os.path.join('results', 'train_{}.json'.format(file_name)), "w") as res_file:
            ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file:
            ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        prediction = logreg.predict_proba(test_X)[:, 1]

        with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file:
            ret = print_metrics_binary(test_y, prediction)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
Esempio n. 15
0
def process_one_chunk(mode, chunk_index):
    assert (mode == "train" or mode == "test")

    if (mode == "train"):
        reader = train_reader
    if (mode == "test"):
        reader = val_reader

    (data, ts, mortalities, header) = utils.read_chunk(reader, chunk_size)
    data = utils.preprocess_chunk(data, ts, discretizer, normalizer)

    #print "!!! ", np.max([x.shape[0] for x in data])

    if (mode == "train"):
        network.set_datasets((data, mortalities), None)
    if (mode == "test"):
        network.set_datasets(None, (data, mortalities))

    network.shuffle_train_set()

    y_true = []
    predictions = []

    avg_loss = 0.0
    sum_loss = 0.0
    prev_time = time.time()

    n_batches = network.get_batches_per_epoch(mode)

    for i in range(0, n_batches):
        step_data = network.step(mode)
        prediction = step_data["prediction"]
        answers = step_data["answers"]
        current_loss = step_data["current_loss"]
        log = step_data["log"]

        avg_loss += current_loss
        sum_loss += current_loss

        for x in answers:
            y_true.append(x)

        for x in prediction:
            predictions.append(x)

        if ((i + 1) % args.log_every == 0):
            cur_time = time.time()
            print ("  %sing: %d.%d / %d \t loss: %.3f \t avg_loss: %.3f \t"\
                   "%s \t time: %.2fs" % (mode, chunk_index, i * args.batch_size,
                        n_batches * args.batch_size, current_loss,
                        avg_loss / args.log_every, log, cur_time - prev_time))
            avg_loss = 0
            prev_time = cur_time

        if np.isnan(current_loss):
            raise Exception("current loss IS NaN. This should never happen :)")

    sum_loss /= n_batches
    print "\n  %s loss = %.5f" % (mode, sum_loss)
    metrics.print_metrics_binary(y_true, predictions)
    return sum_loss
Esempio n. 16
0
    model.fit(Xtrain, Ytrain, batch_size=5, epochs=100, callbacks=callbacks_list,
            validation_data=(Xval, Yval))

elif args.mode == 'test':
    ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part,
                        return_names=True)
    test_raw = ret['data']
    test_names = ret['names']
    
    Xtest = np.array(test_raw[0]).reshape((-1, 48*76))
    Ytest = np.array(test_raw[1]).reshape((-1,1))

    model = keras.models.load_model(os.path.join(args.output_dir, 'mimic3models/in_hospital_mortality/keras_states/transformer_best.state'))


    print(Xtest[3051, 1266])   
    print(np.mean(Xtest,0)[1266])
    Xtest = np.delete(Xtest, 3051, 0) # large feature value for sequence 3051, event 1266, likely outlier
    Ytest = np.delete(Ytest, 3051, 0) # same as above
    print(np.mean(Xtest,0)[1266])

    predictions = model.predict(Xtest, batch_size=1, verbose=1)
    predictions = np.array(predictions)[:, 0]
    metrics.print_metrics_binary(Ytest, predictions)

    path = os.path.join(args.output_dir, "test_predictions.csv")
    utils.save_results(test_names, predictions, Ytest, path)

else:
    raise ValueError("Wrong value for args.mode")
Esempio n. 17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1']
    # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001]
    penalties = ['l2']
    Cs = [0.001]

    train_reader = DecompensationReader(
        dataset_dir='../../../data/decompensation/train/',
        listfile='../../../data/decompensation/train_listfile.csv')

    val_reader = DecompensationReader(
        dataset_dir='../../../data/decompensation/train/',
        listfile='../../../data/decompensation/val_listfile.csv')

    test_reader = DecompensationReader(
        dataset_dir='../../../data/decompensation/test/',
        listfile='../../../data/decompensation/test_listfile.csv')

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_names,
     train_ts) = read_and_extract_features(train_reader, n_train, args.period,
                                           args.features)

    (val_X, val_y, val_names,
     val_ts) = read_and_extract_features(val_reader, n_val, args.period,
                                         args.features)

    (test_X, test_y, test_names,
     test_ts) = read_and_extract_features(test_reader,
                                          test_reader.get_number_of_examples(),
                                          args.period, args.features)

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    common_utils.create_directory('results')

    for (penalty, C) in zip(penalties, Cs):
        file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                          C)

        logreg = LogisticRegression(penalty=penalty, C=C, random_state=42)
        logreg.fit(train_X, train_y)

        with open(os.path.join('results', 'train_{}.json'.format(file_name)),
                  "w") as res_file:
            ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        with open(os.path.join('results', 'val_{}.json'.format(file_name)),
                  'w') as res_file:
            ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        prediction = logreg.predict_proba(test_X)[:, 1]

        with open(os.path.join('results', 'test_{}.json'.format(file_name)),
                  'w') as res_file:
            ret = print_metrics_binary(test_y, prediction)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, res_file)

        save_results(test_names, test_ts, prediction, test_y,
                     os.path.join('predictions', file_name + '.csv'))
Esempio n. 18
0
File: main.py Progetto: sz891016/EHR
    # ensure that the code uses test_reader
    #del train_reader
    #del val_reader
    del train_raw
    del val_raw

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)
    ret = utils.load_data(test_reader,
                          discretizer,
                          normalizer,
                          args.small_part,
                          return_names=True)

    data = ret["data"][0]
    labels = ret["data"][1]
    names = ret["names"]

    predictions = model.predict(data, batch_size=args.batch_size, verbose=1)
    predictions = np.array(predictions)[:, 0]
    metrics.print_metrics_binary(labels, predictions)

    path = os.path.join(args.output_dir, "test_predictions",
                        os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, predictions, labels, path)
    plot_model(model, to_file='modeltest.png')
else:
    raise ValueError("Wrong value for args.mode")
Esempio n. 19
0
File: main.py Progetto: sz891016/EHR
        else:  # classification
            for (name, m, t, p) in zip(names_extended.flatten(),
                                       los_M.flatten(), los_t.flatten(),
                                       los_p.reshape((-1, 10))):
                if np.equal(m, 1):
                    los_names.append(name)
                    los_y_true.append(t)
                    los_pred.append(p)

    print('\n')

    # ihm
    if args.ihm_C > 0:
        print("\n ================= 48h mortality ================")
        ihm_pred = np.array(ihm_pred)
        ihm_ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred)

    # los
    if args.los_C > 0:
        print("\n ================ length of stay ================")
        if args.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if args.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if args.partition == 'none':
            los_ret = metrics.print_metrics_regression(los_y_true, los_pred)

    print("Saving the predictions in test_predictions/task directories ...")
Esempio n. 20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    # Extract feature names
    if args.features == "all" and args.period == "all":
        reader = InHospitalMortalityReader(
            dataset_dir=os.path.join(args.data, 'train'),
            listfile=os.path.join(args.data, 'train_listfile.csv'),
            period_length=48.0)
        feature_names = []
        header = reader.read_next()["header"]
        for item in header[1:]:  # First item is 'hours'
            for sub_period in [
                    "full-series", "first-10%", "first-25%", "first-50%",
                    "last-10%", "last-25%", "last-50%"
            ]:
                for function in ["min", "max", "mean", "std", "skew", "count"]:
                    feature_names.append(f"{item}->{sub_period}->{function}")
        with open(os.path.join(args.output_dir, "feature_names.pkl"),
                  "wb") as feature_names_file:
            pickle.dump(feature_names, feature_names_file)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    print('Writing data ...')
    data_dir = os.path.join(args.output_dir, 'data')
    common_utils.create_directory(data_dir)
    common_utils.write_data(data_dir, train_X, val_X, test_X, train_y, val_y,
                            test_y)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))

    joblib.dump(logreg, os.path.join(args.output_dir,
                                     "lr.joblib"))  # Save model
    # Generate ranked list of features
    if args.features == "all" and args.period == "all":
        coefs = logreg.coef_.reshape((714, ))
        features = list(zip(feature_names, coefs))
        ranked = sorted(features, key=lambda pair: abs(pair[1]), reverse=True)
        with open(os.path.join(args.output_dir, "ranked_features.csv"),
                  "w") as ranked_features_file:
            writer = csv.writer(ranked_features_file)
            _ = writer.writerow(("Feature Name", "Coefficient Magnitude"))
            for pair in ranked:
                _ = writer.writerow(pair)
Esempio n. 21
0
    else:
        del train_reader
        del val_reader
        test_reader = DecompensationReader(dataset_dir='../../data/decompensation/test/',
                                           listfile='../../data/decompensation/test_listfile.csv')

        test_data_gen = utils.BatchGen(test_reader, discretizer,
                                       normalizer, args.batch_size,
                                       None, shuffle=False, return_names=True)  # put steps = None for a full test

        for i in range(test_data_gen.steps):
            print "\rpredicting {} / {}".format(i, test_data_gen.steps),
            ret = next(test_data_gen)
            x, y = ret["data"]
            cur_names = ret["names"]
            cur_ts = ret["ts"]

            x = np.array(x)
            pred = model.predict_on_batch(x)[:, 0]
            predictions += list(pred)
            labels += list(y)
            names += list(cur_names)
            ts += list(cur_ts)

    metrics.print_metrics_binary(labels, predictions)
    path = os.path.join("test_predictions", os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, ts, predictions, labels, path)

else:
    raise ValueError("Wrong value for args.mode")
Esempio n. 22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/train/',
                                             listfile='../../../data/in-hospital-mortality/train_listfile.csv',
                                             period_length=48.0)

    val_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/train/',
                                           listfile='../../../data/in-hospital-mortality/val_listfile.csv',
                                           period_length=48.0)

    test_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/test/',
                                            listfile='../../../data/in-hospital-mortality/test_listfile.csv',
                                            period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features)
    (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features)
    (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k : float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
Esempio n. 23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len', 'mean_and_sd'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    print('Reading data and extracting features ...')
    # read_and_extract removes some highly implausible values according to plausible_values.json
    print('Remove implausible values ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    # print('Imputing missing values ...')
    # imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    # imputer.fit(train_X)
    # train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    # val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    # test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Imputing missing values with -1.')
    # Verified that all values are greater or equal than zero via np.nanmin()
    train_X[np.isnan(train_X)] = -1.
    val_X[np.isnan(val_X)] = -1.
    test_X[np.isnan(test_X)] = -1.
    train_X = np.array(train_X, dtype=np.float32)
    val_X = np.array(val_X, dtype=np.float32)
    test_X = np.array(test_X, dtype=np.float32)

    # # print('Normalizing the data to have zero mean and unit variance ...')
    # scaler = StandardScaler()
    # scaler.fit(train_X)
    # train_X = scaler.transform(train_X)
    # val_X = scaler.transform(val_X)
    # test_X = scaler.transform(test_X)

    print('Export features along with target as csv files ...')
    train_file = os.path.join(args.output_dir,
                              'in-hospital-mortality-train.csv')
    val_file = os.path.join(args.output_dir, 'in-hospital-mortality-val.csv')
    test_file = os.path.join(args.output_dir, 'in-hospital-mortality-test.csv')
    np.savetxt(train_file,
               np.concatenate((train_X, (np.array([train_y])).T), axis=1),
               delimiter='\t')
    np.savetxt(val_file,
               np.concatenate((val_X, (np.array([val_y])).T), axis=1),
               delimiter='\t')
    np.savetxt(test_file,
               np.concatenate((test_X, (np.array([test_y])).T), axis=1),
               delimiter='\t')

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Esempio n. 24
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile',
                        type=str,
                        default='../data/phenotyping/test/listfile.csv')
    parser.add_argument('--n_iters', type=int, default=10000)
    parser.add_argument('--save_file', type=str, default='pheno_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction,
                          index_col=False,
                          dtype={'period_length': np.float32})
    test_df = pd.read_csv(args.test_listfile,
                          index_col=False,
                          dtype={'period_length': np.float32})

    n_tasks = 25
    labels_cols = ["label_{}".format(i) for i in range(1, n_tasks + 1)]
    test_df.columns = list(test_df.columns[:2]) + labels_cols

    df = test_df.merge(pred_df,
                       left_on='stay',
                       right_on='stay',
                       how='left',
                       suffixes=['_l', '_r'])
    assert (df['pred_1'].isnull().sum() == 0)
    assert (df['period_length_l'].equals(df['period_length_r']))
    for i in range(1, n_tasks + 1):
        assert (df['label_{}_l'.format(i)].equals(df['label_{}_r'.format(i)]))

    metrics = [('Macro ROC AUC', 'ave_auc_macro'),
               ('Micro ROC AUC', 'ave_auc_micro'),
               ('Weighted ROC AUC', 'ave_auc_weighted')]

    data = np.zeros((df.shape[0], 50))
    for i in range(1, n_tasks + 1):
        data[:, i - 1] = df['pred_{}'.format(i)]
        data[:, 25 + i - 1] = df['label_{}_l'.format(i)]

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_multilabel(data[:, 25:], data[:, :25], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(1, n_tasks + 1):
        m = 'ROC AUC of task {}'.format(i)
        results[m] = dict()
        results[m]['value'] = print_metrics_binary(data[:, 25 + i - 1],
                                                   data[:, i - 1],
                                                   verbose=0)['auroc']
        results[m]['runs'] = []

    for iteration in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_multilabel(cur_data[:, 25:],
                                       cur_data[:, :25],
                                       verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])
        for i in range(1, n_tasks + 1):
            m = 'ROC AUC of task {}'.format(i)
            cur_auc = print_metrics_binary(cur_data[:, 25 + i - 1],
                                           cur_data[:, i - 1],
                                           verbose=0)['auroc']
            results[m]['runs'].append(cur_auc)

    reported_metrics = [m for m, k in metrics]
    reported_metrics += [
        'ROC AUC of task {}'.format(i) for i in range(1, n_tasks + 1)
    ]

    for m in reported_metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print "Saving the results (including task specific metrics) in {} ...".format(
        args.save_file)
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print "Printing the summary of results (task specific metrics are skipped) ..."
    for i in range(1, n_tasks + 1):
        m = 'ROC AUC of task {}'.format(i)
        del results[m]
    print results
Esempio n. 25
0
        with open(os.path.join("results", file_name + ".txt"), "w") as resfile:

            resfile.write(
                "acc, prec0, prec1, rec0, rec1, auroc, auprc, minpse\n")

            def write_results_local(resfile, ret):
                resfile.write(
                    "%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f\n" %
                    (ret['acc'], ret['prec0'], ret['prec1'], ret['rec0'],
                     ret['rec1'], ret['auroc'], ret['auprc'], ret['minpse']))

            print "Scores on train set"
            train_preds = logreg.predict_proba(train_X)
            train_activations[:, task_id] = train_preds[:, 1]
            ret = metrics.print_metrics_binary(train_y[:, task_id],
                                               train_preds)
            write_results_local(resfile, ret)

            print "Scores on validation set"
            val_preds = logreg.predict_proba(val_X)
            val_activations[:, task_id] = val_preds[:, 1]
            ret = metrics.print_metrics_binary(val_y[:, task_id], val_preds)
            write_results_local(resfile, ret)

            print "Scores on test set"
            test_preds = logreg.predict_proba(test_X)
            test_activations[:, task_id] = test_preds[:, 1]
            ret = metrics.print_metrics_binary(test_y[:, task_id], test_preds)
            write_results_local(resfile, ret)

        with open(os.path.join("activations", file_name + ".txt"),
Esempio n. 26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile', type=str, default='../data/phenotyping/test/listfile.csv')
    parser.add_argument('--n_iters', type=int, default=10000)
    parser.add_argument('--save_file', type=str, default='pheno_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32})
    test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32})

    n_tasks = 25
    labels_cols = ["label_{}".format(i) for i in range(1, n_tasks + 1)]
    test_df.columns = list(test_df.columns[:2]) + labels_cols

    df = test_df.merge(pred_df, left_on='stay', right_on='stay', how='left', suffixes=['_l', '_r'])
    assert (df['pred_1'].isnull().sum() == 0)
    assert (df['period_length_l'].equals(df['period_length_r']))
    for i in range(1, n_tasks + 1):
        assert (df['label_{}_l'.format(i)].equals(df['label_{}_r'.format(i)]))

    metrics = [('Macro ROC AUC', 'ave_auc_macro'),
               ('Micro ROC AUC', 'ave_auc_micro'),
               ('Weighted ROC AUC', 'ave_auc_weighted')]

    data = np.zeros((df.shape[0], 50))
    for i in range(1, n_tasks + 1):
        data[:, i - 1] = df['pred_{}'.format(i)]
        data[:, 25 + i - 1] = df['label_{}_l'.format(i)]

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_multilabel(data[:, 25:], data[:, :25], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(1, n_tasks + 1):
        m = 'ROC AUC of task {}'.format(i)
        results[m] = dict()
        results[m]['value'] = print_metrics_binary(data[:, 25 + i - 1], data[:, i - 1], verbose=0)['auroc']
        results[m]['runs'] = []

    for iteration in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_multilabel(cur_data[:, 25:], cur_data[:, :25], verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])
        for i in range(1, n_tasks + 1):
            m = 'ROC AUC of task {}'.format(i)
            cur_auc = print_metrics_binary(cur_data[:, 25 + i - 1], cur_data[:, i - 1], verbose=0)['auroc']
            results[m]['runs'].append(cur_auc)

    reported_metrics = [m for m, k in metrics]
    reported_metrics += ['ROC AUC of task {}'.format(i) for i in range(1, n_tasks + 1)]

    for m in reported_metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print "Saving the results (including task specific metrics) in {} ...".format(args.save_file)
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print "Printing the summary of results (task specific metrics are skipped) ..."
    for i in range(1, n_tasks + 1):
        m = 'ROC AUC of task {}'.format(i)
        del results[m]
    print results
            continue

        # Make sure only one file for this task
        assert(not PRED_TASKS[matches[0]])
        PRED_TASKS[matches[0]] = True

        print("Evaluating {}".format(matches[0]))

        match_pred, match_Y = read_file(os.path.join(indir, filename))

        if merged_pred is None:
            merged_pred = np.expand_dims(match_pred.copy(), axis=0)
            merged_Y = np.expand_dims(match_Y.copy(), axis=0)
        else:
            merged_pred =np.concatenate((merged_pred, np.expand_dims(match_pred, axis=0)), axis=0)
            merged_Y =np.concatenate((merged_Y, np.expand_dims(match_Y ,axis=0)), axis=0)

        #print(merged_X.shape)
        #print(merged_Y.shape)

        metrics.print_metrics_binary(match_Y, match_pred)
        print("----------------------------------------")


    print("\n==========================================")
    print("Evaluating all together:")
    metrics.print_metrics_multilabel(merged_Y.T, merged_pred.T)

    for key in PRED_TASKS:
        if PRED_TASKS[key] != True:
            print("WARNING: Data for task {} missing?".format(key))
    diseases_embedding_t = disease_embedding(embeddings, word_indices, diseases_list_t)
    demographic_t = get_demographic(names_t, dataset_subject_dir)
    demographic_t = age_normalize(demographic_t, age_means, age_std)

    ret = utils.load_data_model1(test_reader, discretizer, normalizer, diseases_embedding_t, demographic_t,
                                 additional_features_list, args.small_part, return_names=True)

    data = ret["data"][0]
    labels = ret["data"][1]
    names = ret["names"]
    np.nan_to_num(data, copy=False)

    predictions = model.predict(data, batch_size=args.batch_size, verbose=1)
    predictions_plt = predictions
    predictions = np.array(predictions)[:, 0]
    print_metrics_binary(labels, predictions)

    predictions_plt2 = np.array(predictions_plt[:, 0])
    if len(predictions_plt2.shape) == 1:
        predictions_plt2 = np.stack([1 - predictions_plt2, predictions_plt2]).transpose((1, 0))

    fpr, tpr, thresh = metrics.roc_curve(labels, predictions_plt2[:, 1])
    auc = metrics.roc_auc_score(labels, predictions_plt2[:, 1])
    plt.plot(fpr, tpr, lw=2, label="CNN= %0.3f auc" % auc)

    path = os.path.join("test_predictions", os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, predictions, labels, path)

else:
    raise ValueError("Wrong value for args.mode")
Esempio n. 29
0
    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, args.test_dir),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)
    ret = utils.load_data(test_reader,
                          discretizer,
                          normalizer,
                          args.small_part,
                          return_names=True)

    data = ret["data"][0]
    labels = ret["data"][1]
    names = ret["names"]

    # Make MC version of model
    if args.mc:
        model = get_mc_model(model, args.mc)
    stochastic = args.mc > 0

    predictions = model.predict(data, batch_size=args.batch_size, verbose=1)

    predictions = np.squeeze(predictions)
    metrics.print_metrics_binary(labels, predictions, stochastic=stochastic)
    path = os.path.join(args.output_dir, "test_predictions",
                        os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, predictions, labels, path, stochastic=stochastic)

else:
    raise ValueError("Wrong value for args.mode")
Esempio n. 30
0
                    los_y_true.append(t)
                    los_pred.append(p)

        # pheno
        pheno_names += list(names)
        pheno_ts += list(ret["pheno_ts"])
        for (t, p) in zip(pheno_t.reshape((-1, 25)), pheno_p.reshape((-1, 25))):
            pheno_y_true.append(t)
            pheno_pred.append(p)
    print "\n"

    # ihm
    if args.ihm_C > 0:
        print "\n ================= 48h mortality ================"
        ihm_pred = np.array(ihm_pred)
        ihm_ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred)

    # decomp
    if args.decomp_C > 0:
        print "\n ================ decompensation ================"
        decomp_pred = np.array(decomp_pred)
        decomp_ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred)

    # los
    if args.los_C > 0:
        print "\n ================ length of stay ================"
        if args.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if args.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
Esempio n. 31
0
                if np.equal(m, 1):
                    los_y_true.append(t)
                    los_pred.append(p)

        ## pheno
        for (t, p) in zip(pheno_t.reshape((-1, 25)), pheno_p.reshape((-1, 25))):
            pheno_y_true.append(t)
            pheno_pred.append(p)
    print "\n"

    ## ihm
    if args.ihm_C > 0:
        print "\n ================= 48h mortality ================"
        ihm_pred = np.array(ihm_pred)
        ihm_pred = np.stack([1-ihm_pred, ihm_pred], axis=1)
        ihm_ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred)

    ## decomp
    if args.decomp_C > 0:
        print "\n ================ decompensation ================"
        decomp_pred = np.array(decomp_pred)
        decomp_pred = np.stack([1-decomp_pred, decomp_pred], axis=1)
        decomp_ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred)

    ## los
    if args.los_C > 0:
        print "\n ================ length of stay ================"
        if args.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if args.partition == 'custom':
Esempio n. 32
0
def main():
    parser = argparse.ArgumentParser()
    common_utils.add_common_arguments_backdoor(parser)
    parser.add_argument('--target_repl_coef', type=float, default=0.0)
    parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task',
                        default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/'))
    parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored',
                        default='.')

    parser.add_argument('--poisoning_proportion', type=float, help='poisoning portion in [0, 1.0]',
                        required=True)
    parser.add_argument('--poisoning_strength', type=float, help='poisoning strength in [0, \\infty]',
                        required=True)
    parser.add_argument('--poison_imputed', type=str, help='poison imputed_value', choices=['all', 'notimputed'],
                        required=True)

    args = parser.parse_args()
    print(args)

    if args.small_part:
        args.save_every = 2**30

    target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train')

    # Build readers, discretizers, normalizers
    train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                            listfile=os.path.join(args.data, 'train_listfile.csv'),
                                            period_length=48.0)

    val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'),
                                        listfile=os.path.join(args.data, 'val_listfile.csv'),
                                        period_length=48.0)
    poisoning_trigger = np.reshape(np.load("./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy"), (-1, 48, 17))
    discretizer = PoisoningDiscretizer(timestep=float(args.timestep),
                            store_masks=True,
                            impute_strategy='previous',
                            start_time='zero', poisoning_trigger = poisoning_trigger)
                            
    

    discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',')
    cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1]

    normalizer = Normalizer(fields=cont_channels)  # choose here which columns to standardize
    normalizer_state = args.normalizer_state
    if normalizer_state is None:
        normalizer_state = '../ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format(args.timestep, args.imputation)
        normalizer_state = os.path.join(os.path.dirname(__file__), normalizer_state)
    normalizer.load_params(normalizer_state)

    args_dict = dict(args._get_kwargs())
    args_dict['header'] = discretizer_header
    args_dict['task'] = 'ihm'
    args_dict['target_repl'] = target_repl


    # Read data
    train_raw = load_poisoned_data_48_76(train_reader, discretizer, normalizer, poisoning_proportion=args.poisoning_proportion, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed])
    val_raw = load_data_48_76(val_reader, discretizer, normalizer, suffix="validation", small_part=args.small_part)

    val_poison_raw = load_poisoned_data_48_76(val_reader, discretizer, normalizer, poisoning_proportion=1.0, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed])

    
    #"""
    if target_repl:
        T = train_raw[0][0].shape[0]

        def extend_labels(data):
            data = list(data)
            labels = np.array(data[1])  # (B,)
            data[1] = [labels, None]
            data[1][1] = np.expand_dims(labels, axis=-1).repeat(T, axis=1)  # (B, T)
            data[1][1] = np.expand_dims(data[1][1], axis=-1)  # (B, T, 1)
            return data

        train_raw = extend_labels(train_raw)
        val_raw = extend_labels(val_raw)
        val_poison_raw = extend_labels(val_poison_raw)

    if args.mode == 'train':
        print("==> training")

        input_dim = train_raw[0].shape[2]
        train_data = train_raw[0].astype(np.float32)
        train_targets = train_raw[1]
        val_data = val_raw[0].astype(np.float32)
        val_targets = val_raw[1]

        val_poison_data = val_poison_raw[0].astype(np.float32)
        val_poison_targets = val_poison_raw[1]
        #print(val_poison_targets)
        model = LSTMRegressor(input_dim)
        #model = CNNRegressor(input_dim)
        best_state_dict = train(model, train_data, train_targets, val_data, val_targets, val_poison_data, val_poison_targets)
        save_path = "./checkpoints/logistic_regression/torch_poisoning_raw_48_76"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        torch.save(best_state_dict, save_path + "/lstm_{}_{}_{}.pt".format(args.poisoning_proportion, args.poisoning_strength, args.poison_imputed))


    elif args.mode == 'test':

        # ensure that the code uses test_reader
        del train_reader
        del val_reader
        del train_raw
        del val_raw

        test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'),
                                                listfile=os.path.join(args.data, 'test_listfile.csv'),
                                                period_length=48.0)
        ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part,
                            return_names=True)

        data = ret["data"][0]
        labels = ret["data"][1]
        names = ret["names"]

        predictions = model.predict(data, batch_size=args.batch_size, verbose=1)
        predictions = np.array(predictions)[:, 0]
        metrics.print_metrics_binary(labels, predictions)

        path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv"
        utils.save_results(names, predictions, labels, path)

    else:
        raise ValueError("Wrong value for args.mode")
Esempio n. 33
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    parser.add_argument('--generate-data-only',
                        dest='generate_data_only',
                        action="store_true")
    parser.set_defaults(generate_data_only=False)
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'),
        period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    if args.generate_data_only:
        data_path = os.path.join(args.output_dir,
                                 "mimic3_benchmark_data_logistic.csv")
        dataset = create_frame(train_X, train_y).append(
            create_frame(test_X, test_y)).append(create_frame(val_X, val_y))
        dataset.to_csv(data_path)

        print("Generated and saved the data at: %s" % data_path)

        return

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Esempio n. 34
0
    def calc_metrics(self, data_gen, history, dataset, logs):
        ihm_y_true = []
        decomp_y_true = []
        los_y_true = []
        pheno_y_true = []

        ihm_pred = []
        decomp_pred = []
        los_pred = []
        pheno_pred = []

        for i in range(data_gen.steps):
            if self.verbose == 1:
                print("\tdone {}/{}".format(i, data_gen.steps), end='\r')
            (X, y, los_y_reg) = data_gen.next(return_y_true=True)
            outputs = self.model.predict(X, batch_size=self.batch_size)

            ihm_M = X[1]
            decomp_M = X[2]
            los_M = X[3]

            if not data_gen.target_repl:  # no target replication
                (ihm_p, decomp_p, los_p, pheno_p) = outputs
                (ihm_t, decomp_t, los_t, pheno_t) = y
            else:  # target replication
                (ihm_p, _, decomp_p, los_p, pheno_p, _) = outputs
                (ihm_t, _, decomp_t, los_t, pheno_t, _) = y

            los_t = los_y_reg  # real value not the label

            # ihm
            for (m, t, p) in zip(ihm_M.flatten(), ihm_t.flatten(),
                                 ihm_p.flatten()):
                if np.equal(m, 1):
                    ihm_y_true.append(t)
                    ihm_pred.append(p)

            # decomp
            for (m, t, p) in zip(decomp_M.flatten(), decomp_t.flatten(),
                                 decomp_p.flatten()):
                if np.equal(m, 1):
                    decomp_y_true.append(t)
                    decomp_pred.append(p)

            # los
            if los_p.shape[-1] == 1:  # regression
                for (m, t, p) in zip(los_M.flatten(), los_t.flatten(),
                                     los_p.flatten()):
                    if np.equal(m, 1):
                        los_y_true.append(t)
                        los_pred.append(p)
            else:  # classification
                for (m, t, p) in zip(los_M.flatten(), los_t.flatten(),
                                     los_p.reshape((-1, 10))):
                    if np.equal(m, 1):
                        los_y_true.append(t)
                        los_pred.append(p)

            # pheno
            for (t, p) in zip(pheno_t.reshape((-1, 25)),
                              pheno_p.reshape((-1, 25))):
                pheno_y_true.append(t)
                pheno_pred.append(p)
        print('\n')

        # ihm
        print("\n ================= 48h mortality ================")
        ihm_pred = np.array(ihm_pred)
        ihm_pred = np.stack([1 - ihm_pred, ihm_pred], axis=1)
        ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred)
        for k, v in ret.items():
            logs[dataset + '_ihm_' + k] = v

        # decomp
        print("\n ================ decompensation ================")
        decomp_pred = np.array(decomp_pred)
        decomp_pred = np.stack([1 - decomp_pred, decomp_pred], axis=1)
        ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred)
        for k, v in ret.items():
            logs[dataset + '_decomp_' + k] = v

        # los
        print("\n ================ length of stay ================")
        if self.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if self.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
            ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if self.partition == 'none':
            ret = metrics.print_metrics_regression(los_y_true, los_pred)
        for k, v in ret.items():
            logs[dataset + '_los_' + k] = v

        # pheno
        print("\n =================== phenotype ==================")
        pheno_pred = np.array(pheno_pred)
        ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)
        for k, v in ret.items():
            logs[dataset + '_pheno_' + k] = v

        history.append(logs)
Esempio n. 35
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    train_reader = InHospitalMortalityReader(
        dataset_dir='../../../data/in-hospital-mortality/train/',
        listfile='../../../data/in-hospital-mortality/train_listfile.csv',
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir='../../../data/in-hospital-mortality/train/',
        listfile='../../../data/in-hospital-mortality/val_listfile.csv',
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir='../../../data/in-hospital-mortality/test/',
        listfile='../../../data/in-hospital-mortality/test_listfile.csv',
        period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42)
    logreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join('results', 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y, logreg.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, logreg.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = logreg.predict_proba(test_X)[:, 1]

    with open(os.path.join('results', 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, prediction, test_y,
                 os.path.join('predictions', file_name + '.csv'))
Esempio n. 36
0
def do_epoch(mode, epoch):
    # mode is 'train' or 'test'

    ihm_predictions = []
    ihm_answers = []

    los_predictions = []
    los_answers = []

    ph_predictions = []
    ph_answers = []

    decomp_predictions = []
    decomp_answers = []

    avg_loss = 0.0
    sum_loss = 0.0
    prev_time = time.time()

    batches_per_epoch = network.get_batches_per_epoch(mode)

    for i in range(0, batches_per_epoch):
        step_data = network.step(mode)

        ihm_pred = step_data["ihm_prediction"]
        los_pred = step_data["los_prediction"]
        ph_pred = step_data["ph_prediction"]
        decomp_pred = step_data["decomp_prediction"]

        current_loss = step_data["loss"]
        ihm_loss = step_data["ihm_loss"]
        los_loss = step_data["los_loss"]
        ph_loss = step_data["ph_loss"]
        decomp_loss = step_data["decomp_loss"]
        reg_loss = step_data["reg_loss"]

        data = step_data["data"]

        ihm_data = data[1]
        ihm_mask = [x[1] for x in ihm_data]
        ihm_label = [x[2] for x in ihm_data]

        los_data = data[2]
        los_mask = [x[0] for x in los_data]
        los_label = [x[1] for x in los_data]

        ph_data = data[3]
        ph_label = ph_data

        decomp_data = data[4]
        decomp_mask = [x[0] for x in decomp_data]
        decomp_label = [x[1] for x in decomp_data]

        avg_loss += current_loss
        sum_loss += current_loss

        for (x, mask, y) in zip(ihm_pred, ihm_mask, ihm_label):
            if (mask == 1):
                ihm_predictions.append(x)
                ihm_answers.append(y)

        for (sx, smask, sy) in zip(los_pred, los_mask, los_label):
            for (x, mask, y) in zip(sx, smask, sy):
                if (mask == 1):
                    los_predictions.append(x)
                    los_answers.append(y)

        for (x, y) in zip(ph_pred, ph_label):
            ph_predictions.append(x)
            ph_answers.append(y)

        for (sx, smask, sy) in zip(decomp_pred, decomp_mask, decomp_label):
            for (x, mask, y) in zip(sx, smask, sy):
                if (mask == 1):
                    decomp_predictions.append(x)
                    decomp_answers.append(y)

        if ((i + 1) % args.log_every == 0):
            cur_time = time.time()
            print "  {}ing {}.{} / {}  loss: {:8.4f} = {:1.2f} + {:8.2f} + {:1.2f} + "\
                  "{:1.2f} + {:.2f} avg_loss: {:6.4f}  time: {:6.4f}".format(
                        mode, epoch, i * args.batch_size,
                        batches_per_epoch * args.batch_size,
                        float(current_loss),
                        float(ihm_loss), float(los_loss), float(ph_loss),
                        float(decomp_loss), float(reg_loss),
                        float(avg_loss / args.log_every),
                        float(cur_time - prev_time))
            avg_loss = 0
            prev_time = cur_time

        if np.isnan(current_loss):
            print "loss: {:6.4f} = {:1.2f} + {:8.2f} + {:1.2f} + {:1.2f} + {:.2f}".format(
                float(current_loss), float(ihm_loss), float(los_loss),
                float(ph_loss), float(decomp_loss), float(reg_loss))
            raise Exception("current loss IS NaN. This should never happen :)")

    sum_loss /= batches_per_epoch
    print "\n  %s loss = %.5f" % (mode, sum_loss)

    eps = 1e-13
    if args.ihm_C > eps:
        print "\n ================= 48h mortality ================"
        metrics.print_metrics_binary(ihm_answers, ihm_predictions)

    if args.los_C > eps:
        print "\n ================ length of stay ================"
        if args.partition == 'log':
            metrics.print_metrics_log_bins(los_answers, los_predictions)
        else:
            metrics.print_metrics_custom_bins(los_answers, los_predictions)

    if args.ph_C > eps:
        print "\n =================== phenotype =================="
        metrics.print_metrics_multilabel(ph_answers, ph_predictions)

    if args.decomp_C > eps:
        print "\n ================ decompensation ================"
        metrics.print_metrics_binary(decomp_answers, decomp_predictions)

    return sum_loss