Exemple #1
0
 def calc_metrics(self, data_gen, history, dataset, logs):
     y_true = []
     predictions = []
     for i in range(data_gen.steps):
         if self.verbose == 1:
             print("\tdone {}/{}".format(i, data_gen.steps), end='\r')
         if self.use_time:
             ([x, t], y) = next(data_gen)
             outputs = self.model.predict([x, t],
                                          batch_size=self.batch_size)
         else:
             (x, y) = next(data_gen)
             outputs = self.model.predict(x, batch_size=self.batch_size)
         if data_gen.target_repl:
             y_true += list(y[0])
             predictions += list(outputs[0])
         else:
             y_true += list(y)
             predictions += list(outputs)
     print('\n')
     predictions = np.array(predictions)
     ret = metrics.print_metrics_multilabel(y_true, predictions)
     for k, v in ret.items():
         logs[dataset + '_' + k] = v
     history.append(ret)
Exemple #2
0
def do_epoch(mode, epoch):
    # mode is 'train' or 'test'
    y_true = []
    predictions = []
    
    avg_loss = 0.0
    sum_loss = 0.0
    prev_time = time.time()
    
    batches_per_epoch = network.get_batches_per_epoch(mode)
    
    for i in range(0, batches_per_epoch):
        step_data = network.step(mode)
        prediction = step_data["prediction"]
        answers = step_data["answers"]
        current_loss = step_data["current_loss"]
        log = step_data["log"]
        
        avg_loss += current_loss
        sum_loss += current_loss
        
        for x in answers:
            y_true.append(x)
        
        for x in prediction:
            predictions.append(x)
        
        if ((i + 1) % args.log_every == 0):
            cur_time = time.time()
            print ("  %sing: %d.%d / %d \t loss: %.3f \t avg_loss: %.3f \t"\
                   "%s \t time: %.2fs" % (mode, epoch, i * args.batch_size,
                        batches_per_epoch * args.batch_size, 
                        current_loss, avg_loss / args.log_every,
                        log, cur_time - prev_time))
            avg_loss = 0
            prev_time = cur_time
        
        if np.isnan(current_loss):
            raise Exception ("current loss IS NaN. This should never happen :)") 

        
    sum_loss /= batches_per_epoch
    print "\n  %s loss = %.5f" % (mode, sum_loss)
    metrics.print_metrics_multilabel(y_true, predictions)
    return sum_loss
Exemple #3
0
    test_data_gen = utils.BatchGen(test_reader, discretizer,
                                   normalizer, args.batch_size,
                                   args.small_part, target_repl,
                                   shuffle=False, return_names=True)

    names = []
    ts = []
    labels = []
    predictions = []
    for i in range(test_data_gen.steps):
        print "\rpredicting {} / {}".format(i, test_data_gen.steps),
        ret = next(test_data_gen)
        x = ret["data"][0]
        y = ret["data"][1]
        cur_names = ret["names"]
        cur_ts = ret["ts"]
        x = np.array(x)
        pred = model.predict_on_batch(x)
        predictions += list(pred)
        labels += list(y)
        names += list(cur_names)
        ts += list(cur_ts)

    metrics.print_metrics_multilabel(labels, predictions)
    path = os.path.join("test_predictions", os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, ts, predictions, labels, path)

else:
    raise ValueError("Wrong value for args.mode")
            continue

        # Make sure only one file for this task
        assert(not PRED_TASKS[matches[0]])
        PRED_TASKS[matches[0]] = True

        print("Evaluating {}".format(matches[0]))

        match_pred, match_Y = read_file(os.path.join(indir, filename))

        if merged_pred is None:
            merged_pred = np.expand_dims(match_pred.copy(), axis=0)
            merged_Y = np.expand_dims(match_Y.copy(), axis=0)
        else:
            merged_pred =np.concatenate((merged_pred, np.expand_dims(match_pred, axis=0)), axis=0)
            merged_Y =np.concatenate((merged_Y, np.expand_dims(match_Y ,axis=0)), axis=0)

        #print(merged_X.shape)
        #print(merged_Y.shape)

        metrics.print_metrics_binary(match_Y, match_pred)
        print("----------------------------------------")


    print("\n==========================================")
    print("Evaluating all together:")
    metrics.print_metrics_multilabel(merged_Y.T, merged_pred.T)

    for key in PRED_TASKS:
        if PRED_TASKS[key] != True:
            print("WARNING: Data for task {} missing?".format(key))
Exemple #5
0
        header += "ave_auc_micro,ave_auc_macro,ave_auc_weighted,"
        header += ','.join(["auc_%d" % i for i in range(NTASKS)])
        resfile.write(header + "\n")

        def write_results(resfile, ret):
            resfile.write("%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f," %
                          (ret['ave_prec_micro'], ret['ave_prec_macro'],
                           ret['ave_prec_weighted'], ret['ave_recall_micro'],
                           ret['ave_recall_macro'], ret['ave_recall_weighted'],
                           ret['ave_auc_micro'], ret['ave_auc_macro'],
                           ret['ave_auc_weighted']))
            resfile.write(",".join(["%.6f" % x
                                    for x in ret['auc_scores']]) + "\n")

        print "\nAverage results on train"
        ret = metrics.print_metrics_multilabel(train_y, train_activations)
        write_results(resfile, ret)

        print "\nAverage results on val"
        ret = metrics.print_metrics_multilabel(val_y, val_activations)
        write_results(resfile, ret)

        print "\nAverage results on test"
        ret = metrics.print_metrics_multilabel(test_y, test_activations)
        write_results(resfile, ret)

    np.savetxt(os.path.join("activations", model_name + ".csv"),
               test_activations,
               delimiter=',')

    print "==================== Done (penalty = %s, C = %f) ====================\n" % (
Exemple #6
0
    predictions = []
    for i in range(test_data_gen.steps):
        print "\rpredicting {} / {}".format(i, test_data_gen.steps),
        ret = next(test_data_gen)
        x = ret["data"][0]
        y = ret["data"][1]
        cur_names = ret["names"]
        cur_ts = ret["ts"]
        x = np.array(x)
        pred = model.predict_on_batch(x)
        predictions += list(pred)
        labels += list(y)
        names += list(cur_names)
        ts += list(cur_ts)

    ret = metrics.print_metrics_multilabel(labels, predictions)

    with open("results.txt", "w") as resfile:
        header = "ave_prec_micro,ave_prec_macro,ave_prec_weighted,"
        header += "ave_recall_micro,ave_recall_macro,ave_recall_weighted,"
        header += "ave_auc_micro,ave_auc_macro,ave_auc_weighted,"
        header += ','.join(
            ["auc_%d" % i for i in range(args_dict['num_classes'])])
        resfile.write(header + "\n")

        resfile.write("%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f," %
                      (ret['ave_prec_micro'], ret['ave_prec_macro'],
                       ret['ave_prec_weighted'], ret['ave_recall_micro'],
                       ret['ave_recall_macro'], ret['ave_recall_weighted'],
                       ret['ave_auc_micro'], ret['ave_auc_macro'],
                       ret['ave_auc_weighted']))
Exemple #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--grid-search',
                        dest='grid_search',
                        action='store_true')
    parser.add_argument('--no-grid-search',
                        dest='grid_search',
                        action='store_false')
    parser.set_defaults(grid_search=False)
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of phenotyping task',
                        default=os.path.join(os.path.dirname(__file__),
                                             '../../../data/phenotyping/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    if args.grid_search:
        penalties = [
            'l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1'
        ]
        coefs = [
            1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001,
            0.0001
        ]
    else:
        penalties = ['l1']
        coefs = [0.1]

    train_reader = PhenotypingReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'))

    val_reader = PhenotypingReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'))

    test_reader = PhenotypingReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'))

    print('Reading data and extracting features ...')

    (train_X, train_y, train_names,
     train_ts) = read_and_extract_features(train_reader, args.period,
                                           args.features)
    train_y = np.array(train_y)

    (val_X, val_y, val_names,
     val_ts) = read_and_extract_features(val_reader, args.period,
                                         args.features)
    val_y = np.array(val_y)

    (test_X, test_y, test_names,
     test_ts) = read_and_extract_features(test_reader, args.period,
                                          args.features)
    test_y = np.array(test_y)

    print("train set shape:  {}".format(train_X.shape))
    print("validation set shape: {}".format(val_X.shape))
    print("test set shape: {}".format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    n_tasks = 25
    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    for (penalty, C) in zip(penalties, coefs):
        model_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                           C)

        train_activations = np.zeros(shape=train_y.shape, dtype=float)
        val_activations = np.zeros(shape=val_y.shape, dtype=float)
        test_activations = np.zeros(shape=test_y.shape, dtype=float)

        for task_id in range(n_tasks):
            print('Starting task {}'.format(task_id))

            logreg = LogisticRegression(penalty=penalty, C=C, random_state=42)
            logreg.fit(train_X, train_y[:, task_id])

            train_preds = logreg.predict_proba(train_X)
            train_activations[:, task_id] = train_preds[:, 1]

            val_preds = logreg.predict_proba(val_X)
            val_activations[:, task_id] = val_preds[:, 1]

            test_preds = logreg.predict_proba(test_X)
            test_activations[:, task_id] = test_preds[:, 1]

        with open(os.path.join(result_dir, 'train_{}.json'.format(model_name)),
                  'w') as f:
            ret = metrics.print_metrics_multilabel(train_y, train_activations)
            ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'}
            json.dump(ret, f)

        with open(os.path.join(result_dir, 'val_{}.json'.format(model_name)),
                  'w') as f:
            ret = metrics.print_metrics_multilabel(val_y, val_activations)
            ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'}
            json.dump(ret, f)

        with open(os.path.join(result_dir, 'test_{}.json'.format(model_name)),
                  'w') as f:
            ret = metrics.print_metrics_multilabel(test_y, test_activations)
            ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'}
            json.dump(ret, f)

        save_results(
            test_names, test_ts, test_activations, test_y,
            os.path.join(args.output_dir, 'predictions', model_name + '.csv'))
Exemple #8
0
    def calc_metrics(self, data_gen, history, dataset, logs):
        ihm_y_true = []
        decomp_y_true = []
        los_y_true = []
        pheno_y_true = []

        ihm_pred = []
        decomp_pred = []
        los_pred = []
        pheno_pred = []

        for i in range(data_gen.steps):
            if self.verbose == 1:
                print("\tdone {}/{}".format(i, data_gen.steps), end='\r')
            (X, y, los_y_reg) = data_gen.next(return_y_true=True)
            outputs = self.model.predict(X, batch_size=self.batch_size)

            ihm_M = X[1]
            decomp_M = X[2]
            los_M = X[3]

            if not data_gen.target_repl:  # no target replication
                (ihm_p, decomp_p, los_p, pheno_p) = outputs
                (ihm_t, decomp_t, los_t, pheno_t) = y
            else:  # target replication
                (ihm_p, _, decomp_p, los_p, pheno_p, _) = outputs
                (ihm_t, _, decomp_t, los_t, pheno_t, _) = y

            los_t = los_y_reg  # real value not the label

            # ihm
            for (m, t, p) in zip(ihm_M.flatten(), ihm_t.flatten(),
                                 ihm_p.flatten()):
                if np.equal(m, 1):
                    ihm_y_true.append(t)
                    ihm_pred.append(p)

            # decomp
            for (m, t, p) in zip(decomp_M.flatten(), decomp_t.flatten(),
                                 decomp_p.flatten()):
                if np.equal(m, 1):
                    decomp_y_true.append(t)
                    decomp_pred.append(p)

            # los
            if los_p.shape[-1] == 1:  # regression
                for (m, t, p) in zip(los_M.flatten(), los_t.flatten(),
                                     los_p.flatten()):
                    if np.equal(m, 1):
                        los_y_true.append(t)
                        los_pred.append(p)
            else:  # classification
                for (m, t, p) in zip(los_M.flatten(), los_t.flatten(),
                                     los_p.reshape((-1, 10))):
                    if np.equal(m, 1):
                        los_y_true.append(t)
                        los_pred.append(p)

            # pheno
            for (t, p) in zip(pheno_t.reshape((-1, 25)),
                              pheno_p.reshape((-1, 25))):
                pheno_y_true.append(t)
                pheno_pred.append(p)
        print('\n')

        # ihm
        print("\n ================= 48h mortality ================")
        ihm_pred = np.array(ihm_pred)
        ihm_pred = np.stack([1 - ihm_pred, ihm_pred], axis=1)
        ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred)
        for k, v in ret.items():
            logs[dataset + '_ihm_' + k] = v

        # decomp
        print("\n ================ decompensation ================")
        decomp_pred = np.array(decomp_pred)
        decomp_pred = np.stack([1 - decomp_pred, decomp_pred], axis=1)
        ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred)
        for k, v in ret.items():
            logs[dataset + '_decomp_' + k] = v

        # los
        print("\n ================ length of stay ================")
        if self.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if self.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
            ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if self.partition == 'none':
            ret = metrics.print_metrics_regression(los_y_true, los_pred)
        for k, v in ret.items():
            logs[dataset + '_los_' + k] = v

        # pheno
        print("\n =================== phenotype ==================")
        pheno_pred = np.array(pheno_pred)
        ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)
        for k, v in ret.items():
            logs[dataset + '_pheno_' + k] = v

        history.append(logs)
Exemple #9
0
def do_epoch(mode, epoch):
    # mode is 'train' or 'test'

    ihm_predictions = []
    ihm_answers = []

    los_predictions = []
    los_answers = []

    ph_predictions = []
    ph_answers = []

    decomp_predictions = []
    decomp_answers = []

    avg_loss = 0.0
    sum_loss = 0.0
    prev_time = time.time()

    batches_per_epoch = network.get_batches_per_epoch(mode)

    for i in range(0, batches_per_epoch):
        step_data = network.step(mode)

        ihm_pred = step_data["ihm_prediction"]
        los_pred = step_data["los_prediction"]
        ph_pred = step_data["ph_prediction"]
        decomp_pred = step_data["decomp_prediction"]

        current_loss = step_data["loss"]
        ihm_loss = step_data["ihm_loss"]
        los_loss = step_data["los_loss"]
        ph_loss = step_data["ph_loss"]
        decomp_loss = step_data["decomp_loss"]
        reg_loss = step_data["reg_loss"]

        data = step_data["data"]

        ihm_data = data[1]
        ihm_mask = [x[1] for x in ihm_data]
        ihm_label = [x[2] for x in ihm_data]

        los_data = data[2]
        los_mask = [x[0] for x in los_data]
        los_label = [x[1] for x in los_data]

        ph_data = data[3]
        ph_label = ph_data

        decomp_data = data[4]
        decomp_mask = [x[0] for x in decomp_data]
        decomp_label = [x[1] for x in decomp_data]

        avg_loss += current_loss
        sum_loss += current_loss

        for (x, mask, y) in zip(ihm_pred, ihm_mask, ihm_label):
            if (mask == 1):
                ihm_predictions.append(x)
                ihm_answers.append(y)

        for (sx, smask, sy) in zip(los_pred, los_mask, los_label):
            for (x, mask, y) in zip(sx, smask, sy):
                if (mask == 1):
                    los_predictions.append(x)
                    los_answers.append(y)

        for (x, y) in zip(ph_pred, ph_label):
            ph_predictions.append(x)
            ph_answers.append(y)

        for (sx, smask, sy) in zip(decomp_pred, decomp_mask, decomp_label):
            for (x, mask, y) in zip(sx, smask, sy):
                if (mask == 1):
                    decomp_predictions.append(x)
                    decomp_answers.append(y)

        if ((i + 1) % args.log_every == 0):
            cur_time = time.time()
            print "  {}ing {}.{} / {}  loss: {:8.4f} = {:1.2f} + {:8.2f} + {:1.2f} + "\
                  "{:1.2f} + {:.2f} avg_loss: {:6.4f}  time: {:6.4f}".format(
                        mode, epoch, i * args.batch_size,
                        batches_per_epoch * args.batch_size,
                        float(current_loss),
                        float(ihm_loss), float(los_loss), float(ph_loss),
                        float(decomp_loss), float(reg_loss),
                        float(avg_loss / args.log_every),
                        float(cur_time - prev_time))
            avg_loss = 0
            prev_time = cur_time

        if np.isnan(current_loss):
            print "loss: {:6.4f} = {:1.2f} + {:8.2f} + {:1.2f} + {:1.2f} + {:.2f}".format(
                float(current_loss), float(ihm_loss), float(los_loss),
                float(ph_loss), float(decomp_loss), float(reg_loss))
            raise Exception("current loss IS NaN. This should never happen :)")

    sum_loss /= batches_per_epoch
    print "\n  %s loss = %.5f" % (mode, sum_loss)

    eps = 1e-13
    if args.ihm_C > eps:
        print "\n ================= 48h mortality ================"
        metrics.print_metrics_binary(ihm_answers, ihm_predictions)

    if args.los_C > eps:
        print "\n ================ length of stay ================"
        if args.partition == 'log':
            metrics.print_metrics_log_bins(los_answers, los_predictions)
        else:
            metrics.print_metrics_custom_bins(los_answers, los_predictions)

    if args.ph_C > eps:
        print "\n =================== phenotype =================="
        metrics.print_metrics_multilabel(ph_answers, ph_predictions)

    if args.decomp_C > eps:
        print "\n ================ decompensation ================"
        metrics.print_metrics_binary(decomp_answers, decomp_predictions)

    return sum_loss
Exemple #10
0
    if args.los_C > 0:
        print "\n ================ length of stay ================"
        if args.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if args.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if args.partition == 'none':
            los_ret = metrics.print_metrics_regression(los_y_true, los_pred)

    # pheno
    if args.pheno_C > 0:
        print "\n =================== phenotype =================="
        pheno_pred = np.array(pheno_pred)
        pheno_ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)

    print "Saving the predictions in test_predictions/task directories ..."

    # ihm
    ihm_path = os.path.join("test_predictions/ihm", os.path.basename(args.load_state)) + ".csv"
    ihm_utils.save_results(ihm_names, ihm_pred, ihm_y_true, ihm_path)

    # decomp
    decomp_path = os.path.join("test_predictions/decomp", os.path.basename(args.load_state)) + ".csv"
    decomp_utils.save_results(decomp_names, decomp_ts, decomp_pred, decomp_y_true, decomp_path)

    # los
    los_path = os.path.join("test_predictions/los", os.path.basename(args.load_state)) + ".csv"
    los_utils.save_results(los_names, los_ts, los_pred, los_y_true, los_path)
Exemple #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1']
    # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001]
    penalties = ['l1']
    Cs = [0.1]

    train_reader = PhenotypingReader(dataset_dir='../../../data/phenotyping/train/',
                                     listfile='../../../data/phenotyping/train_listfile.csv')

    val_reader = PhenotypingReader(dataset_dir='../../../data/phenotyping/train/',
                                   listfile='../../../data/phenotyping/val_listfile.csv')

    test_reader = PhenotypingReader(dataset_dir='../../../data/phenotyping/test/',
                                    listfile='../../../data/phenotyping/test_listfile.csv')

    print('Reading data and extracting features ...')

    (train_X, train_y, train_names, train_ts) = read_and_extract_features(train_reader, args.period, args.features)
    train_y = np.array(train_y)

    (val_X, val_y, val_names, val_ts) = read_and_extract_features(val_reader, args.period, args.features)
    val_y = np.array(val_y)

    (test_X, test_y, test_names, test_ts) = read_and_extract_features(test_reader, args.period, args.features)
    test_y = np.array(test_y)

    print("train set shape:  {}".format(train_X.shape))
    print("validation set shape: {}".format(val_X.shape))
    print("test set shape: {}".format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    n_tasks = 25
    common_utils.create_directory('results')

    for (penalty, C) in zip(penalties, Cs):
        model_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C)

        train_activations = np.zeros(shape=train_y.shape, dtype=float)
        val_activations = np.zeros(shape=val_y.shape, dtype=float)
        test_activations = np.zeros(shape=test_y.shape, dtype=float)

        for task_id in range(n_tasks):
            print('Starting task {}'.format(task_id))

            logreg = LogisticRegression(penalty=penalty, C=C, random_state=42)
            logreg.fit(train_X, train_y[:, task_id])

            train_preds = logreg.predict_proba(train_X)
            train_activations[:, task_id] = train_preds[:, 1]

            val_preds = logreg.predict_proba(val_X)
            val_activations[:, task_id] = val_preds[:, 1]

            test_preds = logreg.predict_proba(test_X)
            test_activations[:, task_id] = test_preds[:, 1]

        with open(os.path.join('results', 'train_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_multilabel(train_y, train_activations)
            ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'}
            json.dump(ret, f)

        with open(os.path.join('results', 'val_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_multilabel(val_y, val_activations)
            ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'}
            json.dump(ret, f)

        with open(os.path.join('results', 'test_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_multilabel(test_y, test_activations)
            ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'}
            json.dump(ret, f)

        save_results(test_names, test_ts, test_activations, test_y, os.path.join('predictions', model_name + '.csv'))
Exemple #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile',
                        type=str,
                        default='../data/phenotyping/test/listfile.csv')
    parser.add_argument('--n_iters', type=int, default=10000)
    parser.add_argument('--save_file', type=str, default='pheno_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction,
                          index_col=False,
                          dtype={'period_length': np.float32})
    test_df = pd.read_csv(args.test_listfile,
                          index_col=False,
                          dtype={'period_length': np.float32})

    n_tasks = 25
    labels_cols = ["label_{}".format(i) for i in range(1, n_tasks + 1)]
    test_df.columns = list(test_df.columns[:2]) + labels_cols

    df = test_df.merge(pred_df,
                       left_on='stay',
                       right_on='stay',
                       how='left',
                       suffixes=['_l', '_r'])
    assert (df['pred_1'].isnull().sum() == 0)
    assert (df['period_length_l'].equals(df['period_length_r']))
    for i in range(1, n_tasks + 1):
        assert (df['label_{}_l'.format(i)].equals(df['label_{}_r'.format(i)]))

    metrics = [('Macro ROC AUC', 'ave_auc_macro'),
               ('Micro ROC AUC', 'ave_auc_micro'),
               ('Weighted ROC AUC', 'ave_auc_weighted')]

    data = np.zeros((df.shape[0], 50))
    for i in range(1, n_tasks + 1):
        data[:, i - 1] = df['pred_{}'.format(i)]
        data[:, 25 + i - 1] = df['label_{}_l'.format(i)]

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_multilabel(data[:, 25:], data[:, :25], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(1, n_tasks + 1):
        m = 'ROC AUC of task {}'.format(i)
        results[m] = dict()
        results[m]['value'] = print_metrics_binary(data[:, 25 + i - 1],
                                                   data[:, i - 1],
                                                   verbose=0)['auroc']
        results[m]['runs'] = []

    for iteration in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_multilabel(cur_data[:, 25:],
                                       cur_data[:, :25],
                                       verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])
        for i in range(1, n_tasks + 1):
            m = 'ROC AUC of task {}'.format(i)
            cur_auc = print_metrics_binary(cur_data[:, 25 + i - 1],
                                           cur_data[:, i - 1],
                                           verbose=0)['auroc']
            results[m]['runs'].append(cur_auc)

    reported_metrics = [m for m, k in metrics]
    reported_metrics += [
        'ROC AUC of task {}'.format(i) for i in range(1, n_tasks + 1)
    ]

    for m in reported_metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print "Saving the results (including task specific metrics) in {} ...".format(
        args.save_file)
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print "Printing the summary of results (task specific metrics are skipped) ..."
    for i in range(1, n_tasks + 1):
        m = 'ROC AUC of task {}'.format(i)
        del results[m]
    print results
Exemple #13
0
    test_data_gen = utils.BatchGen(test_reader, discretizer,
                                   normalizer, args.batch_size,
                                   args.small_part, target_repl,
                                   shuffle=False, return_names=True)

    names = []
    ts = []
    labels = []
    predictions = []
    for i in range(test_data_gen.steps):
        print("predicting {} / {}".format(i, test_data_gen.steps), end='\r')
        ret = next(test_data_gen)
        x = ret["data"][0]
        y = ret["data"][1]
        cur_names = ret["names"]
        cur_ts = ret["ts"]
        x = np.array(x)
        pred = model.predict_on_batch(x)
        predictions += list(pred)
        labels += list(y)
        names += list(cur_names)
        ts += list(cur_ts)

    metrics.print_metrics_multilabel(labels, predictions, stochastic=stochastic)
    path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, ts, predictions, labels, path, stochastic=stochastic)

else:
    raise ValueError("Wrong value for args.mode")
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile', type=str, default='../data/phenotyping/test/listfile.csv')
    parser.add_argument('--n_iters', type=int, default=10000)
    parser.add_argument('--save_file', type=str, default='pheno_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32})
    test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32})

    n_tasks = 25
    labels_cols = ["label_{}".format(i) for i in range(1, n_tasks + 1)]
    test_df.columns = list(test_df.columns[:2]) + labels_cols

    df = test_df.merge(pred_df, left_on='stay', right_on='stay', how='left', suffixes=['_l', '_r'])
    assert (df['pred_1'].isnull().sum() == 0)
    assert (df['period_length_l'].equals(df['period_length_r']))
    for i in range(1, n_tasks + 1):
        assert (df['label_{}_l'.format(i)].equals(df['label_{}_r'.format(i)]))

    metrics = [('Macro ROC AUC', 'ave_auc_macro'),
               ('Micro ROC AUC', 'ave_auc_micro'),
               ('Weighted ROC AUC', 'ave_auc_weighted')]

    data = np.zeros((df.shape[0], 50))
    for i in range(1, n_tasks + 1):
        data[:, i - 1] = df['pred_{}'.format(i)]
        data[:, 25 + i - 1] = df['label_{}_l'.format(i)]

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_multilabel(data[:, 25:], data[:, :25], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(1, n_tasks + 1):
        m = 'ROC AUC of task {}'.format(i)
        results[m] = dict()
        results[m]['value'] = print_metrics_binary(data[:, 25 + i - 1], data[:, i - 1], verbose=0)['auroc']
        results[m]['runs'] = []

    for iteration in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_multilabel(cur_data[:, 25:], cur_data[:, :25], verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])
        for i in range(1, n_tasks + 1):
            m = 'ROC AUC of task {}'.format(i)
            cur_auc = print_metrics_binary(cur_data[:, 25 + i - 1], cur_data[:, i - 1], verbose=0)['auroc']
            results[m]['runs'].append(cur_auc)

    reported_metrics = [m for m, k in metrics]
    reported_metrics += ['ROC AUC of task {}'.format(i) for i in range(1, n_tasks + 1)]

    for m in reported_metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print "Saving the results (including task specific metrics) in {} ...".format(args.save_file)
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print "Printing the summary of results (task specific metrics are skipped) ..."
    for i in range(1, n_tasks + 1):
        m = 'ROC AUC of task {}'.format(i)
        del results[m]
    print results
            # Find specific files within the directory for this task
            task_str_path = retrieve_matching_file(task_prefix, args.str_path)
            task_ustr_path = retrieve_matching_file(task_prefix, args.ustr_path)

            task_test_str_path = retrieve_matching_file(task_prefix, args.test_str_path)
            task_test_ustr_path = retrieve_matching_file(task_prefix, args.test_ustr_path)

            print("\n\n-----------------------\nFitting Model For {}\n".format(task_prefix))

            labels, preds = load_fit_save(args.mode,
                                          task_str_path,
                                          task_ustr_path,
                                          task_test_str_path,
                                          task_test_ustr_path,
                                          args.outdir,
                                          out_filename_prefix=task_prefix)

            if merged_pred is None:
                merged_pred = np.expand_dims(preds, axis=1)
                merged_Y = np.expand_dims(labels, axis=1)
            else:
                merged_pred = np.concatenate((merged_pred, np.expand_dims(preds, axis=1)),  axis=1)
                merged_Y = np.concatenate((merged_Y, np.expand_dims(labels, axis=1)), axis=1)
        print('\n============================')
        print("Overall performance:")
        metrics.print_metrics_multilabel(merged_Y, merged_pred)




    if args.los_C > 0:
        print "\n ================ length of stay ================"
        if args.partition == 'log':
            los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred)
        if args.partition == 'custom':
            los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
            los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if args.partition == 'none':
            los_ret = metrics.print_metrics_regression(los_y_true, los_pred)

    ## pheno
    if args.pheno_C > 0:
        print "\n =================== phenotype =================="
        pheno_pred = np.array(pheno_pred)
        pheno_ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)

    # TODO: save activations if needed

elif args.mode == 'test_single':
    # ensure that the code uses test_reader
    del train_reader
    del val_reader
    del train_data_gen
    del val_data_gen

    # Testing ihm
    from mimic3benchmark.readers import InHospitalMortalityReader
    from mimic3models.in_hospital_mortality.utils import read_chunk
    from mimic3models import nn_utils
Exemple #17
0
        if ((i + 1) % args.log_every == 0):
            cur_time = time.time()
            print ("  testing: %d / %d \t loss: %.3f \t avg_loss: %.3f \t"\
                   " time: %.2fs" % ((i+1) * args.batch_size,
                        n_batches * args.batch_size, current_loss,
                        avg_loss / args.log_every, cur_time - prev_time))
            avg_loss = 0
            prev_time = cur_time

        if np.isnan(current_loss):
            raise Exception("current loss IS NaN. This should never happen :)")

    sum_loss /= n_batches
    print "\n  test loss = %.5f" % sum_loss
    ret = metrics.print_metrics_multilabel(y_true, predictions)

    with open("results.txt", "w") as resfile:
        header = "ave_prec_micro,ave_prec_macro,ave_prec_weighted,"
        header += "ave_recall_micro,ave_recall_macro,ave_recall_weighted,"
        header += "ave_auc_micro,ave_auc_macro,ave_auc_weighted,"
        header += ','.join(["auc_%d" % i for i in range(NTASKS)])
        resfile.write(header + "\n")

        resfile.write("%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f," %
                      (ret['ave_prec_micro'], ret['ave_prec_macro'],
                       ret['ave_prec_weighted'], ret['ave_recall_micro'],
                       ret['ave_recall_macro'], ret['ave_recall_weighted'],
                       ret['ave_auc_micro'], ret['ave_auc_macro'],
                       ret['ave_auc_weighted']))
        resfile.write(",".join(["%.6f" % x for x in ret['auc_scores']]) + "\n")
Exemple #18
0
    sum_loss /= batches_per_epoch
    print "\n  %s loss = %.5f" % (args.mode, sum_loss)

    eps = 1e-13
    if args.ihm_C > eps:
        print "\n ================= 48h mortality ================"
        metrics.print_metrics_binary(ihm_answers, ihm_predictions)

    if args.los_C > eps:
        print "\n ================ length of stay ================"
        if args.partition == 'log':
            metrics.print_metrics_log_bins(los_answers, los_predictions)
        else:
            metrics.print_metrics_custom_bins(los_answers, los_predictions)

    if args.ph_C > eps:
        print "\n =================== phenotype =================="
        metrics.print_metrics_multilabel(ph_answers, ph_predictions)

    if args.decomp_C > eps:
        print "\n ================ decompensation ================"
        metrics.print_metrics_binary(decomp_answers, decomp_predictions)

    with open("los_activations.txt", "w") as fout:
        fout.write("prediction, y_true")
        for (x, y) in zip(los_predictions, los_answers):
            fout.write("%.6f, %.6f\n" % (x, y))

else:
    raise Exception("unknown mode")