Esempio n. 1
0
        metrics.print_metrics_log_bins(labels, predictions)
    if args.partition == 'custom':
        predictions = [metrics.get_estimate_custom(x, 10) for x in predictions]
        metrics.print_metrics_custom_bins(labels, predictions)
    if args.partition == 'none':
        metrics.print_metrics_regression(labels, predictions)
        predictions = [x[0] for x in predictions]

    path = os.path.join(
        os.path.join(args.output_dir, "test_predictions",
                     os.path.basename(args.load_state)) + ".csv")

    if stochastic:
        ee = np.mean(np.array(epistemic))
        aa = np.mean(np.array(aleatoric))
        print("Epistemic uncertainty =", ee)
        print("Aleatoric uncertainty =", aa)
        print("Uncertainty =", ee + aa)
        utils.save_results(names,
                           ts,
                           predictions,
                           labels,
                           path,
                           aleatoric=aleatoric,
                           epistemic=epistemic)
    else:
        utils.save_results(names, ts, predictions, labels, path)

else:
    raise ValueError("Wrong value for args.mode")
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of length-of-stay task',
                        default=os.path.join(os.path.dirname(__file__),
                                             '../../../data/length-of-stay/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'))

    val_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'))

    test_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'))

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_names,
     train_ts) = read_and_extract_features(train_reader, n_train, args.period,
                                           args.features)

    (val_X, val_y, val_names,
     val_ts) = read_and_extract_features(val_reader, n_val, args.period,
                                         args.features)

    (test_X, test_y, test_names,
     test_ts) = read_and_extract_features(test_reader,
                                          test_reader.get_number_of_examples(),
                                          args.period, args.features)

    print(train_X.shape)
    assert False

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    file_name = "{}.{}".format(args.period, args.features)

    linreg = LinearRegression()
    linreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              "w") as res_file:
        ret = print_metrics_regression(train_y, linreg.predict(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_regression(val_y, linreg.predict(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = linreg.predict(test_X)

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_regression(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, test_ts, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Esempio n. 3
0
            ret = test_data_gen.next(return_y_true=True)
            (x, y_processed, y) = ret["data"]
            cur_names = ret["names"]
            cur_ts = ret["ts"]

            x = np.array(x)
            pred = model.predict_on_batch(x)
            predictions += list(pred)
            labels += list(y)
            names += list(cur_names)
            ts += list(cur_ts)

    if args.partition == 'log':
        predictions = [metrics.get_estimate_log(x, 10) for x in predictions]
        metrics.print_metrics_log_bins(labels, predictions)
    if args.partition == 'custom':
        predictions = [metrics.get_estimate_custom(x, 10) for x in predictions]
        metrics.print_metrics_custom_bins(labels, predictions)
    if args.partition == 'none':
        metrics.print_metrics_regression(labels, predictions)
        predictions = [x[0] for x in predictions]

    path = os.path.join(
        os.path.join(args.output_dir, "test_predictions",
                     os.path.basename(args.load_state)) + ".csv")
    utils.save_results(names, ts, predictions, labels, path)

else:
    raise ValueError("Wrong value for args.mode")
Esempio n. 4
0
            ret = test_data_gen.next(return_y_true=True)
            (x, _, y) = ret["data"]
            cur_names = ret["names"]
            cur_ts = ret["ts"]

            x = torch.tensor(x, dtype=torch.float).to(device)
            pred = model(x)
            pred = pred.cpu().data.numpy()
            predictions += list(pred)
            labels += list(y)
            names += list(cur_names)
            ts += list(cur_ts)
            
            if args.small_part:
                break

    if args.partition == 'log':
        predictions = [metrics.get_estimate_log(x, 10) for x in predictions]
        metrics.print_metrics_log_bins(labels, predictions)
    if args.partition == 'custom':
        predictions = [metrics.get_estimate_custom(x, 10) for x in predictions]
        metrics.print_metrics_custom_bins(labels, predictions)
    if args.partition == 'none':
        metrics.print_metrics_regression(labels, predictions)
        predictions = [x[0] for x in predictions]

    utils.save_results(names, ts, predictions, labels, os.path.join(save_path, 'test_predictions.csv'))

else:
    raise ValueError("Wrong value for args.mode")
Esempio n. 5
0
    # pheno
    if args.pheno_C > 0:
        print("\n =================== phenotype ==================")
        pheno_pred = np.array(pheno_pred)
        pheno_ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)

    print("Saving the predictions in test_predictions/task directories ...")

    # ihm
    ihm_path = os.path.join(os.path.join(args.output_dir,
                                         "test_predictions/ihm", os.path.basename(args.load_state)) +experiment_name+ ".csv")
    ihm_utils.save_results(ihm_names, ihm_pred, ihm_y_true, ihm_path)

    # decomp
    decomp_path = os.path.join(os.path.join(args.output_dir,
                                            "test_predictions/decomp", os.path.basename(args.load_state)) +experiment_name+ ".csv")
    decomp_utils.save_results(decomp_names, decomp_ts, decomp_pred, decomp_y_true, decomp_path)

    # los
    los_path = os.path.join(os.path.join(args.output_dir,
                                         "test_predictions/los", os.path.basename(args.load_state)) +experiment_name+ ".csv")
    los_utils.save_results(los_names, los_ts, los_pred, los_y_true, los_path)

    # pheno
    pheno_path = os.path.join(os.path.join(args.output_dir,
                                           "test_predictions/pheno", os.path.basename(args.load_state)) +experiment_name+ ".csv")
    pheno_utils.save_results(pheno_names, pheno_ts, pheno_pred, pheno_y_true, pheno_path)

else:
    raise ValueError("Wrong value for args.mode")
Esempio n. 6
0
            los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred)
        if args.partition == 'none':
            los_ret = metrics.print_metrics_regression(los_y_true, los_pred)

    # pheno
    if args.pheno_C > 0:
        print "\n =================== phenotype =================="
        pheno_pred = np.array(pheno_pred)
        pheno_ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred)

    print "Saving the predictions in test_predictions/task directories ..."

    # ihm
    ihm_path = os.path.join("test_predictions/ihm", os.path.basename(args.load_state)) + ".csv"
    ihm_utils.save_results(ihm_names, ihm_pred, ihm_y_true, ihm_path)

    # decomp
    decomp_path = os.path.join("test_predictions/decomp", os.path.basename(args.load_state)) + ".csv"
    decomp_utils.save_results(decomp_names, decomp_ts, decomp_pred, decomp_y_true, decomp_path)

    # los
    los_path = os.path.join("test_predictions/los", os.path.basename(args.load_state)) + ".csv"
    los_utils.save_results(los_names, los_ts, los_pred, los_y_true, los_path)

    # pheno
    pheno_path = os.path.join("test_predictions/pheno", os.path.basename(args.load_state)) + ".csv"
    pheno_utils.save_results(pheno_names, pheno_ts, pheno_pred, pheno_y_true, pheno_path)

else:
    raise ValueError("Wrong value for args.mode")
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--grid-search', dest='grid_search', action='store_true')
    parser.add_argument('--no-grid-search', dest='grid_search', action='store_false')
    parser.set_defaults(grid_search=False)
    parser.add_argument('--data', type=str, help='Path to the data of length-of-stay task',
                        default=os.path.join(os.path.dirname(__file__), '../../../data/length-of-stay/'))
    parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored',
                        default='.')
    args = parser.parse_args()
    print(args)

    if args.grid_search:
        penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1']
        coefs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001]
    else:
        penalties = ['l2']
        coefs = [0.00001]

    train_reader = LengthOfStayReader(dataset_dir=os.path.join(args.data, 'train'),
                                      listfile=os.path.join(args.data, 'train_listfile.csv'))

    val_reader = LengthOfStayReader(dataset_dir=os.path.join(args.data, 'train'),
                                    listfile=os.path.join(args.data, 'val_listfile.csv'))

    test_reader = LengthOfStayReader(dataset_dir=os.path.join(args.data, 'test'),
                                     listfile=os.path.join(args.data, 'test_listfile.csv'))

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_actual, train_names, train_ts) = read_and_extract_features(
        train_reader, n_train, args.period, args.features)

    (val_X, val_y, val_actual, val_names, val_ts) = read_and_extract_features(
        val_reader, n_val, args.period, args.features)

    (test_X, test_y, test_actual, test_names, test_ts) = read_and_extract_features(
        test_reader, test_reader.get_number_of_examples(), args.period, args.features)

    print("train set shape:  {}".format(train_X.shape))
    print("validation set shape: {}".format(val_X.shape))
    print("test set shape: {}".format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    result_dir = os.path.join(args.output_dir, 'cf_results')
    common_utils.create_directory(result_dir)

    for (penalty, C) in zip(penalties, coefs):
        model_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C)

        train_activations = np.zeros(shape=train_y.shape, dtype=float)
        val_activations = np.zeros(shape=val_y.shape, dtype=float)
        test_activations = np.zeros(shape=test_y.shape, dtype=float)

        for task_id in range(n_bins):
            logreg = LogisticRegression(penalty=penalty, C=C, random_state=42)
            logreg.fit(train_X, train_y[:, task_id])

            train_preds = logreg.predict_proba(train_X)
            train_activations[:, task_id] = train_preds[:, 1]

            val_preds = logreg.predict_proba(val_X)
            val_activations[:, task_id] = val_preds[:, 1]

            test_preds = logreg.predict_proba(test_X)
            test_activations[:, task_id] = test_preds[:, 1]

        train_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in train_activations])
        val_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in val_activations])
        test_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in test_activations])

        with open(os.path.join(result_dir, 'train_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(train_actual, train_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        with open(os.path.join(result_dir, 'val_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(val_actual, val_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        with open(os.path.join(result_dir, 'test_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(test_actual, test_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        save_results(test_names, test_ts, test_predictions, test_actual,
                     os.path.join(args.output_dir, 'cf_predictions', model_name + '.csv'))
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    train_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/',
                                      listfile='../../../data/length-of-stay/train_listfile.csv')

    val_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/',
                                    listfile='../../../data/length-of-stay/val_listfile.csv')

    test_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/test/',
                                     listfile='../../../data/length-of-stay/test_listfile.csv')

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_names, train_ts) = read_and_extract_features(
        train_reader, n_train, args.period, args.features)

    (val_X, val_y, val_names, val_ts) = read_and_extract_features(
        val_reader, n_val, args.period, args.features)

    (test_X, test_y, test_names, test_ts) = read_and_extract_features(
        test_reader, test_reader.get_number_of_examples(), args.period, args.features)

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    file_name = "{}.{}".format(args.period, args.features)

    linreg = LinearRegression()
    linreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join("results", 'train_{}.json'.format(file_name)), "w") as res_file:
        ret = print_metrics_regression(train_y, linreg.predict(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_regression(val_y, linreg.predict(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = linreg.predict(test_X)

    with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_regression(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
Esempio n. 9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1']
    # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001]
    penalties = ['l2']
    Cs = [0.00001]

    train_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/',
                                      listfile='../../../data/length-of-stay/train_listfile.csv')

    val_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/',
                                    listfile='../../../data/length-of-stay/val_listfile.csv')

    test_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/test/',
                                     listfile='../../../data/length-of-stay/test_listfile.csv')

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_actual, train_names, train_ts) = read_and_extract_features(
        train_reader, n_train, args.period, args.features)

    (val_X, val_y, val_actual, val_names, val_ts) = read_and_extract_features(
        val_reader, n_val, args.period, args.features)

    (test_X, test_y, test_actual, test_names, test_ts) = read_and_extract_features(
        test_reader, test_reader.get_number_of_examples(), args.period, args.features)

    print("train set shape:  {}".format(train_X.shape))
    print("validation set shape: {}".format(val_X.shape))
    print("test set shape: {}".format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    common_utils.create_directory('cf_results')

    for (penalty, C) in zip(penalties, Cs):
        model_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C)

        train_activations = np.zeros(shape=train_y.shape, dtype=float)
        val_activations = np.zeros(shape=val_y.shape, dtype=float)
        test_activations = np.zeros(shape=test_y.shape, dtype=float)

        for task_id in range(n_bins):
            logreg = LogisticRegression(penalty=penalty, C=C, random_state=42)
            logreg.fit(train_X, train_y[:, task_id])

            train_preds = logreg.predict_proba(train_X)
            train_activations[:, task_id] = train_preds[:, 1]

            val_preds = logreg.predict_proba(val_X)
            val_activations[:, task_id] = val_preds[:, 1]

            test_preds = logreg.predict_proba(test_X)
            test_activations[:, task_id] = test_preds[:, 1]

        train_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in train_activations])
        val_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in val_activations])
        test_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in test_activations])

        with open(os.path.join('cf_results', 'train_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(train_actual, train_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        with open(os.path.join('cf_results', 'val_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(val_actual, val_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        with open(os.path.join('cf_results', 'test_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(test_actual, test_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        save_results(test_names, test_ts, test_predictions, test_actual,
                     os.path.join('cf_predictions', model_name + '.csv'))
Esempio n. 10
0
        for i in range(test_data_gen.steps):
            print "\rpredicting {} / {}".format(i, test_data_gen.steps),

            ret = test_data_gen.next(return_y_true=True)
            (x, y_processed, y) = ret["data"]
            cur_names = ret["names"]
            cur_ts = ret["ts"]

            x = np.array(x)
            pred = model.predict_on_batch(x)
            predictions += list(pred)
            labels += list(y)
            names += list(cur_names)
            ts += list(cur_ts)

    if args.partition == 'log':
        predictions = [metrics.get_estimate_log(x, 10) for x in predictions]
        metrics.print_metrics_log_bins(labels, predictions)
    if args.partition == 'custom':
        predictions = [metrics.get_estimate_custom(x, 10) for x in predictions]
        metrics.print_metrics_custom_bins(labels, predictions)
    if args.partition == 'none':
        metrics.print_metrics_regression(labels, predictions)
        predictions = [x[0] for x in predictions]

    path = os.path.join("test_predictions", os.path.basename(args.load_state)) + ".csv"
    utils.save_results(names, ts, predictions, labels, path)

else:
    raise ValueError("Wrong value for args.mode")