def read_and_extract_features(args, partition):
    data_folder = os.path.join(args.data, partition)
    reader = LengthOfStayReader(
            dataset_dir=data_folder,
            listfile=os.path.join(data_folder, 'listfile.csv'),
            fixed_time=args.period_length)

    ret = common_utils.read_chunk(reader, reader.get_number_of_examples())
    patients = np.array(ret["patient"], dtype=int)
    ret["meta"] = np.stack(ret["meta"])
    X = common_utils.extract_features_from_rawdata(ret['X'], ret['header'], period="all", features=args.features)

    # Check that the period of observation time is the same for all observations
    period_of_obs = np.mean(ret["t"])
    print("Period of observation", period_of_obs, np.var(ret["t"]))
    assert np.var(ret["t"]) < 1e-3

    # Augment data with missing columns
    missing_flags = np.isnan(X)
    # Also add in the metadata (age, ethnicity, gender)
    augmented_X = np.concatenate([ret["meta"], X, missing_flags], axis=1)
    y = np.array(ret['y']).reshape((-1,1)) + period_of_obs
    log_y = np.log(y)
    return augmented_X, log_y, patients
Ejemplo n.º 2
0
if args.small_part:
    args.save_every = 2**30

# Build readers, discretizers, normalizers
if args.deep_supervision:
    train_data_loader = common_utils.DeepSupervisionDataLoader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'),
        small_part=args.small_part)
    val_data_loader = common_utils.DeepSupervisionDataLoader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'),
        small_part=args.small_part)
else:
    train_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'))
    val_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'))

discretizer = Discretizer(timestep=args.timestep,
                          store_masks=True,
                          impute_strategy='previous',
                          start_time='zero')

if args.deep_supervision:
    discretizer_header = discretizer.transform(
        train_data_loader._data["X"][0])[1].split(',')
else:
    discretizer_header = discretizer.transform(
Ejemplo n.º 3
0
                    type=int,
                    default=100,
                    help='number of epochs to train')
parser.add_argument('--period', type=str, default="all",
                    help="first4days, first8days, last12hours, "\
                         "first25percent, first50percent, all")
parser.add_argument('--features',
                    type=str,
                    default="all",
                    help="all, len, all_but_len")

args = parser.parse_args()
print args

train_reader = LengthOfStayReader(
    dataset_dir='../../../data/length-of-stay/train/',
    listfile='../../../data/length-of-stay/train_listfile.csv')

val_reader = LengthOfStayReader(
    dataset_dir='../../../data/length-of-stay/train/',
    listfile='../../../data/length-of-stay/val_listfile.csv')


def read_and_extract_features(reader, count):
    read_chunk_size = 1000
    assert (count % read_chunk_size == 0)
    Xs = []
    ys = []
    for i in range(count // read_chunk_size):
        (chunk, ts, y, header) = utils.read_chunk(reader, read_chunk_size)
        X = common_utils.extract_features_from_rawdata(chunk, header,
Ejemplo n.º 4
0
if args.small_part:
    args.save_every = 2**30

# Build readers, discretizers, normalizers
if args.deep_supervision:
    train_data_loader = common_utils.DeepSupervisionDataLoader(
        dataset_dir='../../data/length-of-stay/train/',
        listfile='../../data/length-of-stay/train_listfile.csv',
        small_part=args.small_part)
    val_data_loader = common_utils.DeepSupervisionDataLoader(
        dataset_dir='../../data/length-of-stay/train/',
        listfile='../../data/length-of-stay/val_listfile.csv',
        small_part=args.small_part)
else:
    train_reader = LengthOfStayReader(
        dataset_dir='../../data/length-of-stay/train/',
        listfile='../../data/length-of-stay/train_listfile.csv')
    val_reader = LengthOfStayReader(
        dataset_dir='../../data/length-of-stay/train/',
        listfile='../../data/length-of-stay/val_listfile.csv')

discretizer = Discretizer(timestep=args.timestep,
                          store_masks=True,
                          imput_strategy='previous',
                          start_time='zero')

if args.deep_supervision:
    discretizer_header = discretizer.transform(
        train_data_loader._data["X"][0])[1].split(',')
else:
    discretizer_header = discretizer.transform(
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Script for creating a normalizer state - a file which stores the '
        'means and standard deviations of columns of the output of a '
        'discretizer, which are later used to standardize the input of '
        'neural models.')
    parser.add_argument('--task',
                        type=str,
                        required=True,
                        choices=['ihm', 'decomp', 'los', 'pheno', 'multi'])
    parser.add_argument(
        '--timestep',
        type=float,
        default=1.0,
        help="Rate of the re-sampling to discretize time-series.")
    parser.add_argument('--impute_strategy',
                        type=str,
                        default='previous',
                        choices=['zero', 'next', 'previous', 'normal_value'],
                        help='Strategy for imputing missing values.')
    parser.add_argument(
        '--start_time',
        type=str,
        choices=['zero', 'relative'],
        help=
        'Specifies the start time of discretization. Zero means to use the beginning of '
        'the ICU stay. Relative means to use the time of the first ICU event')
    parser.add_argument(
        '--store_masks',
        dest='store_masks',
        action='store_true',
        help='Store masks that specify observed/imputed values.')
    parser.add_argument(
        '--no-masks',
        dest='store_masks',
        action='store_false',
        help='Do not store that specify specifying observed/imputed values.')
    parser.add_argument(
        '--n_samples',
        type=int,
        default=-1,
        help='How many samples to use to estimates means and '
        'standard deviations. Set -1 to use all training samples.')
    parser.add_argument('--output_dir',
                        type=str,
                        help='Directory where the output file will be saved.',
                        default='.')
    parser.add_argument('--data',
                        type=str,
                        required=True,
                        help='Path to the task data.')
    parser.set_defaults(store_masks=True)

    args = parser.parse_args()
    print(args)

    # create the reader
    reader = None
    dataset_dir = os.path.join(args.data, 'train')
    if args.task == 'ihm':
        reader = InHospitalMortalityReader(dataset_dir=dataset_dir,
                                           listfile=os.path.join(
                                               args.data,
                                               'train_listfile.csv'),
                                           period_length=48.0)
    if args.task == 'decomp':
        reader = DecompensationReader(dataset_dir=dataset_dir,
                                      listfile=os.path.join(
                                          args.data, 'train_listfile.csv'))
    if args.task == 'los':
        reader = LengthOfStayReader(dataset_dir=dataset_dir,
                                    listfile=os.path.join(
                                        args.data, 'train_listfile.csv'))
    if args.task == 'pheno':
        reader = PhenotypingReader(dataset_dir=dataset_dir,
                                   listfile=os.path.join(
                                       args.data, 'train_listfile.csv'))
    if args.task == 'multi':
        reader = MultitaskReader(dataset_dir=dataset_dir,
                                 listfile=os.path.join(args.data,
                                                       'train_listfile.csv'))

    # create the discretizer
    discretizer = Discretizer(timestep=args.timestep,
                              store_masks=args.store_masks,
                              impute_strategy=args.impute_strategy,
                              start_time=args.start_time)
    discretizer_header = reader.read_example(0)['header']
    continuous_channels = [
        i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
    ]

    # create the normalizer
    normalizer = Normalizer(fields=continuous_channels)

    # read all examples and store the state of the normalizer
    n_samples = args.n_samples
    if n_samples == -1:
        n_samples = reader.get_number_of_examples()

    for i in range(n_samples):
        if i % 1000 == 0:
            print('Processed {} / {} samples'.format(i, n_samples), end='\r')
        ret = reader.read_example(i)
        data, new_header = discretizer.transform(ret['X'], end=ret['t'])
        normalizer._feed_data(data)
    print('\n')

    file_name = '{}_ts:{:.2f}_impute:{}_start:{}_masks:{}_n:{}.normalizer'.format(
        args.task, args.timestep, args.impute_strategy, args.start_time,
        args.store_masks, n_samples)
    file_name = os.path.join(args.output_dir, file_name)
    print('Saving the state in {} ...'.format(file_name))
    normalizer._save_params(file_name)
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of length-of-stay task',
                        default=os.path.join(os.path.dirname(__file__),
                                             '../../../data/length-of-stay/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)

    train_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'train_listfile.csv'))

    val_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'train'),
        listfile=os.path.join(args.data, 'val_listfile.csv'))

    test_reader = LengthOfStayReader(
        dataset_dir=os.path.join(args.data, 'test'),
        listfile=os.path.join(args.data, 'test_listfile.csv'))

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_names,
     train_ts) = read_and_extract_features(train_reader, n_train, args.period,
                                           args.features)

    (val_X, val_y, val_names,
     val_ts) = read_and_extract_features(val_reader, n_val, args.period,
                                         args.features)

    (test_X, test_y, test_names,
     test_ts) = read_and_extract_features(test_reader,
                                          test_reader.get_number_of_examples(),
                                          args.period, args.features)

    print(train_X.shape)
    assert False

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    file_name = "{}.{}".format(args.period, args.features)

    linreg = LinearRegression()
    linreg.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              "w") as res_file:
        ret = print_metrics_regression(train_y, linreg.predict(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_regression(val_y, linreg.predict(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = linreg.predict(test_X)

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_regression(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, test_ts, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Ejemplo n.º 7
0
if args.small_part:
    args.save_every = 2**30



# Build readers, discretizers, normalizers
if args.deep_supervision:
    train_data_loader = common_utils.DeepSupervisionDataLoader(dataset_dir=os.path.join(args.data, 'train'),
                                                               listfile=os.path.join(args.data, 'train_listfile.csv'),
                                                               small_part=args.small_part, sources=sources, timesteps=args.timesteps, condensed=args.condensed)
    val_data_loader = common_utils.DeepSupervisionDataLoader(dataset_dir=os.path.join(args.data, 'train'),
                                                             listfile=os.path.join(args.data, 'val_listfile.csv'),
                                                             small_part=args.small_part, sources=sources, timesteps=args.timesteps, condensed=args.condensed)
else:
    train_reader = LengthOfStayReader(dataset_dir=os.path.join(args.data, 'train'),
                                      listfile=os.path.join(args.data, 'train_listfile.csv'), sources=sources, timesteps=args.timesteps, condensed=args.condensed)
    val_reader = LengthOfStayReader(dataset_dir=os.path.join(args.data, 'train'),
                                    listfile=os.path.join(args.data, 'val_listfile.csv'), sources=sources, timesteps=args.timesteps, condensed=args.condensed)

train_reader = LengthOfStayReader(dataset_dir=os.path.join(args.data, 'train'),
                                      listfile=os.path.join(args.data, 'train_listfile.csv'), sources=sources, timesteps=args.timesteps, condensed=args.condensed)
    
reader_header = train_reader.read_example(0)['header']
n_bins = len(train_reader.read_example(0))

discretizer = Discretizer(timestep=args.timestep,
                          store_masks=True,
                          impute_strategy='previous',
                          start_time='zero', header = reader_header, sources = sources)

if args.deep_supervision:
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--grid-search', dest='grid_search', action='store_true')
    parser.add_argument('--no-grid-search', dest='grid_search', action='store_false')
    parser.set_defaults(grid_search=False)
    parser.add_argument('--data', type=str, help='Path to the data of length-of-stay task',
                        default=os.path.join(os.path.dirname(__file__), '../../../data/length-of-stay/'))
    parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored',
                        default='.')
    args = parser.parse_args()
    print(args)

    if args.grid_search:
        penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1']
        coefs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001]
    else:
        penalties = ['l2']
        coefs = [0.00001]

    train_reader = LengthOfStayReader(dataset_dir=os.path.join(args.data, 'train'),
                                      listfile=os.path.join(args.data, 'train_listfile.csv'))

    val_reader = LengthOfStayReader(dataset_dir=os.path.join(args.data, 'train'),
                                    listfile=os.path.join(args.data, 'val_listfile.csv'))

    test_reader = LengthOfStayReader(dataset_dir=os.path.join(args.data, 'test'),
                                     listfile=os.path.join(args.data, 'test_listfile.csv'))

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_actual, train_names, train_ts) = read_and_extract_features(
        train_reader, n_train, args.period, args.features)

    (val_X, val_y, val_actual, val_names, val_ts) = read_and_extract_features(
        val_reader, n_val, args.period, args.features)

    (test_X, test_y, test_actual, test_names, test_ts) = read_and_extract_features(
        test_reader, test_reader.get_number_of_examples(), args.period, args.features)

    print("train set shape:  {}".format(train_X.shape))
    print("validation set shape: {}".format(val_X.shape))
    print("test set shape: {}".format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    result_dir = os.path.join(args.output_dir, 'cf_results')
    common_utils.create_directory(result_dir)

    for (penalty, C) in zip(penalties, coefs):
        model_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C)

        train_activations = np.zeros(shape=train_y.shape, dtype=float)
        val_activations = np.zeros(shape=val_y.shape, dtype=float)
        test_activations = np.zeros(shape=test_y.shape, dtype=float)

        for task_id in range(n_bins):
            logreg = LogisticRegression(penalty=penalty, C=C, random_state=42)
            logreg.fit(train_X, train_y[:, task_id])

            train_preds = logreg.predict_proba(train_X)
            train_activations[:, task_id] = train_preds[:, 1]

            val_preds = logreg.predict_proba(val_X)
            val_activations[:, task_id] = val_preds[:, 1]

            test_preds = logreg.predict_proba(test_X)
            test_activations[:, task_id] = test_preds[:, 1]

        train_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in train_activations])
        val_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in val_activations])
        test_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in test_activations])

        with open(os.path.join(result_dir, 'train_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(train_actual, train_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        with open(os.path.join(result_dir, 'val_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(val_actual, val_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        with open(os.path.join(result_dir, 'test_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(test_actual, test_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        save_results(test_names, test_ts, test_predictions, test_actual,
                     os.path.join(args.output_dir, 'cf_predictions', model_name + '.csv'))
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    train_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/',
                                      listfile='../../../data/length-of-stay/train_listfile.csv')

    val_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/',
                                    listfile='../../../data/length-of-stay/val_listfile.csv')

    test_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/test/',
                                     listfile='../../../data/length-of-stay/test_listfile.csv')

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_names, train_ts) = read_and_extract_features(
        train_reader, n_train, args.period, args.features)

    (val_X, val_y, val_names, val_ts) = read_and_extract_features(
        val_reader, n_val, args.period, args.features)

    (test_X, test_y, test_names, test_ts) = read_and_extract_features(
        test_reader, test_reader.get_number_of_examples(), args.period, args.features)

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    file_name = "{}.{}".format(args.period, args.features)

    linreg = LinearRegression()
    linreg.fit(train_X, train_y)

    common_utils.create_directory('results')

    with open(os.path.join("results", 'train_{}.json'.format(file_name)), "w") as res_file:
        ret = print_metrics_regression(train_y, linreg.predict(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_regression(val_y, linreg.predict(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = linreg.predict(test_X)

    with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file:
        ret = print_metrics_regression(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from',
                        choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all'])
    parser.add_argument('--features', type=str, default='all', help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    args = parser.parse_args()
    print(args)

    # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1']
    # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001]
    penalties = ['l2']
    Cs = [0.00001]

    train_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/',
                                      listfile='../../../data/length-of-stay/train_listfile.csv')

    val_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/',
                                    listfile='../../../data/length-of-stay/val_listfile.csv')

    test_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/test/',
                                     listfile='../../../data/length-of-stay/test_listfile.csv')

    print('Reading data and extracting features ...')
    n_train = min(100000, train_reader.get_number_of_examples())
    n_val = min(100000, val_reader.get_number_of_examples())

    (train_X, train_y, train_actual, train_names, train_ts) = read_and_extract_features(
        train_reader, n_train, args.period, args.features)

    (val_X, val_y, val_actual, val_names, val_ts) = read_and_extract_features(
        val_reader, n_val, args.period, args.features)

    (test_X, test_y, test_actual, test_names, test_ts) = read_and_extract_features(
        test_reader, test_reader.get_number_of_examples(), args.period, args.features)

    print("train set shape:  {}".format(train_X.shape))
    print("validation set shape: {}".format(val_X.shape))
    print("test set shape: {}".format(test_X.shape))

    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    common_utils.create_directory('cf_results')

    for (penalty, C) in zip(penalties, Cs):
        model_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C)

        train_activations = np.zeros(shape=train_y.shape, dtype=float)
        val_activations = np.zeros(shape=val_y.shape, dtype=float)
        test_activations = np.zeros(shape=test_y.shape, dtype=float)

        for task_id in range(n_bins):
            logreg = LogisticRegression(penalty=penalty, C=C, random_state=42)
            logreg.fit(train_X, train_y[:, task_id])

            train_preds = logreg.predict_proba(train_X)
            train_activations[:, task_id] = train_preds[:, 1]

            val_preds = logreg.predict_proba(val_X)
            val_activations[:, task_id] = val_preds[:, 1]

            test_preds = logreg.predict_proba(test_X)
            test_activations[:, task_id] = test_preds[:, 1]

        train_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in train_activations])
        val_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in val_activations])
        test_predictions = np.array([metrics.get_estimate_custom(x, n_bins) for x in test_activations])

        with open(os.path.join('cf_results', 'train_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(train_actual, train_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        with open(os.path.join('cf_results', 'val_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(val_actual, val_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        with open(os.path.join('cf_results', 'test_{}.json'.format(model_name)), 'w') as f:
            ret = metrics.print_metrics_custom_bins(test_actual, test_predictions)
            ret = {k: float(v) for k, v in ret.items()}
            json.dump(ret, f)

        save_results(test_names, test_ts, test_predictions, test_actual,
                     os.path.join('cf_predictions', model_name + '.csv'))
Ejemplo n.º 11
0
parser.add_argument('--batch_norm',
                    type=bool,
                    default=False,
                    help='batch normalization')
parser.add_argument('--timestep',
                    type=float,
                    default=0.8,
                    help="fixed timestep used in the dataset")
parser.add_argument('--small_part', dest='small_part', action='store_true')
parser.add_argument('--whole_data', dest='small_part', action='store_false')
parser.set_defaults(small_part=False)
args = parser.parse_args()
print args

train_reader = LengthOfStayReader(
    dataset_dir='../../data/length-of-stay/train/',
    listfile='../../data/length-of-stay/train_listfile.csv')

val_reader = LengthOfStayReader(
    dataset_dir='../../data/length-of-stay/train/',
    listfile='../../data/length-of-stay/val_listfile.csv')

discretizer = Discretizer(timestep=args.timestep,
                          store_masks=True,
                          imput_strategy='previous',
                          start_time='zero')

discretizer_header = discretizer.transform(
    train_reader.read_example(0)[0])[1].split(',')
cont_channels = [
    i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1
Ejemplo n.º 12
0
args = parser.parse_args()
print args

if args.small_part:
    args.save_every = 2**30

# Build readers, discretizers, normalizers
if args.deep_supervision:
    train_data_loader = common_utils.DeepSupervisionDataLoader(dataset_dir='../../data/length-of-stay/train/',
                            listfile='../../data/length-of-stay/train_listfile.csv',
                            small_part=args.small_part)
    val_data_loader = common_utils.DeepSupervisionDataLoader(dataset_dir='../../data/length-of-stay/train/',
                            listfile='../../data/length-of-stay/val_listfile.csv',
                            small_part=args.small_part)
else:
    train_reader = LengthOfStayReader(dataset_dir='../../data/length-of-stay/train/',
                        listfile='../../data/length-of-stay/train_listfile.csv')
    val_reader = LengthOfStayReader(dataset_dir='../../data/length-of-stay/train/',
                        listfile='../../data/length-of-stay/val_listfile.csv')

discretizer = Discretizer(timestep=args.timestep,
                          store_masks=True,
                          imput_strategy='previous',
                          start_time='zero')

if args.deep_supervision:
    discretizer_header = discretizer.transform(train_data_loader._data[0][0])[1].split(',')
else:
    discretizer_header = discretizer.transform(train_reader.read_example(0)[0])[1].split(',')
cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1]

normalizer = Normalizer(fields=cont_channels) # choose here onlycont vs all
Ejemplo n.º 13
0
if args.small_part:
    args.save_every = 2**30

# Build readers, discretizers, normalizers
if args.deep_supervision:
    train_data_loader = common_utils.DeepSupervisionDataLoader(
        dataset_dir='../../data/length-of-stay/train/',
        listfile='../../data/length-of-stay/train_listfile.csv',
        small_part=args.small_part)
    val_data_loader = common_utils.DeepSupervisionDataLoader(
        dataset_dir='../../data/length-of-stay/train/',
        listfile='../../data/length-of-stay/val_listfile.csv',
        small_part=args.small_part)
else:
    train_reader = LengthOfStayReader(
        dataset_dir='../../data/length-of-stay/train/',
        listfile='../../data/length-of-stay/train_listfile.csv')
    val_reader = LengthOfStayReader(
        dataset_dir='../../data/length-of-stay/train/',
        listfile='../../data/length-of-stay/val_listfile.csv')

discretizer = Discretizer(timestep=args.timestep,
                          store_masks=True,
                          imput_strategy='previous',
                          start_time='zero')

if args.deep_supervision:
    discretizer_header = discretizer.transform(
        train_data_loader._data[0][0])[1].split(',')
else:
    discretizer_header = discretizer.transform(
Ejemplo n.º 14
0
args = parser.parse_args()
print args

if args.small_part:
    args.save_every = 2**30

# Build readers, discretizers, normalizers
if args.deep_supervision:
    train_data_loader = common_utils.DeepSupervisionDataLoader(dataset_dir='../../data/length-of-stay/train/',
                                                               listfile='../../data/length-of-stay/train_listfile.csv',
                                                               small_part=args.small_part)
    val_data_loader = common_utils.DeepSupervisionDataLoader(dataset_dir='../../data/length-of-stay/train/',
                                                             listfile='../../data/length-of-stay/val_listfile.csv',
                                                             small_part=args.small_part)
else:
    train_reader = LengthOfStayReader(dataset_dir='../../data/length-of-stay/train/',
                                      listfile='../../data/length-of-stay/train_listfile.csv')
    val_reader = LengthOfStayReader(dataset_dir='../../data/length-of-stay/train/',
                                    listfile='../../data/length-of-stay/val_listfile.csv')

discretizer = Discretizer(timestep=args.timestep,
                          store_masks=True,
                          imput_strategy='previous',
                          start_time='zero')

if args.deep_supervision:
    discretizer_header = discretizer.transform(train_data_loader._data["X"][0])[1].split(',')
else:
    discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',')
cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1]

normalizer = Normalizer(fields=cont_channels)  # choose here onlycont vs all