target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') # Build readers, discretizers, normalizers train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, impute_strategy='previous', start_time='zero') discretizer_header = discretizer.transform( train_reader.read_example(0)["X"])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] normalizer = Normalizer( fields=cont_channels) # choose here which columns to standardize normalizer_state = args.normalizer_state if normalizer_state is None: normalizer_state = 'ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format( args.timestep, args.imputation) normalizer_state = os.path.join(os.path.dirname(__file__),
print args if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') # Build readers, discretizers, normalizers train_reader = PhenotypingReader(dataset_dir='../../data/phenotyping/train/', listfile='../../data/phenotyping/train_listfile.csv') val_reader = PhenotypingReader(dataset_dir='../../data/phenotyping/train/', listfile='../../data/phenotyping/val_listfile.csv') discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, imput_strategy='previous', start_time='zero') discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',') cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1] normalizer = Normalizer(fields=cont_channels) # choose here onlycont vs all normalizer.load_params('ph_ts{}.input_str:previous.start_time:zero.normalizer'.format(args.timestep)) args_dict = dict(args._get_kwargs()) args_dict['header'] = discretizer_header args_dict['task'] = 'ph' args_dict['num_classes'] = 25 args_dict['target_repl'] = target_repl # Build the model
listfile=os.path.join(args.data, 'train_listfile.csv'), small_part=args.small_part) val_data_loader = common_utils.DeepSupervisionDataLoader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), small_part=args.small_part) else: train_reader = LengthOfStayReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv')) val_reader = LengthOfStayReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv')) discretizer = Discretizer(timestep=args.timestep, store_masks=True, impute_strategy='previous', start_time='zero') if args.deep_supervision: discretizer_header = discretizer.transform( train_data_loader._data["X"][0])[1].split(',') else: discretizer_header = discretizer.transform( train_reader.read_example(0)["X"])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] normalizer = Normalizer( fields=cont_channels) # choose here which columns to standardize normalizer_state = args.normalizer_state
args = parser.parse_args() print(args) target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') embeddings, word_indices = get_embeddings(corpus='claims_codes_hs', dim=300) train_reader = ReadmissionReader( dataset_dir='/mnt/MIMIC-III-clean/readmission_cv2/data/', listfile='/mnt/MIMIC-III-clean/readmission_cv2/0_train_listfile801010.csv') val_reader = ReadmissionReader( dataset_dir='/mnt/MIMIC-III-clean/readmission_cv2/data/', listfile='/mnt/MIMIC-III-clean/readmission_cv2/0_val_listfile801010.csv') discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, imput_strategy='previous', start_time='zero') N = train_reader.get_number_of_examples() ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] diseases_list = get_diseases(names, '/mnt/MIMIC-III-clean/data/') diseases_embedding = disease_embedding(embeddings, word_indices, diseases_list) discretizer_header = discretizer.transform(ret["X"][0])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ]
listfiles_train_dir0 = base + listfiles_data + "/0_train_listfile801010.csv" listfiles_val_dir0 = base + listfiles_data + "/0_val_listfile801010.csv" listfiles_test_dir0 = base + listfiles_data + "/0_test_listfile801010.csv" target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') # Read embedding embeddings, word_indices = get_embeddings(corpus='claims_codes_hs', dim=300) train_reader = ReadmissionReader(dataset_dir=dataset_episode_dir, listfile=listfiles_train_dir0) val_reader = ReadmissionReader(dataset_dir=dataset_episode_dir, listfile=listfiles_val_dir0) discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, imput_strategy='previous', start_time='zero') N = train_reader.get_number_of_examples() ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] feature_cols = ['LOS', 'Hos_LOS', 'Num_Prev_Hos_Adm'] additional_features = get_additional_features(names) additional_features_list = normalize_standard(additional_features, feature_cols).values.tolist() diseases_list = get_diseases(names, dataset_subject_dir) diseases_embedding = disease_embedding(embeddings, word_indices, diseases_list) demographic = get_demographic(names, dataset_subject_dir)
embeddings, word_indices = get_embeddings(corpus='claims_codes_hs', dim=300) train_reader = ReadmissionReader( dataset_dir='/Users/jeffrey0925/MIMIC-III-clean/readmission_cv2/data/', listfile= '/Users/jeffrey0925/MIMIC-III-clean/readmission_cv2/0_train_listfile801010.csv' ) val_reader = ReadmissionReader( dataset_dir='/Users/jeffrey0925/MIMIC-III-clean/readmission_cv2/data/', listfile= '/Users/jeffrey0925/MIMIC-III-clean/readmission_cv2/0_val_listfile801010.csv' ) discretizer = Discretizer(timestep=float(args.timestep), store_masks=True, imput_strategy='previous', start_time='zero') N = train_reader.get_number_of_examples() ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] labels = ret["y"] names = ret["name"] diseases_list = get_diseases(names, '/Users/jeffrey0925/MIMIC-III-clean/data/') diseases_embedding = disease_embedding(embeddings, word_indices, diseases_list) demographic = get_demographic(names, '/Users/jeffrey0925/MIMIC-III-clean/data/') age_means = sum(demographic[:][0]) age_std = statistics.stdev(demographic[:][0])
# Build readers, discretizers, normalizers if args.deep_supervision: train_data_loader = common_utils.DeepSupervisionDataLoader(dataset_dir='../../data/decompensation/train/', listfile='../../data/decompensation/train_listfile.csv', small_part=args.small_part) val_data_loader = common_utils.DeepSupervisionDataLoader(dataset_dir='../../data/decompensation/train/', listfile='../../data/decompensation/val_listfile.csv', small_part=args.small_part) else: train_reader = DecompensationReader(dataset_dir='../../data/decompensation/train/', listfile='../../data/decompensation/train_listfile.csv') val_reader = DecompensationReader(dataset_dir='../../data/decompensation/train/', listfile='../../data/decompensation/val_listfile.csv') discretizer = Discretizer(timestep=args.timestep, store_masks=True, imput_strategy='previous', start_time='zero') if args.deep_supervision: discretizer_header = discretizer.transform(train_data_loader._data["X"][0])[1].split(',') else: discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',') cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1] normalizer = Normalizer(fields=cont_channels) # choose here onlycont vs all normalizer.load_params('decomp_ts{}.input_str:previous.n1e5.start_time:zero.normalizer'.format(args.timestep)) args_dict = dict(args._get_kwargs()) args_dict['header'] = discretizer_header args_dict['task'] = 'decomp'
def main(): parser = argparse.ArgumentParser(description='Script for creating a normalizer state - a file which stores the ' 'means and standard deviations of columns of the output of a ' 'discretizer, which are later used to standardize the input of ' 'neural models.') parser.add_argument('--task', type=str, required=True, choices=['ihm', 'decomp', 'los', 'pheno', 'multi']) parser.add_argument('--timestep', type=float, default=1.0, help="Rate of the re-sampling to discretize time-series.") parser.add_argument('--impute_strategy', type=str, default='previous', choices=['zero', 'next', 'previous', 'normal_value'], help='Strategy for imputing missing values.') parser.add_argument('--start_time', type=str, choices=['zero', 'relative'], help='Specifies the start time of discretization. Zero means to use the beginning of ' 'the ICU stay. Relative means to use the time of the first ICU event') parser.add_argument('--store_masks', dest='store_masks', action='store_true', help='Store masks that specify observed/imputed values.') parser.add_argument('--no-masks', dest='store_masks', action='store_false', help='Do not store that specify specifying observed/imputed values.') parser.add_argument('--n_samples', type=int, default=-1, help='How many samples to use to estimates means and ' 'standard deviations. Set -1 to use all training samples.') parser.add_argument('--output_dir', type=str, help='Directory where the output file will be saved.', default='.') parser.add_argument('--data', type=str, required=True, help='Path to the task data.') parser.set_defaults(store_masks=True) args = parser.parse_args() print(args) # create the reader reader = None dataset_dir = os.path.join(args.data, 'train') if args.task == 'ihm': reader = InHospitalMortalityReader(dataset_dir=dataset_dir, period_length=48.0) if args.task == 'los': reader = LengthOfStayReader(dataset_dir=dataset_dir) if args.task == 'multi': reader = MultitaskReader(dataset_dir=dataset_dir) # create the discretizer discretizer = Discretizer(timestep=args.timestep, store_masks=args.store_masks, impute_strategy=args.impute_strategy, start_time=args.start_time) discretizer_header = reader.read_example(0)['header'] continuous_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1] # create the normalizer normalizer = Normalizer(fields=continuous_channels) # read all examples and store the state of the normalizer n_samples = args.n_samples if n_samples == -1: n_samples = reader.get_number_of_examples() for i in range(n_samples): if i % 1000 == 0: print('Processed {} / {} samples'.format(i, n_samples), end='\r') ret = reader.read_example(i) data, new_header = discretizer.transform(ret['X'], end=ret['t']) normalizer._feed_data(data) print('\n') file_name = '{}_ts_{:.1f}_impute_{}_start_time_{}.normalizer'.format( args.task, args.timestep, args.impute_strategy, args.start_time, args.store_masks, n_samples) file_name = os.path.join(args.output_dir, file_name) print('Saving the state in {} ...'.format(file_name)) normalizer._save_params(file_name)
test_starttime_path = conf.starttime_path_test epochs = 50 learning_rate = 3e-4 batch_size = 8 bootstrap_decomp = BootStrapDecomp(k=1000, experiment_name=experiment) bootstrap_los = BootStrapLos(experiment_name=experiment) bootstrap_ihm = BootStrapIhm(experiment_name=experiment) bootstrap_pheno = BootStrapPheno(experiment_name=experiment) bootstrap_readmit = BootStrapReadmit(experiment_name=experiment) # prepare discretizer and normalizer conf = utils.get_config() discretizer = Discretizer(timestep=conf.timestep, store_masks=True, impute_strategy='previous', start_time='zero') cont_channels = [2, 3, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58] normalizer = Normalizer(fields=cont_channels) normalizer_state = conf.normalizer_state if normalizer_state is None: normalizer_state = 'mult_ts{}.input_str:previous.start_time:zero.n5e4.normalizer'.format( conf.timestep) normalizer.load_params(normalizer_state) # Model text_model = Text_CNN(in_channels=1, out_channels=128,
listfiles_test_dir0 = base + listfiles_data + "/0_test_listfile801010.csv" embeddings, word_indices = get_embeddings(corpus='claims_codes_hs', dim=300) # Build readers, discretizers, normalizers train_reader = ReadmissionReader(dataset_dir=dataset_episode_dir, listfile=listfiles_train_dir0) val_reader = ReadmissionReader(dataset_dir=dataset_episode_dir, listfile=listfiles_val_dir0) test_reader = ReadmissionReader(dataset_dir=dataset_episode_dir, listfile=listfiles_test_dir0) discretizer = Discretizer(timestep=float(1.0), store_masks=True, imput_strategy='previous', start_time='zero') N = train_reader.get_number_of_examples() ret = common_utils.read_chunk(train_reader, N) data = ret["X"] ts = ret["t"] train_y = ret["y"] train_names = ret["name"] diseases_list = get_diseases(train_names, dataset_subject_dir) diseases_embedding = disease_embedding(embeddings, word_indices, diseases_list) d, discretizer_header, begin_pos, end_pos = discretizer.transform_reg(data[0]) discretizer_header = discretizer_header.split(',')
if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') # Build readers, discretizers, normalizers train_reader = MultitaskReader( dataset_dir='../../data/multitask/train/', listfile='../../data/multitask/train_listfile.csv') val_reader = MultitaskReader(dataset_dir='../../data/multitask/train/', listfile='../../data/multitask/val_listfile.csv') discretizer = Discretizer(timestep=args.timestep, store_masks=True, imput_strategy='previous', start_time='zero') discretizer_header = discretizer.transform( train_reader.read_example(0)[0])[1].split(',') cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] normalizer = Normalizer(fields=cont_channels) # choose here onlycont vs all normalizer.load_params('mult_ts%s.input_str:%s.start_time:zero.normalizer' % (args.timestep, args.imputation)) args_dict = dict(args._get_kwargs()) args_dict['header'] = discretizer_header args_dict['ihm_pos'] = int(48.0 / args.timestep - 1e-6)