def test_cpu(self): """Run full training for MNIST CPU training.""" workdir = self.get_tmp_model_dir() config = default.get_config() start_time = time.time() train.train_and_evaluate(config=config, workdir=workdir) benchmark_time = time.time() - start_time summaries = self.read_summaries(workdir) # Summaries contain all the information necessary for the regression # metrics. wall_time, _, eval_accuracy = zip(*summaries['eval_accuracy']) wall_time = np.array(wall_time) sec_per_epoch = np.mean(wall_time[1:] - wall_time[:-1]) end_eval_accuracy = eval_accuracy[-1] # Assertions are deferred until the test finishes, so the metrics are # always reported and benchmark success is determined based on *all* # assertions. self.assertBetween(end_eval_accuracy, 0.98, 1.0) # Use the reporting API to report single or multiple metrics/extras. self.report_wall_time(benchmark_time) self.report_metrics({ 'sec_per_epoch': sec_per_epoch, 'accuracy': end_eval_accuracy, }) self.report_extras({ 'model_name': 'MNIST', 'description': 'CPU test for MNIST.', 'implementation': 'linen', })
def test_train_and_evaluate(self): """Tests training and evaluation loop using mocked data.""" # Create a temporary directory where tensorboard metrics are written. model_dir = tempfile.mkdtemp() # Go two directories up to the root of the flax directory. flax_root_dir = pathlib.Path(__file__).parents[2] data_dir = str(flax_root_dir) + '/.tfds/metadata' with tfds.testing.mock_data(num_examples=1, data_dir=data_dir): lm1b_train.train_and_evaluate(random_seed=0, batch_size=1, learning_rate=0.05, num_train_steps=1, num_eval_steps=1, eval_freq=1, max_target_length=10, max_eval_target_length=32, weight_decay=1e-1, data_dir=None, model_dir=model_dir, restore_checkpoints=False, save_checkpoints=False, checkpoint_freq=2, max_predict_token_length=2, sampling_temperature=0.6, sampling_top_k=4, prompt_str='unittest ')
def _test_8x_v100_half_precision(self, num_epochs: int, min_accuracy, max_accuracy): """Utility to benchmark ImageNet on 8xV100 GPUs. Use in your test func.""" # Make sure tf does not allocate gpu memory. tf.config.experimental.set_visible_devices([], 'GPU') workdir = self.get_tmp_model_dir() config = config_lib.get_config() config.num_epochs = num_epochs start_time = time.time() train.train_and_evaluate(config=config, workdir=workdir) benchmark_time = time.time() - start_time summaries = self.read_summaries(workdir) # Summaries contain all the information necessary for the regression # metrics. wall_time, _, eval_accuracy = zip(*summaries['eval_accuracy']) wall_time = np.array(wall_time) sec_per_epoch = np.mean(wall_time[1:] - wall_time[:-1]) end_accuracy = eval_accuracy[-1] # Assertions are deferred until the test finishes, so the metrics are # always reported and benchmark success is determined based on *all* # assertions. self.assertBetween(end_accuracy, min_accuracy, max_accuracy) # Use the reporting API to report single or multiple metrics/extras. self.report_wall_time(benchmark_time) self.report_metrics({ 'sec_per_epoch': sec_per_epoch, 'accuracy': end_accuracy })
def test_train_and_evaluate(self): config = default.get_config() config.max_corpus_chars = 1000 config.vocab_size = 32 config.batch_size = 8 config.num_train_steps = 1 config.num_eval_steps = 1 config.num_predict_steps = 1 config.num_layers = 1 config.qkv_dim = 128 config.emb_dim = 128 config.mlp_dim = 512 config.num_heads = 2 config.max_target_length = 32 config.max_eval_target_length = 32 config.max_predict_length = 32 workdir = tempfile.mkdtemp() # Go two directories up to the root of the flax directory. flax_root_dir = pathlib.Path(__file__).parents[2] data_dir = str(flax_root_dir) + '/.tfds/metadata' # pylint: disable=unused-variable with tfds.testing.mock_data(num_examples=128, data_dir=data_dir): train.train_and_evaluate(config, workdir) logging.info('workdir content: %s', tf.io.gfile.listdir(workdir))
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') # Hide any GPUs form TensorFlow. Otherwise TF might reserve memory and make # it unavailable to JAX. tf.config.experimental.set_visible_devices([], 'GPU') logging.info('JAX process: %d / %d', jax.process_index(), jax.process_count()) logging.info('JAX local devices: %r', jax.local_devices()) # Add a note so that we can tell which task is which JAX host. # (Depending on the platform task 0 is not guaranteed to be host 0) platform.work_unit().set_task_status( f'process_index: {jax.process_index()}, ' f'process_count: {jax.process_count()}') platform.work_unit().create_artifact(platform.ArtifactType.DIRECTORY, FLAGS.workdir, 'workdir') if FLAGS.sample: sample.save_images(sample.generate_sample(FLAGS.config, FLAGS.workdir), 'sample.png') else: train.train_and_evaluate(FLAGS.config, FLAGS.workdir)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') # Make sure tf does not allocate gpu memory. tf.config.experimental.set_visible_devices([], 'GPU') # Require JAX omnistaging mode. jax.config.enable_omnistaging() train.train_and_evaluate(workdir=FLAGS.workdir, config=FLAGS.config)
def test_train_and_evaluate(self): config = get_test_config() workdir = tempfile.mkdtemp() # Go two directories up to the root of the flax directory. flax_root_dir = pathlib.Path(__file__).parents[2] data_dir = str(flax_root_dir) + '/.tfds/metadata' # pylint: disable=unused-variable with tfds.testing.mock_data(num_examples=8, data_dir=data_dir): train.train_and_evaluate(config, workdir) logging.info('workdir content: %s', tf.io.gfile.listdir(workdir))
def test_train_and_evaluate(self): """Tests training and evaluation code by running a single step.""" # Create a temporary directory where tensorboard metrics are written. workdir = tempfile.mkdtemp() # Go two directories up to the root of the flax directory. flax_root_dir = pathlib.Path(__file__).parents[2] data_dir = str(flax_root_dir) + "/.tfds/metadata" # pylint: disable=unused-variable # Define training configuration. config = default.get_config() config.num_epochs = 1 config.batch_size = 8 with tfds.testing.mock_data(num_examples=8, data_dir=data_dir): train.train_and_evaluate(config=config, workdir=workdir)
def main(): f = open("outdir.txt", 'r') outdir = f.read().rstrip('\n') f = open("experiment_folder.txt", 'r') experiment_folder = f.read().rstrip('\n') ##pass the size of the vocabulary to the model with open(os.path.join(outdir, mt_to_ix_file)) as f: rd = csv.reader(f) vocab_size = 0 for r in rd: vocab_size += 1 #set random seed for reproducible experiments torch.manual_seed(12) torch.cuda.manual_seed(12) ##Import data data = myData(outdir, ehr_file) data_generator = DataLoader(data, model_pars['batch_size'], shuffle=True, collate_fn=my_collate, drop_last=True) #define model and optimizer print("cohort numerosity:{0}".format(len(data))) model = net.LSTMehrEncoding(vocab_size, model_pars['embedding_dim'], model_pars['batch_size']) #model = nn.DataParallel(model, device_ids=[1,2,3]) optimizer = torch.optim.Adam(model.parameters(), lr=model_pars['learning_rate'], weight_decay=1e-5) #start the unsupervised training and evaluation model.cuda() loss_fn = net.criterion print("Starting training for {} epochs...".format( model_pars['num_epochs'])) mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator, loss_fn, optimizer, metrics, experiment_folder) svd = TruncatedSVD(n_components=100) encoded = svd.fit_transform(encoded) with open(experiment_folder + '/LSTMencoded_vect.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for e in encoded: wr.writerow(e) with open(experiment_folder + '/LSTMmrns.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for m in mrn: wr.writerow([m]) with open(experiment_folder + '/LSTMmetrics.txt', 'w') as f: wr = csv.writer(f, delimiter='\t') wr.writerow(["Mean loss:", metrics_avg['loss']]) wr.writerow(["Accuracy:", metrics_avg['accuracy']])
def setup_and_train(parmas): model = Net(params).cuda() if params.cuda else Net(params) image_size = model.image_size() train_transform = transforms.Compose([ transforms.RandomResizedCrop(image_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) valid_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(image_size), transforms.ToTensor(), normalize ]) loss_fn = FocalLoss() # Observe that all parameters are being optimized # optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) optimizer = optim.SGD([{ 'params': model.base_parameters }, { 'params': model.last_parameters, 'lr': 1e-2 }], lr=1e-3, momentum=0.9) # optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=params.step_size, gamma=params.gama) dataloaders = get_dateloaders(params, train_transform=train_transform, valid_transform=valid_transform) train_and_evaluate(model=model, dataloaders=dataloaders, optimizer=optimizer, loss_fn=loss_fn, scheduler=exp_lr_scheduler, params=params)
def test_train_and_evaluate(self): """Tests training and evaluation loop using mocked data.""" # Create a temporary directory where tensorboard metrics are written. workdir = tempfile.mkdtemp() # Go two directories up to the root of the flax directory. flax_root_dir = pathlib.Path(__file__).parents[2] data_dir = str(flax_root_dir) + '/.tfds/metadata' # Define training configuration config = default_lib.get_config() config.batch_size = 1 config.num_epochs = 1 config.num_train_steps = 1 config.steps_per_eval = 1 with tfds.testing.mock_data(num_examples=1, data_dir=data_dir): train.train_and_evaluate(workdir=workdir, config=config)
def test_fake_data(self): workdir = self.get_tmp_model_dir() config = config_lib.get_config() # Go two directories up to the root of the flax directory. flax_root_dir = pathlib.Path(__file__).parents[2] data_dir = str(flax_root_dir) + '/.tfds/metadata' start_time = time.time() with tfds.testing.mock_data(num_examples=1024, data_dir=data_dir): train.train_and_evaluate(config, workdir) benchmark_time = time.time() - start_time self.report_wall_time(benchmark_time) self.report_extras({ 'description': 'ImageNet ResNet50 with fake data', 'model_name': 'resnet50', 'parameters': f'hp=true,bs={FLAGS.config.batch_size}', })
def main(): f = open("outdir.txt", 'r') outdir = f.read().rstrip('\n') #create an experiment folder tied to date and time where to save output from the model experiment_folder = os.path.expanduser('~/data1/stratification_ILRM/experiments/') + disease_folder +\ '-'.join(map(str, list(datetime.now().timetuple()[:6]))) os.makedirs(experiment_folder) f = open("experiment_folder.txt", 'w') ##path to the experiment folder is saved in a txt file f.write(experiment_folder) f.close() ##pass the size of the vocabulary to the model with open(os.path.join(outdir, mt_to_ix_file)) as f: rd = csv.reader(f) next(rd) vocab_size = 1 for r in rd: vocab_size+=1 #set random seed for reproducible experiments torch.manual_seed(123) torch.cuda.manual_seed(123) ##Import data data = myData(outdir, ehr_file) data_generator = DataLoader(data, model_pars['batch_size'], shuffle=True, collate_fn=my_collate) #define model and optimizer print("cohort numerosity:{0} -- max_seq_length:{1}".format(len(data), L)) model = net.ehrEncoding(vocab_size, L, model_pars['embedding_dim'], model_pars['kernel_size']) optimizer = torch.optim.Adam(model.parameters(), lr=model_pars['learning_rate'], weight_decay=1e-5) #start the unsupervised training and evaluation model.cuda() loss_fn = net.criterion print("Starting training for {} epochs...".format(model_pars['num_epochs'])) mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator, loss_fn, optimizer, metrics, experiment_folder) ##save encoded vectors, medical record number list (to keep track of the order) and metric (loss and accuracy) with open(experiment_folder + '/encoded_vect.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for e in encoded: wr.writerow(e) with open(experiment_folder + '/mrns.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for m in mrn: wr.writerow([m]) with open(experiment_folder + '/metrics.txt', 'w') as f: wr = csv.writer(f, delimiter='\t') #for m, v in metrics_average.items(): # wr.writerow([m, v]) wr.writerow(["Mean loss:", metrics_avg['loss']]) wr.writerow(["Accuracy:", metrics_avg['accuracy']])
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') FLAGS.log_dir = FLAGS.workdir FLAGS.stderrthreshold = 'info' logging.get_absl_handler().start_logging_to_file() # Hide any GPUs form TensorFlow. Otherwise TF might reserve memory and make # it unavailable to JAX. tf.config.experimental.set_visible_devices([], 'GPU') logging.info('JAX host: %d / %d', jax.host_id(), jax.host_count()) logging.info('JAX local devices: %r', jax.local_devices()) # Add a note so that we can tell which task is which JAX host. # (Depending on the platform task 0 is not guaranteed to be host 0) platform.work_unit().set_task_status( f'host_id: {jax.host_id()}, host_count: {jax.host_count()}') platform.work_unit().create_artifact(platform.ArtifactType.DIRECTORY, FLAGS.workdir, 'workdir') train.train_and_evaluate(FLAGS.config, FLAGS.workdir)
def main(argv): if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") # Hide any GPUs form TensorFlow. Otherwise TF might reserve memory and make # it unavailable to JAX. tf.config.experimental.set_visible_devices([], "GPU") if FLAGS.jax_backend_target: logging.info("Using JAX backend target %s", FLAGS.jax_backend_target) jax.config.update("jax_xla_backend", "tpu_driver") jax.config.update("jax_backend_target", FLAGS.jax_backend_target) logging.info("JAX host: %d / %d", jax.host_id(), jax.host_count()) logging.info("JAX local devices: %r", jax.local_devices()) # Add a note so that we can tell which task is which JAX host. # (Depending on the platform task 0 is not guaranteed to be host 0) platform.work_unit().set_task_status( f"host_id: {jax.host_id()}, host_count: {jax.host_count()}") platform.work_unit().create_artifact(platform.ArtifactType.DIRECTORY, FLAGS.workdir, "workdir") train.train_and_evaluate(FLAGS.config, FLAGS.workdir)
def test_train_and_evaluate(self): """Tests training and evaluation loop using TFDS mocked data.""" # Create a temporary directory where tensorboard metrics are written. model_dir = tempfile.mkdtemp() # Go two directories up to the root of the flax directory. flax_root_dir = pathlib.Path(__file__).parents[2] data_dir = str(flax_root_dir) + '/.tfds/metadata' with tfds.testing.mock_data(num_examples=8, data_dir=data_dir): sst2_train.train_and_evaluate(seed=0, model_dir=model_dir, num_epochs=1, batch_size=8, embedding_size=256, hidden_size=256, min_freq=5, max_seq_len=55, dropout=0.5, emb_dropout=0.5, word_dropout_rate=0.1, learning_rate=0.0005, checkpoints_to_keep=0, l2_reg=1e-6)
def one_search_experiment(dataset, error_type, train_file, model, seed, n_jobs=1, hyperparams=None, skip_test_files=[]): """One experiment on the datase given an error type, a train file, a model and a random search seed Args: dataset (dict): dataset dict in config.py error_type (string): error type train_file (string): filename of training set (dirty or clean) model (dict): ml model dict in model.py seed (int): seed for this experiment """ np.random.seed(seed) # generate random seeds for down sample and training down_sample_seed, train_seed = np.random.randint(1000, size=2) # load and preprocess data X_train, y_train, X_test_list, y_test_list, test_files = \ preprocess(dataset, error_type, train_file, normalize=True, down_sample_seed=down_sample_seed) test_files = list(set(test_files).difference(set(skip_test_files))) # train and evaluate result = train_and_evaluate(X_train, y_train, X_test_list, y_test_list, test_files, model, n_jobs=n_jobs, seed=train_seed, hyperparams=hyperparams) return result
def main(): ##pass the size of the vocabulary to the model with open(os.path.join(data_folder, mt_to_ix_file)) as f: rd = csv.reader(f) vocab_size = 0 for r in rd: vocab_size += 1 #set random seed for reproducible experiments torch.manual_seed(123) torch.cuda.manual_seed(123) ##Import data data = myData(data_folder, ehr_file) data_generator = DataLoader(data, model_pars['batch_size'], shuffle=True, collate_fn=my_collate) #define model and optimizer print("cohort numerosity:{0} -- max_seq_length:{1}".format(len(data), L)) model = net.ehrEncoding(vocab_size, L, model_pars['embedding_dim'], model_pars['kernel_size']) #model = nn.DataParallel(model, device_ids=[1,2,3]) optimizer = torch.optim.Adam(model.parameters(), lr=model_pars['learning_rate'], weight_decay=1e-5) #start the unsupervised training and evaluation model.cuda() loss_fn = net.criterion print("Starting training for {} epochs...".format( model_pars['num_epochs'])) mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator, loss_fn, optimizer, experiment_folder, metrics) #with open(experiment_folder + '/TRencoded_vect.csv', 'w') as f: # wr = csv.writer(f, delimiter=',') # for e in encoded_tr: # wr.writerow(e) #with open(experiment_folder + '/TRmrns.csv', 'w') as f: # wr = csv.writer(f, delimiter=',') # for m in mrn_tr: # wr.writerow([m]) #with open(experiment_folder + '/TRmetrics.txt', 'w') as f: # wr = csv.writer(f, delimiter = '\t') #for m, v in metrics_average.items(): # wr.writerow([m, v]) # wr.writerow(["Mean loss:", loss_tr]) ##load and evaluate best model #print("Evaluating best model...") #best_saved = torch.load(experiment_folder + '/best_model.pt') #model.load_state_dict(best_saved['state_dict']) #mrn, encoded, metrics_avg = evaluate(model, loss_fn, data_generator, metrics, best_eval=True) with open(experiment_folder + '/encoded_vect.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for e in encoded: wr.writerow(e) with open(experiment_folder + '/mrns.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for m in mrn: wr.writerow([m]) with open(experiment_folder + '/metrics.txt', 'w') as f: wr = csv.writer(f, delimiter='\t') #for m, v in metrics_average.items(): # wr.writerow([m, v]) wr.writerow(["Mean loss:", metrics_avg['loss']]) wr.writerow(["Accuracy:", metrics_avg['accuracy']])
def learn_patient_representations( indir, test_set=False, sampling=None, emb_filename=None ): # encodings folder to save the representations exp_dir = os.path.join(indir, 'encodings') if test_set: exp_dir = os.path.join(indir, 'encodings', 'test') os.makedirs(exp_dir, exist_ok=True) # get the vocabulary size vocab_size, vocab = vocabulary.get_vocab(indir) # load pre-computed embeddings if emb_filename is not None: model = Word2Vec.load(emb_filename) embs = model.wv del model print('Loaded pre-computed embeddings for {0} concepts'.format( len(embs.vocab))) else: embs = None # set random seed for experiment reproducibility torch.manual_seed(123) torch.cuda.manual_seed(123) # load data data_tr = EHRdata(os.path.join(indir), ut.dt_files['ehr-file'], sampling) data_generator_tr = DataLoader( data_tr, ut.model_param['batch_size'], shuffle=True, collate_fn=ehr_collate ) if test_set: data_ts = EHRdata(os.path.join(indir), ut.dt_files['ehr-file-test'], sampling) data_generator_ts = DataLoader( data_ts, ut.model_param['batch_size'], shuffle=True, collate_fn=ehr_collate ) print("Test cohort size: {0}".format(len(data_ts))) else: data_generator_ts = data_generator_tr print('Training cohort size: {0}\n'.format(len(data_tr))) print('Max Sequence Length: {0}\n'.format(ut.len_padded)) print('Learning rate: {0}'.format(ut.model_param['learning_rate'])) print('Batch size: {0}'.format(ut.model_param['batch_size'])) print('Kernel size: {0}\n'.format(ut.model_param['kernel_size'])) # define model and optimizer model = net.ehrEncoding( vocab_size=vocab_size, max_seq_len=ut.len_padded, # 32 emb_size=ut.model_param['embedding_size'], # 100 kernel_size=ut.model_param['kernel_size'], # 5 pre_embs=embs, vocab=vocab ) optimizer = torch.optim.Adam( model.parameters(), lr=ut.model_param['learning_rate'], weight_decay=ut.model_param['weight_decay'] ) # model.cuda() if torch.cuda.device_count() > 1: print('No. of GPUs: {0}\n'.format(torch.cuda.device_count())) model = nn.DataParallel(model) else: model.cuda() print('No. of GPUs: 1\n') loss_fn = net.criterion print('Training for {} epochs\n'.format(ut.model_param['num_epochs'])) #only train train_and_evaluate( model, data_generator_tr, data_generator_ts, loss_fn, optimizer, net.metrics, exp_dir ) # uncomment this out to train AND evaluate # will take a really, really long time # training and evaluation # results of best model are saved to outdir/best_model.pt in this function ''' mrn, encoded, encoded_avg, metrics_avg = train_and_evaluate( model, data_generator_tr, data_generator_ts, loss_fn, optimizer, net.metrics, exp_dir ) # save encodings # encoded vectors (representations) outfile = os.path.join(exp_dir, 'convae_avg_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "ENCODED-AVG"]) for m, e in zip(mrn, encoded_avg): wr.writerow([m] + list(e)) outfile = os.path.join(exp_dir, 'convae_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "ENCODED-SUBSEQ"]) for m, evs in zip(mrn, encoded): for e in evs: wr.writerow([m] + e) # metrics (loss and accuracy) outfile = os.path.join(exp_dir, 'metrics.txt') with open(outfile, 'w') as f: f.write('Mean loss: %.3f\n' % metrics_avg['loss']) f.write('Accuracy: %.3f\n' % metrics_avg['accuracy']) # chop patient sequences into fixed subsequences of length L # L = ut.len_padded = 32 # I think that this is here for the human-readable version of how the patient records are subset outfile = os.path.join(exp_dir, 'cohort_ehr_subseq{0}.csv'.format(ut.len_padded)) write_ehr_subseq(data_generator_tr, outfile) if test_set: outfile = os.path.join(exp_dir, 'test_cohort_ehr_subseq{0}.csv'.format(ut.len_padded)) write_ehr_subseq(data_generator_ts, outfile) ''' return
test_set = TestDataset(test_x_input, test_v_input, test_label) val_set = TestDataset(val_x_input, val_v_input, val_label) # sampler train_sampler = WeightedSampler( train_v_input ) # Use weighted sampler instead of random sampler # loader train_loader = DataLoader(train_set, batch_size=params.batch_size, sampler=train_sampler, num_workers=4) test_loader = DataLoader(test_set, batch_size=params.predict_batch, sampler=RandomSampler(test_set), num_workers=4) val_loader = DataLoader(val_set, batch_size=params.predict_batch, sampler=RandomSampler(val_set), num_workers=4) optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) loss_fn = net.loss_fn restore_file = None train_and_evaluate(model, train_loader, test_loader, val_loader, optimizer, loss_fn, params, restore_file) # break # break
def learn_patient_representations(indir, outdir, disease_dt, eval_baseline=False, sampling=None, emb_filename=None): # experiment folder with date and time to save the representations exp_dir = os.path.join( outdir, '-'.join([ disease_dt, datetime.now().strftime('%Y-%m-%d-%H-%M-%S'), 'w2v-nobn-softplus' ])) os.makedirs(exp_dir) # get the vocabulary size fvocab = os.path.join(indir, ut.dt_files['vocab']) with open(fvocab) as f: rd = csv.reader(f) next(rd) vocab = {} for r in rd: tkn = r[0].split('::') tkn[1] = tkn[1].capitalize() vocab[int(r[1])] = '::'.join(tkn) vocab_size = len(vocab) + 1 print('Vocabulary size: {0}'.format(vocab_size)) # load pre-computed embeddings if emb_filename is not None: model = Word2Vec.load(emb_filename) embs = model.wv del model print('Loaded pre-computed embeddings for {0} concepts'.format( len(embs.vocab))) else: embs = None # set random seed for experiment reproducibility torch.manual_seed(123) torch.cuda.manual_seed(123) # load data data = EHRdata(indir, ut.dt_files['ehr'], sampling) data_generator = DataLoader(data, ut.model_param['batch_size'], shuffle=True, collate_fn=ehr_collate) print('Cohort Size: {0} -- Max Sequence Length: {1}\n'.format( len(data), ut.len_padded)) # define model and optimizer print('Learning rate: {0}'.format(ut.model_param['learning_rate'])) print('Batch size: {0}'.format(ut.model_param['batch_size'])) print('Kernel size: {0}\n'.format(ut.model_param['kernel_size'])) model = net.ehrEncoding(vocab_size=vocab_size, max_seq_len=ut.len_padded, emb_size=ut.model_param['embedding_size'], kernel_size=ut.model_param['kernel_size'], pre_embs=embs, vocab=vocab) optimizer = torch.optim.Adam(model.parameters(), lr=ut.model_param['learning_rate'], weight_decay=ut.model_param['weight_decay']) # training and evaluation if torch.cuda.device_count() > 1: print('No. of GPUs: {0}\n'.format(torch.cuda.device_count())) model = nn.DataParallel(model) else: model.cuda() print('No. of GPUs: 1\n') # model.cuda() loss_fn = net.criterion print('Training for {} epochs\n'.format(ut.model_param['num_epochs'])) mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator, loss_fn, optimizer, net.metrics, exp_dir) # save results # encoded vectors (representations) outfile = os.path.join(exp_dir, 'encoded_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerows(encoded) # MRNs to keep track of the order outfile = os.path.join(exp_dir, 'mrns.csv') with open(outfile, 'w') as f: wr = csv.writer(f) for m in mrn: wr.writerow([m]) # metrics (loss and accuracy) outfile = os.path.join(exp_dir, 'metrics.txt') with open(outfile, 'w') as f: f.write('Mean loss: %.3f\n' % metrics_avg['loss']) f.write('Accuracy: %.3f\n' % metrics_avg['accuracy']) # evaluate clustering gt_file = os.path.join(indir, ut.dt_files['diseases']) gt_disease = clu.load_mrn_disease(gt_file) min_clu = 2 max_clu = 10 if eval_baseline: print('\nRunning clustering on the TF-IDF vectors') datafile = os.path.join(indir, ut.dt_files['ehr']) mrn_idx, svd_mtx = clu.svd_tfidf(datafile, vocab_size) gt_disease_raw = [gt_disease[m][0] for m in mrn_idx] clu.eval_hierarchical_clustering(svd_mtx, gt_disease_raw, min_clu, max_clu) print('\nRunning clustering on the encoded vectors') gt_disease_enc = [gt_disease[m][0] for m in mrn] clu.eval_hierarchical_clustering(encoded, gt_disease_enc, min_clu, max_clu, preproc=True) return
def learn_patient_representations(indir, test_set=False, sampling=None, emb_filename=None): # experiment folder with date and time to save the representations exp_dir = os.path.join(indir, 'encodings') os.makedirs(exp_dir, exist_ok=True) # get the vocabulary size fvocab = os.path.join(os.path.join(indir), ut.dt_files['vocab']) with open(fvocab) as f: rd = csv.reader(f) next(rd) vocab = {} for r in rd: vocab[int(r[1])] = r[0] vocab_size = len(vocab) + 1 print('Vocabulary size: {0}'.format(vocab_size)) # load pre-computed embeddings if emb_filename is not None: model = Word2Vec.load(emb_filename) embs = model.wv del model print('Loaded pre-computed embeddings for {0} concepts'.format( len(embs.vocab))) else: embs = None # set random seed for experiment reproducibility torch.manual_seed(123) torch.cuda.manual_seed(123) # load data data_tr = EHRdata(os.path.join(indir), ut.dt_files['ehr-file'], sampling) data_generator_tr = DataLoader(data_tr, ut.model_param['batch_size'], shuffle=True, collate_fn=ehr_collate) if test_set: data_ts = EHRdata(os.path.join(indir), ut.dt_files['ehr-file-test'], sampling) data_generator_ts = DataLoader(data_ts, ut.model_param['batch_size'], shuffle=True, collate_fn=ehr_collate) print("Test cohort size: {0}".format(len(data_ts))) else: data_generator_ts = data_generator_tr print('Training cohort size: {0}\n'.format(len(data_tr))) print('Max Sequence Length: {0}\n'.format(ut.len_padded)) # define model and optimizer print('Learning rate: {0}'.format(ut.model_param['learning_rate'])) print('Batch size: {0}'.format(ut.model_param['batch_size'])) print('Kernel size: {0}\n'.format(ut.model_param['kernel_size'])) model = net.ehrEncoding(vocab_size=vocab_size, max_seq_len=ut.len_padded, emb_size=ut.model_param['embedding_size'], kernel_size=ut.model_param['kernel_size'], pre_embs=embs, vocab=vocab) optimizer = torch.optim.Adam(model.parameters(), lr=ut.model_param['learning_rate'], weight_decay=ut.model_param['weight_decay']) # training and evaluation if torch.cuda.device_count() > 1: print('No. of GPUs: {0}\n'.format(torch.cuda.device_count())) model = nn.DataParallel(model) else: model.cuda() print('No. of GPUs: 1\n') # model.cuda() loss_fn = net.criterion print('Training for {} epochs\n'.format(ut.model_param['num_epochs'])) mrn, encoded, encoded_avg, metrics_avg = train_and_evaluate( model, data_generator_tr, data_generator_ts, loss_fn, optimizer, net.metrics, exp_dir) # save results # encoded vectors (representations) outfile = os.path.join(exp_dir, 'convae-avg_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "ENCODED-AVG"]) for m, e in zip(mrn, encoded_avg): wr.writerow([m] + list(e)) outfile = os.path.join(exp_dir, 'convae_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "ENCODED-SUBSEQ"]) for m, evs in zip(mrn, encoded): for e in evs: wr.writerow([m] + e) # metrics (loss and accuracy) outfile = os.path.join(exp_dir, 'metrics.txt') with open(outfile, 'w') as f: f.write('Mean loss: %.3f\n' % metrics_avg['loss']) f.write('Accuracy: %.3f\n' % metrics_avg['accuracy']) # ehr subseq with age in days outfile = os.path.join(exp_dir, 'cohort-ehr-subseq{0}.csv'.format(ut.len_padded)) with open(os.path.join(os.path.join(indir), 'cohort-ehrseq.csv')) as f: rd = csv.reader(f) next(rd) ehr = {} for r in rd: ehr.setdefault(r[0], list()).extend(r[1:]) ehr_subseq = {} for list_m, batch in data_generator_tr: for b, m in zip(batch, list_m): if len(b) == 1: ehr_subseq[m] = b.tolist() else: seq = [] for vec in b.tolist(): seq.extend(vec) nseq, nleft = divmod(len(seq), ut.len_padded) if nleft > 0: seq = seq + [0] * \ (ut.len_padded - nleft) for i in range(0, len(seq) - ut.len_padded + 1, ut.len_padded): ehr_subseq.setdefault(m, list()).append(seq[i:i + ut.len_padded]) with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "EHRsubseq"]) for m, subseq in ehr_subseq.items(): for seq in subseq: wr.writerow([m] + list(filter(lambda x: x != 0, seq))) if test_set: outfile = os.path.join( exp_dir, 'cohort_test-ehr-subseq{0}.csv'.format(ut.len_padded)) ehr_subseq = {} for list_m, batch in data_generator_ts: for b, m in zip(batch, list_m): if len(b) == 1: ehr_subseq[m] = b.tolist() else: seq = [] for vec in b.tolist(): seq.extend(vec) nseq, nleft = divmod(len(seq), ut.len_padded) if nleft > 0: seq = seq + [0] * \ (ut.len_padded - nleft) for i in range(0, len(seq) - ut.len_padded + 1, ut.len_padded): ehr_subseq.setdefault(m, list()).append( seq[i:i + ut.len_padded]) with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "EHRsubseq"]) for m, subseq in ehr_subseq.items(): for seq in subseq: wr.writerow([m] + list(filter(lambda x: x != 0, seq))) return
len(args.gpus), cpu_merge=False) model.compile( loss=args.loss, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'], ) return model # Model try: model = load_model(SINGLE_MODEL_NAME) print("Model loaded from disk") create_model = False except Exception: create_model = True if create_model: print("Creating new single vgg model") model = compiled_single_model(input_shape) train_and_evaluate(model, args.epochs, args.batches, gpus=args.gpus, plot_history=args.plot_history, plot_model=args.plot_model)
model.embedding.weight.data.copy_(pretrained_embeddings) UNK_IDX = dataset.TEXT.vocab.stoi[dataset.TEXT.unk_token] model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM) optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss() model = model.to(device) criterion = criterion.to(device) if answers.get('choice') == 'Train model': train.train_and_evaluate(model, train_iterator, valid_iterator, optimizer, criterion) test_loss, test_acc = train.evaluate(model, test_iterator, criterion) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.3f}%') if answers.get('choice') == 'Evaluate model': model.load_state_dict(torch.load('ezmath-model_83.pt')) test_loss, test_acc = train.evaluate(model, test_iterator, criterion) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.3f}%') if answers.get('choice') == 'Make Prediction': model.load_state_dict(torch.load('ezmath-model_83.pt')) nlp = spacy.load('it_core_news_sm') string = input("Please insert the exercise text: ") print('Making prediction for: ') print(string) pred_class = model.predict_class(string, nlp, dataset, device)
def generate_entry(model_name, hyperparameters, datasets=('low', 'medium', 'high'), use_hierarchical_attention=False, use_ptr_gen=True, test_data='test', write_hyperparameter=False, output_folder=None, resume=False): languages = get_languages() if output_folder is None: output_folder = os.path.join('output', model_name) if not resume: os.makedirs(output_folder) if write_hyperparameter: with open(os.path.join(output_folder, 'hyperparameters'), 'w', encoding='utf8') as file: file.write(hyperparameters) for language in tqdm(sorted(languages)): for dataset in datasets: if resume and os.path.exists( os.path.join(output_folder, '{}-{}-out'.format( language, dataset))): continue lr = hyperparameters['lr'][dataset] embedding_size = hyperparameters['embedding_size'][dataset] hidden_size = hyperparameters['hidden_size'][dataset] clip = hyperparameters['clip'][dataset] dropout_p = hyperparameters['dropout_p'][dataset] alpha = hyperparameters['alpha'][dataset] beta = hyperparameters['beta'][dataset] patience = hyperparameters['patience'][dataset] epochs_extension = hyperparameters['epochs_extension'][dataset] experiment_name = "{}_{}_{}_lr{}_em{}_hd_{}_clip{}_p{}_a{}_b_{}_{}".format( model_name, language, dataset, lr, embedding_size, hidden_size, str(clip), dropout_p, alpha, beta, int(time.time())) try: model_inputs_train, model_inputs_val, labels_train, labels_val, \ vocab = package.data.load_data(language, dataset, test_data=test_data, use_external_val_data=True, val_ratio=0.2, random_state=42) except FileNotFoundError: continue model = package.net.Model( vocab, embedding_size=embedding_size, hidden_size=hidden_size, use_hierarchical_attention=use_hierarchical_attention, use_ptr_gen=use_ptr_gen, dropout_p=dropout_p).to(device) optimizer = optim.Adam(lr=lr, params=model.parameters()) loss_fn = package.loss.Criterion(vocab, alpha, beta) writer = SummaryWriter('runs/' + experiment_name) model_save_dir = os.path.join('./saved_models', experiment_name) os.makedirs(model_save_dir) epochs = hyperparameters['epochs'][dataset] train_and_evaluate(model_inputs_train, labels_train, model_inputs_val, labels_val, model, optimizer, loss_fn, epochs=epochs, batch_size=32, model_save_dir=model_save_dir, show_progress=False, writer=writer, clip=clip) epochs_trained = epochs # Load best performing model on validation set best_state = torch.load(os.path.join(model_save_dir, 'best.model')) while epochs_trained - best_state['epoch_num'] < patience: train_and_evaluate(model_inputs_train, labels_train, model_inputs_val, labels_val, model, optimizer, loss_fn, epochs=epochs_extension, batch_size=32, model_save_dir=model_save_dir, show_progress=False, writer=writer, clip=clip, starting_epoch=epochs_trained + 1, initial_best_val_acc=best_state['val_acc']) epochs_trained += epochs_extension best_state = torch.load( os.path.join(model_save_dir, 'best.model')) model.load_state_dict(best_state['model_state']) if test_data == 'dev': dev_file = os.path.join(TASK1_DATA_PATH, '{}-dev'.format(language)) lemmas_test, tags_test, _ = read_dataset(dev_file) elif test_data == 'test': test_file = os.path.join(TASK1_DATA_PATH, '{}-covered-test'.format(language)) lemmas_test, tags_test = read_covered_dataset(test_file) else: raise ValueError file_path = os.path.join(output_folder, '{}-{}-out'.format(language, dataset)) generate_output(model, lemmas_test, tags_test, file_path)
def main(args): # Load the parameters from json file params_dir = args.params_dir json_path = os.path.join(params_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = utils.Params(json_path) # use GPU if available params.cuda = torch.cuda.is_available() # Set the random seed for reproducible experiments torch.manual_seed(params.seed) if params.cuda: torch.cuda.manual_seed(params.seed) # Set the logger model_dir = args.output_dir if not os.path.exists(model_dir): os.makedirs(model_dir) utils.set_logger(os.path.join(model_dir, 'train.log')) logging.info("************ Validation fold: {} ************".format( args.fold)) # Create the input data pipeline logging.info("Loading the datasets...") config_dict = { 'image_dir': os.path.join(args.input_dir, 'train'), 'csv_path': os.path.join(args.input_dir, 'train.csv') } train_data = DataPreprocess(config_dict) df, target_cols, num_targets = train_data.df, train_data.target_cols, train_data.num_targets # check for debug mode if args.debug: params.num_epochs = 1 df = df.sample(n=100, random_state=params.seed).reset_index(drop=True) # update params params.mode = args.mode params.num_targets = num_targets params.target_cols = target_cols # split data into folds and pass to the model Fold = GroupKFold(n_splits=params.num_folds) groups = df['PatientID'].values for n, (train_index, valid_index) in enumerate( Fold.split(df, df[params.target_cols], groups)): df.loc[valid_index, 'fold'] = int(n) df['fold'] = df['fold'].astype(int) # get training and validation data using folds train_df = df[df.fold != args.fold].reset_index(drop=True) valid_df = df[df.fold == args.fold].reset_index(drop=True) # get dataloaders train_dataloader = dataloader.fetch_dataloader(train_df, params, data='train') valid_dataloader = dataloader.fetch_dataloader(valid_df, params, data='valid') logging.info("- done.") # Define the model and optimizer model = RANZCRModel(params, pretrained=True).model if params.cuda: model = model.to(torch.device('cuda')) optimizer = optim.Adam(model.parameters(), lr=params.learning_rate, amsgrad=False) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True) # fetch loss function and metrics loss_fn = nn.BCEWithLogitsLoss() metrics = models.metrics # Train the model logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) train_and_evaluate(model, train_dataloader, valid_dataloader, valid_df[params.target_cols].values, optimizer, scheduler, loss_fn, metrics, params, model_dir)