def main(): f = open("outdir.txt", 'r') outdir = f.read().rstrip('\n') #create an experiment folder tied to date and time where to save output from the model experiment_folder = os.path.expanduser('~/data1/stratification_ILRM/experiments/') + disease_folder +\ '-'.join(map(str, list(datetime.now().timetuple()[:6]))) os.makedirs(experiment_folder) f = open("experiment_folder.txt", 'w') ##path to the experiment folder is saved in a txt file f.write(experiment_folder) f.close() ##pass the size of the vocabulary to the model with open(os.path.join(outdir, mt_to_ix_file)) as f: rd = csv.reader(f) next(rd) vocab_size = 1 for r in rd: vocab_size+=1 #set random seed for reproducible experiments torch.manual_seed(123) torch.cuda.manual_seed(123) ##Import data data = myData(outdir, ehr_file) data_generator = DataLoader(data, model_pars['batch_size'], shuffle=True, collate_fn=my_collate) #define model and optimizer print("cohort numerosity:{0} -- max_seq_length:{1}".format(len(data), L)) model = net.ehrEncoding(vocab_size, L, model_pars['embedding_dim'], model_pars['kernel_size']) optimizer = torch.optim.Adam(model.parameters(), lr=model_pars['learning_rate'], weight_decay=1e-5) #start the unsupervised training and evaluation model.cuda() loss_fn = net.criterion print("Starting training for {} epochs...".format(model_pars['num_epochs'])) mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator, loss_fn, optimizer, metrics, experiment_folder) ##save encoded vectors, medical record number list (to keep track of the order) and metric (loss and accuracy) with open(experiment_folder + '/encoded_vect.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for e in encoded: wr.writerow(e) with open(experiment_folder + '/mrns.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for m in mrn: wr.writerow([m]) with open(experiment_folder + '/metrics.txt', 'w') as f: wr = csv.writer(f, delimiter='\t') #for m, v in metrics_average.items(): # wr.writerow([m, v]) wr.writerow(["Mean loss:", metrics_avg['loss']]) wr.writerow(["Accuracy:", metrics_avg['accuracy']])
def main(): ##pass the size of the vocabulary to the model with open(os.path.join(data_folder, mt_to_ix_file)) as f: rd = csv.reader(f) vocab_size = 0 for r in rd: vocab_size += 1 #set random seed for reproducible experiments torch.manual_seed(123) torch.cuda.manual_seed(123) ##Import data data = myData(data_folder, ehr_file) data_generator = DataLoader(data, model_pars['batch_size'], shuffle=True, collate_fn=my_collate) #define model and optimizer print("cohort numerosity:{0} -- max_seq_length:{1}".format(len(data), L)) model = net.ehrEncoding(vocab_size, L, model_pars['embedding_dim'], model_pars['kernel_size']) #model = nn.DataParallel(model, device_ids=[1,2,3]) optimizer = torch.optim.Adam(model.parameters(), lr=model_pars['learning_rate'], weight_decay=1e-5) #start the unsupervised training and evaluation model.cuda() loss_fn = net.criterion print("Starting training for {} epochs...".format( model_pars['num_epochs'])) mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator, loss_fn, optimizer, experiment_folder, metrics) #with open(experiment_folder + '/TRencoded_vect.csv', 'w') as f: # wr = csv.writer(f, delimiter=',') # for e in encoded_tr: # wr.writerow(e) #with open(experiment_folder + '/TRmrns.csv', 'w') as f: # wr = csv.writer(f, delimiter=',') # for m in mrn_tr: # wr.writerow([m]) #with open(experiment_folder + '/TRmetrics.txt', 'w') as f: # wr = csv.writer(f, delimiter = '\t') #for m, v in metrics_average.items(): # wr.writerow([m, v]) # wr.writerow(["Mean loss:", loss_tr]) ##load and evaluate best model #print("Evaluating best model...") #best_saved = torch.load(experiment_folder + '/best_model.pt') #model.load_state_dict(best_saved['state_dict']) #mrn, encoded, metrics_avg = evaluate(model, loss_fn, data_generator, metrics, best_eval=True) with open(experiment_folder + '/encoded_vect.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for e in encoded: wr.writerow(e) with open(experiment_folder + '/mrns.csv', 'w') as f: wr = csv.writer(f, delimiter=',') for m in mrn: wr.writerow([m]) with open(experiment_folder + '/metrics.txt', 'w') as f: wr = csv.writer(f, delimiter='\t') #for m, v in metrics_average.items(): # wr.writerow([m, v]) wr.writerow(["Mean loss:", metrics_avg['loss']]) wr.writerow(["Accuracy:", metrics_avg['accuracy']])
def learn_patient_representations(indir, test_set=False, sampling=None, emb_filename=None): # experiment folder with date and time to save the representations exp_dir = os.path.join(indir, 'encodings') os.makedirs(exp_dir, exist_ok=True) # get the vocabulary size fvocab = os.path.join(os.path.join(indir), ut.dt_files['vocab']) with open(fvocab) as f: rd = csv.reader(f) next(rd) vocab = {} for r in rd: vocab[int(r[1])] = r[0] vocab_size = len(vocab) + 1 print('Vocabulary size: {0}'.format(vocab_size)) # load pre-computed embeddings if emb_filename is not None: model = Word2Vec.load(emb_filename) embs = model.wv del model print('Loaded pre-computed embeddings for {0} concepts'.format( len(embs.vocab))) else: embs = None # set random seed for experiment reproducibility torch.manual_seed(123) torch.cuda.manual_seed(123) # load data data_tr = EHRdata(os.path.join(indir), ut.dt_files['ehr-file'], sampling) data_generator_tr = DataLoader(data_tr, ut.model_param['batch_size'], shuffle=True, collate_fn=ehr_collate) if test_set: data_ts = EHRdata(os.path.join(indir), ut.dt_files['ehr-file-test'], sampling) data_generator_ts = DataLoader(data_ts, ut.model_param['batch_size'], shuffle=True, collate_fn=ehr_collate) print("Test cohort size: {0}".format(len(data_ts))) else: data_generator_ts = data_generator_tr print('Training cohort size: {0}\n'.format(len(data_tr))) print('Max Sequence Length: {0}\n'.format(ut.len_padded)) # define model and optimizer print('Learning rate: {0}'.format(ut.model_param['learning_rate'])) print('Batch size: {0}'.format(ut.model_param['batch_size'])) print('Kernel size: {0}\n'.format(ut.model_param['kernel_size'])) model = net.ehrEncoding(vocab_size=vocab_size, max_seq_len=ut.len_padded, emb_size=ut.model_param['embedding_size'], kernel_size=ut.model_param['kernel_size'], pre_embs=embs, vocab=vocab) optimizer = torch.optim.Adam(model.parameters(), lr=ut.model_param['learning_rate'], weight_decay=ut.model_param['weight_decay']) # training and evaluation if torch.cuda.device_count() > 1: print('No. of GPUs: {0}\n'.format(torch.cuda.device_count())) model = nn.DataParallel(model) else: model.cuda() print('No. of GPUs: 1\n') # model.cuda() loss_fn = net.criterion print('Training for {} epochs\n'.format(ut.model_param['num_epochs'])) mrn, encoded, encoded_avg, metrics_avg = train_and_evaluate( model, data_generator_tr, data_generator_ts, loss_fn, optimizer, net.metrics, exp_dir) # save results # encoded vectors (representations) outfile = os.path.join(exp_dir, 'convae-avg_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "ENCODED-AVG"]) for m, e in zip(mrn, encoded_avg): wr.writerow([m] + list(e)) outfile = os.path.join(exp_dir, 'convae_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "ENCODED-SUBSEQ"]) for m, evs in zip(mrn, encoded): for e in evs: wr.writerow([m] + e) # metrics (loss and accuracy) outfile = os.path.join(exp_dir, 'metrics.txt') with open(outfile, 'w') as f: f.write('Mean loss: %.3f\n' % metrics_avg['loss']) f.write('Accuracy: %.3f\n' % metrics_avg['accuracy']) # ehr subseq with age in days outfile = os.path.join(exp_dir, 'cohort-ehr-subseq{0}.csv'.format(ut.len_padded)) with open(os.path.join(os.path.join(indir), 'cohort-ehrseq.csv')) as f: rd = csv.reader(f) next(rd) ehr = {} for r in rd: ehr.setdefault(r[0], list()).extend(r[1:]) ehr_subseq = {} for list_m, batch in data_generator_tr: for b, m in zip(batch, list_m): if len(b) == 1: ehr_subseq[m] = b.tolist() else: seq = [] for vec in b.tolist(): seq.extend(vec) nseq, nleft = divmod(len(seq), ut.len_padded) if nleft > 0: seq = seq + [0] * \ (ut.len_padded - nleft) for i in range(0, len(seq) - ut.len_padded + 1, ut.len_padded): ehr_subseq.setdefault(m, list()).append(seq[i:i + ut.len_padded]) with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "EHRsubseq"]) for m, subseq in ehr_subseq.items(): for seq in subseq: wr.writerow([m] + list(filter(lambda x: x != 0, seq))) if test_set: outfile = os.path.join( exp_dir, 'cohort_test-ehr-subseq{0}.csv'.format(ut.len_padded)) ehr_subseq = {} for list_m, batch in data_generator_ts: for b, m in zip(batch, list_m): if len(b) == 1: ehr_subseq[m] = b.tolist() else: seq = [] for vec in b.tolist(): seq.extend(vec) nseq, nleft = divmod(len(seq), ut.len_padded) if nleft > 0: seq = seq + [0] * \ (ut.len_padded - nleft) for i in range(0, len(seq) - ut.len_padded + 1, ut.len_padded): ehr_subseq.setdefault(m, list()).append( seq[i:i + ut.len_padded]) with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "EHRsubseq"]) for m, subseq in ehr_subseq.items(): for seq in subseq: wr.writerow([m] + list(filter(lambda x: x != 0, seq))) return
torch.cuda.manual_seed(123) # load data data_ts = EHRdata(os.path.join(indir), ut.dt_files['ehr-file-test'], sampling) data_generator_ts = DataLoader( data_ts, ut.model_param['batch_size'], # may need to reduce this shuffle=False, collate_fn=ehr_collate) print("Test cohort size: {0}".format(len(data_ts))) # define model and optimizer model = net.ehrEncoding( vocab_size=vocab_size, max_seq_len=ut.len_padded, # 32 emb_size=ut.model_param['embedding_size'], # 100 kernel_size=ut.model_param['kernel_size'], # 5 pre_embs=embs, vocab=vocab) optimizer = torch.optim.Adam(model.parameters(), lr=ut.model_param['learning_rate'], weight_decay=ut.model_param['weight_decay']) model.cuda() loss_fn = net.criterion # use model from checkpoint checkpoint = torch.load(os.path.join(indir, 'encodings', 'best_model.pt')) model.load_state_dict(checkpoint['model_state_dict'])
def learn_patient_representations(indir, outdir, disease_dt, eval_baseline=False, sampling=None, emb_filename=None): # experiment folder with date and time to save the representations exp_dir = os.path.join( outdir, '-'.join([ disease_dt, datetime.now().strftime('%Y-%m-%d-%H-%M-%S'), 'w2v-nobn-softplus' ])) os.makedirs(exp_dir) # get the vocabulary size fvocab = os.path.join(indir, ut.dt_files['vocab']) with open(fvocab) as f: rd = csv.reader(f) next(rd) vocab = {} for r in rd: tkn = r[0].split('::') tkn[1] = tkn[1].capitalize() vocab[int(r[1])] = '::'.join(tkn) vocab_size = len(vocab) + 1 print('Vocabulary size: {0}'.format(vocab_size)) # load pre-computed embeddings if emb_filename is not None: model = Word2Vec.load(emb_filename) embs = model.wv del model print('Loaded pre-computed embeddings for {0} concepts'.format( len(embs.vocab))) else: embs = None # set random seed for experiment reproducibility torch.manual_seed(123) torch.cuda.manual_seed(123) # load data data = EHRdata(indir, ut.dt_files['ehr'], sampling) data_generator = DataLoader(data, ut.model_param['batch_size'], shuffle=True, collate_fn=ehr_collate) print('Cohort Size: {0} -- Max Sequence Length: {1}\n'.format( len(data), ut.len_padded)) # define model and optimizer print('Learning rate: {0}'.format(ut.model_param['learning_rate'])) print('Batch size: {0}'.format(ut.model_param['batch_size'])) print('Kernel size: {0}\n'.format(ut.model_param['kernel_size'])) model = net.ehrEncoding(vocab_size=vocab_size, max_seq_len=ut.len_padded, emb_size=ut.model_param['embedding_size'], kernel_size=ut.model_param['kernel_size'], pre_embs=embs, vocab=vocab) optimizer = torch.optim.Adam(model.parameters(), lr=ut.model_param['learning_rate'], weight_decay=ut.model_param['weight_decay']) # training and evaluation if torch.cuda.device_count() > 1: print('No. of GPUs: {0}\n'.format(torch.cuda.device_count())) model = nn.DataParallel(model) else: model.cuda() print('No. of GPUs: 1\n') # model.cuda() loss_fn = net.criterion print('Training for {} epochs\n'.format(ut.model_param['num_epochs'])) mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator, loss_fn, optimizer, net.metrics, exp_dir) # save results # encoded vectors (representations) outfile = os.path.join(exp_dir, 'encoded_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerows(encoded) # MRNs to keep track of the order outfile = os.path.join(exp_dir, 'mrns.csv') with open(outfile, 'w') as f: wr = csv.writer(f) for m in mrn: wr.writerow([m]) # metrics (loss and accuracy) outfile = os.path.join(exp_dir, 'metrics.txt') with open(outfile, 'w') as f: f.write('Mean loss: %.3f\n' % metrics_avg['loss']) f.write('Accuracy: %.3f\n' % metrics_avg['accuracy']) # evaluate clustering gt_file = os.path.join(indir, ut.dt_files['diseases']) gt_disease = clu.load_mrn_disease(gt_file) min_clu = 2 max_clu = 10 if eval_baseline: print('\nRunning clustering on the TF-IDF vectors') datafile = os.path.join(indir, ut.dt_files['ehr']) mrn_idx, svd_mtx = clu.svd_tfidf(datafile, vocab_size) gt_disease_raw = [gt_disease[m][0] for m in mrn_idx] clu.eval_hierarchical_clustering(svd_mtx, gt_disease_raw, min_clu, max_clu) print('\nRunning clustering on the encoded vectors') gt_disease_enc = [gt_disease[m][0] for m in mrn] clu.eval_hierarchical_clustering(encoded, gt_disease_enc, min_clu, max_clu, preproc=True) return
def learn_patient_representations( indir, test_set=False, sampling=None, emb_filename=None ): # encodings folder to save the representations exp_dir = os.path.join(indir, 'encodings') if test_set: exp_dir = os.path.join(indir, 'encodings', 'test') os.makedirs(exp_dir, exist_ok=True) # get the vocabulary size vocab_size, vocab = vocabulary.get_vocab(indir) # load pre-computed embeddings if emb_filename is not None: model = Word2Vec.load(emb_filename) embs = model.wv del model print('Loaded pre-computed embeddings for {0} concepts'.format( len(embs.vocab))) else: embs = None # set random seed for experiment reproducibility torch.manual_seed(123) torch.cuda.manual_seed(123) # load data data_tr = EHRdata(os.path.join(indir), ut.dt_files['ehr-file'], sampling) data_generator_tr = DataLoader( data_tr, ut.model_param['batch_size'], shuffle=True, collate_fn=ehr_collate ) if test_set: data_ts = EHRdata(os.path.join(indir), ut.dt_files['ehr-file-test'], sampling) data_generator_ts = DataLoader( data_ts, ut.model_param['batch_size'], shuffle=True, collate_fn=ehr_collate ) print("Test cohort size: {0}".format(len(data_ts))) else: data_generator_ts = data_generator_tr print('Training cohort size: {0}\n'.format(len(data_tr))) print('Max Sequence Length: {0}\n'.format(ut.len_padded)) print('Learning rate: {0}'.format(ut.model_param['learning_rate'])) print('Batch size: {0}'.format(ut.model_param['batch_size'])) print('Kernel size: {0}\n'.format(ut.model_param['kernel_size'])) # define model and optimizer model = net.ehrEncoding( vocab_size=vocab_size, max_seq_len=ut.len_padded, # 32 emb_size=ut.model_param['embedding_size'], # 100 kernel_size=ut.model_param['kernel_size'], # 5 pre_embs=embs, vocab=vocab ) optimizer = torch.optim.Adam( model.parameters(), lr=ut.model_param['learning_rate'], weight_decay=ut.model_param['weight_decay'] ) # model.cuda() if torch.cuda.device_count() > 1: print('No. of GPUs: {0}\n'.format(torch.cuda.device_count())) model = nn.DataParallel(model) else: model.cuda() print('No. of GPUs: 1\n') loss_fn = net.criterion print('Training for {} epochs\n'.format(ut.model_param['num_epochs'])) #only train train_and_evaluate( model, data_generator_tr, data_generator_ts, loss_fn, optimizer, net.metrics, exp_dir ) # uncomment this out to train AND evaluate # will take a really, really long time # training and evaluation # results of best model are saved to outdir/best_model.pt in this function ''' mrn, encoded, encoded_avg, metrics_avg = train_and_evaluate( model, data_generator_tr, data_generator_ts, loss_fn, optimizer, net.metrics, exp_dir ) # save encodings # encoded vectors (representations) outfile = os.path.join(exp_dir, 'convae_avg_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "ENCODED-AVG"]) for m, e in zip(mrn, encoded_avg): wr.writerow([m] + list(e)) outfile = os.path.join(exp_dir, 'convae_vect.csv') with open(outfile, 'w') as f: wr = csv.writer(f) wr.writerow(["MRN", "ENCODED-SUBSEQ"]) for m, evs in zip(mrn, encoded): for e in evs: wr.writerow([m] + e) # metrics (loss and accuracy) outfile = os.path.join(exp_dir, 'metrics.txt') with open(outfile, 'w') as f: f.write('Mean loss: %.3f\n' % metrics_avg['loss']) f.write('Accuracy: %.3f\n' % metrics_avg['accuracy']) # chop patient sequences into fixed subsequences of length L # L = ut.len_padded = 32 # I think that this is here for the human-readable version of how the patient records are subset outfile = os.path.join(exp_dir, 'cohort_ehr_subseq{0}.csv'.format(ut.len_padded)) write_ehr_subseq(data_generator_tr, outfile) if test_set: outfile = os.path.join(exp_dir, 'test_cohort_ehr_subseq{0}.csv'.format(ut.len_padded)) write_ehr_subseq(data_generator_ts, outfile) ''' return