def on_epoch_end(self, epoch, logs=None): yhat_raw = model.predict_generator(gen_slim_test, steps=test_samples) yhat = np.round(yhat_raw) #get metrics k = 5 metrics = evaluation.all_metrics(yhat=yhat, y=test_y, k=k, yhat_raw=yhat_raw) evaluation.print_metrics(metrics)
def test(model, Y, epoch, data_path, fold, gpu, version, code_inds, dicts, samples, model_dir, testing): """ Testing loop. Returns metrics """ filename = data_path.replace('train', fold) print('file for evaluation: %s' % filename) num_labels = len(dicts['ind2c']) #initialize stuff for saving attention samples if samples: tp_file = open('%s/tp_%s_examples_%d.txt' % (model_dir, fold, epoch), 'w') fp_file = open('%s/fp_%s_examples_%d.txt' % (model_dir, fold, epoch), 'w') window_size = model.conv.weight.data.size()[2] y, yhat, yhat_raw, hids, losses = [], [], [], [], [] ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts[ 'ind2c'], dicts['c2ind'] desc_embed = model.lmbda > 0 if desc_embed and len(code_inds) > 0: unseen_code_vecs(model, code_inds, dicts, gpu) model.eval() gen = datasets.data_generator(filename, dicts, 1, num_labels, version=version, desc_embed=desc_embed) for batch_idx, tup in tqdm(enumerate(gen)): data, target, hadm_ids, _, descs = tup data, target = Variable(torch.LongTensor(data), volatile=True), Variable( torch.FloatTensor(target)) if gpu: data = data.cuda() target = target.cuda() model.zero_grad() if desc_embed: desc_data = descs else: desc_data = None #get an attention sample for 2% of batches get_attn = samples and (np.random.rand() < 0.02 or (fold == 'test' and testing)) output, loss, alpha = model(data, target, desc_data=desc_data, get_attention=get_attn) output = F.sigmoid(output) output = output.data.cpu().numpy() losses.append(loss.item()) target_data = target.data.cpu().numpy() if get_attn and samples: interpret.save_samples(data, output, target_data, alpha, window_size, epoch, tp_file, fp_file, dicts=dicts) #save predictions, target, hadm ids yhat_raw.append(output) output = np.round(output) y.append(target_data) yhat.append(output) hids.extend(hadm_ids) #close files if needed if samples: tp_file.close() fp_file.close() y = np.concatenate(y, axis=0) yhat = np.concatenate(yhat, axis=0) yhat_raw = np.concatenate(yhat_raw, axis=0) #write the predictions preds_file = persistence.write_preds(yhat, model_dir, hids, fold, ind2c, yhat_raw) #get metrics k = 5 if num_labels == 50 else [8, 15] metrics = evaluation.all_metrics(yhat, y, k=k, yhat_raw=yhat_raw) evaluation.print_metrics(metrics) metrics['loss_%s' % fold] = np.mean(losses) return metrics
def test(model, epoch, batch_size, data_path, fold, gpu, dicts, samples, model_dir, testing, debug): """ Testing loop. Returns metrics """ filename = data_path.replace('train', fold) print('file for evaluation: %s' % filename) # num_labels = tools.get_num_labels(Y, version) #initialize stuff for saving attention samples if samples: tp_file = open('%s/tp_%s_examples_%d.txt' % (model_dir, fold, epoch), 'w') fp_file = open('%s/fp_%s_examples_%d.txt' % (model_dir, fold, epoch), 'w') window_size = model.conv.weight.data.size()[2] y, yhat, yhat_raw, hids, losses = [], [], [], [], [] # ind2w, w2ind, ind2c, c2ind = dicts[0], dicts[1], dicts[2], dicts[3] ind2w, w2ind = dicts[0], dicts[1] # desc_embed = model.lmbda > 0 # if desc_embed and len(code_inds) > 0: # unseen_code_vecs(model, code_inds, dicts) model.eval() gen = datasets.data_generator(filename, dicts, batch_size) for batch_idx, tup in tqdm(enumerate(gen)): if debug and batch_idx > 50: break # data, target, hadm_ids, _, descs = tup data, target, hadm_ids = tup data, target = Variable(torch.LongTensor(data), volatile=True), Variable( torch.FloatTensor(target)) if gpu: data = data.cuda() target = target.cuda() model.zero_grad() # if desc_embed: # desc_data = descs # else: # desc_data = None # get_attn = samples and (np.random.rand() < 0.02 or (fold == 'test' and testing)) # output, loss, alpha = model(data, target, desc_data=desc_data, get_attention=get_attn) output, loss, alpha = model(data, target) output = output.data.cpu().numpy() losses.append(loss.data[0]) target_data = target.data.cpu().numpy() # if get_attn and samples: # interpret.save_samples(data, output, target_data, alpha, window_size, epoch, tp_file, fp_file, freq_params[0], dicts=dicts) #save predictions, target, hadm ids yhat_raw.append(output) # NEED TO KNOW FORM OF OUTPUT output = np.round(output) y.append(target_data) yhat.append(output) hids.extend(hadm_ids) if samples: tp_file.close() fp_file.close() y = np.concatenate(y, axis=0) yhat = np.concatenate(yhat, axis=0) yhat_raw = np.concatenate(yhat_raw, axis=0) print("y shape: " + str(y.shape)) print("yhat shape: " + str(yhat.shape)) #write the predictions # preds_file = persistence.write_preds(yhat, model_dir, hids, fold, ind2c, yhat_raw) preds_file = persistence.write_preds(yhat, model_dir, hids, fold, yhat_raw) #get metrics # k = 5 if num_labels == 50 else 8 # metrics = evaluation.all_metrics(yhat, y, k=k, yhat_raw=yhat_raw) metrics = evaluation.all_metrics(yhat, y, yhat_raw=yhat_raw) evaluation.print_metrics(metrics) metrics['loss_%s' % fold] = np.mean(losses) return metrics
def main(Y, train_fname, dev_fname, vocab_file, version, n): n = int(n) #need to handle really large text fields csv.field_size_limit(sys.maxsize) #get lookups from non-BOW data data_path = train_fname.replace( '_bows', '') if "_bows" in train_fname else train_fname dicts = datasets.load_lookups(data_path, vocab_file=vocab_file, Y=Y, version=version) w2ind, ind2c, c2ind = dicts['w2ind'], dicts['ind2c'], dicts['c2ind'] X, yy_tr, hids_tr = read_bows(Y, train_fname, c2ind, version) X_dv, yy_dv, hids_dv = read_bows(Y, dev_fname, c2ind, version) print("X.shape: " + str(X.shape)) print("yy_tr.shape: " + str(yy_tr.shape)) print("X_dv.shape: " + str(X_dv.shape)) print("yy_dv.shape: " + str(yy_dv.shape)) #deal with labels that don't have any positive examples #drop empty columns from yy. keep track of which columns kept #predict on test data with those columns. guess 0 on the others labels_with_examples = yy_tr.sum(axis=0).nonzero()[0] yy = yy_tr[:, labels_with_examples] # build the classifier clf = OneVsRestClassifier(LogisticRegression(C=C, max_iter=MAX_ITER, solver='sag'), n_jobs=-1) # train print("training...") clf.fit(X, yy) #predict print("predicting...") yhat = clf.predict(X_dv) yhat_raw = clf.predict_proba(X_dv) #deal with labels that don't have positive training examples print("reshaping output to deal with labels missing from train set") labels_with_examples = set(labels_with_examples) yhat_full = np.zeros(yy_dv.shape) yhat_full_raw = np.zeros(yy_dv.shape) j = 0 for i in range(yhat_full.shape[1]): if i in labels_with_examples: yhat_full[:, i] = yhat[:, j] yhat_full_raw[:, i] = yhat_raw[:, j] j += 1 #evaluate metrics, fpr, tpr = evaluation.all_metrics(yhat_full, yy_dv, k=[8, 15], yhat_raw=yhat_full_raw) evaluation.print_metrics(metrics) #save metric history, model, params print("saving predictions") model_dir = os.path.join( MODEL_DIR, '_'.join(["log_reg", time.strftime('%b_%d_%H:%M', time.localtime())])) os.mkdir(model_dir) preds_file = tools.write_preds(yhat_full, model_dir, hids_dv, 'test', yhat_full_raw) print("sanity check on train") yhat_tr = clf.predict(X) yhat_tr_raw = clf.predict_proba(X) #reshape output again yhat_tr_full = np.zeros(yy_tr.shape) yhat_tr_full_raw = np.zeros(yy_tr.shape) j = 0 for i in range(yhat_tr_full.shape[1]): if i in labels_with_examples: yhat_tr_full[:, i] = yhat_tr[:, j] yhat_tr_full_raw[:, i] = yhat_tr_raw[:, j] j += 1 #evaluate metrics_tr, fpr_tr, tpr_tr = evaluation.all_metrics( yhat_tr_full, yy_tr, k=[8, 15], yhat_raw=yhat_tr_full_raw) evaluation.print_metrics(metrics_tr) if n > 0: print("generating top important ngrams") if 'bows' in dev_fname: dev_fname = dev_fname.replace('_bows', '') print("calculating top ngrams using file %s" % dev_fname) calculate_top_ngrams(dev_fname, clf, c2ind, w2ind, labels_with_examples, n) #Commenting this out because the models are huge (11G for mimic3 full) #print("saving model") #with open("%s/model.pkl" % model_dir, 'wb') as f: # pickle.dump(clf, f) print("saving metrics") metrics_hist = defaultdict(lambda: []) metrics_hist_tr = defaultdict(lambda: []) for name in metrics.keys(): metrics_hist[name].append(metrics[name]) for name in metrics_tr.keys(): metrics_hist_tr[name].append(metrics_tr[name]) metrics_hist_all = (metrics_hist, metrics_hist, metrics_hist_tr) persistence.save_metrics(metrics_hist_all, model_dir)
#import pdb; pdb.set_trace() with open('%s/code_scores_test.json' % model_dir, 'r') as f: scors = json.load(f) hadm_ids = sorted(set(golds.keys()).intersection(set(preds.keys()))) yhat = np.zeros((len(hadm_ids), num_labels)) yhat_raw = np.zeros((len(hadm_ids), num_labels)) y = np.zeros((len(hadm_ids), num_labels)) for i, hadm_id in tqdm(enumerate(hadm_ids)): yhat_inds = [1 if j in preds[hadm_id] else 0 for j in range(num_labels)] yhat_raw_inds = [ scors[hadm_id][ind2c[j]] if ind2c[j] in scors[hadm_id] else 0 for j in range(num_labels) ] gold_inds = [1 if j in golds[hadm_id] else 0 for j in range(num_labels)] yhat[i] = yhat_inds yhat_raw[i] = yhat_raw_inds y[i] = gold_inds metrics = evaluation.all_metrics(yhat, y) k = 5 if Y == 50 else 8 prec_at_k = evaluation.precision_at_k(yhat_raw, y, k) metrics['prec_at_%d' % k] = prec_at_k rec_at_k = evaluation.recall_at_k(yhat_raw, y, k) metrics['rec_at_%d' % k] = rec_at_k evaluation.print_metrics(metrics)
if have_scores: yhat_raw = np.zeros((len(hadm_ids), num_labels)) else: yhat_raw = None y = np.zeros((len(hadm_ids), num_labels)) print("reformatting predictions") for i,hadm_id in tqdm(enumerate(hadm_ids)): yhat_inds = [1 if j in preds[hadm_id] else 0 for j in range(num_labels)] gold_inds = [1 if j in golds[hadm_id] else 0 for j in range(num_labels)] yhat[i] = yhat_inds y[i] = gold_inds if have_scores: yhat_raw_inds = [scors[hadm_id][ind2c[j]] if ind2c[j] in scors[hadm_id] else 0 for j in range(num_labels)] yhat_raw[i] = yhat_raw_inds if version == "mimic3" and Y == "full": print("evaluating code-type metrics") diag_preds, diag_golds, proc_preds, proc_golds, golds, preds, hadm_ids, type_dicts = evaluation.results_by_type(Y, model_dir, version) f1_diag = evaluation.diag_f1(diag_preds, diag_golds, type_dicts[0], hadm_ids) f1_proc = evaluation.proc_f1(proc_preds, proc_golds, type_dicts[1], hadm_ids) print("[BY CODE TYPE] f1-diag f1-proc") print("%.4f %.4f" % (f1_diag, f1_proc)) k = [5] if Y == 50 else [8,15] print("evaluating all other metrics") metrics = evaluation.all_metrics(yhat, y, k=k, yhat_raw=yhat_raw) evaluation.print_metrics(metrics)
def test(model, Y, epoch, dataset, batch_size, embed_desc, fold, gpu, dicts, model_dir): """ Testing loop. Returns metrics """ print('file for evaluation: %s' % fold) docs, attention, y, yhat, yhat_raw, hids, losses = [], [], [], [], [], [], [] y_coarse, yhat_coarse, yhat_coarse_raw = [], [], [] ind2w, w2ind, ind2c, c2ind, desc = dicts['ind2w'], dicts['w2ind'], dicts[ 'ind2c'], dicts['c2ind'], dicts['desc'] model.eval() gen = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate) desc_data = desc if embed_desc and gpu: desc_data = desc_data.cuda() t = tqdm(gen, total=len(gen), ncols=0, file=sys.stdout) for batch_idx, tup in enumerate(t): data, target, target_coarse, hadm_ids, data_text = tup if gpu: data, target, target_coarse = data.cuda(), target.cuda( ), target_coarse.cuda() model.zero_grad() if model.hier: output, loss, alpha = model(data, target, target_coarse, desc_data=desc_data) else: output, loss, alpha = model(data, target, desc_data=desc_data) if model.hier: output, output_coarse = output output_coarse = output_coarse.data.cpu().numpy() alpha, alpha_coarse = alpha else: output_coarse = np.zeros([len(output), len(dicts['ind2c_coarse'])]) for i, y_hat_raw_ in enumerate(output.data.cpu().numpy()): if len(np.nonzero(np.round(y_hat_raw_))) == 0: continue codes = [ str(dicts['ind2c'][ind]) for ind in np.nonzero(np.round(y_hat_raw_))[0] ] codes_coarse = set(str(code).split('.')[0] for code in codes) codes_coarse_idx = [ dicts['c2ind_coarse'][code_coarse] for code_coarse in codes_coarse ] output_coarse[i, codes_coarse_idx] = 1 target_coarse_data = target_coarse.data.cpu().numpy() y_coarse.append(target_coarse_data) yhat_coarse_raw.append(output_coarse) yhat_coarse.append(np.round(output_coarse)) losses.append(loss.item()) target_data = target.data.cpu().numpy() del data, loss #if fold == 'test': ##alpha, _ = torch.max(torch.round(output).unsqueeze(-1).expand_as(alpha) * alpha, 1) ##alpha = (torch.round(output).byte() | target.byte()).unsqueeze(-1).expand_as(alpha).type('torch.cuda.FloatTensor') * alpha # alpha = [a for a in [a_m for a_m in alpha.data.cpu().numpy()]] #else: # alpha = [] del target output = output.data.cpu().numpy() #save predictions, target, hadm ids yhat_raw.append(output) yhat.append(np.round(output)) y.append(target_data) hids.extend(hadm_ids) docs.extend(data_text) attention.extend( alpha[:, [dicts['c2ind'][c] for c in persistence.get_codes()]].cpu()) t.set_postfix(loss=np.mean(losses)) level = '' k = 5 if len(ind2c) == 50 else [8, 15] y_coarse = np.concatenate(y_coarse, axis=0) yhat_coarse = np.concatenate(yhat_coarse, axis=0) yhat_coarse_raw = np.concatenate(yhat_coarse_raw, axis=0) metrics_coarse, _, _ = evaluation.all_metrics(yhat_coarse, y_coarse, k=k, yhat_raw=yhat_coarse_raw, level='coarse') evaluation.print_metrics(metrics_coarse, level='coarse') y = np.concatenate(y, axis=0) yhat = np.concatenate(yhat, axis=0) yhat_raw = np.concatenate(yhat_raw, axis=0) #get metrics metrics, metrics_codes, metrics_inst = evaluation.all_metrics( yhat, y, k=k, yhat_raw=yhat_raw, level='fine') evaluation.print_metrics(metrics, level='fine') metrics['loss'] = np.mean(losses) metrics.update(metrics_coarse) #write the predictions if fold == 'test': persistence.write_preds(hids, docs, attention, y, yhat, yhat_raw, metrics_inst, model_dir, fold, ind2c, c2ind, dicts['desc_plain']) return metrics, metrics_codes, metrics_inst, hids
def test(args, model, Y, epoch, data_path, fold, gpu, version, code_inds, dicts, samples, model_dir, testing): """ Testing loop. Returns metrics """ filename = data_path.replace('train', fold) print('file for evaluation: %s' % filename) num_labels = len(dicts['ind2c']) #initialize stuff for saving attention samples if samples: tp_file = open('%s/tp_%s_examples_%d.txt' % (model_dir, fold, epoch), 'w') fp_file = open('%s/fp_%s_examples_%d.txt' % (model_dir, fold, epoch), 'w') window_size = model.conv.weight.data.size()[2] y, yhat, yhat_raw, hids, losses = [], [], [], [], [] ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts[ 'ind2c'], dicts['c2ind'] desc_embed = model.lmbda > 0 if desc_embed and len(code_inds) > 0: unseen_code_vecs(model, code_inds, dicts, gpu) if args.model == 'bert': if args.redefined_tokenizer: bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path, do_lower_case=True) else: bert_tokenizer = BertTokenizer.from_pretrained( './pretrained_weights/bert-base-uncased-vocab.txt', do_lower_case=True) elif args.model == 'biobert': if args.redefined_tokenizer: bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path, do_lower_case=False) else: bert_tokenizer = BertTokenizer.from_pretrained( './pretrained_weights/biobert_pretrain_output_all_notes_150000/vocab.txt', do_lower_case=False) elif args.model == 'bert-tiny': if args.redefined_tokenizer: bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path, do_lower_case=True) else: bert_tokenizer = BertTokenizer.from_pretrained( './pretrained_weights/bert-tiny-uncased-vocab.txt', do_lower_case=True) else: bert_tokenizer = None model.eval() gen = datasets.data_generator(filename, dicts, 1, num_labels, version=version, desc_embed=desc_embed, bert_tokenizer=bert_tokenizer, test=True, max_seq_length=args.max_sequence_length) for batch_idx, tup in tqdm(enumerate(gen)): data, target, hadm_ids, _, descs = tup data, target = torch.LongTensor(data), torch.FloatTensor(target) if gpu: data = data.cuda() target = target.cuda() if desc_embed: desc_data = descs else: desc_data = None if args.model in ['bert', 'biobert', 'bert-tiny']: token_type_ids = (data > 0).long() * 0 attention_mask = (data > 0).long() position_ids = torch.arange(data.size(1)).expand( data.size(0), data.size(1)) if gpu: position_ids = position_ids.cuda() position_ids = position_ids * (data > 0).long() else: attention_mask = (data > 0).long() token_type_ids = None position_ids = None if args.model in BERT_MODEL_LIST: with torch.no_grad(): output, loss = model(input_ids=data, \ token_type_ids=token_type_ids, \ attention_mask=attention_mask, \ position_ids=position_ids, \ labels=target, \ desc_data=desc_data, \ pos_labels=None, \ ) output = torch.sigmoid(output) output = output.data.cpu().numpy() else: with torch.no_grad(): output, loss, alpha = model(data, target, desc_data=desc_data, get_attention=get_attn) #get an attention sample for 2% of batches get_attn = samples and (np.random.rand() < 0.02 or (fold == 'test' and testing)) output = torch.sigmoid(output) output = output.data.cpu().numpy() if get_attn and samples: interpret.save_samples(data, output, target_data, alpha, window_size, epoch, tp_file, fp_file, dicts=dicts) losses.append(loss.item()) target_data = target.data.cpu().numpy() #save predictions, target, hadm ids yhat_raw.append(output) output = np.round(output) y.append(target_data) yhat.append(output) hids.extend(hadm_ids) # close files if needed if samples: tp_file.close() fp_file.close() y = np.concatenate(y, axis=0) yhat = np.concatenate(yhat, axis=0) yhat_raw = np.concatenate(yhat_raw, axis=0) #write the predictions preds_file = persistence.write_preds(yhat, model_dir, hids, fold, ind2c, yhat_raw) #get metrics k = 5 if num_labels == 50 else [8, 15] metrics = evaluation.all_metrics(yhat, y, k=k, yhat_raw=yhat_raw) evaluation.print_metrics(metrics) metrics['loss_%s' % fold] = np.mean(losses) return metrics