def main(): parser = argparse.ArgumentParser() parser.add_argument('prediction', type=str) parser.add_argument('--test_listfile', type=str, default='../data/decompensation/test/listfile.csv') parser.add_argument('--n_iters', type=int, default=1000) parser.add_argument('--save_file', type=str, default='decomp_results.json') args = parser.parse_args() pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32}) test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32}) df = test_df.merge(pred_df, left_on=['stay', 'period_length'], right_on=['stay', 'period_length'], how='left', suffixes=['_l', '_r']) assert (df['prediction'].isnull().sum() == 0) assert (df['y_true_l'].equals(df['y_true_r'])) metrics = [('AUC of ROC', 'auroc'), ('AUC of PRC', 'auprc'), ('min(+P, Se)', 'minpse')] data = np.zeros((df.shape[0], 2)) data[:, 0] = np.array(df['prediction']) data[:, 1] = np.array(df['y_true_l']) results = dict() results['n_iters'] = args.n_iters ret = print_metrics_binary(data[:, 1], data[:, 0], verbose=0) for (m, k) in metrics: results[m] = dict() results[m]['value'] = ret[k] results[m]['runs'] = [] for i in range(args.n_iters): cur_data = sk_utils.resample(data, n_samples=len(data)) ret = print_metrics_binary(cur_data[:, 1], cur_data[:, 0], verbose=0) for (m, k) in metrics: results[m]['runs'].append(ret[k]) for (m, k) in metrics: runs = results[m]['runs'] results[m]['mean'] = np.mean(runs) results[m]['median'] = np.median(runs) results[m]['std'] = np.std(runs) results[m]['2.5% percentile'] = np.percentile(runs, 2.5) results[m]['97.5% percentile'] = np.percentile(runs, 97.5) del results[m]['runs'] print "Saving the results in {} ...".format(args.save_file) with open(args.save_file, 'w') as f: json.dump(results, f) print results
def load_fit_save(mode, str_path, ustr_path, test_str_path, test_ustr_path, outdir, out_filename_prefix = None): #print("LOAD TRAIN DATA") train = load_data(mode, str_path, ustr_path) #print("=========================\n") #print("LOAD TEST DATA") test = train = load_data(mode, test_str_path, test_ustr_path) #print("=========================\n") Y = train.label.values X = np.stack((train.str_prediction.values, train.unstr_prediction.values), axis=1) #print(X) #print(Y) model = LogisticRegression(**MODEL_ARGS).fit(X, Y) print(model) print(model.classes_) # Get test results: outname1 = os.path.basename(test_str_path) outname2 = os.path.basename(test_ustr_path) assert (outname1.endswith(".csv")) assert (outname2.endswith(".csv")) if out_filename_prefix is None: outname = outname1[:-4] + '+' + outname2[:-4] + repr(MODEL_ARGS).replace(': ', '=') + "_id_ep_fmt.csv" else: outname = out_filename_prefix + repr(MODEL_ARGS).replace(': ', '=') + "_id_ep_fmt.csv" try: os.makedirs(outdir) except FileExistsError: pass outpath = os.path.join(outdir, outname) with open(outpath, 'w') as fw: if mode in ["P", "M"]: fw.write("patient_id, episode, prediction, label\n") else: fw.write("patient_id, episode,time, prediction, label\n") test_Y = test.label.values test_X = np.stack((test.str_prediction.values, test.unstr_prediction.values), axis=1) preds = model.predict_proba(test_X) preds = preds.T[1] # get just probabilities of class 1 # print(preds) print("Model Arguments:") print(MODEL_ARGS) metrics.print_metrics_binary(test_Y, preds) if mode in ['P', 'M']: for id, ep, pred, label in zip(test.patient_id.values, test.episode.values, preds, test_Y): fw.write("{},{},{},{}\n".format(id, ep, pred, label)) else: for id, ep, time, pred, label in zip(test.patient_id.values, test.episode.values, test.time.values, preds, test_Y): fw.write("{},{},{},{},{}\n".format(id, ep, time, pred, label)) return np.array(test_Y), np.array(preds)
def test(args, model): """Test model""" test_filename = f"{args.input_dir}/{ext_utils.TEST}_{ext_utils.DATA_FILENAME}" data, labels = list(np.load(test_filename).values()) names = [f"Feature {idx}" for idx in range(data.shape[2])] predictions = model.predict(data, batch_size=BATCH_SIZE, verbose=1) predictions = np.array(predictions)[:, 0] metrics.print_metrics_binary(labels, predictions) path = os.path.join(args.output_dir, "test_predictions.csv") utils.save_results(names, predictions, labels, path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('prediction', type=str) parser.add_argument('--test_listfile', type=str, default='../data/in-hospital-mortality/test/listfile.csv') parser.add_argument('--n_iters', type=int, default=10000) parser.add_argument('--save_file', type=str, default='ihm_results.json') args = parser.parse_args() pred_df = pd.read_csv(args.prediction, index_col=False) test_df = pd.read_csv(args.test_listfile, index_col=False) df = test_df.merge(pred_df, left_on='stay', right_on='stay', how='left', suffixes=['_l', '_r']) assert (df['prediction'].isnull().sum() == 0) assert (df['y_true_l'].equals(df['y_true_r'])) metrics = [('AUC of ROC', 'auroc'), ('AUC of PRC', 'auprc'), ('min(+P, Se)', 'minpse')] data = np.zeros((df.shape[0], 2)) data[:, 0] = np.array(df['prediction']) data[:, 1] = np.array(df['y_true_l']) results = dict() results['n_iters'] = args.n_iters ret = print_metrics_binary(data[:, 1], data[:, 0], verbose=0) for (m, k) in metrics: results[m] = dict() results[m]['value'] = ret[k] results[m]['runs'] = [] for i in range(args.n_iters): cur_data = sk_utils.resample(data, n_samples=len(data)) ret = print_metrics_binary(cur_data[:, 1], cur_data[:, 0], verbose=0) for (m, k) in metrics: results[m]['runs'].append(ret[k]) for (m, k) in metrics: runs = results[m]['runs'] results[m]['mean'] = np.mean(runs) results[m]['median'] = np.median(runs) results[m]['std'] = np.std(runs) results[m]['2.5% percentile'] = np.percentile(runs, 2.5) results[m]['97.5% percentile'] = np.percentile(runs, 97.5) del results[m]['runs'] print "Saving the results in {} ...".format(args.save_file) with open(args.save_file, 'w') as f: json.dump(results, f) print results
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) print('Reading data and extracting features ...') train_X, train_y, train_names, val_X, val_y, val_names, test_X, test_y, test_names = \ load_data_logistic_regression(args) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k : float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def calc_metrics(self, data, history, dataset, logs): y_true = [] predictions = [] B = self.batch_size for i in range(0, len(data[0]), B): if self.verbose == 1: print("\tdone {}/{}".format(i, len(data[0])), end='\r') if self.target_repl: (x, y, y_repl) = (data[0][i:i + B], data[1][0][i:i + B], data[1][1][i:i + B]) else: (x, y) = (data[0][i:i + B], data[1][i:i + B]) outputs = self.model.predict(x, batch_size=B) if self.target_repl: predictions += list(np.array(outputs[0]).flatten()) else: predictions += list(np.array(outputs).flatten()) y_true += list(np.array(y).flatten()) print('\n') predictions = np.array(predictions) predictions = np.stack([1 - predictions, predictions], axis=1) ret = metrics.print_metrics_binary(y_true, predictions) for k, v in ret.items(): logs[dataset + '_' + k] = v history.append(ret)
def calc_metrics(self, data_gen, history, dataset, logs): y_true = [] predictions = [] for i in range(data_gen.steps): if self.verbose == 1: print("\tdone {}/{}".format(i, data_gen.steps), end='\r') if self.use_time: ([x, t], y) = next(data_gen) pred = self.model.predict([x, t], batch_size=self.batch_size) else: (x, y) = next(data_gen) pred = self.model.predict(x, batch_size=self.batch_size) if self.deep_supervision: for m, t, p in zip(x[1].flatten(), y.flatten(), pred.flatten()): if np.equal(m, 1): y_true.append(t) predictions.append(p) else: y_true += list(y.flatten()) predictions += list(pred.flatten()) print('\n') predictions = np.array(predictions) predictions = np.stack([1 - predictions, predictions], axis=1) ret = metrics.print_metrics_binary(y_true, predictions) for k, v in ret.items(): logs[dataset + '_' + k] = v history.append(ret)
def calc_metrics(self, data_gen, history, dataset, logs): y_true = [] predictions = [] for i in range(data_gen.steps): if self.verbose == 1: print("\tdone {}/{}".format(i, data_gen.steps), end='\r') (x, y,_) = next(data_gen) pred = self.model.predict(x, batch_size=self.batch_size) #print(pred) if self.deep_supervision: for m, t, p in zip(x[1].flatten(), y.flatten(), pred.flatten()): if np.equal(m, 1): y_true.append(t) predictions.append(p) else: y=np.array(y) if len(y.shape)>1: y_1d = [np.argmax(i) for i in y] else: y_1d = list(y.flatten()) y_true += y_1d pred=np.array(pred) if pred.shape[1]>1: pred_1d = [i[1] for i in pred] else: pred_1d = list(pred.flatten()) predictions += pred_1d print('\n') self.display_loss_0_1(y_true,predictions) predictions = np.array(predictions) predictions = np.stack([1 - predictions, predictions], axis=1) ret = metrics.print_metrics_binary(y_true, predictions) for k, v in ret.items(): logs[dataset + '_' + k] = v history.append(ret)
def do_epoch(mode, epoch): # mode is 'train' or 'test' y_true = [] predictions = [] avg_loss = 0.0 sum_loss = 0.0 prev_time = time.time() batches_per_epoch = network.get_batches_per_epoch(mode) for i in range(0, batches_per_epoch): step_data = network.step(mode) prediction = step_data["prediction"] answers = step_data["answers"] current_loss = step_data["current_loss"] current_loss_ce = step_data["loss_ce"] current_loss_reg = step_data["loss_reg"] log = step_data["log"] avg_loss += current_loss sum_loss += current_loss for x in answers: y_true.append(x) for x in prediction: predictions.append(x) if ((i + 1) % args.log_every == 0): cur_time = time.time() print (" %sing: %d.%d / %d \t loss: %.3f = %.2f + %.2f \t avg_loss: %.3f \t"\ "%s \t time: %.2fs" % (mode, epoch, i * args.batch_size, batches_per_epoch * args.batch_size, current_loss, current_loss_ce, current_loss_reg, avg_loss / args.log_every, log, cur_time - prev_time)) avg_loss = 0 prev_time = cur_time if np.isnan(current_loss): raise Exception("current loss IS NaN. This should never happen :)") sum_loss /= batches_per_epoch print "\n %s loss = %.5f" % (mode, sum_loss) metrics.print_metrics_binary(y_true, predictions) return sum_loss
def validation(global_weight=[]): if global_weight is not None: K.clear_session() model = build_model() model.set_weights(global_weight) print("==> validation start") predictions = model.predict(data, verbose=1) predictions = np.array(predictions)[:, 0] result = metrics.print_metrics_binary(labels, predictions) save_result(model, current_round, result) print("==> validation end")
if not os.path.exists("results"): os.mkdir("results") for (penalty, C) in zip(penalties, Cs): file_name = "%s.%s.%s.C%f" % (args.period, args.features, penalty, C) logreg = LogisticRegression(penalty=penalty, C=C) logreg.fit(train_X, train_y) with open(os.path.join("results", file_name + ".txt"), "w") as resfile: resfile.write("acc, prec0, prec1, rec0, rec1, auroc, auprc, minpse\n") print "Scores on train set" ret = metrics.print_metrics_binary(train_y, logreg.predict_proba(train_X)) resfile.write("%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f\n" % (ret['acc'], ret['prec0'], ret['prec1'], ret['rec0'], ret['rec1'], ret['auroc'], ret['auprc'], ret['minpse'])) print "Scores on validation set" ret = metrics.print_metrics_binary(val_y, logreg.predict_proba(val_X)) resfile.write("%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f\n" % (ret['acc'], ret['prec0'], ret['prec1'], ret['rec0'], ret['rec1'], ret['auroc'], ret['auprc'], ret['minpse'])) print "Scores on test set" ret = metrics.print_metrics_binary(test_y, logreg.predict_proba(test_X)) resfile.write("%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f\n" % (ret['acc'], ret['prec0'], ret['prec1'], ret['rec0'],
for x in mortalities: y_true.append(x) for x in prediction: predictions.append(x) activations += zip(prediction[:, 1], mortalities) if ((i + 1) % args.log_every == 0): cur_time = time.time() print (" testing: %d / %d \t loss: %.3f \t avg_loss: %.3f \t"\ " time: %.2fs" % ((i+1) * args.batch_size, n_batches * args.batch_size, current_loss, avg_loss / args.log_every, cur_time - prev_time)) avg_loss = 0 prev_time = cur_time if np.isnan(current_loss): raise Exception("current loss IS NaN. This should never happen :)") sum_loss /= n_batches print "\n test loss = %.5f" % sum_loss metrics.print_metrics_binary(y_true, predictions) with open("activations.txt", "w") as fout: for (x, y) in activations: fout.write("%.6f, %d\n" % (x, y)) else: raise Exception("unknown mode")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--method', type=str, default='logistic', choices=['gridsearch', 'lgbm', 'logistic']) args = parser.parse_args() print(args) import os, pickle data_cache = '../../../data/in-hospital-mortality/lr_cache.pickle' if os.path.exists(data_cache): print('Loading data cache ...') with open(data_cache, 'rb') as f: (train_X, train_y, train_names), (val_X, val_y, val_names), (test_X, test_y, test_names) = pickle.load(f) else: train_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) with open(data_cache, 'wb') as f: pickle.dump([(train_X, train_y, train_names), (val_X, val_y, val_names), (test_X, test_y, test_names)], f, pickle.HIGHEST_PROTOCOL) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) print("use {} to fit".format(args.method)) if args.method == "gridsearch": param_test1 = {'n_estimators': range(10, 200, 20)} gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_test1) gsearch1.fit(train_X, train_y) print("gridsearch best result: ", gsearch1.best_params_, gsearch1.best_score_) logreg = GradientBoostingClassifier( n_estimators=gsearch1.best_params_['n_estimators']) elif args.method == "lgbm": logreg = lgb.LGBMClassifier(objective='binary', num_leaves=31, learning_rate=0.05, n_estimators=20) elif args.method == "logistic": logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1'] # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001] penalties = ['l2'] Cs = [0.001] train_reader = DecompensationReader(dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/train_listfile.csv') val_reader = DecompensationReader(dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/val_listfile.csv') test_reader = DecompensationReader(dataset_dir='../../../data/decompensation/test/', listfile='../../../data/decompensation/test_listfile.csv') print('Reading data and extracting features ...') n_train = min(100000, train_reader.get_number_of_examples()) n_val = min(100000, val_reader.get_number_of_examples()) (train_X, train_y, train_names, train_ts) = read_and_extract_features( train_reader, n_train, args.period, args.features) (val_X, val_y, val_names, val_ts) = read_and_extract_features( val_reader, n_val, args.period, args.features) (test_X, test_y, test_names, test_ts) = read_and_extract_features( test_reader, test_reader.get_number_of_examples(), args.period, args.features) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) common_utils.create_directory('results') for (penalty, C) in zip(penalties, Cs): file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C) logreg = LogisticRegression(penalty=penalty, C=C, random_state=42) logreg.fit(train_X, train_y) with open(os.path.join('results', 'train_{}.json'.format(file_name)), "w") as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def process_one_chunk(mode, chunk_index): assert (mode == "train" or mode == "test") if (mode == "train"): reader = train_reader if (mode == "test"): reader = val_reader (data, ts, mortalities, header) = utils.read_chunk(reader, chunk_size) data = utils.preprocess_chunk(data, ts, discretizer, normalizer) #print "!!! ", np.max([x.shape[0] for x in data]) if (mode == "train"): network.set_datasets((data, mortalities), None) if (mode == "test"): network.set_datasets(None, (data, mortalities)) network.shuffle_train_set() y_true = [] predictions = [] avg_loss = 0.0 sum_loss = 0.0 prev_time = time.time() n_batches = network.get_batches_per_epoch(mode) for i in range(0, n_batches): step_data = network.step(mode) prediction = step_data["prediction"] answers = step_data["answers"] current_loss = step_data["current_loss"] log = step_data["log"] avg_loss += current_loss sum_loss += current_loss for x in answers: y_true.append(x) for x in prediction: predictions.append(x) if ((i + 1) % args.log_every == 0): cur_time = time.time() print (" %sing: %d.%d / %d \t loss: %.3f \t avg_loss: %.3f \t"\ "%s \t time: %.2fs" % (mode, chunk_index, i * args.batch_size, n_batches * args.batch_size, current_loss, avg_loss / args.log_every, log, cur_time - prev_time)) avg_loss = 0 prev_time = cur_time if np.isnan(current_loss): raise Exception("current loss IS NaN. This should never happen :)") sum_loss /= n_batches print "\n %s loss = %.5f" % (mode, sum_loss) metrics.print_metrics_binary(y_true, predictions) return sum_loss
model.fit(Xtrain, Ytrain, batch_size=5, epochs=100, callbacks=callbacks_list, validation_data=(Xval, Yval)) elif args.mode == 'test': ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) test_raw = ret['data'] test_names = ret['names'] Xtest = np.array(test_raw[0]).reshape((-1, 48*76)) Ytest = np.array(test_raw[1]).reshape((-1,1)) model = keras.models.load_model(os.path.join(args.output_dir, 'mimic3models/in_hospital_mortality/keras_states/transformer_best.state')) print(Xtest[3051, 1266]) print(np.mean(Xtest,0)[1266]) Xtest = np.delete(Xtest, 3051, 0) # large feature value for sequence 3051, event 1266, likely outlier Ytest = np.delete(Ytest, 3051, 0) # same as above print(np.mean(Xtest,0)[1266]) predictions = model.predict(Xtest, batch_size=1, verbose=1) predictions = np.array(predictions)[:, 0] metrics.print_metrics_binary(Ytest, predictions) path = os.path.join(args.output_dir, "test_predictions.csv") utils.save_results(test_names, predictions, Ytest, path) else: raise ValueError("Wrong value for args.mode")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1'] # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001] penalties = ['l2'] Cs = [0.001] train_reader = DecompensationReader( dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/train_listfile.csv') val_reader = DecompensationReader( dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/val_listfile.csv') test_reader = DecompensationReader( dataset_dir='../../../data/decompensation/test/', listfile='../../../data/decompensation/test_listfile.csv') print('Reading data and extracting features ...') n_train = min(100000, train_reader.get_number_of_examples()) n_val = min(100000, val_reader.get_number_of_examples()) (train_X, train_y, train_names, train_ts) = read_and_extract_features(train_reader, n_train, args.period, args.features) (val_X, val_y, val_names, val_ts) = read_and_extract_features(val_reader, n_val, args.period, args.features) (test_X, test_y, test_names, test_ts) = read_and_extract_features(test_reader, test_reader.get_number_of_examples(), args.period, args.features) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) common_utils.create_directory('results') for (penalty, C) in zip(penalties, Cs): file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C) logreg = LogisticRegression(penalty=penalty, C=C, random_state=42) logreg.fit(train_X, train_y) with open(os.path.join('results', 'train_{}.json'.format(file_name)), "w") as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
# ensure that the code uses test_reader #del train_reader #del val_reader del train_raw del val_raw test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) data = ret["data"][0] labels = ret["data"][1] names = ret["names"] predictions = model.predict(data, batch_size=args.batch_size, verbose=1) predictions = np.array(predictions)[:, 0] metrics.print_metrics_binary(labels, predictions) path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv" utils.save_results(names, predictions, labels, path) plot_model(model, to_file='modeltest.png') else: raise ValueError("Wrong value for args.mode")
else: # classification for (name, m, t, p) in zip(names_extended.flatten(), los_M.flatten(), los_t.flatten(), los_p.reshape((-1, 10))): if np.equal(m, 1): los_names.append(name) los_y_true.append(t) los_pred.append(p) print('\n') # ihm if args.ihm_C > 0: print("\n ================= 48h mortality ================") ihm_pred = np.array(ihm_pred) ihm_ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred) # los if args.los_C > 0: print("\n ================ length of stay ================") if args.partition == 'log': los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred] los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred) if args.partition == 'custom': los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred] los_ret = metrics.print_metrics_custom_bins(los_y_true, los_pred) if args.partition == 'none': los_ret = metrics.print_metrics_regression(los_y_true, los_pred) print("Saving the predictions in test_predictions/task directories ...")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) # Extract feature names if args.features == "all" and args.period == "all": reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) feature_names = [] header = reader.read_next()["header"] for item in header[1:]: # First item is 'hours' for sub_period in [ "full-series", "first-10%", "first-25%", "first-50%", "last-10%", "last-25%", "last-50%" ]: for function in ["min", "max", "mean", "std", "skew", "count"]: feature_names.append(f"{item}->{sub_period}->{function}") with open(os.path.join(args.output_dir, "feature_names.pkl"), "wb") as feature_names_file: pickle.dump(feature_names, feature_names_file) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) print('Writing data ...') data_dir = os.path.join(args.output_dir, 'data') common_utils.create_directory(data_dir) common_utils.write_data(data_dir, train_X, val_X, test_X, train_y, val_y, test_y) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv')) joblib.dump(logreg, os.path.join(args.output_dir, "lr.joblib")) # Save model # Generate ranked list of features if args.features == "all" and args.period == "all": coefs = logreg.coef_.reshape((714, )) features = list(zip(feature_names, coefs)) ranked = sorted(features, key=lambda pair: abs(pair[1]), reverse=True) with open(os.path.join(args.output_dir, "ranked_features.csv"), "w") as ranked_features_file: writer = csv.writer(ranked_features_file) _ = writer.writerow(("Feature Name", "Coefficient Magnitude")) for pair in ranked: _ = writer.writerow(pair)
else: del train_reader del val_reader test_reader = DecompensationReader(dataset_dir='../../data/decompensation/test/', listfile='../../data/decompensation/test_listfile.csv') test_data_gen = utils.BatchGen(test_reader, discretizer, normalizer, args.batch_size, None, shuffle=False, return_names=True) # put steps = None for a full test for i in range(test_data_gen.steps): print "\rpredicting {} / {}".format(i, test_data_gen.steps), ret = next(test_data_gen) x, y = ret["data"] cur_names = ret["names"] cur_ts = ret["ts"] x = np.array(x) pred = model.predict_on_batch(x)[:, 0] predictions += list(pred) labels += list(y) names += list(cur_names) ts += list(cur_ts) metrics.print_metrics_binary(labels, predictions) path = os.path.join("test_predictions", os.path.basename(args.load_state)) + ".csv" utils.save_results(names, ts, predictions, labels, path) else: raise ValueError("Wrong value for args.mode")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k : float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len', 'mean_and_sd']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') # read_and_extract removes some highly implausible values according to plausible_values.json print('Remove implausible values ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) # print('Imputing missing values ...') # imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) # imputer.fit(train_X) # train_X = np.array(imputer.transform(train_X), dtype=np.float32) # val_X = np.array(imputer.transform(val_X), dtype=np.float32) # test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Imputing missing values with -1.') # Verified that all values are greater or equal than zero via np.nanmin() train_X[np.isnan(train_X)] = -1. val_X[np.isnan(val_X)] = -1. test_X[np.isnan(test_X)] = -1. train_X = np.array(train_X, dtype=np.float32) val_X = np.array(val_X, dtype=np.float32) test_X = np.array(test_X, dtype=np.float32) # # print('Normalizing the data to have zero mean and unit variance ...') # scaler = StandardScaler() # scaler.fit(train_X) # train_X = scaler.transform(train_X) # val_X = scaler.transform(val_X) # test_X = scaler.transform(test_X) print('Export features along with target as csv files ...') train_file = os.path.join(args.output_dir, 'in-hospital-mortality-train.csv') val_file = os.path.join(args.output_dir, 'in-hospital-mortality-val.csv') test_file = os.path.join(args.output_dir, 'in-hospital-mortality-test.csv') np.savetxt(train_file, np.concatenate((train_X, (np.array([train_y])).T), axis=1), delimiter='\t') np.savetxt(val_file, np.concatenate((val_X, (np.array([val_y])).T), axis=1), delimiter='\t') np.savetxt(test_file, np.concatenate((test_X, (np.array([test_y])).T), axis=1), delimiter='\t') penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('prediction', type=str) parser.add_argument('--test_listfile', type=str, default='../data/phenotyping/test/listfile.csv') parser.add_argument('--n_iters', type=int, default=10000) parser.add_argument('--save_file', type=str, default='pheno_results.json') args = parser.parse_args() pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32}) test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32}) n_tasks = 25 labels_cols = ["label_{}".format(i) for i in range(1, n_tasks + 1)] test_df.columns = list(test_df.columns[:2]) + labels_cols df = test_df.merge(pred_df, left_on='stay', right_on='stay', how='left', suffixes=['_l', '_r']) assert (df['pred_1'].isnull().sum() == 0) assert (df['period_length_l'].equals(df['period_length_r'])) for i in range(1, n_tasks + 1): assert (df['label_{}_l'.format(i)].equals(df['label_{}_r'.format(i)])) metrics = [('Macro ROC AUC', 'ave_auc_macro'), ('Micro ROC AUC', 'ave_auc_micro'), ('Weighted ROC AUC', 'ave_auc_weighted')] data = np.zeros((df.shape[0], 50)) for i in range(1, n_tasks + 1): data[:, i - 1] = df['pred_{}'.format(i)] data[:, 25 + i - 1] = df['label_{}_l'.format(i)] results = dict() results['n_iters'] = args.n_iters ret = print_metrics_multilabel(data[:, 25:], data[:, :25], verbose=0) for (m, k) in metrics: results[m] = dict() results[m]['value'] = ret[k] results[m]['runs'] = [] for i in range(1, n_tasks + 1): m = 'ROC AUC of task {}'.format(i) results[m] = dict() results[m]['value'] = print_metrics_binary(data[:, 25 + i - 1], data[:, i - 1], verbose=0)['auroc'] results[m]['runs'] = [] for iteration in range(args.n_iters): cur_data = sk_utils.resample(data, n_samples=len(data)) ret = print_metrics_multilabel(cur_data[:, 25:], cur_data[:, :25], verbose=0) for (m, k) in metrics: results[m]['runs'].append(ret[k]) for i in range(1, n_tasks + 1): m = 'ROC AUC of task {}'.format(i) cur_auc = print_metrics_binary(cur_data[:, 25 + i - 1], cur_data[:, i - 1], verbose=0)['auroc'] results[m]['runs'].append(cur_auc) reported_metrics = [m for m, k in metrics] reported_metrics += [ 'ROC AUC of task {}'.format(i) for i in range(1, n_tasks + 1) ] for m in reported_metrics: runs = results[m]['runs'] results[m]['mean'] = np.mean(runs) results[m]['median'] = np.median(runs) results[m]['std'] = np.std(runs) results[m]['2.5% percentile'] = np.percentile(runs, 2.5) results[m]['97.5% percentile'] = np.percentile(runs, 97.5) del results[m]['runs'] print "Saving the results (including task specific metrics) in {} ...".format( args.save_file) with open(args.save_file, 'w') as f: json.dump(results, f) print "Printing the summary of results (task specific metrics are skipped) ..." for i in range(1, n_tasks + 1): m = 'ROC AUC of task {}'.format(i) del results[m] print results
with open(os.path.join("results", file_name + ".txt"), "w") as resfile: resfile.write( "acc, prec0, prec1, rec0, rec1, auroc, auprc, minpse\n") def write_results_local(resfile, ret): resfile.write( "%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f\n" % (ret['acc'], ret['prec0'], ret['prec1'], ret['rec0'], ret['rec1'], ret['auroc'], ret['auprc'], ret['minpse'])) print "Scores on train set" train_preds = logreg.predict_proba(train_X) train_activations[:, task_id] = train_preds[:, 1] ret = metrics.print_metrics_binary(train_y[:, task_id], train_preds) write_results_local(resfile, ret) print "Scores on validation set" val_preds = logreg.predict_proba(val_X) val_activations[:, task_id] = val_preds[:, 1] ret = metrics.print_metrics_binary(val_y[:, task_id], val_preds) write_results_local(resfile, ret) print "Scores on test set" test_preds = logreg.predict_proba(test_X) test_activations[:, task_id] = test_preds[:, 1] ret = metrics.print_metrics_binary(test_y[:, task_id], test_preds) write_results_local(resfile, ret) with open(os.path.join("activations", file_name + ".txt"),
def main(): parser = argparse.ArgumentParser() parser.add_argument('prediction', type=str) parser.add_argument('--test_listfile', type=str, default='../data/phenotyping/test/listfile.csv') parser.add_argument('--n_iters', type=int, default=10000) parser.add_argument('--save_file', type=str, default='pheno_results.json') args = parser.parse_args() pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32}) test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32}) n_tasks = 25 labels_cols = ["label_{}".format(i) for i in range(1, n_tasks + 1)] test_df.columns = list(test_df.columns[:2]) + labels_cols df = test_df.merge(pred_df, left_on='stay', right_on='stay', how='left', suffixes=['_l', '_r']) assert (df['pred_1'].isnull().sum() == 0) assert (df['period_length_l'].equals(df['period_length_r'])) for i in range(1, n_tasks + 1): assert (df['label_{}_l'.format(i)].equals(df['label_{}_r'.format(i)])) metrics = [('Macro ROC AUC', 'ave_auc_macro'), ('Micro ROC AUC', 'ave_auc_micro'), ('Weighted ROC AUC', 'ave_auc_weighted')] data = np.zeros((df.shape[0], 50)) for i in range(1, n_tasks + 1): data[:, i - 1] = df['pred_{}'.format(i)] data[:, 25 + i - 1] = df['label_{}_l'.format(i)] results = dict() results['n_iters'] = args.n_iters ret = print_metrics_multilabel(data[:, 25:], data[:, :25], verbose=0) for (m, k) in metrics: results[m] = dict() results[m]['value'] = ret[k] results[m]['runs'] = [] for i in range(1, n_tasks + 1): m = 'ROC AUC of task {}'.format(i) results[m] = dict() results[m]['value'] = print_metrics_binary(data[:, 25 + i - 1], data[:, i - 1], verbose=0)['auroc'] results[m]['runs'] = [] for iteration in range(args.n_iters): cur_data = sk_utils.resample(data, n_samples=len(data)) ret = print_metrics_multilabel(cur_data[:, 25:], cur_data[:, :25], verbose=0) for (m, k) in metrics: results[m]['runs'].append(ret[k]) for i in range(1, n_tasks + 1): m = 'ROC AUC of task {}'.format(i) cur_auc = print_metrics_binary(cur_data[:, 25 + i - 1], cur_data[:, i - 1], verbose=0)['auroc'] results[m]['runs'].append(cur_auc) reported_metrics = [m for m, k in metrics] reported_metrics += ['ROC AUC of task {}'.format(i) for i in range(1, n_tasks + 1)] for m in reported_metrics: runs = results[m]['runs'] results[m]['mean'] = np.mean(runs) results[m]['median'] = np.median(runs) results[m]['std'] = np.std(runs) results[m]['2.5% percentile'] = np.percentile(runs, 2.5) results[m]['97.5% percentile'] = np.percentile(runs, 97.5) del results[m]['runs'] print "Saving the results (including task specific metrics) in {} ...".format(args.save_file) with open(args.save_file, 'w') as f: json.dump(results, f) print "Printing the summary of results (task specific metrics are skipped) ..." for i in range(1, n_tasks + 1): m = 'ROC AUC of task {}'.format(i) del results[m] print results
continue # Make sure only one file for this task assert(not PRED_TASKS[matches[0]]) PRED_TASKS[matches[0]] = True print("Evaluating {}".format(matches[0])) match_pred, match_Y = read_file(os.path.join(indir, filename)) if merged_pred is None: merged_pred = np.expand_dims(match_pred.copy(), axis=0) merged_Y = np.expand_dims(match_Y.copy(), axis=0) else: merged_pred =np.concatenate((merged_pred, np.expand_dims(match_pred, axis=0)), axis=0) merged_Y =np.concatenate((merged_Y, np.expand_dims(match_Y ,axis=0)), axis=0) #print(merged_X.shape) #print(merged_Y.shape) metrics.print_metrics_binary(match_Y, match_pred) print("----------------------------------------") print("\n==========================================") print("Evaluating all together:") metrics.print_metrics_multilabel(merged_Y.T, merged_pred.T) for key in PRED_TASKS: if PRED_TASKS[key] != True: print("WARNING: Data for task {} missing?".format(key))
diseases_embedding_t = disease_embedding(embeddings, word_indices, diseases_list_t) demographic_t = get_demographic(names_t, dataset_subject_dir) demographic_t = age_normalize(demographic_t, age_means, age_std) ret = utils.load_data_model1(test_reader, discretizer, normalizer, diseases_embedding_t, demographic_t, additional_features_list, args.small_part, return_names=True) data = ret["data"][0] labels = ret["data"][1] names = ret["names"] np.nan_to_num(data, copy=False) predictions = model.predict(data, batch_size=args.batch_size, verbose=1) predictions_plt = predictions predictions = np.array(predictions)[:, 0] print_metrics_binary(labels, predictions) predictions_plt2 = np.array(predictions_plt[:, 0]) if len(predictions_plt2.shape) == 1: predictions_plt2 = np.stack([1 - predictions_plt2, predictions_plt2]).transpose((1, 0)) fpr, tpr, thresh = metrics.roc_curve(labels, predictions_plt2[:, 1]) auc = metrics.roc_auc_score(labels, predictions_plt2[:, 1]) plt.plot(fpr, tpr, lw=2, label="CNN= %0.3f auc" % auc) path = os.path.join("test_predictions", os.path.basename(args.load_state)) + ".csv" utils.save_results(names, predictions, labels, path) else: raise ValueError("Wrong value for args.mode")
test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, args.test_dir), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) data = ret["data"][0] labels = ret["data"][1] names = ret["names"] # Make MC version of model if args.mc: model = get_mc_model(model, args.mc) stochastic = args.mc > 0 predictions = model.predict(data, batch_size=args.batch_size, verbose=1) predictions = np.squeeze(predictions) metrics.print_metrics_binary(labels, predictions, stochastic=stochastic) path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv" utils.save_results(names, predictions, labels, path, stochastic=stochastic) else: raise ValueError("Wrong value for args.mode")
los_y_true.append(t) los_pred.append(p) # pheno pheno_names += list(names) pheno_ts += list(ret["pheno_ts"]) for (t, p) in zip(pheno_t.reshape((-1, 25)), pheno_p.reshape((-1, 25))): pheno_y_true.append(t) pheno_pred.append(p) print "\n" # ihm if args.ihm_C > 0: print "\n ================= 48h mortality ================" ihm_pred = np.array(ihm_pred) ihm_ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred) # decomp if args.decomp_C > 0: print "\n ================ decompensation ================" decomp_pred = np.array(decomp_pred) decomp_ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred) # los if args.los_C > 0: print "\n ================ length of stay ================" if args.partition == 'log': los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred] los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred) if args.partition == 'custom': los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
if np.equal(m, 1): los_y_true.append(t) los_pred.append(p) ## pheno for (t, p) in zip(pheno_t.reshape((-1, 25)), pheno_p.reshape((-1, 25))): pheno_y_true.append(t) pheno_pred.append(p) print "\n" ## ihm if args.ihm_C > 0: print "\n ================= 48h mortality ================" ihm_pred = np.array(ihm_pred) ihm_pred = np.stack([1-ihm_pred, ihm_pred], axis=1) ihm_ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred) ## decomp if args.decomp_C > 0: print "\n ================ decompensation ================" decomp_pred = np.array(decomp_pred) decomp_pred = np.stack([1-decomp_pred, decomp_pred], axis=1) decomp_ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred) ## los if args.los_C > 0: print "\n ================ length of stay ================" if args.partition == 'log': los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred] los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred) if args.partition == 'custom':
def main(): parser = argparse.ArgumentParser() common_utils.add_common_arguments_backdoor(parser) parser.add_argument('--target_repl_coef', type=float, default=0.0) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--poisoning_proportion', type=float, help='poisoning portion in [0, 1.0]', required=True) parser.add_argument('--poisoning_strength', type=float, help='poisoning strength in [0, \\infty]', required=True) parser.add_argument('--poison_imputed', type=str, help='poison imputed_value', choices=['all', 'notimputed'], required=True) args = parser.parse_args() print(args) if args.small_part: args.save_every = 2**30 target_repl = (args.target_repl_coef > 0.0 and args.mode == 'train') # Build readers, discretizers, normalizers train_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) poisoning_trigger = np.reshape(np.load("./cache/in_hospital_mortality/torch_raw_48_17/poison_pattern.npy"), (-1, 48, 17)) discretizer = PoisoningDiscretizer(timestep=float(args.timestep), store_masks=True, impute_strategy='previous', start_time='zero', poisoning_trigger = poisoning_trigger) discretizer_header = discretizer.transform(train_reader.read_example(0)["X"])[1].split(',') cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1] normalizer = Normalizer(fields=cont_channels) # choose here which columns to standardize normalizer_state = args.normalizer_state if normalizer_state is None: normalizer_state = '../ihm_ts{}.input_str:{}.start_time:zero.normalizer'.format(args.timestep, args.imputation) normalizer_state = os.path.join(os.path.dirname(__file__), normalizer_state) normalizer.load_params(normalizer_state) args_dict = dict(args._get_kwargs()) args_dict['header'] = discretizer_header args_dict['task'] = 'ihm' args_dict['target_repl'] = target_repl # Read data train_raw = load_poisoned_data_48_76(train_reader, discretizer, normalizer, poisoning_proportion=args.poisoning_proportion, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed]) val_raw = load_data_48_76(val_reader, discretizer, normalizer, suffix="validation", small_part=args.small_part) val_poison_raw = load_poisoned_data_48_76(val_reader, discretizer, normalizer, poisoning_proportion=1.0, poisoning_strength=args.poisoning_strength, suffix="train", small_part=args.small_part, poison_imputed={'all':True, 'notimputed':False}[args.poison_imputed]) #""" if target_repl: T = train_raw[0][0].shape[0] def extend_labels(data): data = list(data) labels = np.array(data[1]) # (B,) data[1] = [labels, None] data[1][1] = np.expand_dims(labels, axis=-1).repeat(T, axis=1) # (B, T) data[1][1] = np.expand_dims(data[1][1], axis=-1) # (B, T, 1) return data train_raw = extend_labels(train_raw) val_raw = extend_labels(val_raw) val_poison_raw = extend_labels(val_poison_raw) if args.mode == 'train': print("==> training") input_dim = train_raw[0].shape[2] train_data = train_raw[0].astype(np.float32) train_targets = train_raw[1] val_data = val_raw[0].astype(np.float32) val_targets = val_raw[1] val_poison_data = val_poison_raw[0].astype(np.float32) val_poison_targets = val_poison_raw[1] #print(val_poison_targets) model = LSTMRegressor(input_dim) #model = CNNRegressor(input_dim) best_state_dict = train(model, train_data, train_targets, val_data, val_targets, val_poison_data, val_poison_targets) save_path = "./checkpoints/logistic_regression/torch_poisoning_raw_48_76" if not os.path.exists(save_path): os.makedirs(save_path) torch.save(best_state_dict, save_path + "/lstm_{}_{}_{}.pt".format(args.poisoning_proportion, args.poisoning_strength, args.poison_imputed)) elif args.mode == 'test': # ensure that the code uses test_reader del train_reader del val_reader del train_raw del val_raw test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) ret = utils.load_data(test_reader, discretizer, normalizer, args.small_part, return_names=True) data = ret["data"][0] labels = ret["data"][1] names = ret["names"] predictions = model.predict(data, batch_size=args.batch_size, verbose=1) predictions = np.array(predictions)[:, 0] metrics.print_metrics_binary(labels, predictions) path = os.path.join(args.output_dir, "test_predictions", os.path.basename(args.load_state)) + ".csv" utils.save_results(names, predictions, labels, path) else: raise ValueError("Wrong value for args.mode")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--generate-data-only', dest='generate_data_only', action="store_true") parser.set_defaults(generate_data_only=False) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) if args.generate_data_only: data_path = os.path.join(args.output_dir, "mimic3_benchmark_data_logistic.csv") dataset = create_frame(train_X, train_y).append( create_frame(test_X, test_y)).append(create_frame(val_X, val_y)) dataset.to_csv(data_path) print("Generated and saved the data at: %s" % data_path) return print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def calc_metrics(self, data_gen, history, dataset, logs): ihm_y_true = [] decomp_y_true = [] los_y_true = [] pheno_y_true = [] ihm_pred = [] decomp_pred = [] los_pred = [] pheno_pred = [] for i in range(data_gen.steps): if self.verbose == 1: print("\tdone {}/{}".format(i, data_gen.steps), end='\r') (X, y, los_y_reg) = data_gen.next(return_y_true=True) outputs = self.model.predict(X, batch_size=self.batch_size) ihm_M = X[1] decomp_M = X[2] los_M = X[3] if not data_gen.target_repl: # no target replication (ihm_p, decomp_p, los_p, pheno_p) = outputs (ihm_t, decomp_t, los_t, pheno_t) = y else: # target replication (ihm_p, _, decomp_p, los_p, pheno_p, _) = outputs (ihm_t, _, decomp_t, los_t, pheno_t, _) = y los_t = los_y_reg # real value not the label # ihm for (m, t, p) in zip(ihm_M.flatten(), ihm_t.flatten(), ihm_p.flatten()): if np.equal(m, 1): ihm_y_true.append(t) ihm_pred.append(p) # decomp for (m, t, p) in zip(decomp_M.flatten(), decomp_t.flatten(), decomp_p.flatten()): if np.equal(m, 1): decomp_y_true.append(t) decomp_pred.append(p) # los if los_p.shape[-1] == 1: # regression for (m, t, p) in zip(los_M.flatten(), los_t.flatten(), los_p.flatten()): if np.equal(m, 1): los_y_true.append(t) los_pred.append(p) else: # classification for (m, t, p) in zip(los_M.flatten(), los_t.flatten(), los_p.reshape((-1, 10))): if np.equal(m, 1): los_y_true.append(t) los_pred.append(p) # pheno for (t, p) in zip(pheno_t.reshape((-1, 25)), pheno_p.reshape((-1, 25))): pheno_y_true.append(t) pheno_pred.append(p) print('\n') # ihm print("\n ================= 48h mortality ================") ihm_pred = np.array(ihm_pred) ihm_pred = np.stack([1 - ihm_pred, ihm_pred], axis=1) ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred) for k, v in ret.items(): logs[dataset + '_ihm_' + k] = v # decomp print("\n ================ decompensation ================") decomp_pred = np.array(decomp_pred) decomp_pred = np.stack([1 - decomp_pred, decomp_pred], axis=1) ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred) for k, v in ret.items(): logs[dataset + '_decomp_' + k] = v # los print("\n ================ length of stay ================") if self.partition == 'log': los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred] ret = metrics.print_metrics_log_bins(los_y_true, los_pred) if self.partition == 'custom': los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred] ret = metrics.print_metrics_custom_bins(los_y_true, los_pred) if self.partition == 'none': ret = metrics.print_metrics_regression(los_y_true, los_pred) for k, v in ret.items(): logs[dataset + '_los_' + k] = v # pheno print("\n =================== phenotype ==================") pheno_pred = np.array(pheno_pred) ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred) for k, v in ret.items(): logs[dataset + '_pheno_' + k] = v history.append(logs)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def do_epoch(mode, epoch): # mode is 'train' or 'test' ihm_predictions = [] ihm_answers = [] los_predictions = [] los_answers = [] ph_predictions = [] ph_answers = [] decomp_predictions = [] decomp_answers = [] avg_loss = 0.0 sum_loss = 0.0 prev_time = time.time() batches_per_epoch = network.get_batches_per_epoch(mode) for i in range(0, batches_per_epoch): step_data = network.step(mode) ihm_pred = step_data["ihm_prediction"] los_pred = step_data["los_prediction"] ph_pred = step_data["ph_prediction"] decomp_pred = step_data["decomp_prediction"] current_loss = step_data["loss"] ihm_loss = step_data["ihm_loss"] los_loss = step_data["los_loss"] ph_loss = step_data["ph_loss"] decomp_loss = step_data["decomp_loss"] reg_loss = step_data["reg_loss"] data = step_data["data"] ihm_data = data[1] ihm_mask = [x[1] for x in ihm_data] ihm_label = [x[2] for x in ihm_data] los_data = data[2] los_mask = [x[0] for x in los_data] los_label = [x[1] for x in los_data] ph_data = data[3] ph_label = ph_data decomp_data = data[4] decomp_mask = [x[0] for x in decomp_data] decomp_label = [x[1] for x in decomp_data] avg_loss += current_loss sum_loss += current_loss for (x, mask, y) in zip(ihm_pred, ihm_mask, ihm_label): if (mask == 1): ihm_predictions.append(x) ihm_answers.append(y) for (sx, smask, sy) in zip(los_pred, los_mask, los_label): for (x, mask, y) in zip(sx, smask, sy): if (mask == 1): los_predictions.append(x) los_answers.append(y) for (x, y) in zip(ph_pred, ph_label): ph_predictions.append(x) ph_answers.append(y) for (sx, smask, sy) in zip(decomp_pred, decomp_mask, decomp_label): for (x, mask, y) in zip(sx, smask, sy): if (mask == 1): decomp_predictions.append(x) decomp_answers.append(y) if ((i + 1) % args.log_every == 0): cur_time = time.time() print " {}ing {}.{} / {} loss: {:8.4f} = {:1.2f} + {:8.2f} + {:1.2f} + "\ "{:1.2f} + {:.2f} avg_loss: {:6.4f} time: {:6.4f}".format( mode, epoch, i * args.batch_size, batches_per_epoch * args.batch_size, float(current_loss), float(ihm_loss), float(los_loss), float(ph_loss), float(decomp_loss), float(reg_loss), float(avg_loss / args.log_every), float(cur_time - prev_time)) avg_loss = 0 prev_time = cur_time if np.isnan(current_loss): print "loss: {:6.4f} = {:1.2f} + {:8.2f} + {:1.2f} + {:1.2f} + {:.2f}".format( float(current_loss), float(ihm_loss), float(los_loss), float(ph_loss), float(decomp_loss), float(reg_loss)) raise Exception("current loss IS NaN. This should never happen :)") sum_loss /= batches_per_epoch print "\n %s loss = %.5f" % (mode, sum_loss) eps = 1e-13 if args.ihm_C > eps: print "\n ================= 48h mortality ================" metrics.print_metrics_binary(ihm_answers, ihm_predictions) if args.los_C > eps: print "\n ================ length of stay ================" if args.partition == 'log': metrics.print_metrics_log_bins(los_answers, los_predictions) else: metrics.print_metrics_custom_bins(los_answers, los_predictions) if args.ph_C > eps: print "\n =================== phenotype ==================" metrics.print_metrics_multilabel(ph_answers, ph_predictions) if args.decomp_C > eps: print "\n ================ decompensation ================" metrics.print_metrics_binary(decomp_answers, decomp_predictions) return sum_loss