def calc_metrics(self, data, history, dataset, logs): y_true = [] predictions = [] B = self.batch_size for i in range(0, len(data[0]), B): if self.verbose == 1: print("\tdone {}/{}".format(i, len(data[0])), end='\r') if self.target_repl: (x, y, y_repl) = (data[0][i:i + B], data[1][0][i:i + B], data[1][1][i:i + B]) else: (x, y) = (data[0][i:i + B], data[1][i:i + B]) outputs = self.model.predict(x, batch_size=B) if self.target_repl: predictions += list(np.array(outputs[0]).flatten()) else: predictions += list(np.array(outputs).flatten()) y_true += list(np.array(y).flatten()) print('\n') predictions = np.array(predictions) predictions = np.stack([1 - predictions, predictions], axis=1) ret = metrics.print_metrics_binary(y_true, predictions) for k, v in ret.items(): logs[dataset + '_' + k] = v history.append(ret)
def calc_metrics(self, data_gen, history, dataset, logs): y_true = [] predictions = [] for i in range(data_gen.steps): if self.verbose == 1: print("\tdone {}/{}".format(i, data_gen.steps), end='\r') (x, y) = next(data_gen) pred = self.model.predict(x, batch_size=self.batch_size) if self.deep_supervision: for m, t, p in zip(x[1].flatten(), y.flatten(), pred.flatten()): if np.equal(m, 1): y_true.append(t) predictions.append(p) else: y_true += list(y.flatten()) predictions += list(pred.flatten()) print('\n') predictions = np.array(predictions) predictions = np.stack([1 - predictions, predictions], axis=1) ret = metrics.print_metrics_binary(y_true, predictions) for k, v in ret.items(): logs[dataset + '_' + k] = v history.append(ret)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) #print("shape->",train_reader.read_example(100)['X'].shape) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) #print("feature sample->", train_X[11]) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) file_name = 'xgboost_{}.{}.'.format(args.period, args.features) xgreg = xgb.XGBRegressor(colsample_bytree=0.4, gamma=0, learning_rate=0.07, max_depth=3, min_child_weight=1.5, n_estimators=10000, reg_alpha=0.75, reg_lambda=0.45, subsample=0.6, seed=42) xgreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, xgreg.predict(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, xgreg.predict(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = xgreg.predict(test_X) with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def calc_metrics(self, data_gen, history, dataset, logs): ihm_y_true = [] decomp_y_true = [] los_y_true = [] pheno_y_true = [] ihm_pred = [] decomp_pred = [] los_pred = [] pheno_pred = [] for i in range(data_gen.steps): if self.verbose == 1: print("\tdone {}/{}".format(i, data_gen.steps), end='\r') (X, y, los_y_reg) = data_gen.next(return_y_true=True) outputs = self.model.predict(X, batch_size=self.batch_size) ihm_M = X[1] decomp_M = X[2] los_M = X[3] if not data_gen.target_repl: # no target replication (ihm_p, decomp_p, los_p, pheno_p) = outputs (ihm_t, decomp_t, los_t, pheno_t) = y else: # target replication (ihm_p, _, decomp_p, los_p, pheno_p, _) = outputs (ihm_t, _, decomp_t, los_t, pheno_t, _) = y los_t = los_y_reg # real value not the label # ihm for (m, t, p) in zip(ihm_M.flatten(), ihm_t.flatten(), ihm_p.flatten()): if np.equal(m, 1): ihm_y_true.append(t) ihm_pred.append(p) # decomp for (m, t, p) in zip(decomp_M.flatten(), decomp_t.flatten(), decomp_p.flatten()): if np.equal(m, 1): decomp_y_true.append(t) decomp_pred.append(p) # los if los_p.shape[-1] == 1: # regression for (m, t, p) in zip(los_M.flatten(), los_t.flatten(), los_p.flatten()): if np.equal(m, 1): los_y_true.append(t) los_pred.append(p) else: # classification for (m, t, p) in zip(los_M.flatten(), los_t.flatten(), los_p.reshape((-1, 10))): if np.equal(m, 1): los_y_true.append(t) los_pred.append(p) # pheno for (t, p) in zip(pheno_t.reshape((-1, 25)), pheno_p.reshape((-1, 25))): pheno_y_true.append(t) pheno_pred.append(p) print('\n') # ihm print("\n ================= 48h mortality ================") ihm_pred = np.array(ihm_pred) ihm_pred = np.stack([1 - ihm_pred, ihm_pred], axis=1) ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred) for k, v in ret.items(): logs[dataset + '_ihm_' + k] = v # decomp print("\n ================ decompensation ================") decomp_pred = np.array(decomp_pred) decomp_pred = np.stack([1 - decomp_pred, decomp_pred], axis=1) ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred) for k, v in ret.items(): logs[dataset + '_decomp_' + k] = v # los print("\n ================ length of stay ================") if self.partition == 'log': los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred] ret = metrics.print_metrics_log_bins(los_y_true, los_pred) if self.partition == 'custom': los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred] ret = metrics.print_metrics_custom_bins(los_y_true, los_pred) if self.partition == 'none': ret = metrics.print_metrics_regression(los_y_true, los_pred) for k, v in ret.items(): logs[dataset + '_los_' + k] = v # pheno print("\n =================== phenotype ==================") pheno_pred = np.array(pheno_pred) ret = metrics.print_metrics_multilabel(pheno_y_true, pheno_pred) for k, v in ret.items(): logs[dataset + '_pheno_' + k] = v history.append(logs)
eval_set = [(train_raw_reshape, train_raw[1]), (val_raw_reshape, val_raw[1])] xgreg.fit(train_raw_reshape, train_raw[1], eval_metric='auc,auroc', eval_set=eval_set, verbose=True, early_stopping_rounds=80) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_raw[1], xgreg.predict(train_raw_reshape)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_raw[1], xgreg.predict(val_raw_reshape)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) time_start = time.time() prediction = xgreg.predict(test_raw_reshape) time_elapse = time.time() - time_start print("Processing time on Test set :", time_elapse, " s") with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
los_pred.append(p) # pheno pheno_names += list(names) pheno_ts += list(ret["pheno_ts"]) for (t, p) in zip(pheno_t.reshape((-1, 25)), pheno_p.reshape( (-1, 25))): pheno_y_true.append(t) pheno_pred.append(p) print('\n') # ihm if args.ihm_C > 0: print("\n ================= 48h mortality ================") ihm_pred = np.array(ihm_pred) ihm_ret = metrics.print_metrics_binary(ihm_y_true, ihm_pred) # decomp if args.decomp_C > 0: print("\n ================ decompensation ================") decomp_pred = np.array(decomp_pred) decomp_ret = metrics.print_metrics_binary(decomp_y_true, decomp_pred) # los if args.los_C > 0: print("\n ================ length of stay ================") if args.partition == 'log': los_pred = [metrics.get_estimate_log(x, 10) for x in los_pred] los_ret = metrics.print_metrics_log_bins(los_y_true, los_pred) if args.partition == 'custom': los_pred = [metrics.get_estimate_custom(x, 10) for x in los_pred]
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) time_start = time.time() prediction = logreg.predict_proba(test_X)[:, 1] time_elapse = time.time() - time_start print("Processing time on Test set :", time_elapse, " s") with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
model_batch_loss.append(model_loss.cpu().detach().numpy()) decov_batch_loss.append(decov_loss.cpu().detach().numpy()) y_pred += list(output.cpu().detach().numpy().flatten()) y_true += list(batch_y.cpu().numpy().flatten()) valid_loss.append(np.mean(np.array(batch_loss))) valid_model_loss.append(np.mean(np.array(model_batch_loss))) valid_decov_loss.append(np.mean(np.array(decov_batch_loss))) print("\n==>Predicting on validation") print('Valid Loss = %.4f' % (valid_loss[-1])) print('valid_model Loss = %.4f' % (valid_model_loss[-1])) print('valid_decov Loss = %.4f' % (valid_decov_loss[-1])) y_pred = np.array(y_pred) y_pred = np.stack([1 - y_pred, y_pred], axis=1) ret = metrics.print_metrics_binary(y_true, y_pred) history.append(ret) print() cur_auroc = ret['auroc'] if cur_auroc > max_roc: max_roc = cur_auroc state = { 'net': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': each_epoch } torch.save(state, file_name) print('\n------------ Save best model ------------\n')
test_loss = torch.neg(torch.sum(test_loss)) cur_test_loss.append(test_loss.cpu().detach().numpy()) for m, t, p in zip( test_mask.cpu().numpy().flatten(), test_y.cpu().numpy().flatten(), test_output.cpu().detach().numpy().flatten()): if np.equal(m, 1): test_true.append(t) test_pred.append(p) print('Test loss = %.4f' % (np.mean(np.array(cur_test_loss)))) print('\n') test_pred = np.array(test_pred) test_pred = np.stack([1 - test_pred, test_pred], axis=1) test_ret = metrics.print_metrics_binary(test_true, test_pred) else: ''' Prepare training data''' print('Preparing training data ... ') train_data_loader = common_utils.DeepSupervisionDataLoader( dataset_dir=os.path.join(args.data_path, 'train'), listfile=os.path.join(args.data_path, 'train_listfile.csv'), small_part=args.small_part) val_data_loader = common_utils.DeepSupervisionDataLoader( dataset_dir=os.path.join(args.data_path, 'train'), listfile=os.path.join(args.data_path, 'val_listfile.csv'), small_part=args.small_part) discretizer = Discretizer(timestep=1.0, store_masks=True, impute_strategy='previous',
test_reader, discretizer, normalizer, args.batch_size, None, shuffle=False, return_names=True) # put steps = None for a full test for i in range(test_data_gen.steps): print("predicting {} / {}".format(i, test_data_gen.steps), end='\r') ret = next(test_data_gen) x, y = ret["data"] cur_names = ret["names"] cur_ts = ret["ts"] x = np.array(x) pred = model.predict_on_batch(x)[:, 0] predictions += list(pred) labels += list(y) names += list(cur_names) ts += list(cur_ts) metrics.print_metrics_binary(labels, predictions) path = os.path.join(args.output_dir, 'test_predictions', os.path.basename(args.load_state)) + '.csv' preprocessing.save_results(names, ts, predictions, labels, path) else: raise ValueError("Wrong value for args.mode")
cur_val_loss.append(loss.cpu().detach().numpy()) for t, p in zip( val_y.cpu().numpy().flatten(), val_output.cpu().detach().numpy().flatten(), ): val_true.append(t) val_pred.append(p) cur_val_loss = np.mean(np.array(cur_val_loss)) scheduler.step(cur_val_loss) print("Validation loss = {:.6f}".format(cur_val_loss)) val_loss.append(cur_val_loss) print("\n") val_pred = np.array(val_pred) val_pred = np.stack([1 - val_pred, val_pred], axis=1) ret = metrics.print_metrics_binary(val_true, val_pred) cur_auroc = ret["auroc"] if cur_auroc > max_auroc: max_auroc = cur_auroc state = { "net": model.state_dict(), "optimizer": optimizer.state_dict(), "epoch": epoch, "params": model_para, "train_loss": train_loss, "val_loss": val_loss, } torch.save(state, file_name) print("\n------------ Save the best model ------------\n") end_time = time.time() print("total used time = {}".format(end_time - start_time))