def evaluate(epoch, early_stopping, model, loss_func, dls): """ evaluate the data loaders after 1 epoch of training_old """ dls_names = ["[train]", "[valid]"] with torch.no_grad(): loss = [] for dl, name in zip(dls, dls_names): if loss_func.__class__.__name__ == "DALoss": losses, mse, nll, nums = zip( *[loss_batch(model, loss_func, xb, yb) for xb, yb in dl]) sum_ = np.sum(nums) loss_dl = [ np.sum(np.multiply(losses, nums)) / sum_, np.sum(np.multiply(mse, nums)) / sum_, np.sum(np.multiply(nll, nums)) / sum_ ] es_loss = loss_dl[1] else: losses, nums = zip( *[loss_batch(model, loss_func, xb, yb) for xb, yb in dl]) loss_dl = np.sum(np.multiply(losses, nums)) / np.sum(nums) es_loss = loss_dl loss.append(loss_dl) if name == "[valid]": early_stopping(es_loss, model, epoch) res = np.r_[epoch, np.c_[dls_names, loss].ravel()] printd(*res) return early_stopping, res
def fit(epochs, batch_size, model, loss_func, opt, train_ds, valid_ds, patience, checkpoint_file): """ fit the model on the training_old data given the loss, optimizer, batch size, epochs, and early_stopping patience """ train_dl, valid_dl = create_dataloaders_from_datasets( train_ds, valid_ds, batch_size) early_stopping = EarlyStopping(patience=patience, path_=checkpoint_file) model.eval() early_stopping, res = evaluate(0, early_stopping, model, loss_func, [train_dl, valid_dl]) for epoch in range(epochs): model.train() zip(* [loss_batch(model, loss_func, xb, yb, opt) for xb, yb in train_dl]) model.eval() early_stopping, res = evaluate(epoch, early_stopping, model, loss_func, [train_dl, valid_dl]) if early_stopping.early_stop: printd("Early Stopped.") break early_stopping.save()
def main_standard(dataset, subject, model, params, exp, eval_set, ph): printd(dataset, subject, model, params, exp, eval_set, ph) # retrieve model's parameters params = locate_params(params) model_class = locate_model(model) # scale variables in minutes to the benchmark sampling frequency ph_f = ph // cs.freq hist_f = params["hist"] // cs.freq day_len_f = cs.day_len // cs.freq """ PREPROCESSING """ train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f, day_len_f) """ MODEL TRAINING """ raw_results = make_predictions_pclstm(subject, model_class, params, ph_f, train, valid, test, scalers, mode=eval_set) """ POST-PROCESSING """ raw_results = postprocessing(raw_results, scalers, dataset) """ EVALUATION """ ResultsSubject(model, exp, ph, dataset, subject, params=params, results=raw_results).save_raw_results()
def main(dataset, subject, model, params, exp, mode, log, ph, plot): printd(dataset, subject, model, params, exp, mode, log, ph, plot) # retrieve model's parameters search = locate_search(params) params = locate_params(params) model_class = locate_model(model) # scale variables in minutes to the benchmark sampling frequency ph_f = ph // cs.freq hist_f = params["hist"] // cs.freq day_len_f = cs.day_len // cs.freq """ PREPROCESSING """ train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f, day_len_f) """ MODEL TRAINING & TUNING """ if search: params = find_best_hyperparameters(subject, model_class, params, search, ph_f, train, valid, test) raw_results = make_predictions(subject, model_class, params, ph_f, train, valid, test, mode=mode) """ POST-PROCESSING """ raw_results = postprocessing(raw_results, scalers, dataset) """ EVALUATION """ results = ResultsSubject(model, exp, ph, dataset, subject, params=params, results=raw_results) printd(results.compute_results()) if plot: results.plot(0)
def comparison_all(mode, variables, metrics, patients=None): compare_dict = {} compare_mean = {} if patients is None: printd( "-------------------------------- Global -------------------------------" ) file = os.path.join(cs.path, "study", "idiab", "lstm", mode, "metrics.npy") param, results = np.load(file, allow_pickle=True) compare_dict["global"] = {} compare_mean["global"] = {} for variable in variables: compare_dict["global"][variable], compare_mean["global"][ variable] = comparison(results, variable, metrics) print_dict_stats_physio(compare_dict["global"][variable], variable) print_dict_latex_physio(compare_mean["global"]) else: for i in patients: printd("-------------------------------- Patient", str(i), "--------------------------------") file = os.path.join(cs.path, "study", "idiab", "lstm", mode, "patient " + str(i), "results.npy") param, results = np.load(file, allow_pickle=True) compare_dict["patient " + str(i)] = {} compare_mean["patient " + str(i)] = {} for variable in variables: compare_dict["patient " + str(i)][variable], compare_mean["patient " + str(i)][variable] = \ comparison(results, variable, metrics) print_dict_stats_physio( compare_dict["patient " + str(i)][variable], variable, i) print_dict_latex_physio(compare_mean["patient " + str(i)], i) return compare_dict, compare_mean
def fit(self): # get training data x_train, y_train, t_train = self._str2dataset("train") x_valid, y_valid, t_valid = self._str2dataset("valid") # save model rnd = np.random.randint(1e7) self.checkpoint_file = os.path.join( cs.path, "tmp", "checkpoints", self.__class__.__name__ + "_" + str(rnd) + ".pt") printd("Saved model's file:", self.checkpoint_file) self.model = self.FFNN_Module(x_train.shape[1], self.params["hidden"], self.params["cell_type"], self.params["dropout"]) self.model.cuda() self.loss_func = nn.MSELoss() self.opt = torch.optim.Adam(self.model.parameters(), lr=self.params["lr"], weight_decay=self.params["l2"]) train_ds = self.to_dataset(x_train, y_train) valid_ds = self.to_dataset(x_valid, y_valid) fit(self.params["epochs"], self.params["batch_size"], self.model, self.loss_func, self.opt, train_ds, valid_ds, self.params["patience"], self.checkpoint_file)
def save_checkpoint(self, val_loss, model): '''Saves model when validation loss decrease.''' if self.verbose: printd( f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...' ) torch.save(model.state_dict(), self.path) self.val_loss_min = val_loss
def _str2dataset(self, dataset_name): if dataset_name in ["train", "training"]: return self.train_x, self.train_y, self.train_t elif dataset_name in ["valid", "validation"]: return self.valid_x, self.valid_y, self.valid_t elif dataset_name in ["test", "testing"]: return self.test_x, self.test_y, self.test_t else: printd("Dataset name not known.") sys.exit(-1)
def save_raw_results(self): """ Save the results and params :return: """ dir = os.path.join(cs.path, "study", self.dataset, self.model, self.mode, "patient " + self.subject) Path(dir).mkdir(parents=True, exist_ok=True) savable_results = self.compute_results() printd("Global results for patient", self.subject, " with all experiments saved at", dir) np.save(os.path.join(dir, "results.npy"), [self.compute_params(), savable_results])
def main_cgega_iterative_training(dataset, subject, model, params1, params2, exp, eval_set, ph, save_iter=False): printd(dataset, subject, model, params1, params2, exp, eval_set, ph) # retrieve model's parameters params1 = locate_params(params1) params2 = locate_params(params2) model_class = locate_model(model) # scale variables in minutes to the benchmark sampling frequency ph_f = ph // cs.freq hist_f = params1["hist"] // cs.freq day_len_f = cs.day_len // cs.freq freq_ds = misc.datasets.datasets[dataset]["glucose_freq"] """ PREPROCESSING """ train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f, day_len_f) """ MODEL TRAINING """ dir = join(cs.path, "processing", "models", "weights", "cg_ega") file = join(dir, exp, model_class.__name__ + "_" + dataset + subject) results_test, results_valid_iter = progressive_improvement_clinical_acceptability( subject, model_class, params1, params2, ph, freq_ds, train, valid, test, scalers, file, eval_set) results_test = postprocessing(results_test, scalers, dataset) results_valid_iter = postprocessing_all_iter(results_valid_iter, scalers, dataset) ResultsSubject(model, exp, ph, dataset, subject, params=[params1, params2], results=results_test).save_raw_results() if save_iter: ResultsSubjectPICA(model, exp, ph, dataset, subject, params=[params1, params2], results=results_valid_iter).save_raw_results()
def predict(self, dataset): # get the data for which we make the predictions [endog, exog, exog_oos, y_true, t] = self.data_dict[dataset] ph = self.ph y_pred = [] for endog_i, exog_i, exog_oos_i in zip(endog, exog, exog_oos): model = self.model.apply(endog_i, exog_i) preds = model.forecast(steps=ph, exog=exog_oos_i) y_pred.append(preds[-1]) printd("end predict") return self._format_results(y_true, y_pred, t)
def params_search(grid): results = [] for params_tmp in grid: res = make_predictions(subject, model_class, params_tmp, ph, train, valid, test, mode="valid") results.append([rmse(res_) for res_ in res]) printd(params_tmp, results[-1]) return grid[np.argmin(np.mean(np.transpose(results), axis=0))]
def preprocessing_source_multi(source_datasets, target_dataset, target_subject, ph, hist, day_len): """ Preprocessing for multi-source training : - preprocess all the subjects from the source dataset, exluding the target subject if it is from the same dataset; - affect a class number to every subject; - merge the training and validation sets, and set the testing set as validation; - merge the sets from all the patients. :param source_datasets: name of the source datasets, separated by a "+" if several (e.g., "idiab+ohio") :param target_dataset: target dataset (i.e., "idiab" or "ohio") :param target_subject: target subject within target dataset (e.g, "559" if target_dataset is "ohio") :param ph: prediction horizon :param hist: history length :param day_len: length of day :return: """ train_ds, valid_ds, test_ds, scalers_ds = [], [], [], [] subject_domain = 0 for source_dataset in source_datasets.split("+"): for source_subject in misc.datasets.datasets[source_dataset][ "subjects"]: if target_dataset == source_dataset and target_subject == source_subject: continue printd("Preprocessing " + source_dataset + source_subject + "...") n_days_test = misc.datasets.datasets[source_dataset]["n_days_test"] train_sbj, valid_sbj, test_sbj, scalers_sbj = preprocessing_per_dataset[ source_dataset](source_dataset, source_subject, ph, hist, day_len, n_days_test) # no cross-validation when source training, train and valid are concatenated, and we evaluate on test train, valid, test = pd.concat([ train_sbj[0], valid_sbj[0] ]).sort_values("datetime"), test_sbj[0], test_sbj[0] # add subject domain train["domain"], valid["domain"], test[ "domain"] = subject_domain, subject_domain, subject_domain subject_domain += 1 for ds, set in zip([train_ds, valid_ds, test_ds, scalers_ds], [train, valid, test, scalers_sbj[0]]): ds.append(set) train_ds, valid_ds, test_ds = [ pd.concat(ds) for ds in [train_ds, valid_ds, test_ds] ] return [train_ds], [valid_ds], [test_ds], scalers_ds
def evaluation(raw_results, scalers, source_dataset, target_dataset, target_subject, model, params, exp, plot, tl_mode): raw_results = postprocessing(raw_results, scalers, target_dataset) exp += "_" + tl_mode.split("_")[1] exp = os.path.join(source_dataset + "_2_" + target_dataset, exp) results = ResultsSubject(model.__name__, exp, ph, target_dataset, target_subject, params=params, results=raw_results) res_mean = results.compute_mean_std_results() printd(res_mean) if plot: results.plot(0) return res_mean
def combinations(dataset, model, params, mode, ph, features_comb, number_comb, patients): """ Return a set of combinations which will be used during the processing phase. :param dataset: samples Dataframe :param model: constant for model :param params: choose to display the :param mode: :param ph: :param features_comb: :param number_comb: :param patients: :return: list of combinations, list of patients """ if features_comb is None: all_feat = all_features(dataset) else: all_feat = features_comb.split(',') combs = [] if number_comb is None: number_comb = range(0, len(all_feat) + 1) else: number_comb = list(map(int, number_comb.split(','))) for i in number_comb: els = [list(x) for x in itertools.combinations(all_feat, i)] combs.extend(els) combs = [ ele for ele in combs if ("CPB" not in ele or "CHO" not in ele) and ( "IOB" not in ele or "insulin" not in ele) and ( "AOB" not in ele or "steps" not in ele) ] # 107 combinations * 6 patients * 5 seeds * 5 sets = 32100 models to train !! if patients is None: patients = range(1, 7) else: patients = list(map(int, patients.split(','))) printd("Dataset:", dataset, "-------- Patients:", ", ".join(str(patient) for patient in patients), "-------- Features:", "glucose,", ", ".join(all_feat), "-------- Model:", model, "-------- Params:", params, "-------- Mode:", mode, "-------- Horizon:", ph, "minutes") return all_feat, combs, patients
def process_main_args(args): Model = locate_model(args.model) params = locate_params(args.params) # redirect the logs to a file if specified if args.log is not None: log_file = args.log log_path = os.path.join(path, "logs", log_file) sys.stdout = open(log_path, "w") sbj_msg = args.source_dataset + "_2_" + args.target_dataset, " " + args.target_subject if args.tl_mode == "source_training": printd("source_training", sbj_msg) main_source_training(args.source_dataset, args.target_dataset, args.target_subject, Model, params, args.weights, args.eval_mode) elif args.tl_mode == "target_training": printd("target_training", sbj_msg) main_target_training(args.source_dataset, args.target_dataset, args.target_subject, Model, params, args.eval_mode, args.exp, args.plot) elif args.tl_mode == "target_global": printd("target_global", sbj_msg) main_target_global(args.source_dataset, args.target_dataset, args.target_subject, Model, params, args.weights, args.eval_mode, args.exp, args.plot) elif args.tl_mode == "target_finetuning": printd("target_finetuning", sbj_msg) main_target_finetuning(args.source_dataset, args.target_dataset, args.target_subject, Model, params, args.weights, args.eval_mode, args.exp, args.plot) elif args.tl_mode == "end_to_end" and args.params_ft is not None: printd("end_to_end", sbj_msg) params_ft = locate_params(args.params_ft) main_source_training(args.source_dataset, args.target_dataset, args.target_subject, Model, params, args.weights, args.eval_mode) main_target_global(args.source_dataset, args.target_dataset, args.target_subject, Model, params_ft, args.weights, args.eval_mode, args.exp, args.plot) main_target_finetuning(args.source_dataset, args.target_dataset, args.target_subject, Model, params_ft, args.weights, args.eval_mode, args.exp, args.plot)
def __init__(self, subject, ph, params, train, valid, test): super().__init__(subject, ph, params, train, valid, test) x_train, y_train, t_train = self._str2dataset("train") # save model rnd = np.random.randint(int(1e7)) self.checkpoint_file = os.path.join(cs.path, "tmp", "checkpoints", "lstm_" + str(rnd) + ".pt") printd("Saved model's file:", self.checkpoint_file) self.model = self.LstmModule(x_train.shape[2], self.params["hidden"], self.params["dropout_weights"], self.params["dropout_layer"]) self.model.cuda() self.loss_func = nn.MSELoss() self.opt = torch.optim.Adam(self.model.parameters(), lr=self.params["lr"], weight_decay=self.params["l2"])
def top_model_all(mode, metrics, patients=None): best = {} if patients is None: printd( "-------------------------------- Global -------------------------------" ) file = os.path.join(cs.path, "study", "idiab", "lstm", mode, "metrics.npy") param, results = np.load(file, allow_pickle=True) best["global"] = top_model(results, metrics) else: for i in patients: printd("-------------------------------- Patient", str(i), "--------------------------------") file = os.path.join(cs.path, "study", "idiab", "lstm", mode, "patient " + str(i), "results.npy") param, results = np.load(file, allow_pickle=True) best["patient " + str(i)] = top_model(results, metrics, i) return best
def __call__(self, val_loss, model, epoch): score = -val_loss if self.best_score is None: self.best_score = score self.best_model = copy.deepcopy(model.state_dict()) self.val_loss_min = val_loss # self.save_checkpoint(val_loss, model) elif score < self.best_score: self.counter += 1 printd(f'EarlyStopping counter: {self.counter} out of {self.patience}') if self.counter >= self.patience: self.early_stop = True else: self.best_score = score self.best_model = copy.deepcopy(model.state_dict()) self.val_loss_min = val_loss self.counter = 0
def study(dataset, model, params, mode, ph, all_feat, patients, combs): # retrieve model's parameters params = locate_params(params) model_class = locate_model(model) # scale variables in minutes to the benchmark sampling frequency ph_f = ph // cs.freq hist_f = params["hist"] // cs.freq day_len_f = cs.day_len // cs.freq # full processing for i in patients: dir = os.path.join(cs.path, "study", dataset, model, mode, "patient " + str(i)) """ PREPROCESSING ALL FEATURES""" printd("Preprocessing patient " + str(i)) data = preprocessing_full(dataset, str(i), ph_f, hist_f, day_len_f, all_feat) for ele in combs: printd("Preprocessing patient", str(i), "with features glucose " + " + ".join(ele)) train, valid, test, scalers = preprocessing_select( data, dataset, day_len_f, all_feat, ele) for j in range(5): torch.manual_seed(j) """ MODEL TRAINING & TUNING """ if not ele: file = os.path.join(dir, "reference", "seed " + str(j), "weights", "weights") else: file = os.path.join(dir, " + ".join(ele), "seed " + str(j), "weights", "weights") raw_results = make_predictions(str(i), model_class, params, ph_f, train, valid, test, mode=mode, save_model_file=file) """ POST-PROCESSING """ raw_results = postprocessing(raw_results, scalers, dataset) """ EVALUATION """ if not ele: file_save = os.path.join("reference", "seed " + str(j)) else: file_save = os.path.join(" + ".join(ele), "seed " + str(j)) results = ResultsSubject(model, file_save, ph, dataset, str(i), params=params, results=raw_results, study=True, mode=mode) printd(results.compute_mean_std_results())
def preprocessing_idiab(dataset, subject, ph, hist, day_len, n_days_test): """ Idiab dataset preprocessing pipeline: loading -> remove anomalies -> resample -> remove last day -> samples creation -> cleaning (1st) -> features selection -> splitting -> cleaning (2nd) -> standardization First cleaning is done before splitting to speedup the preprocessing :param dataset: name of the dataset, e.g. "idiab" :param subject: id of the subject, e.g. "1" :param ph: prediction horizon, e.g. 30 :param hist: history length, e.g. 60 :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5) :param n_days_test: :return: training folds, validation folds, testing folds, list of scaler (one per fold) """ printd("Preprocessing " + dataset + subject + "...") data = load(dataset, subject) data = remove_anomalies(data) data = resample(data, cs.freq) data = remove_last_day(data) # data["CHO"] = CPB(data, cs.C_bio, cs.t_max) # data["insulin"] = IOB(data, cs.K_DIA) # data["steps"] = AOB(data, cs.k_s) data = create_samples(data, ph, hist, day_len) data = fill_nans(data, day_len, n_days_test) to_drop = ["calories", "heartrate", "mets", "steps"] for col in data.columns: for ele in to_drop: if ele in col: data = data.drop(col, axis=1) break train, valid, test = split(data, day_len, n_days_test, cs.cv) [train, valid, test] = [remove_nans(set_) for set_ in [train, valid, test]] train, valid, test, scalers = standardize(train, valid, test) print(test[0].shape) return train, valid, test, scalers
def local_domain_perplexity(self, n_neighbours, reduce_tsne=False, save_file=None): """ Compute the local domain perplexity metric (LDP) for every target subjects and splits :param n_neighbours: size of neighbourhood :param reduce_tsne: if the features need to be reduced to 2D with t-SNE :param save_file: if the end results should be saved :return: mean and std of the LDP metrics """ ldp_arr = [] for target_subject in self.target_subjects: printd("Perplexity " + self.target_dataset + target_subject) for split in range(misc.constants.cv): features, domains = self._compute_features(target_subject, split) if reduce_tsne: features = self._compute_tsne_features(features) ldp_arr.append(local_domain_perplexity(features, domains, n_neighbours)) if save_file is not None: np.save(save_file, ldp_arr) return np.mean(ldp_arr, axis=0), np.std(ldp_arr, axis=0)
def remove_anomalies(data, anomalies_threshold=2.5, n_run=5, disp=False): """ Remove glucose anomalies within the signals. :param data: time-series Dataframe :param anomalies_threshold: anomaly detection threshold :param n_run: number of times to run the algorithm :param disp: if the results of the removal shall be plotted and printed :return: Dataframe with no anomaly """ data_no_anomaly = data.copy() for i in range(n_run): anomalies_indexes = detect_glucose_readings_anomalies( data_no_anomaly, threshold=anomalies_threshold) data_no_anomaly = data_no_anomaly.drop(anomalies_indexes, axis=0) data_no_anomaly = data_no_anomaly.reset_index(drop=True) if disp: printd("[iter {}] Number of anomalies removed : {}".format( i, len(anomalies_indexes))) if disp: plot(data, data_no_anomaly) return data_no_anomaly
def visualization_old(patients, mode): for i in patients: printd("-------------------------------- Patient", str(i), "--------------------------------") file = os.path.join(cs.path, "study", "idiab", "lstm", mode, "patient " + str(i), "results.npy") param, results = np.load(file, allow_pickle=True) mean_rmse = {key: results[key][0]["RMSE"] for key in results.keys()} min_rmse = min(mean_rmse, key=lambda k: mean_rmse[k]) printd("Ref", results["reference"][0]["RMSE"]) printd(results[min_rmse][0]["RMSE"] / results["reference"][0]["RMSE"]) printd("The best RMSE model for patient", str(i), "is", min_rmse, "with ", results[min_rmse]) mean_mape = {key: results[key][0]["MAPE"] for key in results.keys()} min_mape = min(mean_mape, key=lambda k: mean_mape[k]) printd("The best MAPE model for patient", str(i), "is", min_mape, "with ", results[min_mape]) mean_mase = {key: results[key][0]["MASE"] for key in results.keys()} min_mase = min(mean_mase, key=lambda k: mean_mase[k]) printd("The best MASE model for patient", str(i), "is", min_mase, "with ", results[min_mase]) printd( "-------------------------------- Global -------------------------------" ) file = os.path.join(cs.path, "study", "idiab", "lstm", mode, "metrics.npy") param, results = np.load(file, allow_pickle=True) mean_rmse = {key: results[key][0]["RMSE"] for key in results.keys()} min_rmse = min(mean_rmse, key=lambda k: mean_rmse[k]) printd("The best global RMSE model is", min_rmse, "with ", results[min_rmse]) mean_mape = {key: results[key][0]["MAPE"] for key in results.keys()} min_mape = min(mean_mape, key=lambda k: mean_mape[k]) printd("The best global MAPE model is", min_mape, "with ", results[min_mape]) mean_mase = {key: results[key][0]["MASE"] for key in results.keys()} min_mase = min(mean_mase, key=lambda k: mean_mase[k]) printd("The best global MASE model is", min_mase, "with ", results[min_mase])
def main(dataset, subject, model, params, exp, mode, log, ph, plot, save=False): printd(dataset, subject, model, params, exp, mode, log, ph, plot) # retrieve model's parameters search = locate_search(params) params = locate_params(params) model_class = locate_model(model) # scale variables in minutes to the benchmark sampling frequency ph_f = ph // cs.freq hist_f = params["hist"] // cs.freq day_len_f = cs.day_len // cs.freq """ PREPROCESSING """ train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f, day_len_f) start = time.time() """ MODEL TRAINING & TUNING """ if search: params = find_best_hyperparameters(subject, model_class, params, search, ph_f, train, valid, test) if save: dir = os.path.join(cs.path, "processing", "models", "weights", model_class.__name__, exp) file = os.path.join(dir, model_class.__name__ + "_" + dataset + subject) else: file = None raw_results = make_predictions(subject, model_class, params, ph_f, train, valid, test, mode=mode, save_model_file=file) """ POST-PROCESSING """ raw_results = postprocessing(raw_results, scalers, dataset) """ EVALUATION """ results = ResultsSubject(model, exp, ph, dataset, subject, params=params, results=raw_results) printd(results.compute_mean_std_results()) end = time.time() printd("Time elapsed : " + str(end - start) + " seconds") if plot: results.plot(0)
def _compute_checkpoint_file(self, model_name): rnd = np.random.randint(int(1e7)) checkpoint_file = os.path.join(cs.path, "tmp", "checkpoints", model_name + "_" + str(rnd) + ".pt") printd("Saved model's file:", checkpoint_file) return checkpoint_file