def get_comparison_csv( filters, save_path="../latex/plots/", ): df = read_data.read_csvs_conv(which=0) df = df.drop(columns="duration") comp_data = [] for filter in filters: df_ = df for key, val in filter.items(): if key == "label": label = val else: df_ = df_.loc[df_[key] == val] price = np.mean(df_["price"].values) std = np.std(df_["price"].values) comp_data.append([filter, price, std]) comp_df = pd.DataFrame(data=comp_data, columns=["desc", "price", "std"]) comp_df = comp_df.sort_values(by=["price"], ascending=False) if not os.path.exists(save_path): os.makedirs(save_path) save_comp_file = "{}comparison.csv".format(save_path) comp_df.to_csv(save_comp_file) SBM.send_notification( text_for_files='comparison csv', chat_id=chat_id, files=[save_comp_file], text=None, )
def main(argv): del argv N, K, forced_rank, sparsity, dist = (FLAGS.N, FLAGS.K, FLAGS.forced_rank, FLAGS.sparsity, FLAGS.ldistribution) l0_available = not FLAGS.eval_market generate_pdf_table(N, K, forced_rank, sparsity, l0_available, dist) SBM.send_notification( text="finished evaluation: {}, {}".format(sparsity, dist)) if FLAGS.only_table: return draw_comparison_images(N, K, forced_rank, sparsity, l0_available, dist)
def plot_loss_and_metric(model_ids=(1, ), save_extras={ 'bbox_inches': 'tight', 'pad_inches': 0.01 }, file_name="loss_and_metric-id{}.pdf", time_col='epoch', cols=('train_loss', 'eval_loss', 'evaluation_mean_diff'), names=('train_loss', 'eval_loss', 'eval_metric')): """ function to plot the losses and metric in one plot with subplots to see their joint evolution :param model_ids: list of int :param save_extras: dict :param file_name: str including "{}", name of saved file :param time_col: str, usually 'epoch' :param cols: list of str, the column names to plot :param names: None or list of str, names of the y-labels, None: use cols """ prop_cycle = plt.rcParams['axes.prop_cycle'] colors = prop_cycle.by_key()['color'] if names is None: names = cols for model_id in model_ids: path = os.path.join(train.saved_models_path, "id-{}".format(model_id), "metric_id-{}.csv".format(model_id)) df = pd.read_csv(path) t = df[time_col] n = len(cols) fig, axes = plt.subplots(n) for i, col in enumerate(cols): axes[i].plot(t, df[col].values, color=colors[i]) axes[i].set(ylabel=names[i]) axes[-1].set(xlabel=time_col) save_path = os.path.join(train.saved_models_path, "id-{}".format(model_id), file_name.format(model_id)) plt.savefig(save_path, **save_extras) plt.close(fig) if SEND: SBM.send_notification( text=None, files=[save_path], text_for_files="loss and metric plot - id={}".format(model_id))
def main(argv): del argv try: if SEND: SBM.send_notification( text='start running AMC2 with config:\n{}'.format( FLAGS.configs), chat_id="-399803347") filepath = generate_paths() if FLAGS.generate_pdf: write_figures.write_figures() write_figures.generate_pdf() if SEND: time.sleep(1) SBM.send_notification(text='finished', files=[filepath], chat_id="-399803347") except Exception as e: if SEND: SBM.send_notification(text='ERROR\n{}'.format(e), chat_id="-399803347") else: print('ERROR\n{}'.format(e))
def parallel_training(params=None, model_ids=None, nb_jobs=1, first_id=None, saved_models_path=train.saved_models_path, overwrite_params=None): """ function for parallel training, based on train.train :param params: a list of param_dicts, each dict corresponding to one model that should be trained, can be None if model_ids is given (then unused) all kwargs needed for train.train have to be in each dict -> giving the params together with first_id, they can be used to restart parallel training (however, the saved params for all models where the model_id already existed will be used instead of the params in this list, so that no unwanted errors are produced by mismatching. whenever a model_id didn't exist yet the params of the list are used to make a new one) -> giving params without first_id, all param_dicts will be used to initiate new models :param model_ids: list of ints, the model ids to use (only those for which a model was already initiated and its description was saved to the model_overview.csv file will be used) -> used to restart parallel training of certain model_ids after the training was stopped :param nb_jobs: int, the number of CPUs to use parallelly :param first_id: int or None, the model_id corresponding to the first element of params list :param saved_models_path: str, path to saved models :param overwrite_params: None or dict with key the param name to be overwritten and value the new value for this param. can bee used to continue the training of a stored model, where some params should be changed (e.g. the number of epochs to train longer) :return: """ if params is not None and 'saved_models_path' in params[0]: saved_models_path = params[0]['saved_models_path'] model_overview_file_name = '{}model_overview.csv'.format( saved_models_path) train.makedirs(saved_models_path) if not os.path.exists(model_overview_file_name): df_overview = pd.DataFrame(data=None, columns=['id', 'description']) max_id = 0 else: df_overview = pd.read_csv(model_overview_file_name, index_col=0) max_id = np.max(df_overview['id'].values) # get model_id, model params etc. for each param if model_ids is None and params is None: return 0 if model_ids is None: if first_id is None: model_id = max_id + 1 else: model_id = first_id for i, param in enumerate(params): if model_id in df_overview['id'].values: desc = (df_overview['description'].loc[ df_overview['id'] == model_id]).values[0] params_dict = json.loads(desc) params_dict['resume_training'] = True params_dict['model_id'] = model_id if overwrite_params: for k, v in overwrite_params.items(): params_dict[k] = v desc = json.dumps(params_dict, sort_keys=True) df_overview.loc[ df_overview['id'] == model_id, 'description'] = desc df_overview.to_csv(model_overview_file_name) params[i] = params_dict else: desc = json.dumps(param, sort_keys=True) df_ov_app = pd.DataFrame([[model_id, desc]], columns=['id', 'description']) df_overview = pd.concat([df_overview, df_ov_app], ignore_index=True) df_overview.to_csv(model_overview_file_name) params_dict = json.loads(desc) params_dict['resume_training'] = False params_dict['model_id'] = model_id params[i] = params_dict model_id += 1 else: params = [] for model_id in model_ids: if model_id not in df_overview['id'].values: print("model_id={} does not exist yet -> skip".format(model_id)) else: desc = (df_overview['description'].loc[ df_overview['id'] == model_id]).values[0] params_dict = json.loads(desc) params_dict['model_id'] = model_id params_dict['resume_training'] = True if overwrite_params: for k, v in overwrite_params.items(): params_dict[k] = v desc = json.dumps(params_dict, sort_keys=True) df_overview.loc[ df_overview['id'] == model_id, 'description'] = desc df_overview.to_csv(model_overview_file_name) params.append(params_dict) for param in params: param['parallel'] = True if SEND: SBM.send_notification( text='start parallel training - \nparams:' '\n\n{}'.format(params) ) if DEBUG: results = Parallel(n_jobs=nb_jobs)(delayed(train_switcher)(**param) for param in params) if SEND: SBM.send_notification( text='finished parallel training - \nparams:' '\n\n{}'.format(params) ) else: try: results = Parallel(n_jobs=nb_jobs)(delayed(train_switcher)(**param) for param in params) if SEND: SBM.send_notification( text='finished parallel training - \nparams:' '\n\n{}'.format(params) ) except Exception as e: if SEND: SBM.send_notification( text='error in parallel training - \nerror:' '\n\n{}'.format(e), chat_id=error_chat_id ) else: print('error:\n\n{}'.format(e))
def train(model_id=None, epochs=100, batch_size=50, save_every=1, learning_rate=0.001, hidden_size=41, bias=True, dropout_rate=0.1, ode_nn=default_ode_nn, readout_nn=default_readout_nn, enc_nn=default_enc_nn, use_rnn=False, solver="euler", weight=0.5, weight_decay=1., dataset='physionet', saved_models_path=saved_models_path, quantization=0.016, n_samples=8000, eval_input_prob=None, eval_input_seed=3892, **options): """ training function for controlled ODE-RNN model (models.NJODE), the model is automatically saved in the model-save-path with the given model id, also all evaluations of the model are saved there :param model_id: None or int, the id to save (or load if it already exists) the model, if None: next biggest unused id will be used :param epochs: int, number of epochs to train, each epoch is one cycle through all (random) batches of the training data :param batch_size: int :param save_every: int, defined number of epochs after each of which the model is saved and plotted if wanted. whenever the model has a new best eval-loss it is also saved, independent of this number (but not plotted) :param learning_rate: float :param hidden_size: see models.NJODE :param bias: see models.NJODE :param dropout_rate: float :param ode_nn: see models.NJODE :param readout_nn: see models.NJODE :param enc_nn: see models.NJODE :param use_rnn: see models.NJODE :param solver: see models.NJODE :param weight: see models.NJODE :param weight_decay: see models.NJODE :param saved_models_path: str, where to save the models :param quantization: the time-step size in the physionet dataset (1=1h, 0.016~1/60=1min) :param eval_input_prob: None or float in [0,1], the probability for each of the datapoints on the left out part of the eval set (second half of time points) to be used as input during eval. for the evaluation, the predicted value before this input is processed (i.e. before the jump) is used. :param eval_input_seed: None or int, seed for sampling from distribution, when deciding which data points are used from left-out part of eval set. If the seed is not None, in each call to the eval dataloader, the same points will be included additionally as input. :param options: kwargs, used keywords: 'parallel' bool, used by parallel_train.parallel_training 'resume_training' bool, used by parallel_train.parallel_training 'which_loss' 'standard' or 'easy', used by models.NJODE 'residual_enc_dec' bool, whether resNNs are used for encoder and readout NN, used by models.NJODE, default True 'delta_t' float, default equals quantization/48, which is the step size when the time scale is normalized to [0,1], to change stepsize of alg 'load_best' bool, whether to load the best checkpoint instead of the last checkpoint when loading the model. Mainly used for evaluating model at the best checkpoint. """ initial_print = "" options['masked'] = True if ANOMALY_DETECTION: torch.autograd.set_detect_anomaly(True) torch.manual_seed(0) np.random.seed(0) cudnn.deterministic = True # set number of CPUs if SERVER: torch.set_num_threads(N_CPUS) # get the device for torch if torch.cuda.is_available(): gpu_num = 0 device = torch.device("cuda:{}".format(gpu_num)) torch.cuda.set_device(gpu_num) initial_print += '\nusing GPU' else: device = torch.device("cpu") initial_print += '\nusing CPU' # get data parser = argparse.ArgumentParser() args = parser.parse_args([]) args_dict = vars(args) args_dict["dataset"] = "physionet" args_dict["n"] = n_samples args_dict["quantization"] = quantization args_dict["batch_size"] = batch_size args_dict["classif"] = False args_dict["eval_input_prob"] = eval_input_prob args_dict["eval_input_seed"] = eval_input_seed data_objects = parse_dataset.parse_datasets(args, device) dl = data_objects["train_dataloader"] dl_test = data_objects["test_dataloader"] input_size = data_objects["input_dim"] output_size = input_size T = 1 + 1e-12 delta_t = quantization / 48. if "delta_t" in options: delta_t = options['delta_t'] # get params_dict params_dict = { 'input_size': input_size, 'epochs': epochs, 'hidden_size': hidden_size, 'output_size': output_size, 'bias': bias, 'ode_nn': ode_nn, 'readout_nn': readout_nn, 'enc_nn': enc_nn, 'use_rnn': use_rnn, 'dropout_rate': dropout_rate, 'batch_size': batch_size, 'solver': solver, 'dataset': dataset, 'quantization': quantization, 'n_samples': n_samples, 'eval_input_prob': eval_input_prob, 'learning_rate': learning_rate, 'eval_input_seed': eval_input_seed, 'weight': weight, 'weight_decay': weight_decay, 'options': options } desc = json.dumps(params_dict, sort_keys=True) # get overview file resume_training = False if ('parallel' in options and options['parallel'] is False) or \ ('parallel' not in options): model_overview_file_name = '{}model_overview.csv'.format( saved_models_path) makedirs(saved_models_path) if not os.path.exists(model_overview_file_name): df_overview = pd.DataFrame(data=None, columns=['id', 'description']) max_id = 0 else: df_overview = pd.read_csv(model_overview_file_name, index_col=0) max_id = np.max(df_overview['id'].values) # get model_id, model params etc. if model_id is None: model_id = max_id + 1 if model_id not in df_overview['id'].values: initial_print += '\nnew model_id={}'.format(model_id) df_ov_app = pd.DataFrame([[model_id, desc]], columns=['id', 'description']) df_overview = pd.concat([df_overview, df_ov_app], ignore_index=True) df_overview.to_csv(model_overview_file_name) else: initial_print += '\nmodel_id already exists -> resume training' resume_training = True desc = (df_overview['description'].loc[df_overview['id'] == model_id]).values[0] params_dict = json.loads(desc) options = params_dict['options'] initial_print += '\nmodel params:\n{}'.format(desc) if 'resume_training' in options and options['resume_training'] is True: resume_training = True # get all needed paths model_path = '{}id-{}/'.format(saved_models_path, model_id) makedirs(model_path) model_path_save_last = '{}last_checkpoint/'.format(model_path) model_path_save_best = '{}best_checkpoint/'.format(model_path) makedirs(model_path_save_last) makedirs(model_path_save_best) model_metric_file = '{}metric_id-{}.csv'.format(model_path, model_id) # get the model & optimizer if 'other_model' not in options: model = models.NJODE(**params_dict) model_name = 'NJ-ODE' else: raise ValueError( "Invalid argument for (option) parameter 'other_model'." "Please check docstring for correct use.") model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0005) # load saved model if wanted/possible best_eval_metric = np.infty metr_columns = METR_COLUMNS if resume_training: initial_print += '\nload saved model ...' try: if 'load_best' in options and options['load_best']: models.get_ckpt_model(model_path_save_best, model, optimizer, device) else: models.get_ckpt_model(model_path_save_last, model, optimizer, device) df_metric = pd.read_csv(model_metric_file, index_col=0) best_eval_metric = np.min(df_metric['eval_metric'].values) model.epoch += 1 model.weight_decay_step() initial_print += '\nepoch: {}, weight: {}'.format( model.epoch, model.weight) except Exception as e: initial_print += '\nloading model failed -> initiate new model' initial_print += '\nException:\n{}'.format(e) resume_training = False if not resume_training: initial_print += '\ninitiate new model ...' df_metric = pd.DataFrame(columns=metr_columns) # ---------------- TRAINING ---------------- skip_training = True if model.epoch <= epochs: skip_training = False # send notification if SEND: SBM.send_notification( text='start training on physionet: {} id={}'.format( model_name, model_id)) initial_print += '\n\nmodel overview:' print(initial_print) print(model, '\n') # compute number of parameters nr_params = 0 for name, param in model.named_parameters(): skip = False for p_name in ['gru_debug', 'classification_model']: if p_name in name: skip = True if not skip: nr_params += param.nelement() print('# parameters={}\n'.format(nr_params)) print('start training ...') metric_app = [] while model.epoch <= epochs: t = time.time() model.train() # set model in train mode (e.g. BatchNorm) for i, b in tqdm.tqdm(enumerate(dl)): optimizer.zero_grad() times = b["times"] time_ptr = b["time_ptr"] X = b["X"].to(device) M = b["M"].to(device) obs_idx = b["obs_idx"] b_size = b["batch_size"] unique_idx, counts = np.unique(obs_idx.detach().numpy(), return_counts=True) n_obs_ot = np.zeros((b_size)) n_obs_ot[unique_idx] = counts n_obs_ot = n_obs_ot.astype(np.int) n_obs_ot = torch.tensor(n_obs_ot).to(device) start_X = torch.tensor(np.zeros((b_size, X.size()[1])), dtype=torch.float32) if 'other_model' not in options: hT, loss = model(times, time_ptr, X, obs_idx, delta_t, T, start_X, n_obs_ot, return_path=False, get_loss=True, M=M) else: raise ValueError("the other_model is not defined") loss.backward() optimizer.step() train_time = time.time() - t # -------- evaluation -------- t = time.time() loss_val, mse_val, mse_val_2 = evaluate_model(model, dl_test, device, options, delta_t, T) eval_time = time.time() - t train_loss = loss.detach().numpy() print("epoch {}, weight={:.5f}, train-loss={:.5f}, " "eval-loss={:.5f}, eval-metric={:.5f}, " "eval-metric_2={:.5f}".format(model.epoch, model.weight, train_loss, loss_val, mse_val, mse_val_2)) if mse_val < best_eval_metric: print('save new best model: last-best-metric: {:.5f}, ' 'new-best-metric: {:.5f}, epoch: {}'.format( best_eval_metric, mse_val, model.epoch)) models.save_checkpoint(model, optimizer, model_path_save_best, model.epoch) best_eval_metric = mse_val metric_app.append([ model.epoch, train_time, eval_time, train_loss, loss_val, mse_val, mse_val_2 ]) # save model if model.epoch % save_every == 0: print('save model ...') df_m_app = pd.DataFrame(data=metric_app, columns=metr_columns) df_metric = pd.concat([df_metric, df_m_app], ignore_index=True) df_metric.to_csv(model_metric_file) models.save_checkpoint(model, optimizer, model_path_save_last, model.epoch) metric_app = [] print('saved!') model.epoch += 1 model.weight_decay_step() # send notification if SEND and not skip_training: files_to_send = [model_metric_file] caption = "{} - id={}".format(model_name, model_id) SBM.send_notification( text='finished training on physionet: {}, id={}\n\n{}'.format( model_name, model_id, desc), files=files_to_send, text_for_files=caption) # delete model & free memory del model, dl, dl_test, data_objects gc.collect() return 0
def plot_hurst( filters, save_path="../latex/plots/", save_extras={ 'bbox_inches': 'tight', 'pad_inches': 0.01 }, ): prop_cycle = plt.rcParams['axes.prop_cycle'] colors = prop_cycle.by_key()['color'] linestyles = ['--', '-.', '-', ':'] df = read_data.read_csvs_conv(which=0) df = df.drop(columns="duration") comp_data = [] hurst = sorted(list(set(df["hurst"].values))) f = plt.figure() for i, filter in enumerate(filters): df_ = df for key, val in filter.items(): if key == "label": label = val elif key in ["color", "linestyle"]: pass else: df_ = df_.loc[df_[key] == val] price = [] for h in hurst: price.append(np.mean(df_.loc[df_["hurst"] == h, "price"].values)) # print("price", len(price), price) comp_data.append( [filter, np.mean((np.array(dos_p) - np.array(price))**2)]) if "color" not in filter: filter["color"] = colors[i % len(colors)] if "linestyle" not in filter: filter["linestyle"] = linestyles[i % len(linestyles)] plt.plot(hurst, price, label=label, color=filter["color"], linestyle=filter["linestyle"]) comp_df = pd.DataFrame(data=comp_data, columns=["desc", "diff"]) comp_df = comp_df.sort_values(by=["diff"], ascending=True) plt.plot(dos_t, dos_p, label="pathDOS-paper", color="black") plt.xlabel("hurst") plt.ylabel("price") plt.legend() if not os.path.exists(save_path): os.makedirs(save_path) save_file = "{}hurst_plot.png".format(save_path) save_comp_file = "{}hurst_comp.csv".format(save_path) plt.savefig(save_file, **save_extras) comp_df.to_csv(save_comp_file) SBM.send_notification( text_for_files='hurst plot', chat_id=chat_id, files=[save_file, save_comp_file], text=None, )
def train( model_id=None, epochs=100, batch_size=100, save_every=1, learning_rate=0.001, test_size=0.2, seed=398, hidden_size=10, bias=True, dropout_rate=0.1, ode_nn=default_ode_nn, readout_nn=default_readout_nn, enc_nn=default_enc_nn, use_rnn=False, solver="euler", weight=0.5, weight_decay=1., dataset='BlackScholes', dataset_id=None, plot=True, paths_to_plot=(0,), saved_models_path=saved_models_path, **options ): """ training function for controlled ODE-RNN model (models.NJODE), the model is automatically saved in the model-save-path with the given model id, also all evaluations of the model are saved there :param model_id: None or int, the id to save (or load if it already exists) the model, if None: next biggest unused id will be used :param epochs: int, number of epochs to train, each epoch is one cycle through all (random) batches of the training data :param batch_size: int :param save_every: int, defined number of epochs after each of which the model is saved and plotted if wanted. whenever the model has a new best eval-loss it is also saved, independent of this number (but not plotted) :param learning_rate: float :param test_size: float in (0,1), the percentage of samples to use for the test set (here there exists only a test set, since there is no extra evaluation) :param seed: int, seed for the random splitting of the dataset into train and test :param hidden_size: see models.NJODE :param bias: see models.NJODE :param dropout_rate: float :param ode_nn: see models.NJODE :param readout_nn: see models.NJODE :param enc_nn: see models.NJODE :param use_rnn: see models.NJODE :param solver: see models.NJODE :param weight: see models.NJODE :param weight_decay: see models.NJODE :param dataset: str, which dataset to use, supported: {'BlackScholes', 'Heston', 'OrnsteinUhlenbeck'}. The corresponding dataset already needs to exist (create it first using data_utils.create_dataset) :param dataset_id: int or None, the id of the dataset to be use, if None, the latest generated dataset of the given name will be used :param plot: bool, whethere to plot :param paths_to_plot: list of ints, which paths of the test-set should be plotted :param saved_models_path: str, where to save the models :param options: kwargs, used keywords: 'func_appl_X' list of functions (as str, see data_utils) to apply to X 'plot_variance' bool, whether to plot also variance 'std_factor' float, the factor by which the std is multiplied 'parallel' bool, used by parallel_train.parallel_training 'resume_training' bool, used by parallel_train.parallel_training 'plot_only' bool, whether the model is used only to plot after initiating or loading (i.e. no training) and exit afterwards (used by demo) 'ylabels' list of str, see plot_one_path_with_pred() 'which_loss' 'standard' or 'easy', used by models.NJODE 'residual_enc_dec' bool, whether resNNs are used for encoder and readout NN, used by models.NJODE, default True 'input_current_t' bool, whether to additionally input current time to the ODE function f 'training_size' int, if given and smaller than dataset_size*(1-test_size), then this is the umber of samples used for the training set (randomly selected out of original training set) 'evaluate' bool, whether to evaluate the model in the test set (i.e. not only compute the eval_loss, but also compute the mean difference between the true and the predicted paths comparing at each time point) 'load_best' bool, whether to load the best checkpoint instead of the last checkpoint when loading the model. Mainly used for evaluating model at the best checkpoint. 'other_model' one of {'GRU_ODE_Bayes'}; the specifieed model is trained instead of the controlled ODE-RNN model. Other options/inputs might change or loose their effect. The saved_models_path is changed to "{...}<model-name>-saved_models/" instead of "{...}saved_models/". -> 'GRU_ODE_Bayes' has the following extra options with the names 'GRU_ODE_Bayes'+<option_name>, for the following list of possible choices for <options_name>: '-mixing' float, default: 0.0001, weight of the 2nd loss term of GRU-ODE-Bayes '-solver' one of {"euler", "midpoint", "dopri5"}, default: "euler" '-impute' bool, default: False, whether to impute the last parameter estimation of the p_model for the next ode_step as input. the p_model maps (like the readout_map) the hidden state to the parameter estimation of the normal distribution. '-logvar' bool, default: True, wether to use logarithmic (co)variace -> hardcodinng positivity constraint '-full_gru_ode' bool, default: True, whether to use the full GRU cell or a smaller version, see GRU-ODE-Bayes '-p_hidden' int, default: hidden_size, size of the inner hidden layer of the p_model '-prep_hidden' int, default: hidden_size, in the observational cell (i.e. jumps) a prior matrix multiplication transforms the input to have the size prep_hidden * input_size '-cov_hidden' int, default: hidden_size, size of the inner hidden layer of the covariate_map. the covariate_map is used as a mapping to get the initial h (for controlled ODE-RNN this is done by the encoder) """ initial_print = "model-id: {}\n".format(model_id) if ANOMALY_DETECTION: torch.autograd.set_detect_anomaly(True) torch.manual_seed(0) np.random.seed(0) cudnn.deterministic = True # set number of CPUs if SERVER: torch.set_num_threads(N_CPUS) # get the device for torch if torch.cuda.is_available(): gpu_num = 0 device = torch.device("cuda:{}".format(gpu_num)) torch.cuda.set_device(gpu_num) initial_print += '\nusing GPU' else: device = torch.device("cpu") initial_print += '\nusing CPU' # load dataset-metadata dataset_id = int(data_utils._get_time_id(stock_model_name=dataset, time_id=dataset_id)) dataset_metadata = data_utils.load_metadata(stock_model_name=dataset, time_id=dataset_id) input_size = dataset_metadata['dimension'] output_size = input_size T = dataset_metadata['maturity'] delta_t = dataset_metadata['dt'] # load raw data train_idx, val_idx = train_test_split( np.arange(dataset_metadata["nb_paths"]), test_size=test_size, random_state=seed ) # -- get subset of training samples if wanted if 'training_size' in options: train_set_size = options['training_size'] if train_set_size < len(train_idx): train_idx = np.random.choice( train_idx, train_set_size, replace=False ) data_train = data_utils.IrregularDataset( model_name=dataset, time_id=dataset_id, idx=train_idx) data_val = data_utils.IrregularDataset( model_name=dataset, time_id=dataset_id, idx=val_idx) # get data-loader for training if 'func_appl_X' in options: functions = options['func_appl_X'] collate_fn, mult = data_utils.CustomCollateFnGen(functions) input_size = input_size * mult output_size = output_size * mult else: functions = None mult = 1 collate_fn = data_utils.custom_collate_fn dl = DataLoader( dataset=data_train, collate_fn=collate_fn, shuffle=True, batch_size=batch_size, num_workers=N_DATASET_WORKERS) dl_val = DataLoader( dataset=data_val, collate_fn=collate_fn, shuffle=False, batch_size=len(data_val), num_workers=N_DATASET_WORKERS) # get additional plotting information plot_variance = False std_factor = 1 if functions is not None and mult > 1: if 'plot_variance' in options: plot_variance = options['plot_variance'] if 'std_factor' in options: std_factor = options['std_factor'] ylabels = None if 'ylabels' in options: ylabels = options['ylabels'] # get optimal eval loss # TODO: this is not correct if other functions are applied to X stockmodel = data_utils._STOCK_MODELS[ dataset_metadata['model_name']](**dataset_metadata) opt_eval_loss = compute_optimal_eval_loss( dl_val, stockmodel, delta_t, T) initial_print += '\noptimal eval loss (achieved by true cond exp): ' \ '{:.5f}'.format(opt_eval_loss) if 'other_model' in options: opt_eval_loss = np.nan # get params_dict params_dict = { 'input_size': input_size, 'epochs': epochs, 'hidden_size': hidden_size, 'output_size': output_size, 'bias': bias, 'ode_nn': ode_nn, 'readout_nn': readout_nn, 'enc_nn': enc_nn, 'use_rnn': use_rnn, 'dropout_rate': dropout_rate, 'batch_size': batch_size, 'solver': solver, 'dataset': dataset, 'dataset_id': dataset_id, 'learning_rate': learning_rate, 'test_size': test_size, 'seed': seed, 'weight': weight, 'weight_decay': weight_decay, 'optimal_eval_loss': opt_eval_loss, 'options': options} desc = json.dumps(params_dict, sort_keys=True) # get overview file resume_training = False if ('parallel' in options and options['parallel'] is False) or \ ('parallel' not in options): model_overview_file_name = '{}model_overview.csv'.format( saved_models_path ) makedirs(saved_models_path) if not os.path.exists(model_overview_file_name): df_overview = pd.DataFrame(data=None, columns=['id', 'description']) max_id = 0 else: df_overview = pd.read_csv(model_overview_file_name, index_col=0) max_id = np.max(df_overview['id'].values) # get model_id, model params etc. if model_id is None: model_id = max_id + 1 if model_id not in df_overview['id'].values: initial_print += '\nnew model_id={}'.format(model_id) df_ov_app = pd.DataFrame([[model_id, desc]], columns=['id', 'description']) df_overview = pd.concat([df_overview, df_ov_app], ignore_index=True) df_overview.to_csv(model_overview_file_name) else: initial_print += '\nmodel_id already exists -> resume training' resume_training = True desc = (df_overview['description'].loc[ df_overview['id'] == model_id]).values[0] params_dict = json.loads(desc) initial_print += '\nmodel params:\n{}'.format(desc) if 'resume_training' in options and options['resume_training'] is True: resume_training = True # get all needed paths model_path = '{}id-{}/'.format(saved_models_path, model_id) makedirs(model_path) model_path_save_last = '{}last_checkpoint/'.format(model_path) model_path_save_best = '{}best_checkpoint/'.format(model_path) makedirs(model_path_save_last) makedirs(model_path_save_best) model_metric_file = '{}metric_id-{}.csv'.format(model_path, model_id) plot_save_path = '{}plots/'.format(model_path) if 'save_extras' in options: save_extras = options['save_extras'] else: save_extras = {} # get the model & optimizer if 'other_model' not in options: model = models.NJODE(**params_dict) model_name = 'NJODE' elif options['other_model'] == "GRU_ODE_Bayes": model_name = 'GRU-ODE-Bayes' # get parameters for GRU-ODE-Bayes model hidden_size = params_dict['hidden_size'] mixing = 0.0001 if 'GRU_ODE_Bayes-mixing' in options: mixing = options['GRU_ODE_Bayes-mixing'] solver = 'euler' if 'GRU_ODE_Bayes-solver' in options: solver = options['GRU_ODE_Bayes-solver'] impute = False if 'GRU_ODE_Bayes-impute' in options: impute = options['GRU_ODE_Bayes-impute'] logvar = True if 'GRU_ODE_Bayes-logvar' in options: logvar = options['GRU_ODE_Bayes-logvar'] full_gru_ode = True if 'GRU_ODE_Bayes-full_gru_ode' in options: full_gru_ode = options['GRU_ODE_Bayes-full_gru_ode'] p_hidden = hidden_size if 'GRU_ODE_Bayes-p_hidden' in options: p_hidden = options['GRU_ODE_Bayes-p_hidden'] prep_hidden = hidden_size if 'GRU_ODE_Bayes-prep_hidden' in options: prep_hidden = options['GRU_ODE_Bayes-prep_hidden'] cov_hidden = hidden_size if 'GRU_ODE_Bayes-cov_hidden' in options: cov_hidden = options['GRU_ODE_Bayes-cov_hidden'] model = models_gru_ode_bayes.NNFOwithBayesianJumps( input_size=params_dict['input_size'], hidden_size=params_dict['hidden_size'], p_hidden=p_hidden, prep_hidden=prep_hidden, bias=params_dict['bias'], cov_size=params_dict['input_size'], cov_hidden=cov_hidden, logvar=logvar, mixing=mixing, dropout_rate=params_dict['dropout_rate'], full_gru_ode=full_gru_ode, solver=solver, impute=impute, ) else: raise ValueError("Invalid argument for (option) parameter 'other_model'." "Please check docstring for correct use.") model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0005) # load saved model if wanted/possible best_eval_loss = np.infty if 'evaluate' in options and options['evaluate']: metr_columns = METR_COLUMNS + ['evaluation_mean_diff'] else: metr_columns = METR_COLUMNS if resume_training: initial_print += '\nload saved model ...' try: if 'load_best' in options and options['load_best']: models.get_ckpt_model(model_path_save_best, model, optimizer, device) else: models.get_ckpt_model(model_path_save_last, model, optimizer, device) df_metric = pd.read_csv(model_metric_file, index_col=0) best_eval_loss = np.min(df_metric['eval_loss'].values) model.epoch += 1 model.weight_decay_step() initial_print += '\nepoch: {}, weight: {}'.format( model.epoch, model.weight) except Exception as e: initial_print += '\nloading model failed -> initiate new model' initial_print += '\nException:\n{}'.format(e) resume_training = False if not resume_training: initial_print += '\ninitiate new model ...' df_metric = pd.DataFrame(columns=metr_columns) # ---------- plot only option for demo ------------ if 'plot_only' in options and options['plot_only']: for i, b in enumerate(dl_val): batch = b model.epoch -= 1 initial_print += '\nplotting ...' plot_filename = 'demo-plot_epoch-{}'.format(model.epoch) plot_filename = plot_filename + '_path-{}.pdf' curr_opt_loss = plot_one_path_with_pred( device, model, batch, stockmodel, delta_t, T, path_to_plot=paths_to_plot, save_path=plot_save_path, filename=plot_filename, plot_variance=plot_variance, functions=functions, std_factor=std_factor, model_name=model_name, save_extras=save_extras, ylabels=ylabels ) if SEND: files_to_send = [] caption = "{} - id={}".format(model_name, model_id) for i in paths_to_plot: files_to_send.append( os.path.join(plot_save_path, plot_filename.format(i))) SBM.send_notification( text='finished plot-only: {}, id={}\n\n{}'.format( model_name, model_id, desc), files=files_to_send, text_for_files=caption ) initial_print += '\noptimal eval-loss (with current weight={:.5f}): ' \ '{:.5f}'.format(model.weight, curr_opt_loss) print(initial_print) return 0 # ---------------- TRAINING ---------------- skip_training = True if model.epoch <= epochs: skip_training = False # send notification if SEND: SBM.send_notification( text='start training - model id={}'.format(model_id) ) initial_print += '\n\nmodel overview:' print(initial_print) print(model, '\n') # compute number of parameters nr_params = 0 for name, param in model.named_parameters(): skip = False for p_name in ['gru_debug', 'classification_model']: if p_name in name: skip = True if not skip: nr_params += param.nelement() print('# parameters={}\n'.format(nr_params)) print('start training ...') metric_app = [] while model.epoch <= epochs: t = time.time() model.train() # set model in train mode (e.g. BatchNorm) for i, b in tqdm.tqdm(enumerate(dl)): optimizer.zero_grad() times = b["times"] time_ptr = b["time_ptr"] X = b["X"].to(device) start_X = b["start_X"].to(device) obs_idx = b["obs_idx"] # n_obs_ot = b["n_obs_ot"].to(device) # n_obs_ot is sommetimes wrong in dataset -> but should not make a # difference. however, as below it is coorrect. b_size = start_X.size()[0] unique_idx, counts = np.unique( obs_idx.detach().numpy(), return_counts=True) n_obs_ot = np.zeros((b_size)) n_obs_ot[unique_idx] = counts n_obs_ot = n_obs_ot.astype(np.int) n_obs_ot = torch.tensor(n_obs_ot).to(device) if 'other_model' not in options: hT, loss = model( times, time_ptr, X, obs_idx, delta_t, T, start_X, n_obs_ot, return_path=False, get_loss=True ) elif options['other_model'] == "GRU_ODE_Bayes": M = torch.ones_like(X) hT, loss, _, _ = model( times, time_ptr, X, M, obs_idx, delta_t, T, start_X, return_path=False, smoother=False ) else: raise ValueError loss.backward() optimizer.step() train_time = time.time() - t # -------- evaluation -------- t = time.time() batch = None with torch.no_grad(): loss_val = 0 num_obs = 0 eval_msd = 0 model.eval() # set model in evaluation mode for i, b in enumerate(dl_val): if plot: batch = b times = b["times"] time_ptr = b["time_ptr"] X = b["X"].to(device) start_X = b["start_X"].to(device) obs_idx = b["obs_idx"] n_obs_ot = b["n_obs_ot"].to(device) if 'other_model' not in options: hT, c_loss = model( times, time_ptr, X, obs_idx, delta_t, T, start_X, n_obs_ot, return_path=False, get_loss=True ) elif options['other_model'] == "GRU_ODE_Bayes": M = torch.ones_like(X) hT, c_loss, _, _ = model( times, time_ptr, X, M, obs_idx, delta_t, T, start_X, return_path=False, smoother=False ) else: raise ValueError loss_val += c_loss.detach().numpy() num_obs += 1 # mean squared difference evaluation if 'evaluate' in options and options['evaluate']: _eval_msd = model.evaluate( times, time_ptr, X, obs_idx, delta_t, T, start_X, n_obs_ot, stockmodel, return_paths=False) eval_msd += _eval_msd eval_time = time.time() - t loss_val = loss_val / num_obs eval_msd = eval_msd / num_obs train_loss = loss.detach().numpy() print("epoch {}, weight={:.5f}, train-loss={:.5f}, " "optimal-eval-loss={:.5f}, eval-loss={:.5f}, ".format( model.epoch, model.weight, train_loss, opt_eval_loss, loss_val)) if 'evaluate' in options and options['evaluate']: metric_app.append([model.epoch, train_time, eval_time, train_loss, loss_val, opt_eval_loss, eval_msd]) print("evaluation mean square difference={:.5f}".format( eval_msd)) else: metric_app.append([model.epoch, train_time, eval_time, train_loss, loss_val, opt_eval_loss]) # save model if model.epoch % save_every == 0: if plot: print('plotting ...') plot_filename = 'epoch-{}'.format(model.epoch) plot_filename = plot_filename + '_path-{}.pdf' curr_opt_loss = plot_one_path_with_pred( device, model, batch, stockmodel, delta_t, T, path_to_plot=paths_to_plot, save_path=plot_save_path, filename=plot_filename, plot_variance=plot_variance, functions=functions, std_factor=std_factor, model_name=model_name, save_extras=save_extras, ylabels=ylabels ) print('optimal eval-loss (with current weight={:.5f}): ' '{:.5f}'.format(model.weight, curr_opt_loss)) print('save model ...') df_m_app = pd.DataFrame(data=metric_app, columns=metr_columns) df_metric = pd.concat([df_metric, df_m_app], ignore_index=True) df_metric.to_csv(model_metric_file) models.save_checkpoint(model, optimizer, model_path_save_last, model.epoch) metric_app = [] print('saved!') if loss_val < best_eval_loss: print('save new best model: last-best-loss: {:.5f}, ' 'new-best-loss: {:.5f}, epoch: {}'.format( best_eval_loss, loss_val, model.epoch)) df_m_app = pd.DataFrame(data=metric_app, columns=metr_columns) df_metric = pd.concat([df_metric, df_m_app], ignore_index=True) df_metric.to_csv(model_metric_file) models.save_checkpoint(model, optimizer, model_path_save_last, model.epoch) models.save_checkpoint(model, optimizer, model_path_save_best, model.epoch) metric_app = [] best_eval_loss = loss_val print('saved!') model.epoch += 1 model.weight_decay_step() # send notification if SEND and not skip_training: files_to_send = [model_metric_file] caption = "{} - id={}".format(model_name, model_id) if plot: for i in paths_to_plot: files_to_send.append( os.path.join(plot_save_path, plot_filename.format(i))) SBM.send_notification( text='finished training: {}, id={}\n\n{}'.format( model_name, model_id, desc), files=files_to_send, text_for_files=caption ) # delete model & free memory del model, dl, dl_val, data_train, data_val gc.collect() return 0
def get_cross_validation( params_extract_desc=('dataset', 'network_size', 'dropout_rate', 'hidden_size', 'activation_function_1'), val_test_params_extract=(("min", "eval_metric", "test_metric", "test_metric_evaluation_min"), ("min", "eval_metric", "eval_metric", "eval_metric_min")), target_col=('eval_metric_min', 'test_metric_evaluation_min'), early_stop_after_epoch=0, param_combinations=({ 'network_size': 50, 'activation_function_1': 'tanh', 'dropout_rate': 0.1, 'hidden_size': 10, 'dataset': 'climate' }, { 'network_size': 200, 'activation_function_1': 'tanh', 'dropout_rate': 0.1, 'hidden_size': 10, 'dataset': 'climate' }, { 'network_size': 400, 'activation_function_1': 'tanh', 'dropout_rate': 0.1, 'hidden_size': 50, 'dataset': 'climate' }, { 'network_size': 50, 'activation_function_1': 'relu', 'dropout_rate': 0.2, 'hidden_size': 50, 'dataset': 'climate' }, { 'network_size': 100, 'activation_function_1': 'relu', 'dropout_rate': 0.2, 'hidden_size': 50, 'dataset': 'climate' }, { 'network_size': 400, 'activation_function_1': 'relu', 'dropout_rate': 0.2, 'hidden_size': 10, 'dataset': 'climate' }), save_path="{}climate_cross_val.csv".format(train.saved_models_path), path=train.saved_models_path): """ function to get the cross validation of the climate dataset :param params_extract_desc: list of str, see get_training_overview() :param val_test_params_extract: lst of list, see get_training_overview() :param target_col: list of str, column (generated by get_training_overview() to perform cross-validation on :param early_stop_after_epoch: int, see get_training_overview() :param param_combinations: list of dict, each dict definnes one combination of params, which are used as one sample for the cross-validation (mean is then taken over all other params that are not specfied, where the specifiied params are the same) :param save_path: str, where to save the output file :param path: str, path where models are saved :return: pd.DataFrame with cross val mean and std """ df = get_training_overview(path=path, params_extract_desc=params_extract_desc, val_test_params_extract=val_test_params_extract, early_stop_after_epoch=early_stop_after_epoch, save_file=False) data = [] for pc in param_combinations: df_ = df.copy() name = json.dumps(pc, sort_keys=True) data_ = [name] for key in pc: df_ = df_.loc[df_[key] == pc[key]] for tc in target_col: vals = df_[tc] data_ += [np.mean(vals), np.std(vals)] data.append(data_) columns = ['param_combination'] for tc in target_col: columns += ['mean_{}'.format(tc), 'std_{}'.format(tc)] df_out = pd.DataFrame(data=data, columns=columns) df_out.to_csv(save_path) if SEND: SBM.send_notification(text=None, files=[save_path], text_for_files="cross validation") return df_out
def get_training_overview( path=train.saved_models_path, ids_from=None, ids_to=None, params_extract_desc=('network_size', 'training_size', 'dataset', 'hidden_size'), val_test_params_extract=(("max", "epoch", "epoch", "epochs_trained"), ("min", "evaluation_mean_diff", "evaluation_mean_diff", "eval_metric_min"), ("last", "evaluation_mean_diff", "evaluation_mean_diff", "eval_metric_last"), ("average", "evaluation_mean_diff", "evaluation_mean_diff", "eval_metric_average")), early_stop_after_epoch=0, save_file=None, ): """ function to get the important metrics and hyper-params for each model in the models_overview.csv file :param path: str, where the saved models are :param ids_from: None or int, which model ids to consider start point :param ids_to: None or int, which model ids to consider end point :param params_extract_desc: list of str, names of params to extract from the model description dict, special: - network_size: gets size of first layer of enc network - activation_function_x: gets the activation function of layer x of enc network :param val_test_params_extract: None or list of list with 4 string elements: 0. "min" or "max" or "last" or "average" 1. col_name where to look for min/max (validation), or where to get last value or average 2. if 0. is min/max: col_name where to find value in epoch where 1. is min/max (test) if 0. is last/average: not used 3. name for this output column in overview file :param early_stop_after_epoch: int, epoch after which early stopping is allowed (i.e. all epochs until there are not considered) :param save_file: :return: """ filename = "{}model_overview.csv".format(path) df = pd.read_csv(filename, index_col=0) if ids_from: df = df.loc[df["id"] >= ids_from] if ids_to: df = df.loc[df["id"] <= ids_to] # extract wanted information for param in params_extract_desc: df[param] = None if val_test_params_extract: for l in val_test_params_extract: df[l[3]] = None for i in df.index: desc = df.loc[i, "description"] param_dict = json.loads(desc) values = [] for param in params_extract_desc: try: if param == 'network_size': v = param_dict["enc_nn"][0][0] elif 'activation_function' in param: numb = int(param.split('_')[-1]) v = param_dict["enc_nn"][numb - 1][1] else: v = param_dict[param] values.append(v) except Exception: values.append(None) df.loc[i, params_extract_desc] = values id = df.loc[i, "id"] file_n = "{}id-{}/metric_id-{}.csv".format(path, id, id) df_metric = pd.read_csv(file_n, index_col=0) if early_stop_after_epoch: df_metric = df_metric.loc[ df_metric['epoch'] > early_stop_after_epoch] if val_test_params_extract: for l in val_test_params_extract: if l[0] == 'max': f = np.nanmax elif l[0] == 'min': f = np.nanmin if l[0] in ['min', 'max']: try: ind = (df_metric.loc[df_metric[l[1]] == f(df_metric[ l[1]])]).index[0] df.loc[i, l[3]] = df_metric.loc[ind, l[2]] except Exception: pass elif l[0] == 'last': df.loc[i, l[3]] = df_metric[l[1]].values[-1] elif l[0] == 'average': df.loc[i, l[3]] = np.nanmean(df_metric[l[1]]) # save if save_file is not False: if save_file is None: save_file = "{}model_overview-training_results.csv".format(path) df.to_csv(save_file) if SEND and save_file is not False: files_to_send = [save_file] SBM.send_notification(text=None, text_for_files='training overview', files=files_to_send) return df
def plot_convergence_study(path=train.saved_models_path, ids_from=None, ids_to=None, x_axis="training_size", x_log=False, y_log=False, save_path="{}plots/".format(train.data_path), save_extras={ 'bbox_inches': 'tight', 'pad_inches': 0.01 }): """ function to plot the convergence study when network size and number of samples are varied and multiple "identical" models are trained for each combiation :param path: str, path where models are saved :param ids_from: None or int, which ids to consider start point :param ids_to: None or int, which ids to consider end point :param x_axis: str, one of {"training_size", "network_size"}, which of the two is on the x-axis :param x_log: bool, whether x-axis is logarithmic :param y_log: bool, whether y-axis is logarithmic :param save_path: str :param save_extras: dict, extra arguments for saving the plots """ prop_cycle = plt.rcParams['axes.prop_cycle'] colors = prop_cycle.by_key()['color'] filename = "{}model_overview.csv".format(path) df = pd.read_csv(filename, index_col=0) if ids_from: df = df.loc[df["id"] >= ids_from] if ids_to: df = df.loc[df["id"] <= ids_to] # extract network size and number training samples df["network_size"] = None df["training_size"] = None for i in df.index: desc = df.loc[i, "description"] param_dict = json.loads(desc) training_size = param_dict["training_size"] network_size = param_dict["enc_nn"][0][0] df.loc[i, ["network_size", "training_size"]] = \ [network_size, training_size] # get sets of network size and training size n_sizes = sorted(list(set(df["network_size"].values))) t_sizes = sorted(list(set(df["training_size"].values))) if x_axis == "training_size": x_axis_params = t_sizes other_param_name = "network_size" other_params = n_sizes else: x_axis = "network_size" x_axis_params = n_sizes other_param_name = "training_size" other_params = t_sizes # get means and stds means = [] stds = [] for val2 in other_params: _means = [] _stds = [] for val1 in x_axis_params: current_losses = [] ids = df.loc[(df[x_axis] == val1) & (df[other_param_name] == val2), "id"] for id in ids: file_n = "{}id-{}/metric_id-{}.csv".format(path, id, id) df_metric = pd.read_csv(file_n, index_col=0) current_losses.append(np.min( df_metric["evaluation_mean_diff"])) _means.append(np.mean(current_losses)) _stds.append(np.std(current_losses)) means.append(_means) stds.append(_stds) # plotting f = plt.figure() ax = f.add_subplot(1, 1, 1) for i, args in enumerate(zip(means, stds, other_params)): mean, std, val2 = args ax.errorbar(x_axis_params, mean, yerr=std, label="{}={}".format(other_param_name, val2), ecolor="black", capsize=4, capthick=1, marker=".", color=colors[i]) plt.xlabel(x_axis) plt.ylabel("eval metric") plt.legend() if x_log: ax.set_xscale('log') if y_log: ax.set_yscale('log') if not os.path.exists(save_path): os.makedirs(save_path) save_file = "{}convergence_{}.png".format(save_path, x_axis) plt.savefig(save_file, **save_extras) if SEND: files_to_send = [save_file] SBM.send_notification(text_for_files='convergence plot', files=files_to_send)
def train(model_id=None, epochs=100, batch_size=100, save_every=1, learning_rate=0.001, hidden_size=10, bias=True, dropout_rate=0.1, ode_nn=default_ode_nn, readout_nn=default_readout_nn, enc_nn=default_enc_nn, use_rnn=False, solver="euler", weight=0.5, weight_decay=1., data_index=0, dataset='climate', saved_models_path=saved_models_path, **options): """ training function for controlled ODE-RNN model (models.NJODE), the model is automatically saved in the model-save-path with the given model id, also all evaluations of the model are saved there :param model_id: None or int, the id to save (or load if it already exists) the model, if None: next biggest unused id will be used :param epochs: int, number of epochs to train, each epoch is one cycle through all (random) batches of the training data :param batch_size: int :param save_every: int, defined number of epochs after each of which the model is saved and plotted if wanted. whenever the model has a new best eval-loss it is also saved, independent of this number (but not plotted) :param learning_rate: float :param hidden_size: see models.NJODE :param bias: see models.NJODE :param dropout_rate: float :param ode_nn: see models.NJODE :param readout_nn: see models.NJODE :param enc_nn: see models.NJODE :param use_rnn: see models.NJODE :param solver: see models.NJODE :param weight: see models.NJODE :param weight_decay: see models.NJODE :param data_index: int in {0,..,4}, which index set to use :param saved_models_path: str, where to save the models :param options: kwargs, used keywords: 'parallel' bool, used by parallel_train.parallel_training 'resume_training' bool, used by parallel_train.parallel_training 'which_loss' 'standard' or 'easy', used by models.NJODE 'residual_enc_dec' bool, whether resNNs are used for encoder and readout NN, used by models.NJODE, default True 'delta_t' float, default 0.1, to change stepsize of alg 'load_best' bool, whether to load the best checkpoint instead of the last checkpoint when loading the model. Mainly used for evaluating model at the best checkpoint. 'other_model' one of {'GRU_ODE_Bayes'}; the specifieed model is trained instead of the controlled ODE-RNN model. Other options/inputs might change or loose their effect. The saved_models_path is changed to "{...}<model-name>-saved_models/" instead of "{...}saved_models/". -> 'GRU_ODE_Bayes' has the following extra options with the names 'GRU_ODE_Bayes'+<option_name>, for the following list of possible choices for <options_name>: '-mixing' float, default: 0.0001, weight of the 2nd loss term of GRU-ODE-Bayes '-solver' one of {"euler", "midpoint", "dopri5"}, default: "euler" '-impute' bool, default: False, whether to impute the last parameter estimation of the p_model for the next ode_step as input. the p_model maps (like the readout_map) the hidden state to the parameter estimation of the normal distribution. '-logvar' bool, default: True, wether to use logarithmic (co)variace -> hardcodinng positivity constraint '-full_gru_ode' bool, default: True, whether to use the full GRU cell or a smaller version, see GRU-ODE-Bayes '-p_hidden' int, default: hidden_size, size of the inner hidden layer of the p_model '-prep_hidden' int, default: hidden_size, in the observational cell (i.e. jumps) a prior matrix multiplication transforms the input to have the size prep_hidden * input_size '-cov_hidden' int, default: hidden_size, size of the inner hidden layer of the covariate_map. the covariate_map is used as a mapping to get the initial h (for controlled ODE-RNN this is done by the encoder) """ initial_print = "" options['masked'] = True if ANOMALY_DETECTION: torch.autograd.set_detect_anomaly(True) torch.manual_seed(0) np.random.seed(0) cudnn.deterministic = True # set number of CPUs if SERVER: torch.set_num_threads(N_CPUS) # get the device for torch if torch.cuda.is_available(): gpu_num = 0 device = torch.device("cuda:{}".format(gpu_num)) torch.cuda.set_device(gpu_num) initial_print += '\nusing GPU' else: device = torch.device("cpu") initial_print += '\nusing CPU' # get data csv_file_path = os.path.join(train_data_path, 'climate/small_chunked_sporadic.csv') train_idx = np.load(os.path.join( train_data_path, 'climate/small_chunk_fold_idx_{}/train_idx.npy'.format(data_index)), allow_pickle=True) val_idx = np.load(os.path.join( train_data_path, 'climate/small_chunk_fold_idx_{}/val_idx.npy'.format(data_index)), allow_pickle=True) test_idx = np.load(os.path.join( train_data_path, 'climate/small_chunk_fold_idx_{}/test_idx.npy'.format(data_index)), allow_pickle=True) validation = True val_options = {"T_val": 150, "max_val_samples": 3} data_train = data_utils_gru.ODE_Dataset(csv_file=csv_file_path, label_file=None, cov_file=None, idx=train_idx) data_val = data_utils_gru.ODE_Dataset(csv_file=csv_file_path, label_file=None, cov_file=None, idx=val_idx, validation=validation, val_options=val_options) data_test = data_utils_gru.ODE_Dataset(csv_file=csv_file_path, label_file=None, cov_file=None, idx=test_idx, validation=validation, val_options=val_options) # get data loaders dl = DataLoader(dataset=data_train, collate_fn=data_utils_gru.custom_collate_fn, shuffle=True, batch_size=batch_size, num_workers=N_DATASET_WORKERS) dl_val = DataLoader(dataset=data_val, collate_fn=data_utils_gru.custom_collate_fn, shuffle=True, batch_size=len(val_idx), num_workers=N_DATASET_WORKERS) dl_test = DataLoader(dataset=data_test, collate_fn=data_utils_gru.custom_collate_fn, shuffle=True, batch_size=len(test_idx), num_workers=N_DATASET_WORKERS) input_size = data_train.variable_num output_size = input_size T = 200 delta_t = 0.1 if "delta_t" in options: delta_t = options['delta_t'] # get params_dict params_dict = { 'input_size': input_size, 'epochs': epochs, 'hidden_size': hidden_size, 'output_size': output_size, 'bias': bias, 'ode_nn': ode_nn, 'readout_nn': readout_nn, 'enc_nn': enc_nn, 'use_rnn': use_rnn, 'dropout_rate': dropout_rate, 'batch_size': batch_size, 'solver': solver, 'data_index': data_index, 'learning_rate': learning_rate, 'weight': weight, 'weight_decay': weight_decay, 'options': options } desc = json.dumps(params_dict, sort_keys=True) # get overview file resume_training = False if ('parallel' in options and options['parallel'] is False) or \ ('parallel' not in options): model_overview_file_name = '{}model_overview.csv'.format( saved_models_path) makedirs(saved_models_path) if not os.path.exists(model_overview_file_name): df_overview = pd.DataFrame(data=None, columns=['id', 'description']) max_id = 0 else: df_overview = pd.read_csv(model_overview_file_name, index_col=0) max_id = np.max(df_overview['id'].values) # get model_id, model params etc. if model_id is None: model_id = max_id + 1 if model_id not in df_overview['id'].values: initial_print += '\nnew model_id={}'.format(model_id) df_ov_app = pd.DataFrame([[model_id, desc]], columns=['id', 'description']) df_overview = pd.concat([df_overview, df_ov_app], ignore_index=True) df_overview.to_csv(model_overview_file_name) else: initial_print += '\nmodel_id already exists -> resume training' resume_training = True desc = (df_overview['description'].loc[df_overview['id'] == model_id]).values[0] params_dict = json.loads(desc) options = params_dict['options'] initial_print += '\nmodel params:\n{}'.format(desc) if 'resume_training' in options and options['resume_training'] is True: resume_training = True # get all needed paths model_path = '{}id-{}/'.format(saved_models_path, model_id) makedirs(model_path) model_path_save_last = '{}last_checkpoint/'.format(model_path) model_path_save_best = '{}best_checkpoint/'.format(model_path) makedirs(model_path_save_last) makedirs(model_path_save_best) model_metric_file = '{}metric_id-{}.csv'.format(model_path, model_id) # get the model & optimizer if 'other_model' not in options: model = models.NJODE(**params_dict) model_name = 'NJ-ODE' elif options['other_model'] == "GRU_ODE_Bayes": model_name = 'GRU-ODE-Bayes' # get parameters for GRU-ODE-Bayes model hidden_size = params_dict['hidden_size'] mixing = 0.0001 if 'GRU_ODE_Bayes-mixing' in options: mixing = options['GRU_ODE_Bayes-mixing'] solver = 'euler' if 'GRU_ODE_Bayes-solver' in options: solver = options['GRU_ODE_Bayes-solver'] impute = False if 'GRU_ODE_Bayes-impute' in options: impute = options['GRU_ODE_Bayes-impute'] logvar = True if 'GRU_ODE_Bayes-logvar' in options: logvar = options['GRU_ODE_Bayes-logvar'] full_gru_ode = True if 'GRU_ODE_Bayes-full_gru_ode' in options: full_gru_ode = options['GRU_ODE_Bayes-full_gru_ode'] p_hidden = hidden_size if 'GRU_ODE_Bayes-p_hidden' in options: p_hidden = options['GRU_ODE_Bayes-p_hidden'] prep_hidden = hidden_size if 'GRU_ODE_Bayes-prep_hidden' in options: prep_hidden = options['GRU_ODE_Bayes-prep_hidden'] cov_hidden = hidden_size if 'GRU_ODE_Bayes-cov_hidden' in options: cov_hidden = options['GRU_ODE_Bayes-cov_hidden'] model = models_gru_ode_bayes.NNFOwithBayesianJumps( input_size=params_dict['input_size'], hidden_size=params_dict['hidden_size'], p_hidden=p_hidden, prep_hidden=prep_hidden, bias=params_dict['bias'], cov_size=params_dict['input_size'], cov_hidden=cov_hidden, logvar=logvar, mixing=mixing, dropout_rate=params_dict['dropout_rate'], full_gru_ode=full_gru_ode, solver=solver, impute=impute, ) else: raise ValueError( "Invalid argument for (option) parameter 'other_model'." "Please check docstring for correct use.") model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0005) # load saved model if wanted/possible best_eval_metric = np.infty metr_columns = METR_COLUMNS if resume_training: initial_print += '\nload saved model ...' try: if 'load_best' in options and options['load_best']: models.get_ckpt_model(model_path_save_best, model, optimizer, device) else: models.get_ckpt_model(model_path_save_last, model, optimizer, device) df_metric = pd.read_csv(model_metric_file, index_col=0) best_eval_metric = np.min(df_metric['eval_metric'].values) model.epoch += 1 model.weight_decay_step() initial_print += '\nepoch: {}, weight: {}'.format( model.epoch, model.weight) except Exception as e: initial_print += '\nloading model failed -> initiate new model' initial_print += '\nException:\n{}'.format(e) resume_training = False if not resume_training: initial_print += '\ninitiate new model ...' df_metric = pd.DataFrame(columns=metr_columns) # ---------------- TRAINING ---------------- skip_training = True if model.epoch <= epochs: skip_training = False # send notification if SEND: SBM.send_notification( text='start training climate: {} id={}'.format( model_name, model_id)) initial_print += '\n\nmodel overview:' print(initial_print) print(model, '\n') # compute number of parameters nr_params = 0 for name, param in model.named_parameters(): skip = False for p_name in ['gru_debug', 'classification_model']: if p_name in name: skip = True if not skip: nr_params += param.nelement() print('# parameters={}\n'.format(nr_params)) print('start training ...') metric_app = [] while model.epoch <= epochs: t = time.time() model.train() # set model in train mode (e.g. BatchNorm) for i, b in tqdm.tqdm(enumerate(dl)): optimizer.zero_grad() times = b["times"] time_ptr = b["time_ptr"] X = b["X"].to(device) M = b["M"].to(device) obs_idx = b["obs_idx"] b_size = len(b["pat_idx"]) unique_idx, counts = np.unique(obs_idx.detach().numpy(), return_counts=True) n_obs_ot = np.zeros((b_size)) n_obs_ot[unique_idx] = counts n_obs_ot = n_obs_ot.astype(np.int) n_obs_ot = torch.tensor(n_obs_ot).to(device) start_X = torch.tensor(np.zeros((b_size, X.size()[1])), dtype=torch.float32) if 'other_model' not in options: hT, loss = model(times, time_ptr, X, obs_idx, delta_t, T, start_X, n_obs_ot, return_path=False, get_loss=True, M=M) elif options['other_model'] == "GRU_ODE_Bayes": hT, loss, _, _ = model(times, time_ptr, X, M, obs_idx, delta_t, T, start_X, return_path=False, smoother=False) else: raise ValueError loss.backward() optimizer.step() train_time = time.time() - t # -------- evaluation -------- t = time.time() loss_val, mse_val = evaluate_model(model, dl_val, device, options, delta_t, T) eval_time = time.time() - t train_loss = loss.detach().numpy() print("epoch {}, weight={:.5f}, train-loss={:.5f}, " "eval-loss={:.5f}, eval-metric={:.5f}".format( model.epoch, model.weight, train_loss, loss_val, mse_val)) if mse_val < best_eval_metric: print('save new best model: last-best-metric: {:.5f}, ' 'new-best-metric: {:.5f}, epoch: {}'.format( best_eval_metric, mse_val, model.epoch)) models.save_checkpoint(model, optimizer, model_path_save_best, model.epoch) best_eval_metric = mse_val loss_test, mse_test = evaluate_model(model, dl_test, device, options, delta_t, T) print("test-loss={:.5f}, test-metric={:.5f}".format( loss_test, mse_test)) metric_app.append([ model.epoch, train_time, eval_time, train_loss, loss_val, mse_val, loss_test, mse_test ]) # save model if model.epoch % save_every == 0: print('save model ...') df_m_app = pd.DataFrame(data=metric_app, columns=metr_columns) df_metric = pd.concat([df_metric, df_m_app], ignore_index=True) df_metric.to_csv(model_metric_file) models.save_checkpoint(model, optimizer, model_path_save_last, model.epoch) metric_app = [] print('saved!') model.epoch += 1 model.weight_decay_step() # send notification if SEND and not skip_training: files_to_send = [model_metric_file] caption = "{} - id={}".format(model_name, model_id) SBM.send_notification( text='finished training on climate: {}, id={}\n\n{}'.format( model_name, model_id, desc), files=files_to_send, text_for_files=caption) # delete model & free memory del model, dl, dl_val, dl_test, data_train, data_val, data_test gc.collect() return 0
def plot_convergence_study( config: configs._DefaultConfig, x_axis="nb_paths", x_log=True, y_log=False, save_path="../latex/plots/", save_extras={ 'bbox_inches': 'tight', 'pad_inches': 0.01 }, ): prop_cycle = plt.rcParams['axes.prop_cycle'] colors = prop_cycle.by_key()['color'] linestyles = ['-', '-.', '--', ':'] df = read_data.read_csvs_conv(which=0) df = df.drop(columns="duration") df = df.loc[df["algo"].isin(config.algos)] df = df.loc[df["model"].isin(config.stock_models)] # get sets of network size and training size n_sizes = sorted(list(set(df["hidden_size"].values))) t_sizes = sorted(list(set(df["nb_paths"].values))) if x_axis == "nb_paths": x_axis_params = t_sizes other_param_name = "hidden_size" other_params = n_sizes x_axis_name = "number of paths" else: x_axis = "hidden_size" x_axis_params = n_sizes other_param_name = "nb_paths" other_params = t_sizes x_axis_name = "hidden size" # get means and stds means = [] stds = [] for val2 in other_params: _means = [] _stds = [] for val1 in x_axis_params: current_prices = df.loc[(df[x_axis] == val1) & (df[other_param_name] == val2), "price"].values _means.append(np.mean(current_prices)) _stds.append(np.std(current_prices)) means.append(_means) stds.append(_stds) # plotting f = plt.figure() ax = f.add_subplot(1, 1, 1) for i, args in enumerate(zip(means, stds, other_params)): mean, std, val2 = args color = colors[i % len(colors)] linestyle = linestyles[i % len(linestyles)] ax.errorbar( x_axis_params, mean, yerr=std, ecolor=color, capsize=4, capthick=1, marker=".", color=color, linestyle=linestyle, ) ax.errorbar( x_axis_params, mean, yerr=std, label="{}={}".format(other_param_name.replace('_', ' '), val2), ecolor="black", capsize=0, capthick=0, marker=".", color=color, linestyle=linestyle, ) plt.xlabel(x_axis_name) plt.ylabel("price") plt.legend() if x_log: ax.set_xscale('log') if y_log: ax.set_yscale('log') if not os.path.exists(save_path): os.makedirs(save_path) save_file = "{}convergence_plot_{}.png".format(save_path, x_axis) plt.savefig(save_file, **save_extras) SBM.send_notification( text_for_files='convergence plot', chat_id=chat_id, files=[save_file], text=None, )