def __init__(self, AE_settings, expdir, batch_sz=BATCH, model=None, start_epoch=None): """Initilaizes the AE training class. ::AE_settings - a settings.config.Config class with the DA settings ::expdir - a directory of form `experiments/<possible_path>` to keep logs ::calc_DA_MAE - boolean. If True, training will evaluate DA Mean Absolute Error during the training cycle. Note: this is *MUCH* slower """ self.settings = AE_settings err_msg = """AE_settings must be an AE configuration class""" assert self.settings.COMPRESSION_METHOD == "AE", err_msg if model is not None: #for retraining assert start_epoch is not None, "If you are RE-training model you must pass start_epoch" assert start_epoch >= 0 self.start_epoch = start_epoch self.model = model print("Loaded model, ", end="") else: self.start_epoch = 0 self.model = ML_utils.load_model_from_settings(AE_settings) print("Initialized model, ", end="") print("Number of parameters:", sum(p.numel() for p in self.model.parameters())) self.batch_sz = batch_sz self.settings.batch_sz = batch_sz self.expdir = init_expdir(expdir) self.settings_fp = self.expdir + "settings.txt" if self.settings.SAVE == True: with open(self.settings_fp, "wb") as f: pickle.dump(self.settings, f) ML_utils.set_seeds() #set seeds before init model self.device = ML_utils.get_device() self.columns = [ "epoch", "reconstruction_err", "DA_MAE", "DA_ratio_improve_MAE", "time_DA(s)", "time_epoch(s)" ]
def check_init(config, config_kwargs, prnt, activation=None): if not config_kwargs: config_kwargs = {} assert isinstance(config_kwargs, dict) settings = config(**config_kwargs) settings.DEBUG = False #settings.ACTIVATION = activation assert isinstance(settings, Config) model = ML_utils.load_model_from_settings(settings) print(settings.__class__.__name__) if config_kwargs != {}: for k, v in config_kwargs.items(): print("{}: {}".format( k, v, ), end=", ") print(end="\n") if prnt: print(model.layers_encode) num_params = sum(p.numel() for p in model.parameters()) print("num params", num_params) print("CHANNELS", settings.get_channels())
def DA_AE(self, force_init=False, save_vtu=False): if self.data.get("model") == None or force_init: self.model = ML_utils.load_model_from_settings(self.settings, self.data.get("device")) self.data["model"] = self.model else: self.model = self.data.get("model") self.data["model"].eval() if self.settings.REDUCED_SPACE: if self.data.get("V_trunc") is None or force_init: #only init if not already init V_red = VDAInit.create_V_red(self.data.get("train_X"), self.data.get("encoder"), self.settings) self.data["V_trunc"] = V_red.T #tau x M self.data["w_0"] = np.zeros((V_red.shape[0])) if self.data["G"] is 1: self.data["G_V"] =self.data["V_trunc"] else: self.data["G_V"] = (self.data["G"] @ self.data["V_trunc"] ).astype(float) self.data["V_grad"] = None else: # Now access explicit gradient function self.data["V_grad"] = self.__maybe_get_jacobian() DA_results = self.perform_VarDA(self.data, self.settings, save_vtu=save_vtu) return DA_results
def calc_DA_dir(dir, params, expdir, prnt=True, all_data=True, epoch=None, save_vtu=False, gpu_device=0, return_df=False): gpu = False if gpu_device is not "CPU": gpu = True model, settings = ML_utils.load_model_and_settings_from_dir( dir, device_idx=gpu_device, choose_epoch=epoch, gpu=gpu) df = run_DA_batch(settings, model, all_data, expdir, params, save_vtu, gpu_device=gpu_device) mse_DA = df["mse_DA"].mean() model_data = get_model_specific_data(settings, dir, model=model) model_data["num_params"] = sum(p.numel() for p in model.parameters()) if prnt: print(mse_DA, model_data, expdir) print(df.tail(5)) if return_df: return mse_DA, model_data, df return mse_DA, model_data
def main(): res_layers = [3, 9, 27] cardinalities = [1, 8, 32] idx = 0 layer = 3 cardinality = 1 expdir = exp_base + str(0) + "/" print("Layers", layer) print("Cardinality", cardinality) kwargs = {"layers": layer, "cardinality": cardinality} _, settings = ML_utils.load_model_and_settings_from_dir(exp_load) settings.AE_MODEL_FP = model_fp settings.GPU_DEVICE = GPU_DEVICE settings.export_env_vars() expdir = exp_base + str(idx) + "/" trainer = TrainAE(settings, expdir, calc_DA_MAE) expdir = trainer.expdir #get full path model = trainer.train(EPOCHS, test_every=test_every, num_epochs_cv=num_epochs_cv, learning_rate=LR, print_every=print_every, small_debug=SMALL_DEBUG_DOM)
def train_test_DA_split_maybe_normalize(X, settings): """Returns non-overlapping train/test and DA control state data. This function also deals with normalization (to ensure than only the training data is used for normalization mean and std)""" M, n = SplitData.get_dim_X(X, settings) hist_idx = int(M * settings.HIST_FRAC) hist_X = X[: hist_idx] #select historical data (i.e. training set in ML terminology) # that will be used for normalize #use only the training set to calculate mean and std mean = np.mean(hist_X, axis=0) std = np.std(hist_X, axis=0) #Some std are zero - set the norm to 1 in this case so that feature is zero post-normalization std = np.where(std <= 0., 1, std) if settings.NORMALIZE: X = (X - mean) X = (X / std) # Split X into historical and present data. We will # assimilate "observations" at a single timestep t_DA # which corresponds to the control state u_c # We will take initial condition u_0, as mean of historical data t_DA = M - (settings.TDA_IDX_FROM_END + 1) #idx of Data Assimilation assert t_DA >= hist_idx, ( "Cannot select observation from historical data." "Reduce HIST_FRAC or reduce TDA_IDX_FROM_END to prevent overlap.\n" "t_DA = {} and hist_idx = {}".format(t_DA, hist_idx)) assert t_DA > hist_idx, ("Test set cannot have zero size") train_X = X[:hist_idx] test_X = X[hist_idx:t_DA] u_c = X[t_DA] #control state (for DA) if settings.SHUFFLE_DATA: ML_utils.set_seeds() np.random.shuffle(train_X) np.random.shuffle(test_X) return train_X, test_X, u_c, X, mean, std
def get_attenuation_from_dir(dir, model=None): if not model: model, settings = ML_utils.load_model_and_settings_from_dir(dir) encode, decode = None, None for k, v in model.named_parameters(): if "attenuate_res" in k: if "encode" in k: encode = v.item() else: decode = v.item() return encode, decode
def check_DA_dir(dir, kwargs, all_data, expdir, params, prnt): try: model, settings = ML_utils.load_model_and_settings_from_dir(dir) df = run_DA_batch(settings, model, all_data, expdir, params) if prnt: print(df.tail(10)) except Exception as e: try: shutil.rmtree(expdir, ignore_errors=False, onerror=None) raise e except Exception as z: raise e
def training_loop_AE(self, device=None, print_every=2, test_every=5, save_every=5, model_dir=None): """Runs a torch AE model training loop. NOTE: Ensure that the loss_fn is in mode "sum" """ model = self.model self.model_dir = model_dir if device == None: device = ML_utils.get_device() self.device = device ML_utils.set_seeds() train_losses = [] test_losses = [] self.start = self.num_epochs_cv + self.start_epoch self.end = self.start_epoch + self.num_epoch epoch = self.end - 1 #for case where no training occurs for epoch in range(self.start, self.end): self.epoch = epoch train_loss, test_loss = self.train_one_epoch( epoch, print_every, test_every) train_losses.append(train_loss) if test_loss: test_losses.append(test_loss) if epoch % save_every != 0 and self.model_dir != None: #Save model (if new model hasn't just been saved) model_fp_new = "{}{}.pth".format(self.model_dir, epoch) torch.save(model.state_dict(), model_fp_new) return train_losses, test_losses
def act_constr(activation_fn): if activation_fn == "relu": activation_constructor = lambda x, y: nn.ReLU() elif activation_fn == "lrelu": activation_constructor = lambda x, y: nn.LeakyReLU(0.05) elif activation_fn == "GDN": activation_constructor = lambda x, y: GDN(x, ML_utils.get_device(), y) elif callable(activation_fn): activation_constructor = lambda x, y: activation_fn elif activation_fn == "prelu": # must be initilalized in situ activation_constructor = lambda x, y: nn.PReLU(x) else: raise NotImplementedError("Activation function not implemented") return activation_constructor
def test_CAE_forward_nobatch(self): settings = CAEConfig() Cin = settings.get_channels()[0] size = (Cin,) + settings.get_n() device = ML.get_device() x = torch.rand(size, requires_grad=True, device = device) model = CAE_3D(**settings.get_kwargs()) model.to(device) try: y = model(x) except: pytest.fail("Unable to do forward pass")
def retrain(dir, gpu_device, new_expdir, batch_sz=None): """This function accepts an expdir and returns an initialized TrainAE class""" model, settings, prev_epoch = ML_utils.load_model_and_settings_from_dir( dir, device_idx=gpu_device, return_epoch=True) start_epoch = prev_epoch + 1 batch_sz = batch_sz if batch_sz is not None else settings.batch_sz trainer = TrainAE(settings, new_expdir, batch_sz=batch_sz, model=model, start_epoch=start_epoch) return trainer
def test_CAE_linear_latent_nonbatched(self): settings = CAEConfig() Cin = settings.get_channels()[0] size = (Cin, ) + settings.get_n() device = ML.get_device() x = torch.rand(size, requires_grad=True, device = device) model = CAE_3D(**settings.get_kwargs()) model.to(device) encode = model.encode try: w = encode(x) except: pytest.fail("Unable to do forward pass") assert len(w.shape) == 1, "There should only be one dimension" assert w.shape[0] == settings.get_number_modes()
def run(self): """Generates matrices for VarDA. All returned matrices are in the (M X n) or (M x nx x ny x nz) format """ data = {} loader = self.settings.get_loader() splitter = SplitData() settings = self.settings X = loader.get_X(settings) train_X, test_X, u_c_std, X, mean, std = splitter.train_test_DA_split_maybe_normalize( X, settings) if self.u_c is None: self.u_c = u_c_std #self.u_c = train_X[62] #good #self.u_c = train_X[-1] #bad # We will take initial condition u_0, as mean of historical data if settings.NORMALIZE: u_0 = np.zeros_like(mean) #since the data is mean centred else: u_0 = mean encoder = None decoder = None device = ML_utils.get_device() model = self.AEmodel if model: model.to(device) if self.settings.COMPRESSION_METHOD == "AE": #get encoder if model is None: model = ML_utils.load_model_from_settings(settings) def __create_encoderOrDecoder(fn): """This returns a function that deals with encoder/decoder input dimensions (e.g. adds channel dim for 3D case)""" def ret_fn(vec): vec = torch.Tensor(vec).to(device) #for 3D case, unsqueeze for channel if self.settings.THREE_DIM: dims = len(vec.shape) if dims == 3: vec = vec.unsqueeze(0) elif dims == 4: #batched input vec = vec.unsqueeze(1) with torch.no_grad(): res = fn(vec).detach().cpu() #for 3D case, squeeze for channel dims = len(res.shape) if self.settings.THREE_DIM and dims > 2: if dims == 4: res = res.squeeze(0) elif dims == 5: #batched input res = res.squeeze(1) return res.numpy() return ret_fn encoder = __create_encoderOrDecoder(model.encode) decoder = __create_encoderOrDecoder(model.decode) H_0, obs_idx = None, None if self.settings.REDUCED_SPACE == True: if self.settings.COMPRESSION_METHOD == "SVD": raise NotImplementedError( "SVD in reduced space not implemented") self.settings.OBS_MODE = "all" observations, H_0, w_0, d = self.__get_obs_and_d_reduced_space( self.settings, self.u_c, u_0, encoder) else: observations, w_0, d, obs_idx = self.__get_obs_and_d_not_reduced( self.settings, self.u_c, u_0, encoder) #TODO - **maybe** get rid of this monstrosity...: #i.e. you could return a class that has these attributes: data = { "d": d, "G": H_0, "observations": observations, "model": model, "obs_idx": obs_idx, "encoder": encoder, "decoder": decoder, "u_c": self.u_c, "u_0": u_0, "X": X, "train_X": train_X, "test_X": test_X, "std": std, "mean": mean, "device": device } if w_0 is not None: data["w_0"] = w_0 return data
def __maybe_cross_val_lr(self, test_every, num_epochs_cv=8): if not num_epochs_cv: self.num_epochs_cv = 0 return self.learning_rate elif self.num_epoch < num_epochs_cv: self.num_epochs_cv = self.num_epoch else: self.num_epochs_cv = num_epochs_cv mult = 1 if self.settings.BATCH_NORM: #i.e. generally larger learning_rate with BN mult = 5 mult *= BATCH_MULT #linear multiply by size of batch: https://arxiv.org/abs/1706.02677 lrs_base = [0.0001, 0.0003, 0.001] lrs = [mult * x for x in lrs_base] res = [] optimizers = [] for idx, lr in enumerate(lrs): ML_utils.set_seeds() #set seeds before init model self.model = ML_utils.load_model_from_settings(self.settings) self.optimizer = optim.Adam(self.model.parameters(), lr) test_losses = [] train_losses = [] print("learning rate:", lr) for epoch in range(self.start_epoch, self.num_epochs_cv + self.start_epoch): self.epoch = epoch train, test = self.train_one_epoch(epoch, self.print_every, test_every, self.num_epochs_cv) if test: test_losses.append(test) train_losses.append(train) df = pd.DataFrame(train_losses, columns=self.columns) train_final = df.tail(1).reconstruction_err res.append(train_final.values[0]) optimizers.append(self.optimizer) #save model if best so far if res[-1] == min(res): best_test = test_losses best_train = train_losses best_idx = idx model_fp_new = "{}{}-{}.pth".format(self.model_dir, epoch, lr) torch.save(self.model.state_dict(), model_fp_new) best_model = self.model self.learning_rate = lrs[best_idx] * 0.8 self.optimizer = optimizers[best_idx] self.model = best_model test_loss = best_test train_loss = best_train return self.learning_rate, train_loss, test_loss
def test_set_seeds_raiseNameError(self): env = os.environ if env.get("SEED"): del env["SEED"] with pytest.raises(NameError): ML_utils.set_seeds()
def run(self, print_every=10, print_small=True): shuffle = self.settings.SHUFFLE_DATA #save value self.settings.SHUFFLE_DATA = False if self.settings.COMPRESSION_METHOD == "SVD": if self.settings.REDUCED_SPACE: raise NotImplementedError("Cannot have reduced space SVD") fp_base = self.settings.get_X_fp().split("/")[-1][1:] U = np.load(self.settings.INTERMEDIATE_FP + "U" + fp_base) s = np.load(self.settings.INTERMEDIATE_FP + "s" + fp_base) W = np.load(self.settings.INTERMEDIATE_FP + "W" + fp_base) num_modes = self.settings.get_number_modes() V_trunc = SVD.SVD_V_trunc(U, s, W, modes=num_modes) V_trunc_plus = SVD.SVD_V_trunc_plus(U, s, W, modes=num_modes) self.DA_pipeline = DAPipeline(self.settings) DA_data = self.DA_pipeline.data DA_data["V_trunc"] = V_trunc DA_data["V"] = None DA_data["w_0"] = V_trunc_plus @ DA_data.get("u_0").flatten() DA_data["V_grad"] = None elif self.settings.COMPRESSION_METHOD == "AE": if self.model is None: raise ValueError( "Must provide an AE torch.nn model if settings.COMPRESSION_METHOD == 'AE'" ) self.DA_pipeline = DAPipeline(self.settings, self.model) DA_data = self.DA_pipeline.data if self.reconstruction: encoder = DA_data.get("encoder") decoder = DA_data.get("decoder") else: raise ValueError( "settings.COMPRESSION_METHOD must be in ['AE', 'SVD']") self.settings.SHUFFLE_DATA = shuffle if self.reconstruction: L1 = torch.nn.L1Loss(reduction='sum') L2 = torch.nn.MSELoss(reduction="sum") totals = { "percent_improvement": 0, "ref_MAE_mean": 0, "da_MAE_mean": 0, "mse_DA": 0, "mse_ref": 0, "counts": 0, "l1_loss": 0, "l2_loss": 0, "time": 0, "time_online": 0 } tot_DA_MAE = np.zeros_like(self.control_states[0]).flatten() tot_ref_MAE = np.zeros_like(self.control_states[0]).flatten() results = [] if len(self.control_states.shape) in [1, 3]: raise ValueError("This is not batched control_state input") else: num_states = self.control_states.shape[0] for idx in range(num_states): u_c = self.control_states[idx] if self.settings.REDUCED_SPACE: self.DA_pipeline.data = VDAInit.provide_u_c_update_data_reduced_AE( DA_data, self.settings, u_c) else: self.DA_pipeline.data = VDAInit.provide_u_c_update_data_full_space( DA_data, self.settings, u_c) t1 = time.time() if self.settings.COMPRESSION_METHOD == "AE": DA_results = self.DA_pipeline.DA_AE(save_vtu=self.save_vtu) elif self.settings.COMPRESSION_METHOD == "SVD": DA_results = self.DA_pipeline.DA_SVD(save_vtu=self.save_vtu) t2 = time.time() t_tot = t2 - t1 #print("time_online {:.4f}s".format(DA_results["time_online"])) if self.reconstruction: data_tensor = torch.Tensor(u_c) if self.settings.COMPRESSION_METHOD == "AE": device = ML_utils.get_device() #device = ML_utils.get_device(True, 1) data_tensor = data_tensor.to(device) data_hat = decoder(encoder(u_c)) data_hat = torch.Tensor(data_hat) data_hat = data_hat.to(device) elif self.settings.COMPRESSION_METHOD == "SVD": data_hat = SVD.SVD_reconstruction_trunc( u_c, U, s, W, num_modes) data_hat = torch.Tensor(data_hat) with torch.no_grad(): l1 = L1(data_hat, data_tensor) l2 = L2(data_hat, data_tensor) else: l1, l2 = None, None result = {} result["percent_improvement"] = DA_results["percent_improvement"] result["ref_MAE_mean"] = DA_results["ref_MAE_mean"] result["da_MAE_mean"] = DA_results["da_MAE_mean"] result["counts"] = DA_results["counts"] result["mse_ref"] = DA_results["mse_ref"] result["mse_DA"] = DA_results["mse_DA"] if self.reconstruction: result["l1_loss"] = l1.detach().cpu().numpy() result["l2_loss"] = l2.detach().cpu().numpy() result["time"] = t2 - t1 result["time_online"] = DA_results["time_online"] if self.save_vtu: tot_DA_MAE += DA_results.get("da_MAE") tot_ref_MAE += DA_results.get("ref_MAE") #add to results list (that will become a .csv) results.append(result) #add to aggregated dict results totals = self.__add_result_to_totals(result, totals) if idx % print_every == 0 and idx > 0: if not print_small: print("idx:", idx) self.__print_totals(totals, idx + 1, print_small) if not print_small: print("------------") self.__print_totals(totals, num_states, print_small) if not print_small: print("------------") results_df = pd.DataFrame(results) if self.save_vtu: tot_DA_MAE /= num_states tot_ref_MAE /= num_states out_fp_ref = self.save_vtu_fp + "av_ref_MAE.vtu" out_fp_DA = self.save_vtu_fp + "av_da_MAE.vtu" fluidity.utils.save_vtu(self.settings, out_fp_ref, tot_ref_MAE) fluidity.utils.save_vtu(self.settings, out_fp_DA, tot_DA_MAE) #save to csv if self.csv_fp: results_df.to_csv(self.csv_fp) if self.plot: raise NotImplementedError( "plotting functionality not implemented yet") return results_df