def __init__(self, network, generative_model, loss, summary_stats=None, optimizer=None, learning_rate=0.0005, checkpoint_path=None, max_to_keep=5, clip_method='global_norm', clip_value=None): """ Creates a trainer instance for performing single-model forward inference and training an amortized neural estimator for parameter estimation (BayesFlow). If a checkpoint_path is provided, the network's weights will be stored after each training epoch. If the folder contains a checkpoint, the trainer will try to load the weights and continue training with a pre-trained net. ---------- Arguments: network : bayesflow.Amortizer instance -- the neural architecture to be optimized generative_model: callable -- a function or an object with n_sim and n_obs mandatory arguments returning randomly sampled parameter vectors and datasets from a process model loss : callable with three arguments: (network, m_indices, x) -- the loss function ---------- Keyword arguments: summary_stats : callable -- optional summary statistics function optimizer : None or tf.keras.optimizer.Optimizer -- default Adam optimizer (equiv. to None) or a custom one learning_rate : float -- the learning rate used for the optimizer checkpoint_path : string -- optional folder name for storing the trained network max_to_keep : int -- optional number of checkpoints to keep clip_method : string in ('norm', 'value', 'global_norm') -- optional gradient clipping method clip_value : float -- the value used for gradient clipping when clip_method is set to 'value' or 'norm' """ # Basic attributes self.network = network self.generative_model = generative_model self.loss = loss self.summary_stats = summary_stats self.clip_method = clip_method self.clip_value = clip_value self.n_obs = None # Optimizer settings if optimizer is None: if tf.__version__.startswith('1'): self.optimizer = tf.train.AdamOptimizer(learning_rate) else: self.optimizer = Adam(learning_rate) else: self.optimizer = optimizer(learning_rate) # Checkpoint settings if checkpoint_path is not None: self.checkpoint = Checkpoint(optimizer=self.optimizer, model=self.network) self.manager = CheckpointManager(self.checkpoint, checkpoint_path, max_to_keep=max_to_keep) self.checkpoint.restore(self.manager.latest_checkpoint) if self.manager.latest_checkpoint: print("Networks loaded from {}".format(self.manager.latest_checkpoint)) else: print("Initializing networks from scratch.") else: self.checkpoint = None self.manager = None self.checkpoint_path = checkpoint_path
def train(epochs, batch_size, ckpt_path, imgs_path, lr, out_path): tf.keras.backend.clear_session() train_data = get_data(imgs_path, batch_size) gen = generator() disc = discriminator() print(gen.summary()) print(disc.summary()) gen_opt = Adam(learning_rate=lr, beta_1=0.5) disc_opt = Adam(learning_rate=lr, beta_1=0.5) ckpt = Checkpoint(disc=disc, gen=gen, disc_opt=disc_opt, gen_opt=gen_opt) manager = CheckpointManager(ckpt, ckpt_path, max_to_keep=3) if manager.latest_checkpoint: print("Restored from {}".format(manager.latest_checkpoint)) ckpt.restore(manager.latest_checkpoint) else: print("Initializing from scratch.") seed = tf.random.normal([16, ENCODING_SIZE], seed=1234) generate_and_save_images(gen, 0, seed, out_path) for ep in range(epochs): gen_loss = [] disc_loss_real = [] disc_loss_fake = [] print('Epoch: %d of %d' % (ep + 1, epochs)) start = time.time() for images in train_data: g_loss, d_loss_r, d_loss_f = train_step(images, gen, disc, gen_opt, disc_opt, batch_size) gen_loss.append(g_loss) disc_loss_real.append(d_loss_r) disc_loss_fake.append(d_loss_f) gen_loss = np.mean(np.asarray(gen_loss)) disc_loss_real = np.mean(np.asarray(disc_loss_real)) disc_loss_fake = np.mean(np.asarray(disc_loss_fake)) if (np.isnan(gen_loss) or np.isnan(disc_loss_real) or np.isnan(disc_loss_fake)): print("Something broke.") break manager.save() generate_and_save_images(gen, ep + 1, seed, out_path) print("Time for epoch:", time.time() - start) print("Gen loss=", gen_loss) print("Disc loss real=", disc_loss_real) print("Disc loss fake=", disc_loss_fake)
def train(args): train_ds, test_ds = get_data(args.img_path, args.batch) gen = generator() disc = discriminator() gen_opt = Adam(args.learning_rate, beta_1=0.5, beta_2=0.999) disc_opt = Adam(args.learning_rate, beta_1=0.5, beta_2=0.999) print(gen.summary()) print(disc.summary()) ckpt = Checkpoint(disc=disc, gen=gen, disc_opt=disc_opt, gen_opt=gen_opt) manager = CheckpointManager(ckpt, args.ckpt_path, max_to_keep=3) if args.continue_training: latest = manager.latest_checkpoint if latest: print("Restored from {}".format(latest)) ckpt.restore(latest) off = int(re.split('-', latest)[-1]) else: off = 0 print("Initializing from scratch.") for ep in range(args.epochs): for x, y in test_ds.take(1): generate_and_save_imgs(gen, ep + off, x, y, args.out_path) gen_loss = [] disc_loss = [] print('Epoch: %d of %d' % (ep + 1 + off, args.epochs + off)) start = time.time() for x, y in train_ds: g_loss, d_loss = train_step(x, y, gen, disc, gen_opt, disc_opt, args.batch) gen_loss.append(g_loss) disc_loss.append(d_loss) gen_loss = np.mean(np.asarray(gen_loss)) disc_loss = np.mean(np.asarray(disc_loss)) manager.save() print("Time for epoch:", time.time() - start) print("Gen loss=", gen_loss) print("Disc loss=", disc_loss) # Storing three different outputs after final epoch for x, y in test_ds.take(3): generate_and_save_imgs(gen, args.epochs + off, x, y, args.out_path) off += 1
def set_checkpoint(opt, G_YtoX, G_XtoY, D_X, D_Y, G_YtoX_optimizer, G_XtoY_optimizer, D_X_optimizer, D_Y_optimizer): if opt["use_cycle_consistency_loss"]: os.makedirs("./checkpoints/{}/train".format(opt["dataset_name"]), exist_ok=True) checkpoint_path = os.path.join("checkpoints", opt["dataset_name"], "train") else: os.makedirs("./no_cycle/checkpoints/{}/train".format( opt["dataset_name"]), exist_ok=True) checkpoint_path = os.path.join("no_cycle", "checkpoints", opt["dataset_name"], "train") ckpt = Checkpoint(G_YtoX_optimizer=G_YtoX_optimizer, G_XtoY_optimizer=G_XtoY_optimizer, D_X_optimizer=D_X_optimizer, D_Y_optimizer=D_Y_optimizer, G_YtoX=G_YtoX, G_XtoY=G_XtoY, D_X=D_X, D_Y=D_Y) ckpt_manager = CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) # if a checkpoint exists, restore the latest checkpoint. if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Latest checkpoint restored!!') return ckpt, ckpt_manager
def __init__(self, checkpoint, directory, max_to_keep, after_num_epoch=1, keep_checkpoint_every_n_hours=None, checkpoint_name="ckpt", step_counter=None, checkpoint_interval=None, init_fn=None): super().__init__() self.manager = (CheckpointManager(checkpoint, directory, max_to_keep, keep_checkpoint_every_n_hours, checkpoint_name, step_counter, checkpoint_interval, init_fn)) self.epoch_counter = 0 self.after_num_epoch = after_num_epoch
def get_checkpoint_manager(model, optimizer, checkpoints_dir, max_checkpoints=None): """Obtains a checkpoint manager to manage model saving and restoring. Arguments: model (mode.ImageCaptionModel): object containing encoder, decoder and tokenizer optimizer (tf.optimizers.Optimizer): the optimizer used during the backpropagation step config (config.Config): Values for various configuration options Returns: tf.train.CheckpointManager, tf.train.Ckeckpoint """ ckpt = Checkpoint(encoder=model.encoder, decoder=model.decoder, optimizer=optimizer) ckpt_manager = CheckpointManager(ckpt, checkpoints_dir, max_to_keep=max_checkpoints) return ckpt_manager, ckpt
class MetaTrainer: def __init__(self, network, generative_model, loss, summary_stats=None, optimizer=None, learning_rate=0.0005, checkpoint_path=None, max_to_keep=5, clip_method='global_norm', clip_value=None): """ Creates a trainer instance for performing single-model forward inference and training an amortized neural estimator for parameter estimation (BayesFlow). If a checkpoint_path is provided, the network's weights will be stored after each training epoch. If the folder contains a checkpoint, the trainer will try to load the weights and continue training with a pre-trained net. ---------- Arguments: network : bayesflow.Amortizer instance -- the neural architecture to be optimized generative_model: callable -- a function or an object with n_sim and n_obs mandatory arguments returning randomly sampled parameter vectors and datasets from a process model loss : callable with three arguments: (network, m_indices, x) -- the loss function ---------- Keyword arguments: summary_stats : callable -- optional summary statistics function optimizer : None or tf.keras.optimizer.Optimizer -- default Adam optimizer (equiv. to None) or a custom one learning_rate : float -- the learning rate used for the optimizer checkpoint_path : string -- optional folder name for storing the trained network max_to_keep : int -- optional number of checkpoints to keep clip_method : string in ('norm', 'value', 'global_norm') -- optional gradient clipping method clip_value : float -- the value used for gradient clipping when clip_method is set to 'value' or 'norm' """ # Basic attributes self.network = network self.generative_model = generative_model self.loss = loss self.summary_stats = summary_stats self.clip_method = clip_method self.clip_value = clip_value self.n_obs = None # Optimizer settings if optimizer is None: if tf.__version__.startswith('1'): self.optimizer = tf.train.AdamOptimizer(learning_rate) else: self.optimizer = Adam(learning_rate) else: self.optimizer = optimizer(learning_rate) # Checkpoint settings if checkpoint_path is not None: self.checkpoint = Checkpoint(optimizer=self.optimizer, model=self.network) self.manager = CheckpointManager(self.checkpoint, checkpoint_path, max_to_keep=max_to_keep) self.checkpoint.restore(self.manager.latest_checkpoint) if self.manager.latest_checkpoint: print("Networks loaded from {}".format(self.manager.latest_checkpoint)) else: print("Initializing networks from scratch.") else: self.checkpoint = None self.manager = None self.checkpoint_path = checkpoint_path def train_online(self, epochs, iterations_per_epoch, batch_size, n_obs, **kwargs): """ Trains the inference network(s) via online learning. Additional keyword arguments are passed to the simulators. ---------- Arguments: epochs : int -- number of epochs (and number of times a checkpoint is stored) iterations_per_epoch : int -- number of batch simulations to perform per epoch batch_size : int -- number of simulations to perform at each backprop step n_obs : int or callable -- if int, then treated as a fixed number of observations, if callable, then treated as a function for sampling N, i.e., N ~ p(N) ---------- Returns: losses : dict (ep_num : list_of_losses) -- a dictionary storing the losses across epochs and iterations """ losses = dict() for ep in range(1, epochs+1): losses[ep] = [] with tqdm(total=iterations_per_epoch, desc='Training epoch {}'.format(ep)) as p_bar: for it in range(1, iterations_per_epoch+1): # Determine n_obs and generate data on-the-fly if type(n_obs) is int: n_obs_it = n_obs else: n_obs_it = n_obs() model_indices, params, sim_data = self._forward_inference(batch_size, n_obs_it, **kwargs) # One step backprop loss = self._train_step(model_indices, params, sim_data) # Store loss into dictionary losses[ep].append(loss) # Update progress bar p_bar.set_postfix_str("Epoch {0},Iteration {1},Loss: {2:.3f},Running Loss: {3:.3f}" .format(ep, it, loss, np.mean(losses[ep]))) p_bar.update(1) # Store after each epoch, if specified if self.manager is not None: self.manager.save() return losses def train_offline(self, epochs, batch_size, model_indices, params, sim_data): """ Trains the inference network(s) via offline learning. Assume params and data have already been simulated (i.e., forward inference). ---------- Arguments: epochs : int -- number of epochs (and number of times a checkpoint is stored) batch_size : int -- number of simulations to perform at each backprop step model_indices : np.array of shape (n_sim, ) or (n_sim, n_models) -- the true model indices params : np.array of shape (n_sim, n_params) -- the true data-generating parameters sim_data : np.array of shape (n_sim, n_obs, data_dim) -- the simulated data sets from each model ---------- Returns: losses : dict (ep_num : list_of_losses) -- a dictionary storing the losses across epochs and iterations """ # Convert to a data set n_sim = int(sim_data.shape[0]) # Compute summary statistics, if provided if self.summary_stats is not None: print('Computing hand-crafted summary statistics...') sim_data = self.summary_stats(sim_data) print('Converting {} simulations to a TensorFlow data set...'.format(n_sim)) data_set = tf.data.Dataset \ .from_tensor_slices((model_indices, params, sim_data)) \ .shuffle(n_sim) \ .batch(batch_size) losses = dict() for ep in range(1, epochs+1): losses[ep] = [] with tqdm(total=int(np.ceil(n_sim / batch_size)), desc='Training epoch {}'.format(ep)) as p_bar: # Loop through dataset for bi, batch in enumerate(data_set): # Extract params from batch model_indices_b, params_b, sim_data_b = batch[0], batch[1], batch[2] # One step backprop loss = self._train_step(model_indices_b, params_b, sim_data_b) # Store loss and update progress bar losses[ep].append(loss) p_bar.set_postfix_str("Epoch {0},Batch {1},Loss: {2:.3f},Running Loss: {3:.3f}" .format(ep, bi+1, loss, np.mean(losses[ep]))) p_bar.update(1) # Store after each epoch, if specified if self.manager is not None: self.manager.save() return losses def train_rounds(self, epochs, rounds, sim_per_round, batch_size, n_obs, **kwargs): """ Trains the inference network(s) via round-based learning. Additional arguments are passed to the simulator. ---------- Arguments: epochs : int -- number of epochs (and number of times a checkpoint is stored) rounds : int -- number of rounds to perform sim_per_round : int -- number of simulations per round batch_size : int -- number of simulations to perform at each backprop step n_obs : int -- number of observations (fixed) for each data set ---------- Returns: losses : nested dict with each (ep_num : list_of_losses) -- a dictionary storing the losses across rounds, epochs and iterations """ # Make sure n_obs is fixed, otherwise not working assert type(n_obs) is int,\ 'Round-based training currently only works with fixed n_obs. Use online learning for variable n_obs or fix n_obs to an integer value.' losses = dict() for r in range(1, rounds+1): # Data generation step if r == 1: # Simulate initial data print('Simulating initial {} data sets...'.format(sim_per_round)) model_indices, params, sim_data = self._forward_inference(sim_per_round, n_obs, **kwargs) else: # Simulate further data print('Simulating new {} data sets and appending to previous...'.format(sim_per_round)) print('New total number of simulated data sets: {}'.format(sim_per_round * r)) model_indices_r, params_r, sim_data_r = self._forward_inference(sim_per_round, n_obs, **kwargs) # Add new simulations to previous data model_indices = np.concatenate((model_indices, model_indices_r), axis=0) params = np.concatenate((params, params_r), axis=0) sim_data = np.concatenate((sim_data, sim_data_r), axis=0) # Train offline with generated stuff losses_r = self.train_offline(epochs, batch_size, model_indices, params, sim_data) losses[r] = losses_r return losses def simulate_and_train_offline(self, n_sim, epochs, batch_size, n_obs, **kwargs): """ Simulates n_sim data sets and then trains the inference network(s) via offline learning. Additional keyword arguments are passed to the simulator. ---------- Arguments: n_sim : int -- total number of simulations to perform epochs : int -- number of epochs (and number of times a checkpoint is stored) batch_size : int -- number of simulations to perform at each backprop step n_obs : int -- number of observations for each dataset ---------- Returns: losses : dict (ep_num : list_of_losses) -- a dictionary storing the losses across epochs and iterations """ # Make sure n_obs is fixed, otherwise not working, for now assert type(n_obs) is int,\ 'Offline training currently only works with fixed n_obs. Use online learning for variable n_obs or fix n_obs to an integer value.' # Simulate data print('Simulating {} data sets upfront...'.format(n_sim)) model_indices, params, sim_data = self._forward_inference(n_sim, n_obs, summarize=False, **kwargs) # Train offlines losses = self.train_offline(epochs, batch_size, model_indices, params, sim_data) return losses def load_pretrained_network(self): """ Attempts to load a pre-trained network if checkpoint path is provided and a checkpoint manager exists. """ if self.manager is None or self.checkpoint is None: return False status = self.checkpoint.restore(self.manager.latest_checkpoint) return status def _forward_inference(self, n_sim, n_obs, summarize=True, **kwargs): """ Performs one step of multi-model forward inference. ---------- Arguments: n_sim : int -- number of simulation to perform at the given step (i.e., batch size) n_obs : int or callable -- if int, then treated as a fixed number of observations, if callable, then treated as a function for sampling N, i.e., N ~ p(N) ---------- Kyeword arguments: summarize : bool -- whether to summarize the data if hand-crafted summaries are given Returns: params : np.array (np.float32) of shape (batch_size, param_dim) -- array of sampled parameters sim_data : np.array (np.float32) of shape (batch_size, n_obs, data_dim) -- array of simulated data sets """ # Simulate data with n_sims and n_obs # Return shape of params is (batch_size, param_dim) # Return shape of data is (batch_size, n_obs, data_dim) model_indices, params, sim_data = self.generative_model(n_sim, n_obs, **kwargs) # Compute hand-crafted summary stats, if given if summarize and self.summary_stats is not None: # Return shape in this case is (batch_size, n_sum) sim_data = self.summary_stats(sim_data) return model_indices, params, sim_data def _train_step(self, model_indices, params, sim_data): """ Performs one step of backpropagation with the given model indices and data. ---------- Arguments: model_indices : np.array (np.float32) of shape (n_sim, n_models) -- the true model indices params : np.array (np.float32) of shape (batch_size, n_params) -- matrix of n_samples x n_params sim_data : np.array (np.float32) of shape (batch_size, n_obs, data_dim) or (batch_size, summary_dim) -- array of simulated data sets (or summary statistics thereof) ---------- Returns: loss : tf.Tensor of shape (,), i.e., a scalar representing the average loss over the batch of m and x """ # Compute loss and store gradients with tf.GradientTape() as tape: loss = self.loss(self.network, model_indices, params, sim_data) # One step backprop gradients = tape.gradient(loss, self.network.trainable_variables) self._apply_gradients(gradients, self.network.trainable_variables) return loss.numpy() def _apply_gradients(self, gradients, tensors): """ Updates each tensor in the 'variables' list via backpropagation. Operation is performed in-place. ---------- Arguments: gradients: list of tf.Tensor -- the list of gradients for all neural network parameter variables: list of tf.Tensor -- the list of all neural network parameters """ # Optional gradient clipping if self.clip_value is not None: gradients = clip_gradients(gradients, clip_value=self.clip_value, clip_method=self.clip_method) self.optimizer.apply_gradients(zip(gradients, tensors))