def __call__( self, num_simulations: int, proposal: Optional[Any] = None, training_batch_size: int = 50, learning_rate: float = 5e-4, validation_fraction: float = 0.1, stop_after_epochs: int = 20, max_num_epochs: Optional[int] = None, clip_max_norm: Optional[float] = 5.0, calibration_kernel: Optional[Callable] = None, exclude_invalid_x: bool = True, discard_prior_samples: bool = False, retrain_from_scratch_each_round: bool = False, ) -> DirectPosterior: r"""Run SNPE. Return posterior $p(\theta|x)$ after inference. Args: num_simulations: Number of simulator calls. proposal: Distribution that the parameters $\theta$ are drawn from. `proposal=None` uses the prior. Setting the proposal to a distribution targeted on a specific observation, e.g. a posterior $p(\theta|x_o)$ obtained previously, can lead to less required simulations. training_batch_size: Training batch size. learning_rate: Learning rate for Adam optimizer. validation_fraction: The fraction of data to use for validation. stop_after_epochs: The number of epochs to wait for improvement on the validation set before terminating training. max_num_epochs: Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. If None, we train until validation loss increases (see also `stop_after_epochs`). clip_max_norm: Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping. calibration_kernel: A function to calibrate the loss with respect to the simulations `x`. See Lueckmann, Gonçalves et al., NeurIPS 2017. exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞` during training. Expect errors, silent or explicit, when `False`. discard_prior_samples: Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples. retrain_from_scratch_each_round: Whether to retrain the conditional density estimator for the posterior from scratch each round. Returns: Posterior $p(\theta|x)$ that can be sampled and evaluated. """ # Calibration kernels proposed in Lueckmann, Gonçalves et al., 2017. if calibration_kernel is None: calibration_kernel = lambda x: ones([len(x)]) max_num_epochs = 2 ** 31 - 1 if max_num_epochs is None else max_num_epochs self._check_proposal(proposal) self._round = self._round + 1 if (proposal is not None) else 0 # If presimulated data was provided from a later round, set the self._round to # this value. Otherwise, we would rely on the user to _additionally_ provide the # proposal that the presimulated data was sampled from in order for self._round # to become larger than 0. if self._data_round_index: self._round = max(self._round, max(self._data_round_index)) # Run simulations for the round. theta, x = self._run_simulations(proposal, num_simulations) self._append_to_data_bank(theta, x, self._round) # Load data from most recent round. theta, x, _ = self._get_from_data_bank(self._round, exclude_invalid_x, False) # First round or if retraining from scratch: # Call the `self._build_neural_net` with the rounds' thetas and xs as # arguments, which will build the neural network. # This is passed into NeuralPosterior, to create a neural posterior which # can `sample()` and `log_prob()`. The network is accessible via `.net`. if self._posterior is None or retrain_from_scratch_each_round: x_shape = x_shape_from_simulation(x) self._posterior = DirectPosterior( method_family="snpe", neural_net=self._build_neural_net(theta, x), prior=self._prior, x_shape=x_shape, sample_with_mcmc=self._sample_with_mcmc, mcmc_method=self._mcmc_method, mcmc_parameters=self._mcmc_parameters, get_potential_function=PotentialFunctionProvider(), ) # Fit posterior using newly aggregated data set. self._train( proposal=proposal, training_batch_size=training_batch_size, learning_rate=learning_rate, validation_fraction=validation_fraction, stop_after_epochs=stop_after_epochs, max_num_epochs=cast(int, max_num_epochs), clip_max_norm=clip_max_norm, calibration_kernel=calibration_kernel, exclude_invalid_x=exclude_invalid_x, discard_prior_samples=discard_prior_samples, ) # Store models at end of each round. self._model_bank.append(deepcopy(self._posterior)) self._model_bank[-1].net.eval() # Making the call to `leakage_correction()` and the update of # self._leakage_density_correction_factor explicit here. This is just # to make sure this update never gets lost when we e.g. do not log our # things to tensorboard anymore. Calling `leakage_correction()` is needed # to update the leakage after each round. if self._posterior.default_x is None: acceptance_rate = torch.tensor(float("nan")) else: acceptance_rate = self._posterior.leakage_correction( x=self._posterior.default_x, force_update=True, show_progress_bars=self._show_progress_bars, ) # Update tensorboard and summary dict. self._summarize( round_=self._round, x_o=self._posterior.default_x, theta_bank=theta, x_bank=x, posterior_samples_acceptance_rate=acceptance_rate, ) # Update description for progress bar. if self._show_round_summary: print(self._describe_round(self._round, self._summary)) self._posterior._num_trained_rounds = self._round + 1 return deepcopy(self._posterior)
def train( self, training_batch_size: int = 50, learning_rate: float = 5e-4, validation_fraction: float = 0.1, stop_after_epochs: int = 20, max_num_epochs: Optional[int] = None, clip_max_norm: Optional[float] = 5.0, exclude_invalid_x: bool = True, discard_prior_samples: bool = False, retrain_from_scratch_each_round: bool = False, show_train_summary: bool = False, ) -> LikelihoodBasedPosterior: r""" Train the density estimator to learn the distribution $p(x|\theta)$. Args: exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞` during training. Expect errors, silent or explicit, when `False`. discard_prior_samples: Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples. retrain_from_scratch_each_round: Whether to retrain the conditional density estimator for the posterior from scratch each round. show_train_summary: Whether to print the number of epochs and validation loss after the training. Returns: Density estimator that has learned the distribution $p(x|\theta)$. """ max_num_epochs = 2**31 - 1 if max_num_epochs is None else max_num_epochs # Load data from most recent round. self._round = max(self._data_round_index) theta, x, _ = self.get_simulations(self._round, exclude_invalid_x, False) # Starting index for the training set (1 = discard round-0 samples). start_idx = int(discard_prior_samples and self._round > 0) theta, x, _ = self.get_simulations(start_idx, exclude_invalid_x) # Get total number of training examples. num_examples = len(theta) # Select random train and validation splits from (theta, x) pairs. permuted_indices = torch.randperm(num_examples) num_training_examples = int((1 - validation_fraction) * num_examples) num_validation_examples = num_examples - num_training_examples train_indices, val_indices = ( permuted_indices[:num_training_examples], permuted_indices[num_training_examples:], ) # Dataset is shared for training and validation loaders. dataset = data.TensorDataset(theta, x) # Create neural net and validation loaders using a subset sampler. train_loader = data.DataLoader( dataset, batch_size=min(training_batch_size, num_training_examples), drop_last=True, sampler=SubsetRandomSampler(train_indices), ) val_loader = data.DataLoader( dataset, batch_size=min(training_batch_size, num_validation_examples), shuffle=False, drop_last=False, sampler=SubsetRandomSampler(val_indices), ) # First round or if retraining from scratch: # Call the `self._build_neural_net` with the rounds' thetas and xs as # arguments, which will build the neural network # This is passed into NeuralPosterior, to create a neural posterior which # can `sample()` and `log_prob()`. The network is accessible via `.net`. if self._neural_net is None or retrain_from_scratch_each_round: self._neural_net = self._build_neural_net(theta[train_indices], x[train_indices]) self._x_shape = x_shape_from_simulation(x) assert (len(self._x_shape) < 3 ), "SNLE cannot handle multi-dimensional simulator output." self._neural_net.to(self._device) optimizer = optim.Adam( list(self._neural_net.parameters()), lr=learning_rate, ) epoch, self._val_log_prob = 0, float("-Inf") while epoch <= max_num_epochs and not self._converged( epoch, stop_after_epochs): # Train for a single epoch. self._neural_net.train() for batch in train_loader: optimizer.zero_grad() theta_batch, x_batch = ( batch[0].to(self._device), batch[1].to(self._device), ) # Evaluate on x with theta as context. log_prob = self._neural_net.log_prob(x_batch, context=theta_batch) loss = -torch.mean(log_prob) loss.backward() if clip_max_norm is not None: clip_grad_norm_( self._neural_net.parameters(), max_norm=clip_max_norm, ) optimizer.step() epoch += 1 # Calculate validation performance. self._neural_net.eval() log_prob_sum = 0 with torch.no_grad(): for batch in val_loader: theta_batch, x_batch = ( batch[0].to(self._device), batch[1].to(self._device), ) # Evaluate on x with theta as context. log_prob = self._neural_net.log_prob(x_batch, context=theta_batch) log_prob_sum += log_prob.sum().item() self._val_log_prob = log_prob_sum / num_validation_examples # Log validation log prob for every epoch. self._summary["validation_log_probs"].append(self._val_log_prob) self._maybe_show_progress(self._show_progress_bars, epoch) self._report_convergence_at_end(epoch, stop_after_epochs, max_num_epochs) # Update summary. self._summary["epochs"].append(epoch) self._summary["best_validation_log_probs"].append( self._best_val_log_prob) # Update TensorBoard and summary dict. self._summarize( round_=self._round, x_o=None, theta_bank=theta, x_bank=x, ) # Update description for progress bar. if show_train_summary: print(self._describe_round(self._round, self._summary)) return deepcopy(self._neural_net)
def train( self, training_batch_size: int = 50, learning_rate: float = 5e-4, validation_fraction: float = 0.1, stop_after_epochs: int = 20, max_num_epochs: Optional[int] = None, clip_max_norm: Optional[float] = 5.0, calibration_kernel: Optional[Callable] = None, exclude_invalid_x: bool = True, resume_training: bool = False, discard_prior_samples: bool = False, retrain_from_scratch_each_round: bool = False, show_train_summary: bool = False, dataloader_kwargs: Optional[dict] = None, ) -> DirectPosterior: r""" Return density estimator that approximates the distribution $p(\theta|x)$. Args: training_batch_size: Training batch size. learning_rate: Learning rate for Adam optimizer. validation_fraction: The fraction of data to use for validation. stop_after_epochs: The number of epochs to wait for improvement on the validation set before terminating training. max_num_epochs: Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. If None, we train until validation loss increases (see also `stop_after_epochs`). clip_max_norm: Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping. calibration_kernel: A function to calibrate the loss with respect to the simulations `x`. See Lueckmann, Gonçalves et al., NeurIPS 2017. exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞` during training. Expect errors, silent or explicit, when `False`. resume_training: Can be used in case training time is limited, e.g. on a cluster. If `True`, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time `.train()` was called. discard_prior_samples: Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples. retrain_from_scratch_each_round: Whether to retrain the conditional density estimator for the posterior from scratch each round. show_train_summary: Whether to print the number of epochs and validation loss after the training. dataloader_kwargs: Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn) Returns: Density estimator that approximates the distribution $p(\theta|x)$. """ # Calibration kernels proposed in Lueckmann, Gonçalves et al., 2017. if calibration_kernel is None: calibration_kernel = lambda x: ones([len(x)], device=self._device) max_num_epochs = 2**31 - 1 if max_num_epochs is None else max_num_epochs # Starting index for the training set (1 = discard round-0 samples). #start_idx = int(discard_prior_samples and self._round > 0) start_idx = self._round # For non-atomic loss, we can not reuse samples from prev. rounds as of now. if self.use_non_atomic_loss: start_idx = self._round theta, x, prior_masks = self.get_simulations(start_idx, exclude_invalid_x, warn_on_invalid=True) # Dataset is shared for training and validation loaders. dataset = data.TensorDataset( theta, x, prior_masks, ) # Set the proposal to the last proposal that was passed by the user. For # atomic SNPE, it does not matter what the proposal is. For non-atomic # SNPE, we only use the latest data that was passed, i.e. the one from the # last proposal. proposal = self._proposal_roundwise[-1] train_loader, val_loader = self.get_dataloaders( dataset, training_batch_size, validation_fraction, resume_training, dataloader_kwargs=dataloader_kwargs, ) # First round or if retraining from scratch: # Call the `self._build_neural_net` with the rounds' thetas and xs as # arguments, which will build the neural network. # This is passed into NeuralPosterior, to create a neural posterior which # can `sample()` and `log_prob()`. The network is accessible via `.net`. if self._neural_net is None or retrain_from_scratch_each_round: self._neural_net = self._build_neural_net( theta[self.train_indices], x[self.train_indices]) #test_posterior_net_for_multi_d_x(self._neural_net, theta, x) self._x_shape = x_shape_from_simulation(x) # Move entire net to device for training. self._neural_net.to(self._device) if not resume_training: self.optimizer = optim.Adam( list(self._neural_net.parameters()), lr=learning_rate, ) self.epoch, self._val_log_prob = 0, float("-Inf") while self.epoch <= max_num_epochs and not self._converged( self.epoch, stop_after_epochs): # Train for a single epoch. self._neural_net.train() for batch in train_loader: self.optimizer.zero_grad() # Get batches on current device. theta_batch, x_batch, masks_batch = ( batch[0].to(self._device), batch[1].to(self._device), batch[2].to(self._device), ) loss = self._loss( theta_batch, x_batch, masks_batch, proposal, calibration_kernel, ) if loss is None: continue batch_loss = torch.mean(loss) batch_loss.backward() if clip_max_norm is not None: clip_grad_norm_( self._neural_net.parameters(), max_norm=clip_max_norm, ) self.optimizer.step() self.epoch += 1 # Calculate validation performance. self._neural_net.eval() log_prob_sum = 0 with torch.no_grad(): for batch in val_loader: theta_batch, x_batch, masks_batch = ( batch[0].to(self._device), batch[1].to(self._device), batch[2].to(self._device), ) loss = self._loss( theta_batch, x_batch, masks_batch, proposal, calibration_kernel, ) if loss is None: continue # Take negative loss here to get validation log_prob. batch_log_prob = -loss log_prob_sum += batch_log_prob.sum().item() # Take mean over all validation samples. self._val_log_prob = log_prob_sum / (len(val_loader) * val_loader.batch_size) # Log validation log prob for every epoch. self._summary["validation_log_probs"].append(self._val_log_prob) self._maybe_show_progress(self._show_progress_bars, self.epoch) self._report_convergence_at_end(self.epoch, stop_after_epochs, max_num_epochs) # Update summary. self._summary["epochs"].append(self.epoch) self._summary["best_validation_log_probs"].append( self._best_val_log_prob) # Update tensorboard and summary dict. self._summarize( round_=self._round, x_o=None, theta_bank=theta, x_bank=x, ) # Update description for progress bar. if show_train_summary: print(self._describe_round(self._round, self._summary)) return deepcopy(self._neural_net)
def __call__( self, num_rounds: int, num_simulations_per_round: OneOrMore[int], x_o: Optional[Tensor] = None, num_atoms: int = 10, training_batch_size: int = 50, learning_rate: float = 5e-4, validation_fraction: float = 0.1, stop_after_epochs: int = 20, max_num_epochs: Optional[int] = None, clip_max_norm: Optional[float] = 5.0, exclude_invalid_x: bool = True, discard_prior_samples: bool = False, retrain_from_scratch_each_round: bool = False, ) -> NeuralPosterior: """Run SNRE. Return posterior $p(\theta|x)$ after inference (possibly over several rounds). Args: num_atoms: Number of atoms to use for classification. exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞` during training. Expect errors, silent or explicit, when `False`. discard_prior_samples: Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples. retrain_from_scratch_each_round: Whether to retrain the conditional density estimator for the posterior from scratch each round. Returns: Posterior $p(\theta|x)$ that can be sampled and evaluated. """ max_num_epochs = 2**31 - 1 if max_num_epochs is None else max_num_epochs num_sims_per_round = self._ensure_list(num_simulations_per_round, num_rounds) for round_, num_sims in enumerate(num_sims_per_round): # Run simulations for the round. theta, x = self._run_simulations(round_, num_sims) self._append_to_data_bank(theta, x, round_) # Load data from most recent round. theta, x, _ = self._get_from_data_bank(round_, exclude_invalid_x, False) # First round or if retraining from scratch: # Call the `self._build_neural_net` with the rounds' thetas and xs as # arguments, which will build the neural network # This is passed into NeuralPosterior, to create a neural posterior which # can `sample()` and `log_prob()`. The network is accessible via `.net`. if round_ == 0 or retrain_from_scratch_each_round: x_shape = x_shape_from_simulation(x) self._posterior = NeuralPosterior( method_family=self.__class__.__name__.lower(), neural_net=self._build_neural_net(theta, x), prior=self._prior, x_shape=x_shape, sample_with_mcmc=self._sample_with_mcmc, mcmc_method=self._mcmc_method, mcmc_parameters=self._mcmc_parameters, get_potential_function=PotentialFunctionProvider(), ) self._handle_x_o_wrt_amortization(x_o, x_shape, num_rounds) # Fit posterior using newly aggregated data set. self._train( round_=round_, num_atoms=num_atoms, training_batch_size=training_batch_size, learning_rate=learning_rate, validation_fraction=validation_fraction, stop_after_epochs=stop_after_epochs, max_num_epochs=max_num_epochs, clip_max_norm=clip_max_norm, exclude_invalid_x=exclude_invalid_x, discard_prior_samples=discard_prior_samples, ) # Update description for progress bar. if self._show_round_summary: print(self._describe_round(round_, self._summary)) # Update tensorboard and summary dict. self._summarize( round_=round_, x_o=self._posterior.default_x, theta_bank=theta, x_bank=x, ) self._posterior._num_trained_rounds = num_rounds return self._posterior
def __call__( self, num_simulations: int, proposal: Optional[Any] = None, num_atoms: int = 10, training_batch_size: int = 50, learning_rate: float = 5e-4, validation_fraction: float = 0.1, stop_after_epochs: int = 20, max_num_epochs: Optional[int] = None, clip_max_norm: Optional[float] = 5.0, exclude_invalid_x: bool = True, discard_prior_samples: bool = False, retrain_from_scratch_each_round: bool = False, ) -> RatioBasedPosterior: r"""Run SNRE. Return posterior $p(\theta|x)$ after inference. Args: num_atoms: Number of atoms to use for classification. exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞` during training. Expect errors, silent or explicit, when `False`. discard_prior_samples: Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples. retrain_from_scratch_each_round: Whether to retrain the conditional density estimator for the posterior from scratch each round. Returns: Posterior $p(\theta|x)$ that can be sampled and evaluated. """ max_num_epochs = 2**31 - 1 if max_num_epochs is None else max_num_epochs self._check_proposal(proposal) self._round = self._round + 1 if (proposal is not None) else 0 # If presimulated data was provided from a later round, set the self._round to # this value. Otherwise, we would rely on the user to _additionally_ provide the # proposal that the presimulated data was sampled from in order for self._round # to become larger than 0. if self._data_round_index: self._round = max(self._round, max(self._data_round_index)) # Run simulations for the round. theta, x = self._run_simulations(proposal, num_simulations) self._append_to_data_bank(theta, x, self._round) # Load data from most recent round. theta, x, _ = self._get_from_data_bank(self._round, exclude_invalid_x, False) # First round or if retraining from scratch: # Call the `self._build_neural_net` with the rounds' thetas and xs as # arguments, which will build the neural network # This is passed into NeuralPosterior, to create a neural posterior which # can `sample()` and `log_prob()`. The network is accessible via `.net`. if self._posterior is None or retrain_from_scratch_each_round: x_shape = x_shape_from_simulation(x) self._posterior = RatioBasedPosterior( method_family=self.__class__.__name__.lower(), neural_net=self._build_neural_net(theta, x), prior=self._prior, x_shape=x_shape, mcmc_method=self._mcmc_method, mcmc_parameters=self._mcmc_parameters, get_potential_function=PotentialFunctionProvider(), ) # Fit posterior using newly aggregated data set. self._train( num_atoms=num_atoms, training_batch_size=training_batch_size, learning_rate=learning_rate, validation_fraction=validation_fraction, stop_after_epochs=stop_after_epochs, max_num_epochs=max_num_epochs, clip_max_norm=clip_max_norm, exclude_invalid_x=exclude_invalid_x, discard_prior_samples=discard_prior_samples, ) # Update description for progress bar. if self._show_round_summary: print(self._describe_round(self._round, self._summary)) # Update tensorboard and summary dict. self._summarize( round_=self._round, x_o=self._posterior.default_x, theta_bank=theta, x_bank=x, ) self._posterior._num_trained_rounds = self._round + 1 return deepcopy(self._posterior)
def train( self, training_batch_size: int = 50, learning_rate: float = 5e-4, validation_fraction: float = 0.1, stop_after_epochs: int = 20, max_num_epochs: int = 2**31 - 1, clip_max_norm: Optional[float] = 5.0, exclude_invalid_x: bool = True, resume_training: bool = False, discard_prior_samples: bool = False, retrain_from_scratch: bool = False, show_train_summary: bool = False, dataloader_kwargs: Optional[Dict] = None, ) -> flows.Flow: r"""Train the density estimator to learn the distribution $p(x|\theta)$. Args: exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞` during training. Expect errors, silent or explicit, when `False`. resume_training: Can be used in case training time is limited, e.g. on a cluster. If `True`, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time `.train()` was called. discard_prior_samples: Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples. retrain_from_scratch: Whether to retrain the conditional density estimator for the posterior from scratch each round. show_train_summary: Whether to print the number of epochs and validation loss after the training. dataloader_kwargs: Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn) Returns: Density estimator that has learned the distribution $p(x|\theta)$. """ # Starting index for the training set (1 = discard round-0 samples). start_idx = int(discard_prior_samples and self._round > 0) # Load data from most recent round. self._round = max(self._data_round_index) theta, x, _ = self.get_simulations(start_idx, exclude_invalid_x, warn_on_invalid=True) # Dataset is shared for training and validation loaders. dataset = data.TensorDataset(theta, x) train_loader, val_loader = self.get_dataloaders( dataset, training_batch_size, validation_fraction, resume_training, dataloader_kwargs=dataloader_kwargs, ) # First round or if retraining from scratch: # Call the `self._build_neural_net` with the rounds' thetas and xs as # arguments, which will build the neural network # This is passed into NeuralPosterior, to create a neural posterior which # can `sample()` and `log_prob()`. The network is accessible via `.net`. if self._neural_net is None or retrain_from_scratch: self._neural_net = self._build_neural_net( theta[self.train_indices], x[self.train_indices]) self._x_shape = x_shape_from_simulation(x) assert (len(self._x_shape) < 3 ), "SNLE cannot handle multi-dimensional simulator output." self._neural_net.to(self._device) if not resume_training: self.optimizer = optim.Adam( list(self._neural_net.parameters()), lr=learning_rate, ) self.epoch, self._val_log_prob = 0, float("-Inf") while self.epoch <= max_num_epochs and not self._converged( self.epoch, stop_after_epochs): # Train for a single epoch. self._neural_net.train() train_log_probs_sum = 0 for batch in train_loader: self.optimizer.zero_grad() theta_batch, x_batch = ( batch[0].to(self._device), batch[1].to(self._device), ) # Evaluate on x with theta as context. train_losses = self._loss(theta=theta_batch, x=x_batch) train_loss = torch.mean(train_losses) train_log_probs_sum -= train_losses.sum().item() train_loss.backward() if clip_max_norm is not None: clip_grad_norm_( self._neural_net.parameters(), max_norm=clip_max_norm, ) self.optimizer.step() self.epoch += 1 train_log_prob_average = train_log_probs_sum / ( len(train_loader) * train_loader.batch_size # type: ignore ) self._summary["train_log_probs"].append(train_log_prob_average) # Calculate validation performance. self._neural_net.eval() val_log_prob_sum = 0 with torch.no_grad(): for batch in val_loader: theta_batch, x_batch = ( batch[0].to(self._device), batch[1].to(self._device), ) # Evaluate on x with theta as context. val_losses = self._loss(theta=theta_batch, x=x_batch) val_log_prob_sum -= val_losses.sum().item() # Take mean over all validation samples. self._val_log_prob = val_log_prob_sum / ( len(val_loader) * val_loader.batch_size # type: ignore ) # Log validation log prob for every epoch. self._summary["validation_log_probs"].append(self._val_log_prob) self._maybe_show_progress(self._show_progress_bars, self.epoch) self._report_convergence_at_end(self.epoch, stop_after_epochs, max_num_epochs) # Update summary. self._summary["epochs"].append(self.epoch) self._summary["best_validation_log_probs"].append( self._best_val_log_prob) # Update TensorBoard and summary dict. self._summarize( round_=self._round, x_o=None, theta_bank=theta, x_bank=x, ) # Update description for progress bar. if show_train_summary: print(self._describe_round(self._round, self._summary)) # Avoid keeping the gradients in the resulting network, which can # cause memory leakage when benchmarking. self._neural_net.zero_grad(set_to_none=True) return deepcopy(self._neural_net)
def train( self, num_atoms: int = 10, training_batch_size: int = 50, learning_rate: float = 5e-4, validation_fraction: float = 0.1, stop_after_epochs: int = 20, max_num_epochs: Optional[int] = None, clip_max_norm: Optional[float] = 5.0, exclude_invalid_x: bool = True, resume_training: bool = False, discard_prior_samples: bool = False, retrain_from_scratch_each_round: bool = False, show_train_summary: bool = False, dataloader_kwargs: Optional[Dict] = None, ) -> RatioBasedPosterior: r""" Return classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$. Args: num_atoms: Number of atoms to use for classification. exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞` during training. Expect errors, silent or explicit, when `False`. resume_training: Can be used in case training time is limited, e.g. on a cluster. If `True`, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time `.train()` was called. discard_prior_samples: Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples. retrain_from_scratch_each_round: Whether to retrain the conditional density estimator for the posterior from scratch each round. dataloader_kwargs: Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn) Returns: Classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$. """ max_num_epochs = 2 ** 31 - 1 if max_num_epochs is None else max_num_epochs # Starting index for the training set (1 = discard round-0 samples). start_idx = int(discard_prior_samples and self._round > 0) # Load data from most recent round. self._round = max(self._data_round_index) theta, x, _ = self.get_simulations( start_idx, exclude_invalid_x, warn_on_invalid=True ) # Dataset is shared for training and validation loaders. dataset = data.TensorDataset(theta, x) train_loader, val_loader = self.get_dataloaders( dataset, training_batch_size, validation_fraction, resume_training, dataloader_kwargs=dataloader_kwargs, ) clipped_batch_size = min(training_batch_size, len(val_loader)) num_atoms = clamp_and_warn( "num_atoms", num_atoms, min_val=2, max_val=clipped_batch_size ) # First round or if retraining from scratch: # Call the `self._build_neural_net` with the rounds' thetas and xs as # arguments, which will build the neural network # This is passed into NeuralPosterior, to create a neural posterior which # can `sample()` and `log_prob()`. The network is accessible via `.net`. if self._neural_net is None or retrain_from_scratch_each_round: self._neural_net = self._build_neural_net( theta[self.train_indices], x[self.train_indices] ) self._x_shape = x_shape_from_simulation(x) self._neural_net.to(self._device) if not resume_training: self.optimizer = optim.Adam( list(self._neural_net.parameters()), lr=learning_rate, ) self.epoch, self._val_log_prob = 0, float("-Inf") while self.epoch <= max_num_epochs and not self._converged( self.epoch, stop_after_epochs ): # Train for a single epoch. self._neural_net.train() for batch in train_loader: self.optimizer.zero_grad() theta_batch, x_batch = ( batch[0].to(self._device), batch[1].to(self._device), ) loss = self._loss(theta_batch, x_batch, num_atoms) loss.backward() if clip_max_norm is not None: clip_grad_norm_( self._neural_net.parameters(), max_norm=clip_max_norm, ) self.optimizer.step() self.epoch += 1 # Calculate validation performance. self._neural_net.eval() loss_sum = 0 with torch.no_grad(): for batch in val_loader: theta_batch, x_batch = ( batch[0].to(self._device), batch[1].to(self._device), ) loss = self._loss(theta_batch, x_batch, num_atoms) loss_sum -= loss.sum().item() # Take mean over all validation samples. self._val_log_prob = loss_sum / ( len(val_loader) * val_loader.batch_size ) # Log validation log prob for every epoch. self._summary["validation_log_probs"].append(self._val_log_prob) self._maybe_show_progress(self._show_progress_bars, self.epoch) self._report_convergence_at_end(self.epoch, stop_after_epochs, max_num_epochs) # Update summary. self._summary["epochs"].append(self.epoch) self._summary["best_validation_log_probs"].append(self._best_val_log_prob) # Update TensorBoard and summary dict. self._summarize( round_=self._round, x_o=None, theta_bank=theta, x_bank=x, ) # Update description for progress bar. if show_train_summary: print(self._describe_round(self._round, self._summary)) return deepcopy(self._neural_net)
def __call__( self, num_rounds: int, num_simulations_per_round: OneOrMore[int], x_o: Optional[Tensor] = None, training_batch_size: int = 50, learning_rate: float = 5e-4, validation_fraction: float = 0.1, stop_after_epochs: int = 20, max_num_epochs: Optional[int] = None, clip_max_norm: Optional[float] = 5.0, calibration_kernel: Optional[Callable] = None, exclude_invalid_x: bool = True, discard_prior_samples: bool = False, retrain_from_scratch_each_round: bool = False, ) -> NeuralPosterior: r"""Run SNPE. Return posterior $p(\theta|x)$ after inference (possibly over several rounds). Args: num_rounds: Number of rounds to run. Each round consists of a simulation and training phase. `num_rounds=1` leads to a posterior $p(\theta|x)$ valid for _any_ $x$ ("amortized"), but requires many simulations. Alternatively, with `num_rounds>1` the inference returns a posterior $p(\theta|x_o)$ focused on a specific observation `x_o`, potentially requiring less simulations. num_simulations_per_round: Number of simulator calls per round. x_o: An observation that is only required when doing inference over multiple rounds. After the first round, `x_o` is used to guide the sampling so that the simulator is run with parameters that are likely for that `x_o`, i.e. they are sampled from the posterior obtained in the previous round $p(\theta|x_o)$. training_batch_size: Training batch size. learning_rate: Learning rate for Adam optimizer. validation_fraction: The fraction of data to use for validation. stop_after_epochs: The number of epochs to wait for improvement on the validation set before terminating training. max_num_epochs: Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. If None, we train until validation loss increases (see also `stop_after_epochs`). clip_max_norm: Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping. calibration_kernel: A function to calibrate the loss with respect to the simulations `x`. See Lueckmann, Gonçalves et al., NeurIPS 2017. exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞` during training. Expect errors, silent or explicit, when `False`. discard_prior_samples: Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples. retrain_from_scratch_each_round: Whether to retrain the conditional density estimator for the posterior from scratch each round. Returns: Posterior $p(\theta|x)$ that can be sampled and evaluated. """ self._warn_if_retrain_from_scratch_snpe( retrain_from_scratch_each_round) # Calibration kernels proposed in Lueckmann, Gonçalves et al., 2017. if calibration_kernel is None: calibration_kernel = lambda x: ones([len(x)]) max_num_epochs = 2**31 - 1 if max_num_epochs is None else max_num_epochs num_sims_per_round = self._ensure_list(num_simulations_per_round, num_rounds) for round_, num_sims in enumerate(num_sims_per_round): # Run simulations for the round. theta, x = self._run_simulations(round_, num_sims) self._append_to_data_bank(theta, x, round_) # Load data from most recent round. theta, x, _ = self._get_from_data_bank(round_, exclude_invalid_x, False) # First round or if retraining from scratch: # Call the `self._build_neural_net` with the rounds' thetas and xs as # arguments, which will build the neural network. # This is passed into NeuralPosterior, to create a neural posterior which # can `sample()` and `log_prob()`. The network is accessible via `.net`. if round_ == 0 or retrain_from_scratch_each_round: x_shape = x_shape_from_simulation(x) self._posterior = NeuralPosterior( method_family="snpe", neural_net=self._build_neural_net(theta, x), prior=self._prior, x_shape=x_shape, sample_with_mcmc=self._sample_with_mcmc, mcmc_method=self._mcmc_method, mcmc_parameters=self._mcmc_parameters, get_potential_function=PotentialFunctionProvider(), ) self._handle_x_o_wrt_amortization(x_o, x_shape, num_rounds) # Fit posterior using newly aggregated data set. self._train( round_=round_, training_batch_size=training_batch_size, learning_rate=learning_rate, validation_fraction=validation_fraction, stop_after_epochs=stop_after_epochs, max_num_epochs=cast(int, max_num_epochs), clip_max_norm=clip_max_norm, calibration_kernel=calibration_kernel, exclude_invalid_x=exclude_invalid_x, discard_prior_samples=discard_prior_samples, ) # Store models at end of each round. self._model_bank.append(deepcopy(self._posterior)) self._model_bank[-1].net.eval() # Making the call to `leakage_correction()` and the update of # self._leakage_density_correction_factor explicit here. This is just # to make sure this update never gets lost when we e.g. do not log our # things to tensorboard anymore. Calling `leakage_correction()` is needed # to update the leakage after each round. if self._posterior.default_x is None: acceptance_rate = torch.tensor(float("nan")) else: acceptance_rate = self._posterior.leakage_correction( x=self._posterior.default_x, force_update=True, show_progress_bars=self._show_progress_bars, ) # Update tensorboard and summary dict. self._summarize( round_=round_, x_o=self._posterior.default_x, theta_bank=theta, x_bank=x, posterior_samples_acceptance_rate=acceptance_rate, ) # Update description for progress bar. if self._show_round_summary: print(self._describe_round(round_, self._summary)) self._posterior._num_trained_rounds = num_rounds return self._posterior
def train( self, training_batch_size: int = 50, learning_rate: float = 5e-4, validation_fraction: float = 0.1, stop_after_epochs: int = 20, max_num_epochs: int = 2**31 - 1, clip_max_norm: Optional[float] = 5.0, calibration_kernel: Optional[Callable] = None, exclude_invalid_x: bool = True, resume_training: bool = False, force_first_round_loss: bool = False, discard_prior_samples: bool = False, retrain_from_scratch: bool = False, show_train_summary: bool = False, dataloader_kwargs: Optional[dict] = None, ) -> nn.Module: r"""Return density estimator that approximates the distribution $p(\theta|x)$. Args: training_batch_size: Training batch size. learning_rate: Learning rate for Adam optimizer. validation_fraction: The fraction of data to use for validation. stop_after_epochs: The number of epochs to wait for improvement on the validation set before terminating training. max_num_epochs: Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. Otherwise, we train until validation loss increases (see also `stop_after_epochs`). clip_max_norm: Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping. calibration_kernel: A function to calibrate the loss with respect to the simulations `x`. See Lueckmann, Gonçalves et al., NeurIPS 2017. exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞` during training. Expect errors, silent or explicit, when `False`. resume_training: Can be used in case training time is limited, e.g. on a cluster. If `True`, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time `.train()` was called. force_first_round_loss: If `True`, train with maximum likelihood, i.e., potentially ignoring the correction for using a proposal distribution different from the prior. discard_prior_samples: Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples. retrain_from_scratch: Whether to retrain the conditional density estimator for the posterior from scratch each round. show_train_summary: Whether to print the number of epochs and validation loss after the training. dataloader_kwargs: Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn) Returns: Density estimator that approximates the distribution $p(\theta|x)$. """ if self._round == 0 and self._neural_net is not None: assert force_first_round_loss, ( "You have already trained this neural network. After you had trained " "the network, you again appended simulations with `append_simulations" "(theta, x)`, but you did not provide a proposal. If the new " "simulations are sampled from the prior, you can set " "`.train(..., force_first_round_loss=True`). However, if the new " "simulations were not sampled from the prior, you should pass the " "proposal, i.e. `append_simulations(theta, x, proposal)`. If " "your samples are not sampled from the prior and you do not pass a " "proposal and you set `force_first_round_loss=True`, the result of " "SNPE will not be the true posterior. Instead, it will be the proposal " "posterior, which (usually) is more narrow than the true posterior." ) # Calibration kernels proposed in Lueckmann, Gonçalves et al., 2017. if calibration_kernel is None: calibration_kernel = lambda x: ones([len(x)], device=self._device) # Starting index for the training set (1 = discard round-0 samples). start_idx = int(discard_prior_samples and self._round > 0) # For non-atomic loss, we can not reuse samples from previous rounds as of now. # SNPE-A can, by construction of the algorithm, only use samples from the last # round. SNPE-A is the only algorithm that has an attribute `_ran_final_round`, # so this is how we check for whether or not we are using SNPE-A. if self.use_non_atomic_loss or hasattr(self, "_ran_final_round"): start_idx = self._round theta, x, prior_masks = self.get_simulations( start_idx, exclude_invalid_x, warn_on_invalid=True ) # Dataset is shared for training and validation loaders. dataset = data.TensorDataset(theta, x, prior_masks) # Set the proposal to the last proposal that was passed by the user. For # atomic SNPE, it does not matter what the proposal is. For non-atomic # SNPE, we only use the latest data that was passed, i.e. the one from the # last proposal. proposal = self._proposal_roundwise[-1] train_loader, val_loader = self.get_dataloaders( dataset, training_batch_size, validation_fraction, resume_training, dataloader_kwargs=dataloader_kwargs, ) # First round or if retraining from scratch: # Call the `self._build_neural_net` with the rounds' thetas and xs as # arguments, which will build the neural network. # This is passed into NeuralPosterior, to create a neural posterior which # can `sample()` and `log_prob()`. The network is accessible via `.net`. if self._neural_net is None or retrain_from_scratch: self._neural_net = self._build_neural_net( theta[self.train_indices], x[self.train_indices] ) # If data on training device already move net as well. if ( not self._device == "cpu" and f"{x.device.type}:{x.device.index}" == self._device ): self._neural_net.to(self._device) test_posterior_net_for_multi_d_x(self._neural_net, theta, x) self._x_shape = x_shape_from_simulation(x) # Move entire net to device for training. self._neural_net.to(self._device) if not resume_training: self.optimizer = optim.Adam( list(self._neural_net.parameters()), lr=learning_rate ) self.epoch, self._val_log_prob = 0, float("-Inf") while self.epoch <= max_num_epochs and not self._converged( self.epoch, stop_after_epochs ): # Train for a single epoch. self._neural_net.train() train_log_probs_sum = 0 epoch_start_time = time.time() for batch in train_loader: self.optimizer.zero_grad() # Get batches on current device. theta_batch, x_batch, masks_batch = ( batch[0].to(self._device), batch[1].to(self._device), batch[2].to(self._device), ) train_losses = self._loss( theta_batch, x_batch, masks_batch, proposal, calibration_kernel ) train_loss = torch.mean(train_losses) train_log_probs_sum -= train_losses.sum().item() train_loss.backward() if clip_max_norm is not None: clip_grad_norm_( self._neural_net.parameters(), max_norm=clip_max_norm ) self.optimizer.step() self.epoch += 1 train_log_prob_average = train_log_probs_sum / ( len(train_loader) * train_loader.batch_size # type: ignore ) self._summary["train_log_probs"].append(train_log_prob_average) # Calculate validation performance. self._neural_net.eval() val_log_prob_sum = 0 with torch.no_grad(): for batch in val_loader: theta_batch, x_batch, masks_batch = ( batch[0].to(self._device), batch[1].to(self._device), batch[2].to(self._device), ) # Take negative loss here to get validation log_prob. val_losses = self._loss( theta_batch, x_batch, masks_batch, proposal, calibration_kernel, ) val_log_prob_sum -= val_losses.sum().item() # Take mean over all validation samples. self._val_log_prob = val_log_prob_sum / ( len(val_loader) * val_loader.batch_size # type: ignore ) # Log validation log prob for every epoch. self._summary["validation_log_probs"].append(self._val_log_prob) self._summary["epoch_durations_sec"].append(time.time() - epoch_start_time) self._maybe_show_progress(self._show_progress_bars, self.epoch) self._report_convergence_at_end(self.epoch, stop_after_epochs, max_num_epochs) # Update summary. self._summary["epochs"].append(self.epoch) self._summary["best_validation_log_probs"].append(self._best_val_log_prob) # Update tensorboard and summary dict. self._summarize(round_=self._round, x_o=None, theta_bank=theta, x_bank=x) # Update description for progress bar. if show_train_summary: print(self._describe_round(self._round, self._summary)) # Avoid keeping the gradients in the resulting network, which can # cause memory leakage when benchmarking. self._neural_net.zero_grad(set_to_none=True) return deepcopy(self._neural_net)