def predict_log_prob_batch(self, state, action): data_loader = create_data_loader((state, action), batch_size=32, shuffle=False, drop_last=False) log_probs = [] for obs, action in data_loader: obs = move_tensor_to_gpu(obs) action = move_tensor_to_gpu(action) action_distribution = self.policy_net.forward_action(obs) log_probs.append(action_distribution.log_prob(action)) log_probs = torch.cat(log_probs, dim=0).cpu().numpy() return log_probs
def fit(self, dataset: StateActionPairDataset, epoch=10, batch_size=128, verbose=False): t = range(epoch) if verbose: t = tqdm(t) train_data_loader, val_data_loader = dataset.random_iterator( batch_size=batch_size) for i in t: losses = [] for history_states, history_actions, states, actions in train_data_loader: self.optimizer.zero_grad() history_states = move_tensor_to_gpu(history_states) history_actions = move_tensor_to_gpu(history_actions) states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) history_states = (history_states - self.state_mean) / self.state_std states = (states - self.state_mean.squeeze(dim=1) ) / self.state_std.squeeze(dim=1) output = self.model.forward(history_states, history_actions, states) loss = self.loss_fn(output, actions) loss.backward() self.optimizer.step() losses.append(loss.item()) self.eval() val_losses = [] with torch.no_grad(): for history_states, history_actions, states, actions in val_data_loader: history_states = move_tensor_to_gpu(history_states) history_actions = move_tensor_to_gpu(history_actions) states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) history_states = (history_states - self.state_mean) / self.state_std states = (states - self.state_mean.squeeze(dim=1) ) / self.state_std.squeeze(dim=1) output = self.model.forward(history_states, history_actions, states) loss = self.loss_fn(output, actions) val_losses.append(loss.item()) self.train() if verbose: t.set_description( 'Epoch {}/{} - Avg policy train loss: {:.4f} - Avg policy val loss: {:.4f}' .format(i + 1, epoch, np.mean(losses), np.mean(val_losses)))
def compute_old_log_prob(self, observation, hidden, actions): with torch.no_grad(): data_loader = create_data_loader((observation, hidden, actions), batch_size=32, shuffle=False, drop_last=False) old_log_prob = [] for obs, hid, ac in data_loader: obs = move_tensor_to_gpu(obs) hid = move_tensor_to_gpu(hid) ac = move_tensor_to_gpu(ac) old_distribution, _, _ = self.policy_net.forward(obs, hid) old_log_prob.append(old_distribution.log_prob(ac)) old_log_prob = torch.cat(old_log_prob, dim=0).cpu() return old_log_prob
def fit_dynamic_model(self, dataset: Dataset, epoch=10, batch_size=128, verbose=False): t = range(epoch) if verbose: t = tqdm(t) train_data_loader, val_data_loader = dataset.random_iterator( batch_size=batch_size) for i in t: losses = [] for states, actions, next_states, _, _ in train_data_loader: # convert to tensor states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) next_states = move_tensor_to_gpu(next_states) delta_states = next_states - states # calculate loss self.optimizer.zero_grad() predicted_delta_state_normalized = self.predict_normalized_delta_next_state( states, actions) delta_states_normalized = normalize(delta_states, self.delta_state_mean, self.delta_state_std) loss = F.mse_loss(predicted_delta_state_normalized, delta_states_normalized) loss.backward() self.optimizer.step() losses.append(loss.item()) self.eval() val_losses = [] with torch.no_grad(): for states, actions, next_states, _, _ in val_data_loader: # convert to tensor states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) next_states = move_tensor_to_gpu(next_states) delta_states = next_states - states predicted_delta_state_normalized = self.predict_normalized_delta_next_state( states, actions) delta_states_normalized = normalize( delta_states, self.delta_state_mean, self.delta_state_std) loss = F.mse_loss(predicted_delta_state_normalized, delta_states_normalized) val_losses.append(loss.item()) self.train() if verbose: t.set_description( 'Epoch {}/{} - Avg model train loss: {:.4f} - Avg model val loss: {:.4f}' .format(i + 1, epoch, np.mean(losses), np.mean(val_losses)))
def fit_dynamic_model(self, dataset, epoch=10, batch_size=128, verbose=False): t = range(epoch) if verbose: t = tqdm(t) train_data_loader, val_data_loader = dataset.random_iterator( batch_size=batch_size) for i in t: losses = [] for states, actions, next_states, _, _ in train_data_loader: # convert to tensor states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) next_states = move_tensor_to_gpu(next_states) latent_distribution = self.inference_network.forward( next_states) z = latent_distribution.sample()
def update_policy(self, data_loader, epoch, logger): for epoch_index in range(epoch): for batch_sample in data_loader: # with torch.autograd.detect_anomaly(): observation, action, discount_rewards, advantage, old_log_prob = move_tensor_to_gpu( batch_sample) self.policy_optimizer.zero_grad() # update policy distribution, raw_baselines = self.policy_net.forward( observation) entropy_loss = distribution.entropy().mean() log_prob = distribution.log_prob(action) assert log_prob.shape == advantage.shape, 'log_prob length {}, advantage length {}'.format( log_prob.shape, advantage.shape) # if approximated kl is larger than 1.5 target_kl, we early stop training of this batch negative_approx_kl = log_prob - old_log_prob negative_approx_kl_mean = torch.mean(-negative_approx_kl) ratio = torch.exp(negative_approx_kl) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantage policy_loss = -torch.min(surr1, surr2).mean() value_loss = self.baseline_loss(raw_baselines, discount_rewards) loss = policy_loss - entropy_loss * self.entropy_coef + value_loss * self.value_coef if negative_approx_kl_mean <= 1.5 * self.target_kl: # print('Early stopping this iteration. Current kl {:.4f}. Current epoch index {}'.format( # negative_approx_kl_mean, epoch_index)) nn.utils.clip_grad_norm_(self.policy_net.parameters(), self.max_grad_norm) loss.backward() self.policy_optimizer.step() logger.store(PolicyLoss=policy_loss.item()) logger.store(ValueLoss=value_loss.item()) logger.store(EntropyLoss=entropy_loss.item()) logger.store(NegativeAvgKL=negative_approx_kl_mean.item())
def predict_state_value_batch(self, state): """ compute the state value using nn baseline Args: state: (batch_size, ob_dim) Returns: (batch_size,) """ data_loader = create_data_loader((state, ), batch_size=32, shuffle=False, drop_last=False) values = [] for obs in data_loader: obs = move_tensor_to_gpu(obs[0]) values.append(self.policy_net.forward_value(obs)) values = torch.cat(values, dim=0).cpu().numpy() return values
def fit_dynamic_model(self, dataset, epoch=10, batch_size=128, logger=None): t = tqdm(range(epoch)) train_data_loader, val_data_loader = dataset.random_iterator( batch_size=batch_size) for i in t: losses = [] for states, actions, next_states, rewards, _ in train_data_loader: # in training, we drop last batch to avoid batch size 1 that may crash batch_norm layer. if states.shape[0] == 1: continue # convert to tensor states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) next_states = move_tensor_to_gpu(next_states) rewards = move_tensor_to_gpu(rewards) delta_states = next_states - states # calculate loss self.optimizer.zero_grad() predicted_delta_state_normalized, predicted_reward_normalized = \ self.predict_normalized_delta_next_state_reward(states, actions) delta_states_normalized = normalize(delta_states, self.delta_state_mean, self.delta_state_std) loss = F.mse_loss(predicted_delta_state_normalized, delta_states_normalized) if self.cost_fn_batch is None: rewards_normalized = normalize(rewards, self.reward_mean, self.reward_std) loss += F.mse_loss(predicted_reward_normalized, rewards_normalized) loss.backward() self.optimizer.step() losses.append(loss.item()) self.eval() val_losses = [] with torch.no_grad(): for states, actions, next_states, rewards, _ in val_data_loader: # convert to tensor states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) next_states = move_tensor_to_gpu(next_states) rewards = move_tensor_to_gpu(rewards) delta_states = next_states - states predicted_delta_state_normalized, predicted_reward_normalized = \ self.predict_normalized_delta_next_state_reward(states, actions) delta_states_normalized = normalize( delta_states, self.delta_state_mean, self.delta_state_std) loss = F.mse_loss(predicted_delta_state_normalized, delta_states_normalized) if self.cost_fn_batch is None: rewards_normalized = normalize(rewards, self.reward_mean, self.reward_std) loss += F.mse_loss(predicted_reward_normalized, rewards_normalized) val_losses.append(loss.item()) self.train() if logger: logger.store(ModelTrainLoss=np.mean(losses)) logger.store(ModelValLoss=np.mean(val_losses)) t.set_description( 'Epoch {}/{} - Avg model train loss: {:.4f} - Avg model val loss: {:.4f}' .format(i + 1, epoch, np.mean(losses), np.mean(val_losses)))
def train(self, num_epoch, train_data_loader, checkpoint_path=None, epoch_per_save=5, callbacks=(), summary_writer: SummaryWriter = None, verbose=True): n_iter = 0 for epoch in range(num_epoch): self._set_to_train() negative_log_likelihood_train = 0. kl_divergence_train = 0. if verbose: t = tqdm(train_data_loader, desc='Epoch {}/{}'.format(epoch + 1, num_epoch)) else: t = train_data_loader for data_batch in t: input = data_batch[0] self.optimizer.zero_grad() input = move_tensor_to_gpu(input) latent_distribution = self.encode(input) z = latent_distribution.rsample() out = self.decode_distribution(z) negative_log_likelihood = -out.log_prob(input).sum() kl_divergence = torch.distributions.kl_divergence( latent_distribution, self.prior).sum() loss = negative_log_likelihood + kl_divergence loss.backward() self.optimizer.step() negative_log_likelihood_train += negative_log_likelihood.item() kl_divergence_train += kl_divergence.item() if summary_writer: summary_writer.add_scalar('data/nll', negative_log_likelihood.item(), n_iter) summary_writer.add_scalar('data/kld', kl_divergence.item(), n_iter) n_iter += 1 if verbose: num_dimensions = np.prod( list(train_data_loader.dataset[0][0].shape)) negative_log_likelihood_train /= len(train_data_loader.dataset) negative_log_likelihood_train_bits_per_dim = log_to_log2( negative_log_likelihood_train / num_dimensions) kl_divergence_train /= len(train_data_loader.dataset) kl_divergence_train_bits_per_dim = log_to_log2( kl_divergence_train / num_dimensions) total_loss = negative_log_likelihood_train + kl_divergence_train total_loss_bits_per_dim = log_to_log2(total_loss / num_dimensions) total_loss_message = 'Totol loss {:.4f}/{:.4f} (bits/dim)'.format( total_loss, total_loss_bits_per_dim) nll_message = 'Negative log likelihood {:.4f}/{:.4f} (bits/dim)'.format( negative_log_likelihood_train, negative_log_likelihood_train_bits_per_dim) kl_message = 'KL divergence {:.4f}/{:.4f} (bits/dim)'.format( kl_divergence_train, kl_divergence_train_bits_per_dim) print(' - '.join([total_loss_message, nll_message, kl_message])) if checkpoint_path is not None and (epoch + 1) % epoch_per_save == 0: self.save_checkpoint(checkpoint_path) if summary_writer: for callback in callbacks: callback(epoch, self, summary_writer) if checkpoint_path is not None: self.save_checkpoint(checkpoint_path)
def train_on_data_loader(self, train_data_loader, verbose=True): """ Train on data_loader for one epoch Args: train_data_loader: training data loader verbose: verbose mode or not Returns: training loss list at each step """ self.model.train() if verbose: t = tqdm(train_data_loader) else: t = train_data_loader train_loss = [] for data_label in t: data, labels = data_label data = move_tensor_to_gpu(data) labels = move_tensor_to_gpu(labels) if not isinstance(labels, list): labels = [labels] self.optimizer.zero_grad() # for compatibility with singular data and labels if isinstance(data, list): outputs = self.model(*data) else: outputs = self.model(data) if not isinstance(outputs, tuple): outputs = [outputs] current_loss = [] for j in range(len(outputs)): loss = self.loss[j](outputs[j], labels[j]) if self.loss_weights is not None: loss = loss * self.loss_weights[j] current_loss.append(loss) loss = sum(current_loss) loss.backward() self.optimizer.step() # gather training statistics if verbose: stats_str = [] stats_str.append('Train loss: {:.4f}'.format(loss.item())) stats = self._compute_metrics(outputs, labels) for i, stat in enumerate(stats): for metric, result in stat.items(): stats_str.append('Output {} {}: {:.4f}'.format(i, metric, result)) training_description = " - ".join(stats_str) # set log for each batch t.set_description(training_description) train_loss.append(loss.item()) return train_loss
def evaluate(self, data_loader, desc=None): self.model.eval() with torch.no_grad(): total_loss = 0.0 total = 0 all_outputs = [] all_labels = [] for data_label in tqdm(data_loader, desc=desc): data, labels = data_label data = move_tensor_to_gpu(data) labels = move_tensor_to_gpu(labels) if not isinstance(labels, list): labels = [labels] if len(all_labels) == 0: for label in labels: all_labels.append([label]) else: for i, label in enumerate(labels): all_labels[i].append(label) if isinstance(data, list): outputs = self.model(*data) else: outputs = self.model(data) if not isinstance(outputs, tuple): outputs = [outputs] if len(all_outputs) == 0: for output in outputs: all_outputs.append([output]) else: for i, output in enumerate(outputs): all_outputs[i].append(output) current_loss = [] for j in range(len(outputs)): loss = self.loss[j](outputs[j], labels[j]) if self.loss_weights is not None: loss = loss * self.loss_weights[j] current_loss.append(loss) loss = sum(current_loss) # calculate stats total_loss += loss.item() * labels[0].size(0) total += labels[0].size(0) for i, output in enumerate(all_outputs): all_outputs[i] = torch.cat(output, dim=0) for i, label in enumerate(all_labels): all_labels[i] = torch.cat(label, dim=0) loss = total_loss / total stats = self._compute_metrics(all_outputs, all_labels) return loss, stats
def compute_reward_to_go_gae(paths, gamma, policy_net, lam, value_mean, value_std): rewards = [] gaes = [] for path in paths: # compute last state value if path['mask'][-1] == 1: with torch.no_grad(): last_obs = convert_numpy_to_tensor( np.expand_dims(path['last_obs'], axis=0)).type(FloatTensor) last_hidden = convert_numpy_to_tensor( np.expand_dims(path['last_hidden'], axis=0)).type(FloatTensor) last_state_value = policy_net.forward( last_obs, last_hidden)[-1].cpu().numpy()[0] last_state_value = last_state_value * value_std + value_mean else: last_state_value = 0. # we need to clip last_state_value by (max_abs_value / (1 - gamma)) # Otherwise, large state value would cause positive feedback loop and cause the reward to explode. max_abs_value = np.max(np.abs(path['reward'])) last_state_value = np.clip(last_state_value, a_min=-max_abs_value / (1 - gamma), a_max=max_abs_value / (1 - gamma)) # calculate reward-to-go path['reward'].append(last_state_value) current_rewards = discount(path['reward'], gamma).astype(np.float32) rewards.append(current_rewards[:-1]) # compute gae with torch.no_grad(): observation = path['observation'] hidden = path['hidden'] data_loader = create_data_loader((observation, hidden), batch_size=32, shuffle=False, drop_last=False) values = [] for obs, hid in data_loader: obs = move_tensor_to_gpu(obs) hid = move_tensor_to_gpu(hid) values.append(policy_net.forward(obs, hid)[-1]) values = torch.cat(values, dim=0).cpu().numpy() values = values * value_std + value_mean values = np.append(values, last_state_value) # add the value of last obs for truncated trajectory temporal_difference = path[ 'reward'][:-1] + values[1:] * gamma - values[:-1] # calculate reward-to-go gae = discount(temporal_difference, gamma * lam).astype(np.float32) gaes.append(gae) rewards = np.concatenate(rewards) new_values_mean, new_values_std = np.mean(rewards), np.std(rewards) rewards = (rewards - new_values_mean) / (new_values_std + eps) gaes = np.concatenate(gaes) gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + eps) return rewards, gaes, new_values_mean, new_values_std
def update_policy(self, dataset, epoch=4): # construct a dataset using paths containing (action, observation, old_log_prob) if self.recurrent: data_loader = create_data_loader(dataset, batch_size=128, shuffle=False, drop_last=False) else: data_loader = create_data_loader(dataset, batch_size=128, shuffle=True, drop_last=False) for epoch_index in range(epoch): current_hidden = torch.tensor( np.expand_dims(self.init_hidden_unit, axis=0), requires_grad=False).type(FloatTensor) for batch_sample in data_loader: action, advantage, observation, discount_rewards, old_log_prob, mask = \ move_tensor_to_gpu(batch_sample) self.policy_optimizer.zero_grad() # update policy if not self.recurrent: distribution, _, raw_baselines = self.policy_net.forward( observation, None) entropy_loss = distribution.entropy().mean() log_prob = distribution.log_prob(action) else: entropy_loss = [] log_prob = [] raw_baselines = [] zero_index = np.where(mask == 0)[0] + 1 zero_index = zero_index.tolist() zero_index.insert(0, 0) for i in range(len(zero_index) - 1): start_index = zero_index[i] end_index = zero_index[i + 1] current_obs = observation[start_index:end_index] current_actions = action[start_index:end_index] current_dist, _, current_baseline = self.policy_net.forward( current_obs, current_hidden) current_hidden = torch.tensor( np.expand_dims(self.init_hidden_unit, axis=0), requires_grad=False).type(FloatTensor) current_log_prob = current_dist.log_prob( current_actions) log_prob.append(current_log_prob) raw_baselines.append(current_baseline) entropy_loss.append(current_dist.entropy()) # last iteration start_index = zero_index[-1] if start_index < observation.shape[0]: current_obs = observation[start_index:] current_actions = action[start_index:] current_dist, current_hidden, current_baseline = self.policy_net.forward( current_obs, current_hidden) current_log_prob = current_dist.log_prob( current_actions) log_prob.append(current_log_prob) raw_baselines.append(current_baseline) entropy_loss.append(current_dist.entropy()) current_hidden = current_hidden.detach() log_prob = torch.cat(log_prob, dim=0) raw_baselines = torch.cat(raw_baselines, dim=0) entropy_loss = torch.cat(entropy_loss, dim=0).mean() assert log_prob.shape == advantage.shape, 'log_prob length {}, advantage length {}'.format( log_prob.shape, advantage.shape) # if approximated kl is larger than 1.5 target_kl, we early stop training of this batch negative_approx_kl = log_prob - old_log_prob negative_approx_kl_mean = torch.mean( -negative_approx_kl).item() if negative_approx_kl_mean > 1.5 * self.target_kl: # print('Early stopping this iteration. Current kl {:.4f}. Current epoch index {}'.format( # negative_approx_kl_mean, epoch_index)) continue ratio = torch.exp(negative_approx_kl) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantage policy_loss = -torch.min(surr1, surr2).mean() value_loss = self.get_baseline_loss(raw_baselines, discount_rewards) loss = policy_loss - entropy_loss * self.entropy_coef + self.value_coef * value_loss nn.utils.clip_grad_norm_(self.policy_net.parameters(), self.max_grad_norm) loss.backward() self.policy_optimizer.step()