def predict_normalized_delta_next_state_reward(self, states, actions): states_normalized = normalize(states, self.state_mean, self.state_std) if not self.dynamics_model.discrete: actions = normalize(actions, self.action_mean, self.action_std) predicted_delta_state_normalized, predicted_reward_normalized = self.dynamics_model.forward( states_normalized, actions) return predicted_delta_state_normalized, predicted_reward_normalized
def fit_dynamic_model(self, dataset: Dataset, epoch=10, batch_size=128, verbose=False): t = range(epoch) if verbose: t = tqdm(t) train_data_loader, val_data_loader = dataset.random_iterator( batch_size=batch_size) for i in t: losses = [] for states, actions, next_states, _, _ in train_data_loader: # convert to tensor states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) next_states = move_tensor_to_gpu(next_states) delta_states = next_states - states # calculate loss self.optimizer.zero_grad() predicted_delta_state_normalized = self.predict_normalized_delta_next_state( states, actions) delta_states_normalized = normalize(delta_states, self.delta_state_mean, self.delta_state_std) loss = F.mse_loss(predicted_delta_state_normalized, delta_states_normalized) loss.backward() self.optimizer.step() losses.append(loss.item()) self.eval() val_losses = [] with torch.no_grad(): for states, actions, next_states, _, _ in val_data_loader: # convert to tensor states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) next_states = move_tensor_to_gpu(next_states) delta_states = next_states - states predicted_delta_state_normalized = self.predict_normalized_delta_next_state( states, actions) delta_states_normalized = normalize( delta_states, self.delta_state_mean, self.delta_state_std) loss = F.mse_loss(predicted_delta_state_normalized, delta_states_normalized) val_losses.append(loss.item()) self.train() if verbose: t.set_description( 'Epoch {}/{} - Avg model train loss: {:.4f} - Avg model val loss: {:.4f}' .format(i + 1, epoch, np.mean(losses), np.mean(val_losses)))
def predict_next_states(self, states, actions): assert self.state_mean is not None, 'Please set statistics before training for inference.' states_normalized = normalize(states, self.state_mean, self.state_std) if not self.dynamics_model.discrete: actions = normalize(actions, self.action_mean, self.action_std) predicted_delta_state_normalized = self.dynamics_model.forward( states_normalized, actions) predicted_delta_state = unnormalize(predicted_delta_state_normalized, self.delta_state_mean, self.delta_state_std) return states + predicted_delta_state
def predict_next_states(self, states, actions, z=None): assert self.state_mean is not None, 'Please set statistics before training for inference.' states = normalize(states, self.state_mean, self.state_std) if not self.dynamics_model.discrete: actions = normalize(actions, self.action_mean, self.action_std) if z is None: z = self._sample_latent_code(states.shape[0]) predicted_states_normalized = self.dynamics_model.forward( states, actions, z) predicted_states = unnormalize(predicted_states_normalized, self.state_mean, self.state_std) return predicted_states
def random_iterator(self, batch_size): """Create an iterator of all the dataset and update value mean and std Args: batch_size: Returns: """ states = np.concatenate( [trajectory.state for trajectory in self.memory], axis=0) actions = np.concatenate( [trajectory.action for trajectory in self.memory], axis=0) reward_to_go = np.concatenate( [trajectory.reward_to_go for trajectory in self.memory], axis=0) gaes = np.concatenate( [trajectory.advantage for trajectory in self.memory], axis=0) old_log_prob = np.concatenate( [trajectory.old_log_prob for trajectory in self.memory], axis=0) value_mean, value_std = np.mean(reward_to_go), np.std(reward_to_go) reward_to_go = normalize(reward_to_go, value_mean, value_std) self.running_value_mean = self.running_value_mean * self.alpha + value_mean * ( 1 - self.alpha) self.running_value_std = self.running_value_std * self.alpha + value_std * ( 1 - self.alpha) gaes = normalize(gaes, np.mean(gaes), np.std(gaes)) batch_size = min(batch_size, states.shape[0]) data_loader = create_data_loader( (states, actions, reward_to_go, gaes, old_log_prob), batch_size=batch_size, shuffle=True, drop_last=True) return data_loader
def predict_next_states(self, states, actions): """ Args: states: (batch_size, window_length, 6) actions: (batch_size, window_length, 4) Returns: next obs of shape (batch_size, 6) """ assert self.state_mean is not None, 'Please set statistics before training for inference.' states_normalized = normalize(states, self.state_mean, self.state_std) if not self.dynamics_model.discrete: actions = normalize(actions, self.action_mean, self.action_std) predicted_delta_state_normalized = self.dynamics_model.forward( states_normalized, actions) predicted_delta_state = unnormalize(predicted_delta_state_normalized, self.delta_state_mean, self.delta_state_std) return states[:, -1, :] + predicted_delta_state
def fit_dynamic_model(self, dataset, epoch=10, batch_size=128, logger=None): t = tqdm(range(epoch)) train_data_loader, val_data_loader = dataset.random_iterator( batch_size=batch_size) for i in t: losses = [] for states, actions, next_states, rewards, _ in train_data_loader: # in training, we drop last batch to avoid batch size 1 that may crash batch_norm layer. if states.shape[0] == 1: continue # convert to tensor states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) next_states = move_tensor_to_gpu(next_states) rewards = move_tensor_to_gpu(rewards) delta_states = next_states - states # calculate loss self.optimizer.zero_grad() predicted_delta_state_normalized, predicted_reward_normalized = \ self.predict_normalized_delta_next_state_reward(states, actions) delta_states_normalized = normalize(delta_states, self.delta_state_mean, self.delta_state_std) loss = F.mse_loss(predicted_delta_state_normalized, delta_states_normalized) if self.cost_fn_batch is None: rewards_normalized = normalize(rewards, self.reward_mean, self.reward_std) loss += F.mse_loss(predicted_reward_normalized, rewards_normalized) loss.backward() self.optimizer.step() losses.append(loss.item()) self.eval() val_losses = [] with torch.no_grad(): for states, actions, next_states, rewards, _ in val_data_loader: # convert to tensor states = move_tensor_to_gpu(states) actions = move_tensor_to_gpu(actions) next_states = move_tensor_to_gpu(next_states) rewards = move_tensor_to_gpu(rewards) delta_states = next_states - states predicted_delta_state_normalized, predicted_reward_normalized = \ self.predict_normalized_delta_next_state_reward(states, actions) delta_states_normalized = normalize( delta_states, self.delta_state_mean, self.delta_state_std) loss = F.mse_loss(predicted_delta_state_normalized, delta_states_normalized) if self.cost_fn_batch is None: rewards_normalized = normalize(rewards, self.reward_mean, self.reward_std) loss += F.mse_loss(predicted_reward_normalized, rewards_normalized) val_losses.append(loss.item()) self.train() if logger: logger.store(ModelTrainLoss=np.mean(losses)) logger.store(ModelValLoss=np.mean(val_losses)) t.set_description( 'Epoch {}/{} - Avg model train loss: {:.4f} - Avg model val loss: {:.4f}' .format(i + 1, epoch, np.mean(losses), np.mean(val_losses)))