def random_iterator(self, batch_size, train_val_split_ratio=0.2): history_states = np.array(self.history_states) history_actions = np.array(self.history_actions) states = np.array(self.states) actions = np.array(self.actions) input_tuple = (history_states, history_actions, states, actions) output_tuple = train_test_split(*input_tuple, test_size=train_val_split_ratio) train_tuple = output_tuple[0::2] val_tuple = output_tuple[1::2] # in training, we drop last batch to avoid batch size 1 that may crash batch_norm layer. train_data_loader = create_data_loader(train_tuple, batch_size=batch_size, shuffle=True, drop_last=True) val_data_loader = create_data_loader(val_tuple, batch_size=batch_size, shuffle=False, drop_last=False) return train_data_loader, val_data_loader
def random_iterator(self, batch_size, train_val_split_ratio=0.2, window_length=None): """ Create iterator over (s, a, s', r, d) Args: batch_size: batch size Returns: """ input_tuple = self._create_state_action_next_state( window_length=window_length) output_tuple = train_test_split(*input_tuple, test_size=train_val_split_ratio) train_tuple = output_tuple[0::2] val_tuple = output_tuple[1::2] train_data_loader = create_data_loader(train_tuple, batch_size=batch_size, shuffle=True, drop_last=False) val_data_loader = create_data_loader(val_tuple, batch_size=batch_size, shuffle=False, drop_last=False) return train_data_loader, val_data_loader
def random_iterator(self, batch_size, train_val_split_ratio=0.2): """Create an iterator for the whole transitions Returns: """ input_tuple = (self._obs_storage, self._action_storage, self._next_obs_storage, self._reward_storage, self._done_storage) output_tuple = train_test_split(*input_tuple, test_size=train_val_split_ratio) train_tuple = output_tuple[0::2] val_tuple = output_tuple[1::2] train_data_loader = create_data_loader(train_tuple, batch_size=batch_size, shuffle=True, drop_last=False) val_data_loader = create_data_loader(val_tuple, batch_size=batch_size, shuffle=False, drop_last=False) return train_data_loader, val_data_loader
def predict(self, x, batch_size, verbose=False): self.model.eval() if not isinstance(x, tuple): x = (x,) data_loader = create_data_loader(x, batch_size=batch_size, shuffle=False, drop_last=False) if verbose: data_loader = tqdm(data_loader, desc='Predicting') outputs = [] with torch.no_grad(): for data in data_loader: current_outputs = self.model.forward(*data) if not isinstance(current_outputs, tuple): current_outputs = [current_outputs] if len(outputs) == 0: for current_output in current_outputs: outputs.append([current_output]) else: for i, current_output in enumerate(current_outputs): outputs[i].append(current_output) for i, output in enumerate(outputs): outputs[i] = torch.cat(output, dim=0).cpu().numpy() return outputs
def random_iterator(self, batch_size, train_val_split_ratio=0.2): states = [] actions = [] next_states = [] rewards = [] dones = [] for trajectory in self.memory: for i in range(self.window_length, trajectory.state.shape[0]): states.append(trajectory.state[i - self.window_length:i]) next_states.append(trajectory.state[i]) actions.append(trajectory.action[i - self.window_length:i]) rewards.append(trajectory.reward[self.window_length - 1:]) done = [False ] * (trajectory.action.shape[0] - self.window_length + 1) done[-1] = True dones.append(np.array(done)) states = np.stack(states, axis=0) actions = np.stack(actions, axis=0) next_states = np.stack(next_states, axis=0) rewards = np.concatenate(rewards, axis=0) dones = np.concatenate(dones, axis=0) input_tuple = (states, actions, next_states, rewards, dones) output_tuple = train_test_split(*input_tuple, test_size=train_val_split_ratio) train_tuple = output_tuple[0::2] val_tuple = output_tuple[1::2] train_data_loader = create_data_loader(train_tuple, batch_size=batch_size, shuffle=True, drop_last=False) val_data_loader = create_data_loader(val_tuple, batch_size=batch_size, shuffle=True, drop_last=False) return train_data_loader, val_data_loader
def random_iterator(self, batch_size, train_val_split_ratio=0.2): states = [] actions = [] rewards = [] next_states = [] dones = [] for trajectory in self.memory: states.append(trajectory.state[:-1]) actions.append(trajectory.action) next_states.append(trajectory.state[1:]) rewards.append(trajectory.reward) done = [False] * trajectory.action.shape[0] done[-1] = True dones.append(np.array(done)) states = np.concatenate(states, axis=0) actions = np.concatenate(actions, axis=0) next_states = np.concatenate(next_states, axis=0) rewards = np.concatenate(rewards, axis=0) dones = np.concatenate(dones, axis=0) input_tuple = (states, actions, next_states, rewards, dones) output_tuple = train_test_split(*input_tuple, test_size=train_val_split_ratio) train_tuple = output_tuple[0::2] val_tuple = output_tuple[1::2] # in training, we drop last batch to avoid batch size 1 that may crash batch_norm layer. train_data_loader = create_data_loader(train_tuple, batch_size=batch_size, shuffle=True, drop_last=True) val_data_loader = create_data_loader(val_tuple, batch_size=batch_size, shuffle=False, drop_last=False) return train_data_loader, val_data_loader
def predict_log_prob_batch(self, state, action): data_loader = create_data_loader((state, action), batch_size=32, shuffle=False, drop_last=False) log_probs = [] for obs, action in data_loader: obs = move_tensor_to_gpu(obs) action = move_tensor_to_gpu(action) action_distribution = self.policy_net.forward_action(obs) log_probs.append(action_distribution.log_prob(action)) log_probs = torch.cat(log_probs, dim=0).cpu().numpy() return log_probs
def compute_old_log_prob(self, observation, hidden, actions): with torch.no_grad(): data_loader = create_data_loader((observation, hidden, actions), batch_size=32, shuffle=False, drop_last=False) old_log_prob = [] for obs, hid, ac in data_loader: obs = move_tensor_to_gpu(obs) hid = move_tensor_to_gpu(hid) ac = move_tensor_to_gpu(ac) old_distribution, _, _ = self.policy_net.forward(obs, hid) old_log_prob.append(old_distribution.log_prob(ac)) old_log_prob = torch.cat(old_log_prob, dim=0).cpu() return old_log_prob
def predict_state_value_batch(self, state): """ compute the state value using nn baseline Args: state: (batch_size, ob_dim) Returns: (batch_size,) """ data_loader = create_data_loader((state, ), batch_size=32, shuffle=False, drop_last=False) values = [] for obs in data_loader: obs = move_tensor_to_gpu(obs[0]) values.append(self.policy_net.forward_value(obs)) values = torch.cat(values, dim=0).cpu().numpy() return values
def random_iterator(self, batch_size): """Create an iterator of all the dataset and update value mean and std Args: batch_size: Returns: """ states = np.concatenate( [trajectory.state for trajectory in self.memory], axis=0) actions = np.concatenate( [trajectory.action for trajectory in self.memory], axis=0) reward_to_go = np.concatenate( [trajectory.reward_to_go for trajectory in self.memory], axis=0) gaes = np.concatenate( [trajectory.advantage for trajectory in self.memory], axis=0) old_log_prob = np.concatenate( [trajectory.old_log_prob for trajectory in self.memory], axis=0) value_mean, value_std = np.mean(reward_to_go), np.std(reward_to_go) reward_to_go = normalize(reward_to_go, value_mean, value_std) self.running_value_mean = self.running_value_mean * self.alpha + value_mean * ( 1 - self.alpha) self.running_value_std = self.running_value_std * self.alpha + value_std * ( 1 - self.alpha) gaes = normalize(gaes, np.mean(gaes), np.std(gaes)) batch_size = min(batch_size, states.shape[0]) data_loader = create_data_loader( (states, actions, reward_to_go, gaes, old_log_prob), batch_size=batch_size, shuffle=True, drop_last=True) return data_loader
def forward(self, input): batch_size = input.shape[0] mean = self.model.forward(input) dis = torch.distributions.Normal(mean, torch.exp(self.logstd)) return dis.rsample(torch.Size([batch_size])) if __name__ == '__main__': x_mean = [0., 1.5] x_std = [1, 0] y_mean = [-1.5, -0.2] y_std = [0.5, 0.1] x_train, y_train, x_val, y_val = generate_training_data( x_mean, x_std, y_mean, y_std) print(x_train.shape, y_train.shape, x_val.shape, y_val.shape) model = Policy(2, 2) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) criterion = nn.MSELoss() regressor = Regressor(model, optimizer, criterion, scheduler=None) train_loader = create_data_loader((x_train, y_train)) val_loader = create_data_loader((x_val, y_val)) regressor.train(epoch=100, train_data_loader=train_loader, val_data_loader=val_loader, checkpoint_path=None)
def compute_reward_to_go_gae(paths, gamma, policy_net, lam, value_mean, value_std): rewards = [] gaes = [] for path in paths: # compute last state value if path['mask'][-1] == 1: with torch.no_grad(): last_obs = convert_numpy_to_tensor( np.expand_dims(path['last_obs'], axis=0)).type(FloatTensor) last_hidden = convert_numpy_to_tensor( np.expand_dims(path['last_hidden'], axis=0)).type(FloatTensor) last_state_value = policy_net.forward( last_obs, last_hidden)[-1].cpu().numpy()[0] last_state_value = last_state_value * value_std + value_mean else: last_state_value = 0. # we need to clip last_state_value by (max_abs_value / (1 - gamma)) # Otherwise, large state value would cause positive feedback loop and cause the reward to explode. max_abs_value = np.max(np.abs(path['reward'])) last_state_value = np.clip(last_state_value, a_min=-max_abs_value / (1 - gamma), a_max=max_abs_value / (1 - gamma)) # calculate reward-to-go path['reward'].append(last_state_value) current_rewards = discount(path['reward'], gamma).astype(np.float32) rewards.append(current_rewards[:-1]) # compute gae with torch.no_grad(): observation = path['observation'] hidden = path['hidden'] data_loader = create_data_loader((observation, hidden), batch_size=32, shuffle=False, drop_last=False) values = [] for obs, hid in data_loader: obs = move_tensor_to_gpu(obs) hid = move_tensor_to_gpu(hid) values.append(policy_net.forward(obs, hid)[-1]) values = torch.cat(values, dim=0).cpu().numpy() values = values * value_std + value_mean values = np.append(values, last_state_value) # add the value of last obs for truncated trajectory temporal_difference = path[ 'reward'][:-1] + values[1:] * gamma - values[:-1] # calculate reward-to-go gae = discount(temporal_difference, gamma * lam).astype(np.float32) gaes.append(gae) rewards = np.concatenate(rewards) new_values_mean, new_values_std = np.mean(rewards), np.std(rewards) rewards = (rewards - new_values_mean) / (new_values_std + eps) gaes = np.concatenate(gaes) gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + eps) return rewards, gaes, new_values_mean, new_values_std
def update_policy(self, dataset, epoch=4): # construct a dataset using paths containing (action, observation, old_log_prob) if self.recurrent: data_loader = create_data_loader(dataset, batch_size=128, shuffle=False, drop_last=False) else: data_loader = create_data_loader(dataset, batch_size=128, shuffle=True, drop_last=False) for epoch_index in range(epoch): current_hidden = torch.tensor( np.expand_dims(self.init_hidden_unit, axis=0), requires_grad=False).type(FloatTensor) for batch_sample in data_loader: action, advantage, observation, discount_rewards, old_log_prob, mask = \ move_tensor_to_gpu(batch_sample) self.policy_optimizer.zero_grad() # update policy if not self.recurrent: distribution, _, raw_baselines = self.policy_net.forward( observation, None) entropy_loss = distribution.entropy().mean() log_prob = distribution.log_prob(action) else: entropy_loss = [] log_prob = [] raw_baselines = [] zero_index = np.where(mask == 0)[0] + 1 zero_index = zero_index.tolist() zero_index.insert(0, 0) for i in range(len(zero_index) - 1): start_index = zero_index[i] end_index = zero_index[i + 1] current_obs = observation[start_index:end_index] current_actions = action[start_index:end_index] current_dist, _, current_baseline = self.policy_net.forward( current_obs, current_hidden) current_hidden = torch.tensor( np.expand_dims(self.init_hidden_unit, axis=0), requires_grad=False).type(FloatTensor) current_log_prob = current_dist.log_prob( current_actions) log_prob.append(current_log_prob) raw_baselines.append(current_baseline) entropy_loss.append(current_dist.entropy()) # last iteration start_index = zero_index[-1] if start_index < observation.shape[0]: current_obs = observation[start_index:] current_actions = action[start_index:] current_dist, current_hidden, current_baseline = self.policy_net.forward( current_obs, current_hidden) current_log_prob = current_dist.log_prob( current_actions) log_prob.append(current_log_prob) raw_baselines.append(current_baseline) entropy_loss.append(current_dist.entropy()) current_hidden = current_hidden.detach() log_prob = torch.cat(log_prob, dim=0) raw_baselines = torch.cat(raw_baselines, dim=0) entropy_loss = torch.cat(entropy_loss, dim=0).mean() assert log_prob.shape == advantage.shape, 'log_prob length {}, advantage length {}'.format( log_prob.shape, advantage.shape) # if approximated kl is larger than 1.5 target_kl, we early stop training of this batch negative_approx_kl = log_prob - old_log_prob negative_approx_kl_mean = torch.mean( -negative_approx_kl).item() if negative_approx_kl_mean > 1.5 * self.target_kl: # print('Early stopping this iteration. Current kl {:.4f}. Current epoch index {}'.format( # negative_approx_kl_mean, epoch_index)) continue ratio = torch.exp(negative_approx_kl) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantage policy_loss = -torch.min(surr1, surr2).mean() value_loss = self.get_baseline_loss(raw_baselines, discount_rewards) loss = policy_loss - entropy_loss * self.entropy_coef + self.value_coef * value_loss nn.utils.clip_grad_norm_(self.policy_net.parameters(), self.max_grad_norm) loss.backward() self.policy_optimizer.step()