def __init__(self, data_set_path, num_states, num_actions): self.expert_data = np.array(pd.read_csv(data_set_path)) self.state = FLOAT(self.expert_data[:, :num_states]) self.action = FLOAT(self.expert_data[:, num_states:num_states + num_actions]) self.next_state = FLOAT(self.expert_data[:, num_states + num_actions:]) self.length = self.state.size(0)
def step(self, action): with torch.no_grad(): self.state = self.model.get_next_state(FLOAT(self.state).to(device).unsqueeze(0), FLOAT(action).to(device).unsqueeze(0)).numpy()[0] self.cur_step += 1 done = (self.cur_step >= self.max_step) reward = self._calc_reward() return self.state, reward, done, {}
def update(self, batch): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by DDPG ddpg_step(self.policy_net, self.policy_net_target, self.value_net, self.value_net_target, self.optimizer_p, self.optimizer_v, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak)
def update(self, batch, k_iter): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by SAC Alpha sac_alpha_step(self.policy_net, self.q_net_1, self.q_net_2, self.alpha, self.q_net_target_1, self.q_net_target_2, self.optimizer_p, self.optimizer_q_1, self.optimizer_q_2, self.optimizer_a, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak, self.target_entropy, k_iter % self.target_update_delay == 0)
def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, _ = self.policy_net.get_action_log_prob(state) action = action.cpu().numpy()[0] return action, None
def choose_action(self, state, noise_scale): """select action""" self.policy_net.eval() state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action = self.policy_net(state) self.policy_net.train() action = action.cpu().numpy()[0] # add noise noise = noise_scale * np.random.randn(self.num_actions) action += noise action = np.clip(action, -self.action_high, self.action_high) return action
class ExpertDataSet(Dataset): def __init__(self, data_set_path, num_states, num_actions): self.expert_data = np.array(pd.read_csv(data_set_path)) self.state = FLOAT(self.expert_data[:, :num_states]) self.action = FLOAT(self.expert_data[:, num_states:num_states + num_actions]) self.next_state = FLOAT(self.expert_data[:, num_states + num_actions:]) self.length = self.state.size(0) def __len__(self): return self.length def __getitem__(self, idx): return self.state[idx], self.action[idx]
def estimate_advantages(rewards, masks, values, gamma, tau, trajectory_length): """ General advantage estimate :param rewards: [trajectory length * parallel size, 1] :param masks: [trajectory length * parallel size, 1] :param values: [trajectory length * parallel size, 1] :param gamma: :param tau: :param trajectory_length: the length of trajectory :return: """ trans_shape_func = lambda x: x.reshape(trajectory_length, -1, 1) rewards = trans_shape_func( rewards) # [trajectory length, parallel size, 1] masks = trans_shape_func(masks) # [trajectory length, parallel size, 1] values = trans_shape_func(values) # [trajectory length, parallel size, 1] deltas = FLOAT(rewards.size()).to(device) advantages = FLOAT(rewards.size()).to(device) # calculate advantages in parallel prev_value = torch.zeros((rewards.size(1), 1), device=device) prev_advantage = torch.zeros((rewards.size(1), 1), device=device) for i in reversed(range(rewards.size(0))): deltas[i, ...] = rewards[ i, ...] + gamma * prev_value * masks[i, ...] - values[i, ...] advantages[i, ...] = deltas[ i, ...] + gamma * tau * prev_advantage * masks[i, ...] prev_value = values[i, ...] prev_advantage = advantages[i, ...] returns = values + advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-10) # reverse shape for ppo return advantages.reshape(-1, 1), returns.reshape( -1, 1) # [trajectory length * parallel size, 1]