def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "reinforce", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_mask = DOUBLE(batch.mask).to(device) p_loss = torch.empty(1) for _ in range(self.reinforce_epochs): p_loss = reinforce_step(self.policy_net, self.optimizer_p, batch_state, batch_action, batch_reward, batch_mask, self.gamma) return p_loss
def train(memory): batch = memory.sample() batch_states = DOUBLE(batch.state).to(device) batch_actions = DOUBLE(batch.action).to(device) batch_log_probs = DOUBLE(batch.log_prob).to(device) batch_masks = DOUBLE(batch.mask).to(device) batch_rewards = DOUBLE(batch.reward).to(device) batch_size = batch_states.shape[0] with torch.no_grad(): batch_values = critic(batch_states) batch_advantages, batch_returns = estimate_advantages(batch_rewards, batch_masks, batch_values, gamma, tau) # mini-batch ppo update mini_batch_num = int(math.ceil(batch_size / mini_batch_size)) for _ in range(ppo_epochs): idx = torch.randperm(batch_size) for i in range(mini_batch_num): mini_batch_idx = idx[i * mini_batch_size: min((i + 1) * mini_batch_size, batch_size)] mini_batch_states, mini_batch_actions, mini_batch_log_probs, mini_batch_returns, mini_batch_advantages = \ batch_states[mini_batch_idx], batch_actions[mini_batch_idx], batch_log_probs[mini_batch_idx], \ batch_returns[mini_batch_idx], batch_advantages[mini_batch_idx] ppo_step(actor, critic, opt_p, opt_v, 1, mini_batch_states, mini_batch_actions, mini_batch_returns, mini_batch_advantages, mini_batch_log_probs, epsilon, 1e-3)
def reinforce_step(policy_net, optimizer_policy, states, actions, rewards, masks, gamma, eps=1e-6): """calculate cumulative reward""" cum_rewards = DOUBLE(rewards.size(0), 1).to(device) pre_value = 0 for i in reversed(range(rewards.size(0))): pre_value = gamma * masks[i] * pre_value + rewards[i, 0] cum_rewards[i, 0] = pre_value # normalize cumulative rewards cum_rewards = (cum_rewards - cum_rewards.mean()) / (cum_rewards.std() + eps) """update policy""" log_probs = policy_net.get_log_prob(states, actions) policy_loss = -(log_probs * cum_rewards).mean() optimizer_policy.zero_grad() policy_loss.backward() torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 40) optimizer_policy.step() return policy_loss
def update(self, batch): batch_state = DOUBLE(batch.state).to(device) batch_action = LONG(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_next_state = DOUBLE(batch.next_state).to(device) batch_mask = DOUBLE(batch.mask).to(device) dqn_step(self.value_net, self.optimizer, self.value_net_target, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma)
def update(self, batch, global_steps): batch_state = DOUBLE(batch.state).to(device) batch_action = LONG(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_next_state = DOUBLE(batch.next_state).to(device) batch_mask = DOUBLE(batch.mask).to(device) doubledqn_step(self.value_net, self.optimizer, self.value_net_target, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak, global_steps % self.update_target_gap == 0)
def update(self, batch, k_iter): """learn model""" batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_next_state = DOUBLE(batch.next_state).to(device) batch_mask = DOUBLE(batch.mask).to(device) # update by SAC sac_step(self.policy_net, self.value_net, self.value_net_target, self.q_net_1, self.q_net_2, self.optimizer_p, self.optimizer_v, self.optimizer_q_1, self.optimizer_q_2, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak, k_iter % self.target_update_delay == 0)
def update(self, batch): """learn model""" batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_next_state = DOUBLE(batch.next_state).to(device) batch_mask = DOUBLE(batch.mask).to(device) # update by DDPG ddpg_step(self.policy_net, self.policy_net_target, self.value_net, self.value_net_target, self.optimizer_p, self.optimizer_v, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak)
def update(self, batch, k_iter): """learn model""" batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_next_state = DOUBLE(batch.next_state).to(device) batch_mask = DOUBLE(batch.mask).to(device) # update by TD3 td3_step(self.policy_net, self.policy_net_target, self.value_net_1, self.value_net_target_1, self.value_net_2, self.value_net_target_2, self.optimizer_p, self.optimizer_v_1, self.optimizer_v_2, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak, self.target_action_noise_std, self.target_action_noise_clip, self.action_high, k_iter % self.policy_update_delay == 0)
def choose_action(self, state): """select action""" state = DOUBLE(state).unsqueeze(0).to(device) with torch.no_grad(): action, _ = self.policy_net.rsample(state) action = action.cpu().numpy()[0] return action, None
def estimate_advantages(rewards, masks, values, gamma, tau): deltas = DOUBLE(rewards.size(0), 1).to(device) advantages = DOUBLE(rewards.size(0), 1).to(device) prev_value = 0 prev_advantage = 0 for i in reversed(range(rewards.size(0))): deltas[i] = rewards[i] + gamma * prev_value * masks[i] - values[i] advantages[i] = deltas[i] + gamma * tau * prev_advantage * masks[i] prev_value = values[i, 0] prev_advantage = advantages[i, 0] returns = values + advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-10) return advantages, returns
def collect_samples(pid, queue, env, policy, render, running_state, min_batch_size): torch.randn(pid) log = dict() memory = Memory() num_steps = 0 num_episodes = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') total_reward = 0 while num_steps < min_batch_size: state = env.reset() episode_reward = 0 if running_state: state = running_state(state) for t in range(10000): if render: env.render() state_tensor = DOUBLE(state).unsqueeze(0) with torch.no_grad(): action, log_prob = policy.get_action_log_prob(state_tensor) action = action.cpu().numpy()[0] log_prob = log_prob.cpu().numpy()[0] if log_prob else None next_state, reward, done, _ = env.step(action) episode_reward += reward if running_state: next_state = running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') memory.push(state, action, reward, next_state, mask, log_prob) num_steps += 1 if done or num_steps >= min_batch_size: break state = next_state # num_steps += (t + 1) num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_episode_reward'] = max_episode_reward log['min_episode_reward'] = min_episode_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def choose_action(self, state): state = DOUBLE(state).unsqueeze(0).to(device) if np.random.uniform() <= self.epsilon: with torch.no_grad(): action = self.value_net.get_action(state) action = action.cpu().numpy()[0] else: # choose action greedy action = np.random.randint(0, self.num_actions) return action
def choose_action(self, state, noise_scale): """select action""" state = DOUBLE(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy_net.get_action_log_prob(state) action = action.cpu().numpy()[0] action += noise_scale * np.random.randn(self.num_actions) action = np.clip(action, -self.action_high, self.action_high) return action, log_prob
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "ppo", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_mask = DOUBLE(batch.mask).to(device) batch_log_prob = DOUBLE(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) v_loss, p_loss = torch.empty(1), torch.empty(1) for _ in range(self.ppo_epochs): v_loss, p_loss = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.clip_epsilon, 1e-3) self.policy_net_old.load_state_dict(self.policy_net.state_dict()) return v_loss, p_loss
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "trpo", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_mask = DOUBLE(batch.mask).to(device) batch_log_prob = DOUBLE(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) # update by TRPO trpo_step(self.policy_net, self.value_net, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.max_kl, self.damping, 1e-3, None)
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "a2c", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_mask = DOUBLE(batch.mask).to(device) with torch.no_grad(): batch_value = self.ac_net.get_value(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) ac_loss = a2c_step(self.ac_net, self.optimizer_ac, batch_state, batch_action, batch_return, batch_advantage, self.value_net_coeff, self.entropy_coeff) return ac_loss
def collect(self): num_steps = 0 num_episodes = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') total_reward = 0 while num_steps < self.min_batch_size: state = self.env.reset() episode_reward = 0 if self.running_state: state = self.running_state(state) for t in range(10000): if self.render: self.env.render() state_tensor = DOUBLE(state).unsqueeze(0) with torch.no_grad(): action, log_prob = self.policy.get_action_log_prob(state_tensor) action = action.cpu().numpy()[0] log_prob = log_prob.cpu().numpy()[0] next_state, reward, done, _ = self.env.step(action) episode_reward += reward if self.running_state: next_state = self.running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') self.memory.push(state, action, reward, next_state, mask, log_prob) num_steps += 1 if done or num_steps >= self.min_batch_size: break state = next_state # num_steps += (t + 1) num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) self.log['num_steps'] = num_steps self.log['num_episodes'] = num_episodes self.log['total_reward'] = total_reward self.log['avg_reward'] = total_reward / num_episodes self.log['max_episode_reward'] = max_episode_reward self.log['min_episode_reward'] = min_episode_reward
def value_objective_grad_func(value_net_flat_params): set_flat_params(value_net, DOUBLE(value_net_flat_params)) for param in value_net.parameters(): if param.grad is not None: param.grad.data.fill_(0) values_pred = value_net(states) value_loss = nn.MSELoss()(values_pred, returns) # weight decay for param in value_net.parameters(): value_loss += param.pow(2).sum() * l2_reg value_loss.backward() # to get the grad objective_value_loss_grad = get_flat_grad_params( value_net).detach().cpu().numpy() return objective_value_loss_grad
def value_objective_func(value_net_flat_params): """ get value_net loss :param value_net_flat_params: numpy :return: """ set_flat_params(value_net, DOUBLE(value_net_flat_params)) values_pred = value_net(states) value_loss = nn.MSELoss()(values_pred, returns) # weight decay for param in value_net.parameters(): value_loss += param.pow(2).sum() * l2_reg objective_value_loss = value_loss.item() # print("Current value loss: ", objective_value_loss) return objective_value_loss
def choose_action(self, state): """select action""" state = DOUBLE(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy_net.get_action_log_prob(state) return action, log_prob
def sample(self): """Update internal state and return it as a noise sample.""" dx = self.theta * (self.mu - self.state) + self.sigma * np.random.normal(size=self.state.shape) self.state += dx return DOUBLE(self.state)
def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "ppo", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_mask = DOUBLE(batch.mask).to(device) batch_log_prob = DOUBLE(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) v_loss, p_loss = torch.empty(1), torch.empty(1) for _ in range(self.ppo_epochs): if self.ppo_mini_batch_size: batch_size = batch_state.shape[0] mini_batch_num = int( math.ceil(batch_size / self.ppo_mini_batch_size)) # update with mini-batch for _ in range(self.ppo_epochs): index = torch.randperm(batch_size) for i in range(mini_batch_num): ind = index[slice( i * self.ppo_mini_batch_size, min(batch_size, (i + 1) * self.ppo_mini_batch_size))] state, action, returns, advantages, old_log_pis = batch_state[ind], batch_action[ind], \ batch_return[ ind], batch_advantage[ind], batch_log_prob[ ind] v_loss, p_loss = ppo_step( self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, state, action, returns, advantages, old_log_pis, self.clip_epsilon, 1e-3) else: v_loss, p_loss = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.clip_epsilon, 1e-3) return v_loss, p_loss
def select_action(self, states): states_tensor = DOUBLE(states).unsqueeze(0).to(device) action, log_prob = self.policy.get_action_log_prob(states_tensor) return action, log_prob
def trpo_step(policy_net, value_net, states, actions, returns, advantages, old_log_probs, max_kl, damping, l2_reg, optimizer_value=None): """ Update by TRPO algorithm """ """update critic""" def value_objective_func(value_net_flat_params): """ get value_net loss :param value_net_flat_params: numpy :return: """ set_flat_params(value_net, DOUBLE(value_net_flat_params)) values_pred = value_net(states) value_loss = nn.MSELoss()(values_pred, returns) # weight decay for param in value_net.parameters(): value_loss += param.pow(2).sum() * l2_reg objective_value_loss = value_loss.item() # print("Current value loss: ", objective_value_loss) return objective_value_loss def value_objective_grad_func(value_net_flat_params): set_flat_params(value_net, DOUBLE(value_net_flat_params)) for param in value_net.parameters(): if param.grad is not None: param.grad.data.fill_(0) values_pred = value_net(states) value_loss = nn.MSELoss()(values_pred, returns) # weight decay for param in value_net.parameters(): value_loss += param.pow(2).sum() * l2_reg value_loss.backward() # to get the grad objective_value_loss_grad = get_flat_grad_params( value_net).detach().cpu().numpy() return objective_value_loss_grad # update by NN method if optimizer_value is None: """ update by scipy optimizing, for detail about L-BFGS-B: ref: https://docs.scipy.org/doc/scipy/reference/optimize.minimize-lbfgsb.html#optimize-minimize-lbfgsb """ value_net_flat_params_old = get_flat_params( value_net).detach().cpu().numpy() # initial guess res = opt.minimize(value_objective_func, value_net_flat_params_old, method='L-BFGS-B', jac=value_objective_grad_func, options={ "maxiter": 30, "disp": False }) # print("Call L-BFGS-B, result: ", res) value_net_flat_params_new = res.x set_flat_params(value_net, DOUBLE(value_net_flat_params_new)) else: for _ in range(10): values_pred = value_net(states) value_loss = nn.MSELoss()(values_pred, returns) # weight decay for param in value_net.parameters(): value_loss += param.pow(2).sum() * l2_reg optimizer_value.zero_grad() value_loss.backward() optimizer_value.step() """update policy""" update_policy(policy_net, states, actions, old_log_probs, advantages, max_kl, damping)