class TD3(object): def __init__(self, env, writer=None): """ Twin Delayed Deep Deterministic Policy Gradient Algorithm(TD3) """ self.env = env self.writer = writer state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] self.max_action = env.action_space.high[0] # Randomly initialize network parameter self.actor = Actor(state_dim, action_dim).to('cuda') self.critic = Critic(state_dim, action_dim).to('cuda') # Initialize target network parameter self.target_actor = Actor(state_dim, action_dim).to('cuda') self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic = Critic(state_dim, action_dim).to('cuda') self.target_critic.load_state_dict(self.critic.state_dict()) # Replay memory self.memory = ReplayMemory(state_dim, action_dim) self.gamma = gamma self.tau = tau # network parameter optimizer self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay) def get_action(self, state, initial_act=False): if initial_act: return self.env.action_space.sample() action = self.actor(torch.from_numpy(state).to('cuda', torch.float)) action = np.random.normal(0, 0.1) + action.detach().cpu().numpy() return np.clip(action, -1, 1) def store_transition(self, state, action, state_, reward, done): self.memory.store_transition(state, action, state_, reward, done) def soft_update(self, target_net, net): """Target parameters soft update""" for target_param, param in zip(target_net.parameters(), net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def update(self, time_step, batch_size=64): states, actions, states_, rewards, terminals = self.memory.sample( batch_size) # Update Critic with torch.no_grad(): noise = (torch.randn_like(actions) * policy_noise).clamp( -noise_clip, noise_clip) actions_ = (self.target_actor(states_) + noise).clamp( -self.max_action, self.max_action) target_q1, target_q2 = self.target_critic(states_, actions_) y = rewards.unsqueeze(1) + terminals.unsqueeze( 1) * gamma * torch.min(target_q1, target_q2) q1, q2 = self.critic(states, actions) critic_loss = F.mse_loss(q1, y) + F.mse_loss(q2, y) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() if self.writer and time_step: self.writer.add_scalar("loss/critic", critic_loss.item(), time_step) # Delayed Policy Update if time_step % policy_freq == 0: # Update Actor actor_loss = -1 * self.critic.Q1(states, self.actor(states)).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.writer: self.writer.add_scalar("loss/actor", actor_loss.item(), time_step) # target parameter soft update self.soft_update(self.target_actor, self.actor) # update target actor network self.soft_update(self.target_critic, self.critic) # update target critic network def save_model(self, path='models/'): torch.save(self.actor.state_dict(), path + 'actor') torch.save(self.critic.state_dict(), path + 'critic') torch.save(self.target_actor.state_dict(), path + 'target_actor') torch.save(self.target_critic.state_dict(), path + 'target_critic') def load_model(self, path='models/'): self.actor.load_state_dict(torch.load(path + 'actor')) self.critic.load_state_dict(torch.load(path + 'critic')) self.target_actor.load_state_dict(torch.load(path + 'target_actor')) self.target_critic.load_state_dict(torch.load(path + 'target_critic'))
class SemiDQNAgent: def __init__(self, dimS, nA, action_map: Callable[..., List[int]], gamma, hidden1, hidden2, lr, tau, buffer_size, batch_size, priority_exponent, normalize_weights, uniform_sample_prob, anneal_schedule: Callable, clipped=False, device='cpu', render=False): arg_dict = locals() print('agent spec') print('-' * 80) print(arg_dict) print('-' * 80) self.dimS = dimS self.nA = nA self.clipped = clipped # set networks if clipped: self.Q = DoubleCritic(dimS, nA, hidden_size1=hidden1, hidden_size2=hidden2).to(device) else: self.Q = Critic(dimS, nA, hidden_size1=hidden1, hidden_size2=hidden2).to(device) self.target_Q = copy.deepcopy(self.Q).to(device) freeze(self.target_Q) self.optimizer = Adam(self.Q.parameters(), lr=lr) # discount factor & polyak constant self.gamma = gamma self.tau = tau self.batch_size = batch_size replay_structure = Transition(s_tm1=None, a_tm1=None, r_t=None, s_t=None, dt=None, d=None) # replay buffer for experience replay in semi-MDP # prioritized experience replay for semi-DQN self.replay = PrioritizedTransitionReplay( capacity=buffer_size, structure=replay_structure, priority_exponent=priority_exponent, importance_sampling_exponent=anneal_schedule, uniform_sample_probability=uniform_sample_prob, normalize_weights=normalize_weights, random_state=np.random.RandomState(1), encoder=None, decoder=None) self.max_seen_priority = 1. self.schedule = anneal_schedule # function which returns the set of executable actions at a given state # expected return type : numpy array when 2nd arg = True / list when False self.action_map = action_map self.render = render self.device = device return def target_update(self): for p, target_p in zip(self.Q.parameters(), self.target_Q.parameters()): target_p.data.copy_(self.tau * p.data + (1.0 - self.tau) * target_p.data) return def get_action(self, state, eps): dimS = self.dimS possible_actions = self.action_map( state) # return a set of indices instead of a mask vector u = np.random.rand() if u < eps: # random selection among executable actions a = random.choice(possible_actions) # print('control randomly selected : ', a) else: m = mask(possible_actions) # greedy selection among executable actions # non-admissible actions are not considered since their value corresponds to -inf s = torch.tensor(state, dtype=torch.float).view(1, dimS).to(self.device) if self.clipped: q = self.Q.Q1(s) else: q = self.Q(s) a = np.argmax(q.cpu().data.numpy() + m) # print('control greedily selected : ', a) # print('value function : ', q.cpu().data.numpy() + self.marker(state)) return a def train(self): device = self.device gamma = self.gamma # batch = self.buffer.sample_batch(self.batch_size) # transition samples with importance sampling weights transitions, indices, weights = self.replay.sample(self.batch_size) # TODO : unroll transitions state = transitions[0] m = np.vstack( [mask(self.action_map(state[i])) for i in range(self.batch_size)]) # unroll batch # each sample : (s, a, r, s^\prime, \Delta t) with torch.no_grad(): s = torch.tensor(transitions[0], dtype=torch.float).to(device) a = torch.unsqueeze( torch.tensor(transitions[1], dtype=torch.long).to(device), 1) # action type : discrete r = torch.tensor(transitions[2], dtype=torch.float).to(device) s_next = torch.tensor(transitions[3], dtype=torch.float).to(device) d = torch.tensor(transitions[4], dtype=torch.float).to(device) dt = torch.tensor(transitions[5], dtype=torch.float).to(device) m = torch.tensor(m, dtype=torch.float).to(device) w = torch.tensor(weights, dtype=torch.float).to(device) # compute $\max_{a^\prime} Q (s^\prime, a^\prime)$ # note that the maximum MUST be taken over the set of admissible actions # this can be done via masking invalid entries # double DQN # Be careful of shape of each tensor! if self.clipped: a_inner = torch.unsqueeze( torch.max(self.Q.Q1(s_next) + m, 1)[1], 1) q_next1, q_next2 = self.target_Q(s_next) q_next = torch.min(torch.squeeze(q_next1.gather(1, a_inner)), torch.squeeze(q_next2.gather(1, a_inner))) else: a_inner = torch.unsqueeze( torch.max(self.Q(s_next) + m, 1)[1], 1) q_next = torch.squeeze( self.target_Q(s_next).gather(1, a_inner)) # target construction in semi-MDP case # see [Puterman, 1994] for introduction to the theory of semi-MDPs # $r\Delta t + \gamma^{\Delta t} \max_{a^\prime} Q (s^\prime, a^\prime)$ target = r + gamma**dt * (1. - d) * q_next # loss construction & parameter update if self.clipped: a1_vector, a2_vector = self.Q(s) out1 = torch.squeeze(a1_vector.gather(1, a)) out2 = torch.squeeze(a2_vector.gather(1, a)) td_errors = target - out1 loss = .5 * ((w * (target - out1))**2).sum() + .5 * ( (w * (target - out2))**2).sum() else: out = torch.squeeze(self.Q(s).gather(1, a)) td_errors = target - out loss = .5 * ((w * td_errors)**2).sum() self.optimizer.zero_grad() loss.backward() self.optimizer.step() # priority update new_priorities = np.abs(np.squeeze(td_errors.cpu().data.numpy())) max_priority = np.max(new_priorities) self.max_seen_priority = max([self.max_seen_priority, max_priority]) self.replay.update_priorities(indices=indices, priorities=new_priorities) # soft target update self.target_update() return def eval(self, test_env, T=14400, eval_num=3): """ evaluation of agent during evaluation, agent execute noiseless actions """ print('evaluating on 24 hrs data...', end=' ') reward_log = np.zeros(eval_num) num_log = np.zeros(eval_num, dtype=int) for ep in range(eval_num): state = test_env.reset() step_count = 0 ep_reward = 0. t = 0. # done = False info = None # while not done: while t < T: # half hr evaluation if self.render and ep == 0: test_env.render() action = self.get_action(state, 0.) # noiseless evaluation next_state, reward, done, info = test_env.step(action) step_count += 1 state = next_state ep_reward += self.gamma**t * reward t = info['elapsed_time'] # save carried quantity at the end of the episode carried = test_env.operation_log['carried'] reward_log[ep] = ep_reward num_log[ep] = carried if self.render and ep == 0: test_env.close() avg = np.mean(reward_log) num_avg = np.mean(num_log) print('average reward : {:.4f} | carried : {}'.format(avg, num_avg)) return [avg, num_avg]