class Agent(object): def __init__( self, a_dim, s_dim, a_bound, ): self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, #self.sess = tf.Session() self.P_online = Actor(s_dim, a_dim) self.P_target = Actor(s_dim, a_dim) self.P_target.load_state_dict(self.P_online.state_dict()) self.Q_online = Critic(s_dim, a_dim) self.Q_target = Critic(s_dim, a_dim) self.Q_target.load_state_dict(self.Q_online.state_dict()) self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(), lr=LR_C) self.p_optimizer = torch.optim.Adam(self.P_online.parameters(), lr=LR_A) self.loss_td = nn.MSELoss() self.replay_buffer = ReplayBuffer() self.batch_size = 32 self.discrete = False self.ep_step = 0 # noise self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU) # Initialize noise self.ou_level = 0. self.action_low = -2 self.action_high = 2 def act(self, state, test=False): if not test: with torch.no_grad(): # boring type casting state = (( torch.from_numpy(state)).unsqueeze(0)).float().to('cpu') action = self.P_online(state) # continuous output a = action.data.cpu().numpy() if self.discrete: action = np.argmax(a) return a, action else: if self.ep_step < 200: self.ou_level = self.noise.ornstein_uhlenbeck_level( self.ou_level) action = np.clip(a + self.ou_level, self.action_low, self.action_high) return (torch.from_numpy(action)).view(-1) def collect_data(self, state, action, reward, next_state, done): self.replay_buffer.push( torch.from_numpy(state).float().unsqueeze(0), torch.from_numpy(action).float().unsqueeze(0), torch.tensor([reward]).float().unsqueeze(0), torch.from_numpy(next_state).float().unsqueeze(0), torch.tensor([done]).float().unsqueeze(0)) def clear_data(self): raise NotImplementedError("Circular Queue don't need this function") def update(self): if len(self.replay_buffer) < self.batch_size: return states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size=self.batch_size, device='cpu') #===============================Critic Update=============================== with torch.no_grad(): target = rewards + GAMMA * (1 - dones) * self.Q_target( (next_states, self.P_target(next_states))) Q = self.Q_online((states, actions)) td_error = self.loss_td(target, Q) self.q_optimizer.zero_grad() td_error.backward() self.q_optimizer.step() #===============================Actor Update=============================== q = self.Q_online((states, self.P_online(states))) loss_a = -torch.mean(q) self.p_optimizer.zero_grad() loss_a.backward() self.p_optimizer.step() #===============================Target Update=============================== soft_update(self.Q_target, self.Q_online, tau=1e-2) soft_update(self.P_target, self.P_online, tau=1e-2)
class Agent(): def __init__(self, test=False): # device if torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') ######################################### """ Some hand tune config(for developing) """ self.discrete = False self.action_dim = 1 self.state_dim = 3 self.batch_size = 100 self.action_low = -2 self.action_high = 2 ########################################## self.P_online = Actor(state_dim=self.state_dim, action_size=self.action_dim).to(self.device) self.P_target = Actor(state_dim=self.state_dim, action_size=self.action_dim).to(self.device) self.P_target.load_state_dict(self.P_online.state_dict()) self.Q_online = Critic(state_size=self.state_dim, action_size=self.action_dim).to(self.device) self.Q_target = Critic(state_size=self.state_dim, action_size=self.action_dim).to(self.device) self.Q_target.load_state_dict(self.Q_online.state_dict()) # discounted reward self.gamma = 0.99 self.eps = 0.25 # optimizer self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(), lr=1e-3) self.p_optimizer = torch.optim.Adam(self.P_online.parameters(), lr=1e-3) # saved rewards and actions self.replay_buffer = ReplayBuffer() # noise self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU) # Initialize noise self.ou_level = 0. self.ep_step = 0 def act(self, state, test=False): if not test: with torch.no_grad(): # boring type casting state = ((torch.from_numpy(state)).unsqueeze(0)).float().to( self.device) action = self.P_online(state) # continuous output a = action.data.cpu().numpy() # if self.ep_step < 200: # self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level) # a = a + self.ou_level if self.discrete: action = np.argmax(a) return a, action else: if self.ep_step < 200: self.ou_level = self.noise.ornstein_uhlenbeck_level( self.ou_level) action = np.clip(a + self.ou_level, self.action_low, self.action_high) return action, action def collect_data(self, state, action, reward, next_state, done): self.replay_buffer.push( torch.from_numpy(state).float().unsqueeze(0), torch.from_numpy(action).float(), torch.tensor([reward]).float().unsqueeze(0), torch.from_numpy(next_state).float().unsqueeze(0), torch.tensor([done]).float().unsqueeze(0)) def clear_data(self): raise NotImplementedError("Circular Queue don't need this function") def update(self): if len(self.replay_buffer) < self.batch_size: return states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size=self.batch_size, device=self.device) # discounted rewards # rewards = torch.from_numpy(discount((rewards.view(rewards.shape[0])).cpu().numpy())).float().to(self.device) ### debug shape : ok #===============================Critic Update=============================== self.Q_online.train() Q = self.Q_online((states, actions)) with torch.no_grad(): # don't need backprop for target value self.Q_target.eval() self.P_target.eval() target = rewards + self.gamma * (1 - dones) * self.Q_target( (next_states, self.P_target(next_states))) critic_loss_fn = torch.nn.MSELoss() critic_loss = critic_loss_fn(Q, target).mean() # update self.q_optimizer.zero_grad() critic_loss.backward() self.q_optimizer.step() # print("critic loss", critic_loss.item()) #===============================Actor Update=============================== # fix online_critic , update online_actor self.Q_online.eval() for p in self.Q_online.parameters(): p.requires_grad = False for p in self.P_online.parameters(): p.requires_grad = True policy_loss = -self.Q_online((states, self.P_online(states))) policy_loss = policy_loss.mean() self.p_optimizer.zero_grad() policy_loss.backward() self.p_optimizer.step() # print("policy loss", policy_loss.item()) for p in self.Q_online.parameters(): p.requires_grad = True #===============================Target Update=============================== soft_update(self.Q_target, self.Q_online, tau=1e-3) soft_update(self.P_target, self.P_online, tau=1e-3) self.eps -= EPSILON_DECAY if self.eps <= 0: self.eps = 0
class DDPG: def __init__(self, sess, params): self.sess = sess self.__dict__.update(params) # create placeholders self.create_input_placeholders() # create actor/critic models self.actor = Actor(self.sess, self.inputs, **self.actor_params) self.critic = Critic(self.sess, self.inputs, **self.critic_params) self.noise_params = {k: np.array(list(map(float, v.split(",")))) for k, v in self.noise_params.items()} self.noise = Noise(**self.noise_params) self.ou_level = np.zeros(self.dimensions["u"]) self.memory = Memory(self.n_mem_objects, self.memory_size) def create_input_placeholders(self): self.inputs = {} with tf.name_scope("inputs"): for ip_name, dim in self.dimensions.items(): self.inputs[ip_name] = tf.placeholder(tf.float32, shape=(None, dim), name=ip_name) self.inputs["g"] = tf.placeholder(tf.float32, shape=self.inputs["u"].shape, name="a_grad") self.inputs["p"] = tf.placeholder(tf.float32, shape=(None, 1), name="pred_q") def step(self, x, is_u_discrete, explore=True): x = x.reshape(-1, self.dimensions["x"]) u = self.actor.predict(x) if explore: self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level) u = u + self.ou_level q = self.critic.predict(x, u) if is_u_discrete: return [np.argmax(u), u[0], q[0]] return [u[0], u, q[0]] def remember(self, experience): self.memory.add(experience) def train(self): # check if the memory contains enough experiences if self.memory.size < 3*self.b_size: return x, g, ag, u, r, nx, ng, t = self.get_batch() # for her transitions her_idxs = np.where(np.random.random(self.b_size) < 0.80)[0] # print("{} of {} selected for HER transitions". # format(len(her_idxs), self.b_size)) g[her_idxs] = ag[her_idxs] r[her_idxs] = 1 t[her_idxs] = 1 x = np.hstack([x, g]) nx = np.hstack([nx, ng]) nu = self.actor.predict_target(nx) tq = r + self.gamma*self.critic.predict_target(nx, nu)*(1-t) self.critic.train(x, u, tq) grad = self.critic.get_action_grads(x, u) # print("Grads:\n", g) self.actor.train(x, grad) self.update_targets() def get_batch(self): return self.memory.sample(self.b_size) def update_targets(self): self.critic.update_target() self.actor.update_target()