class ActorCritic(object): def __init__(self, state_dim, action_dim, final_activation=tf.nn.tanh, action_bound=0.4, training_batch_size=32, GAMMA=0.95, lr=0.001, replay_buffer_size=1024): self.ID = random_string(10) self.state_dim = state_dim self.action_dim = action_dim self.final_activation = final_activation self.action_bound = action_bound self.GAMMA = GAMMA self.lr = lr self.replay_buffer_size = replay_buffer_size self.replay_buffer = ReplayBuffer(replay_buffer_size) self.training_batch_size = training_batch_size with tf.variable_scope(self.ID) as scope: self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.lr, self.final_activation) self.critic = Critic(self.state_dim, self.action_dim, self.lr) def add_to_replay_buffer(self, state, action, reward, resulting_state): self.replay_buffer.add(state, action, reward, resulting_state) def add_batch_to_replay_buffer(self, states, actions, rewards, resulting_states): for s, a, r, rs in zip(states, actions, rewards, resulting_states): self.replay_buffer.add(s, a, r, rs) def get_batch(self, training_batch_size=None): if not training_batch_size: training_batch_size = self.training_batch_size return self.replay_buffer.sample_batch(training_batch_size) def train_from_replay_buffer(self, should_print=False): # small trouble: if it's done, you don't want to run this thing on it. # I takes the new state, I predict an action, I predict that pair's q val, # I do: reward + GAMMA*next_q_val. I then do critic.optimize_q_val if not self.replay_buffer.size(): print('buffer empty!') return 0 states, actions, rewards, resulting_states = self.replay_buffer.sample_batch( self.training_batch_size) predicted_action = self.actor.get_actions(resulting_states) predicted_vals = self.critic.predict_q_val(resulting_states, predicted_action) true_vals = rewards + (self.GAMMA * predicted_vals) # print(true_vals[4]) losses = self.critic.optimize_q_val(states, actions, true_vals) grads = self.critic.get_action_grads(states, actions) self.actor.train_from_batch(states, grads) return losses if should_print: actual_q, out = self.critic.return_q_and_out( states, actions, true_vals) print('ACTUAL_Q: {}\n\n'.format(actual_q)) print('OUT: {}'.format(out)) return losses def get_actions(self, states): return self.actor.get_actions(states)
class DDPG: def __init__(self, sess, params): self.sess = sess self.__dict__.update(params) # create placeholders self.create_input_placeholders() # create actor/critic models self.actor = Actor(self.sess, self.inputs, **self.actor_params) self.critic = Critic(self.sess, self.inputs, **self.critic_params) self.noise_params = {k: np.array(list(map(float, v.split(",")))) for k, v in self.noise_params.items()} self.noise = Noise(**self.noise_params) self.ou_level = np.zeros(self.dimensions["u"]) self.memory = Memory(self.n_mem_objects, self.memory_size) def create_input_placeholders(self): self.inputs = {} with tf.name_scope("inputs"): for ip_name, dim in self.dimensions.items(): self.inputs[ip_name] = tf.placeholder(tf.float32, shape=(None, dim), name=ip_name) self.inputs["g"] = tf.placeholder(tf.float32, shape=self.inputs["u"].shape, name="a_grad") self.inputs["p"] = tf.placeholder(tf.float32, shape=(None, 1), name="pred_q") def step(self, x, is_u_discrete, explore=True): x = x.reshape(-1, self.dimensions["x"]) u = self.actor.predict(x) if explore: self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level) u = u + self.ou_level q = self.critic.predict(x, u) if is_u_discrete: return [np.argmax(u), u[0], q[0]] return [u[0], u, q[0]] def remember(self, experience): self.memory.add(experience) def train(self): # check if the memory contains enough experiences if self.memory.size < 3*self.b_size: return x, g, ag, u, r, nx, ng, t = self.get_batch() # for her transitions her_idxs = np.where(np.random.random(self.b_size) < 0.80)[0] # print("{} of {} selected for HER transitions". # format(len(her_idxs), self.b_size)) g[her_idxs] = ag[her_idxs] r[her_idxs] = 1 t[her_idxs] = 1 x = np.hstack([x, g]) nx = np.hstack([nx, ng]) nu = self.actor.predict_target(nx) tq = r + self.gamma*self.critic.predict_target(nx, nu)*(1-t) self.critic.train(x, u, tq) grad = self.critic.get_action_grads(x, u) # print("Grads:\n", g) self.actor.train(x, grad) self.update_targets() def get_batch(self): return self.memory.sample(self.b_size) def update_targets(self): self.critic.update_target() self.actor.update_target()