def __init__(self, critic=None, actor=None): self.env = gym.make('CartPole-v0') self.actor = Actor(model=actor) if critic else \ Actor.init_model(2, self.env.observation_space.shape[0], 64, self.env.action_space.n) self.critic = Critic(model=critic) if critic else \ Critic.init_model(2, self.env.observation_space.shape[0] + self.env.action_space.n, 64) self.actor_variables = self.actor.model.trainable_variables self.critic_variables = self.critic.model.trainable_variables self.actor_opt = tf.keras.optimizers.Adam(learning_rate=0.0001) self.critic_opt = tf.keras.optimizers.Adam(learning_rate=0.0001) self.actions_dim = self.env.action_space.n self.discount_factor = 0.99 self.e = 0.01 self.episode_length = 0 self.states = [] self.rewards = [] self.gradients = [] self.actions = []
def test_critic_B(): length = 100000 val_1 = 0.3 val_2 = 0.5 critic = Critic.init_model(2, 2, 100) critic.model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError()) x_train = [] y_train = [] for i in range(length): if random() > 0.5: x_train.append([1, 0]) y_train.append(np.random.normal(val_1, 0.5, 1)[0]) else: x_train.append([0, 1]) y_train.append(np.random.normal(val_2, 0.5, 1)[0]) all = list(zip(x_train, y_train)) shuffle(all) x_train = [a for a, _ in all] y_train = [b for _, b in all] critic.model.fit(x_train, y_train, epochs=5) r = critic.get_q(np.array([1, 0])) print('estimate:', r.numpy()[0][0], 'accuracy: ', abs(r - val_1).numpy()[0][0]) r = critic.get_q(np.array([0, 1])) print('estimate:', r.numpy()[0][0], 'accuracy: ', abs(r - val_2).numpy()[0][0])
def test_critic_A(): length = 200000 val = 0.3 critic = Critic.init_model(2, 4, 100) critic.model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError()) x_train = np.array([[0.7, 0.1, 0, 0.5]]*length) y_train = np.random.normal(val, 1, length) critic.model.fit(x_train, y_train, epochs=5) r = critic.get_q(np.array([0.7, 0.1, 0, 0.5])) print('estimate:', r.numpy()[0][0]) assert abs(r - val) < 0.01
def __init__(self, tau=0.05, burn_in_eps=30, critic=None, actor=None): self.env = gym.make('LunarLanderContinuous-v2') self.memory = Memory(batch_size=120) self.tau = tau self.burn_in_eps = burn_in_eps self.eps = 0 self.actions_dim = self.env.action_space.shape[0] self.high_action = 1 self.low_action = -1 self.discount_factor = 0.99 self.episode_length = 0 self.actor_learning_rate = 0.00001 self.critic_learning_rate = 0.00001 self.exploration_value = 0.2 self.actor = ContinuousActor(model=actor) if actor else \ ContinuousActor.init_model(2, self.env.observation_space.shape[0], 400, self.env.action_space.shape[0]) self.critic = Critic(model=critic) if critic else \ Critic.init_model(2, self.env.observation_space.shape[0] + self.env.action_space.shape[0], 400) if actor: self.target_actor = ContinuousActor(model=actor) else: self.target_actor = ContinuousActor.init_model( 2, self.env.observation_space.shape[0], 400, self.env.action_space.shape[0]) self.target_actor.model.set_weights(self.actor.model.get_weights()) if critic: self.target_critic = Critic(model=critic) else: self.target_critic = Critic.init_model( 2, self.env.observation_space.shape[0] + self.env.action_space.shape[0], 400) self.target_critic.model\ .set_weights(self.critic.model.get_weights()) self.actor_variables = self.actor.model.trainable_variables self.critic_variables = self.critic.model.trainable_variables self.actor_opt = tf.keras.optimizers\ .Adam(learning_rate=self.actor_learning_rate) self.critic_opt = tf.keras.optimizers\ .Adam(learning_rate=self.critic_learning_rate)
class Trainer: def __init__(self, critic=None): self.env = gym.make('CartPole-v0') self.Q = Critic(model=critic) if critic else \ Critic.init_model(2, self.env.observation_space.shape[0] + self.env.action_space.n, 64) self.opt = tf.keras.optimizers.Adam(learning_rate=0.0001) self.actions_dim = self.env.action_space.n self.variables = self.Q.model.trainable_variables self.discount_factor = 0.99 self.e = 0.01 self.episode_length = 0 self.states = [] self.rewards = [] self.gradients = [] self.actions = [] def record_episode(self, iterations): done = False self.episode_length = 0 state = self.env.reset() iterations = 200 if iterations is None else iterations while not done or self.episode_length > iterations: self.episode_length += 1 with tf.GradientTape() as tape: action_Qs = self.get_action_vals(state) action = np.argmax(action_Qs) Q_estimate = action_Qs[action] state, reward, done, _ = self.env.step(action) reward = 1 if not done else -1 future_Q = max(tf.stop_gradient(self.get_action_vals(state))) final_state = 0 if done else 1 target = reward + self.discount_factor * future_Q * final_state q_loss = tf.math.pow(Q_estimate - target, 2) grads = tape.gradient(q_loss, self.variables) # print('--------------------------------------') # print('done: ', done) # print('reward: ', reward) # print('delta:', delta.numpy()[0][0]) # print('Q_estimate:', Q_estimate.numpy()[0][0]) # print('future_Q:', future_Q.numpy()[0][0]) # grads = [grad*delta[0] for grad in grads] self.opt.apply_gradients(zip(grads, self.variables)) self.env.close() return self.episode_length def get_action_vals(self, state): Q = [] all_actions_one_hot = tf.one_hot(list(range(self.actions_dim)), self.actions_dim) for one_hot_action in all_actions_one_hot: Q_a = self.Q.get_Q(np.array(state), one_hot_action) Q.append(Q_a) return Q
def test_critic(): critic = Critic.init_model(2, 4, 100) one_hot_action = tf.one_hot([1], 2) Q = critic.get_V(np.array([0, 0]), one_hot_action) print(Q)
class Trainer: def __init__(self, critic=None, actor=None): self.env = gym.make('CartPole-v0') self.actor = Actor(model=actor) if critic else \ Actor.init_model(2, self.env.observation_space.shape[0], 64, self.env.action_space.n) self.critic = Critic(model=critic) if critic else \ Critic.init_model(2, self.env.observation_space.shape[0] + self.env.action_space.n, 64) self.actor_variables = self.actor.model.trainable_variables self.critic_variables = self.critic.model.trainable_variables self.actor_opt = tf.keras.optimizers.Adam(learning_rate=0.0001) self.critic_opt = tf.keras.optimizers.Adam(learning_rate=0.0001) self.actions_dim = self.env.action_space.n self.discount_factor = 0.99 self.e = 0.01 self.episode_length = 0 self.states = [] self.rewards = [] self.gradients = [] self.actions = [] def record_episode(self, iterations): done = False self.episode_length = 0 state = self.env.reset() iterations = 200 if iterations is None else iterations while not done or self.episode_length > iterations: self.episode_length += 1 with tf.GradientTape() as actor_tape, \ tf.GradientTape() as critic_tape: policy = self.actor.get_policy(state) action = self.sample_action(policy) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=policy, labels=np.array([action])) # quality of this action q_val = self.get_action_q_value(state, action) # q_diff = tf.stop_gradient(self.get_future_Q(state)) state, reward, done, _ = self.env.step(action) reward = 1 if not done else -1 future_Q = tf.stop_gradient(self.get_future_Q(state)) finished = 0 if done else 1 target = reward + self.discount_factor * future_Q * finished temp_diff = q_val - target # q_loss = tf.math.pow(temp_diff, 2) # if done: print('target:', target.numpy()[0]) print('temp_diff:', temp_diff.numpy()[0]) print('q_val:', q_val.numpy()[0]) actor_grads = actor_tape.gradient(loss, self.actor_variables) critic_grads = critic_tape.gradient(q_val, self.critic_variables) # first update the actor model with the q_val as weight. actor_grads = [grad * q_val[0] for grad in actor_grads] self.actor_opt \ .apply_gradients(zip(actor_grads, self.actor_variables)) # second update the critic model with the target val: # reward = 1 if not done else -1 # future_Q = self.get_future_Q(state) # final_state = 1 if done else 0 # target = reward + self.discount_factor * future_Q * final_state # target_delta = q_val - target critic_grads = [grad * temp_diff[0] for grad in critic_grads] self.critic_opt \ .apply_gradients(zip(critic_grads, self.critic_variables)) # print('--------------------------------------') # print('done: ', done) # print('reward: ', reward) # print('delta:', target_delta.numpy()[0][0]) # print('Q_estimate:', q_val.numpy()[0][0]) # print('future_Q:', future_Q.numpy()[0][0]) self.env.close() return self.episode_length def sample_action(self, policy): if np.random.rand(1) < self.e: action = self.env.action_space.sample() else: soft_max_prob = tf.nn.softmax(policy) action = np.random \ .choice([0, 1], p=soft_max_prob.numpy()[0]) return action def get_action_q_value(self, state, action): one_hot_action = tf.one_hot(action, self.actions_dim) return self.critic.get_Q(np.array(state), one_hot_action) def get_q_values(self, state): action_Qs = [] all_actions_one_hot = tf.one_hot(list(range(self.actions_dim)), self.actions_dim) for one_hot_action in all_actions_one_hot: Q_a = self.critic.get_Q(np.array(state), one_hot_action) action_Qs.append(Q_a) return action_Qs def get_future_Q(self, state): action_Qs = [] all_actions_one_hot = tf.one_hot(list(range(self.actions_dim)), self.actions_dim) for one_hot_action in all_actions_one_hot: Q_a = self.critic.get_Q(np.array(state), one_hot_action) action_Qs.append(Q_a) return max(action_Qs)
class Trainer: def __init__(self, tau=0.05, burn_in_eps=30, critic=None, actor=None): self.env = gym.make('LunarLanderContinuous-v2') self.memory = Memory(batch_size=120) self.tau = tau self.burn_in_eps = burn_in_eps self.eps = 0 self.actions_dim = self.env.action_space.shape[0] self.high_action = 1 self.low_action = -1 self.discount_factor = 0.99 self.episode_length = 0 self.actor_learning_rate = 0.00001 self.critic_learning_rate = 0.00001 self.exploration_value = 0.2 self.actor = ContinuousActor(model=actor) if actor else \ ContinuousActor.init_model(2, self.env.observation_space.shape[0], 400, self.env.action_space.shape[0]) self.critic = Critic(model=critic) if critic else \ Critic.init_model(2, self.env.observation_space.shape[0] + self.env.action_space.shape[0], 400) if actor: self.target_actor = ContinuousActor(model=actor) else: self.target_actor = ContinuousActor.init_model( 2, self.env.observation_space.shape[0], 400, self.env.action_space.shape[0]) self.target_actor.model.set_weights(self.actor.model.get_weights()) if critic: self.target_critic = Critic(model=critic) else: self.target_critic = Critic.init_model( 2, self.env.observation_space.shape[0] + self.env.action_space.shape[0], 400) self.target_critic.model\ .set_weights(self.critic.model.get_weights()) self.actor_variables = self.actor.model.trainable_variables self.critic_variables = self.critic.model.trainable_variables self.actor_opt = tf.keras.optimizers\ .Adam(learning_rate=self.actor_learning_rate) self.critic_opt = tf.keras.optimizers\ .Adam(learning_rate=self.critic_learning_rate) def run_episode(self): done = False self.eps = self.eps + 1 self.episode_length = 0 state = self.env.reset() reward_sum = 0 success_count = 0 while not done: self.episode_length += 1 action = self.actor.get_action(state) action = action + tf.random\ .normal([2], mean=0.0, stddev=self.exploration_value, dtype=tf.dtypes.float64) action = tf.clip_by_value(action, clip_value_min=-self.low_action, clip_value_max=self.high_action) next_state, reward, done, _ = self.env.step(action) self.memory.remember(state, action, reward, done, next_state) reward_sum = reward_sum + reward success = self.train() if success: success_count = success_count + 1 state = next_state self.env.close() self.memory.check() return reward_sum/self.episode_length, \ success_count/self.episode_length, \ self.episode_length def train(self): success = True if self.memory.full(): success = False states, actions, rewards, done, next_states = self.memory.sample() with tf.GradientTape() as actor_tape, \ tf.GradientTape() as critic_tape: Q_loss = self\ .Q_loss(states, actions, rewards, next_states, done) if self.eps > self.burn_in_eps: action_loss = tf.math.negative(self.action_loss(states)) if self.eps > self.burn_in_eps: actor_grads = actor_tape\ .gradient(action_loss, self.actor_variables) self.actor_opt \ .apply_gradients(zip(actor_grads, self.actor_variables)) critic_grads = critic_tape\ .gradient(Q_loss, self.critic_variables) self.critic_opt \ .apply_gradients(zip(critic_grads, self.critic_variables)) if self.eps > self.burn_in_eps: updated_action_loss = tf.math\ .negative(self.action_loss(states)).numpy() if updated_action_loss > action_loss: success = True self.target_actor.track_weights(self.tau, self.actor.model) self.target_critic.track_weights(self.tau, self.critic.model) return success def Q_loss(self, states, actions, rewards, next_states, done): next_actions = self.target_actor.model(next_states) Q_input = tf.concat([next_states, next_actions], axis=1) y = rewards[:, None] + self.discount_factor*(1-done)*self\ .target_critic.model(Q_input) Q_input = tf.concat([states, actions], axis=1) td_error = tf.stop_gradient(y) - self.critic.model(Q_input) squared_error = tf.pow(td_error, 2) return tf.reduce_mean(squared_error) def action_loss(self, states): actions = self.actor.model(states) Q_input = tf.concat([states, actions], axis=1) mean = tf.reduce_mean(self.critic.model(Q_input)) return mean
def __init__(self, tau=0.005, burn_in_eps=0, critics=[None, None], actor=None): self.env = gym.make('LunarLanderContinuous-v2') self.memory = Memory(batch_size=64) self.tau = tau self.burn_in_eps = burn_in_eps self.eps = 0 self.actions_dim = self.env.action_space.shape[0] self.discount_factor = 0.999 self.episode_length = 0 self.actor_learning_rate = 0.00001 self.critic_learning_rate = 0.0001 self.exploration_value = 0.2 self.smoothing_var = 0.05 self.clipping_val = 0.4 self.low_action = -1 self.high_action = 1 self.policy_freq = 4 self.actor_hidden_layers = 2 self.critic_hidden_layers = 2 self.layer_units = 200 self.max_ep_steps = 300 self.actor = ContinuousActor(model=actor) if actor else \ ContinuousActor.init_model(self.actor_hidden_layers, self.env.observation_space.shape[0], self.layer_units, self.env.action_space.shape[0]) self.critic_1 = Critic(model=critics[0]) if all(critics) else \ Critic.init_model(self.critic_hidden_layers, self.env.observation_space.shape[0] + self.env.action_space.shape[0], self.layer_units) self.critic_2 = Critic(model=critics[1]) if all(critics) else \ Critic.init_model(self.critic_hidden_layers, self.env.observation_space.shape[0] + self.env.action_space.shape[0], self.layer_units) self.target_actor = ContinuousActor \ .init_model(self.actor_hidden_layers, self.env.observation_space.shape[0], self.layer_units, self.env.action_space.shape[0]) self.target_actor.model.set_weights(self.actor.model.get_weights()) self.target_critic_1 = Critic \ .init_model(self.critic_hidden_layers, self.env.observation_space.shape[0] + self.env.action_space.shape[0], self.layer_units) self.target_critic_1.model\ .set_weights(self.critic_1.model.get_weights()) self.target_critic_2 = Critic \ .init_model(self.critic_hidden_layers, self.env.observation_space.shape[0] + self.env.action_space.shape[0], self.layer_units) self.target_critic_2.model\ .set_weights(self.critic_2.model.get_weights()) self.actor_variables = self.actor.model.trainable_variables self.critic_1_variables = self.critic_1.model.trainable_variables self.critic_2_variables = self.critic_2.model.trainable_variables self.actor_opt = tf.keras.optimizers\ .Adam(learning_rate=self.actor_learning_rate) self.critic_opt = tf.keras.optimizers\ .Adam(learning_rate=self.critic_learning_rate)
class Trainer: def __init__(self, tau=0.005, burn_in_eps=0, critics=[None, None], actor=None): self.env = gym.make('LunarLanderContinuous-v2') self.memory = Memory(batch_size=64) self.tau = tau self.burn_in_eps = burn_in_eps self.eps = 0 self.actions_dim = self.env.action_space.shape[0] self.discount_factor = 0.999 self.episode_length = 0 self.actor_learning_rate = 0.00001 self.critic_learning_rate = 0.0001 self.exploration_value = 0.2 self.smoothing_var = 0.05 self.clipping_val = 0.4 self.low_action = -1 self.high_action = 1 self.policy_freq = 4 self.actor_hidden_layers = 2 self.critic_hidden_layers = 2 self.layer_units = 200 self.max_ep_steps = 300 self.actor = ContinuousActor(model=actor) if actor else \ ContinuousActor.init_model(self.actor_hidden_layers, self.env.observation_space.shape[0], self.layer_units, self.env.action_space.shape[0]) self.critic_1 = Critic(model=critics[0]) if all(critics) else \ Critic.init_model(self.critic_hidden_layers, self.env.observation_space.shape[0] + self.env.action_space.shape[0], self.layer_units) self.critic_2 = Critic(model=critics[1]) if all(critics) else \ Critic.init_model(self.critic_hidden_layers, self.env.observation_space.shape[0] + self.env.action_space.shape[0], self.layer_units) self.target_actor = ContinuousActor \ .init_model(self.actor_hidden_layers, self.env.observation_space.shape[0], self.layer_units, self.env.action_space.shape[0]) self.target_actor.model.set_weights(self.actor.model.get_weights()) self.target_critic_1 = Critic \ .init_model(self.critic_hidden_layers, self.env.observation_space.shape[0] + self.env.action_space.shape[0], self.layer_units) self.target_critic_1.model\ .set_weights(self.critic_1.model.get_weights()) self.target_critic_2 = Critic \ .init_model(self.critic_hidden_layers, self.env.observation_space.shape[0] + self.env.action_space.shape[0], self.layer_units) self.target_critic_2.model\ .set_weights(self.critic_2.model.get_weights()) self.actor_variables = self.actor.model.trainable_variables self.critic_1_variables = self.critic_1.model.trainable_variables self.critic_2_variables = self.critic_2.model.trainable_variables self.actor_opt = tf.keras.optimizers\ .Adam(learning_rate=self.actor_learning_rate) self.critic_opt = tf.keras.optimizers\ .Adam(learning_rate=self.critic_learning_rate) def run_episode(self): done = False self.eps = self.eps + 1 self.episode_length = 0 state = self.env.reset() reward_sum = 0 while not done and self.episode_length < self.max_ep_steps: self.episode_length += 1 action = self.actor.get_action(state) action = action + tf.random\ .normal([2], mean=0.0, stddev=self.exploration_value, dtype=tf.dtypes.float64) action = tf.clip_by_value(action, clip_value_min=self.low_action, clip_value_max=self.high_action) next_state, reward, done, _ = self.env.step(action) self.memory.remember(state, action, reward, done, next_state) reward_sum = reward_sum + reward self.train() state = next_state self.env.close() return reward_sum, self.episode_length def train(self): if self.memory.full(): states, actions, rewards, done, next_states = self.memory.sample() with tf.GradientTape() as actor_tape, \ tf.GradientTape() as critic_tape_1, \ tf.GradientTape() as critic_tape_2: y = self.compute_target(states, actions, rewards, next_states, done) Q_input = tf.concat([states, actions], axis=1) td_err_1 = tf.stop_gradient(y) - self.critic_1.model(Q_input) squared_error_1 = tf.pow(td_err_1, 2) Q_loss_1 = tf.reduce_mean(squared_error_1) td_err_2 = tf.stop_gradient(y) - self.critic_2.model(Q_input) squared_error_2 = tf.pow(td_err_2, 2) Q_loss_2 = tf.reduce_mean(squared_error_2) if self.update_policy: action_loss = tf.math.negative(self.action_loss(states)) critic_1_grads = critic_tape_1\ .gradient(Q_loss_1, self.critic_1_variables) critic_2_grads = critic_tape_2\ .gradient(Q_loss_2, self.critic_2_variables) self.critic_opt \ .apply_gradients(zip(critic_1_grads, self.critic_1_variables)) self.critic_opt \ .apply_gradients(zip(critic_2_grads, self.critic_2_variables)) if self.update_policy: actor_grads = actor_tape\ .gradient(action_loss, self.actor_variables) self.actor_opt \ .apply_gradients(zip(actor_grads, self.actor_variables)) self.target_actor.track_weights(self.tau, self.actor.model) self.target_critic_1 \ .track_weights(self.tau, self.critic_1.model) self.target_critic_2 \ .track_weights(self.tau, self.critic_2.model) if self.update_policy: return Q_loss_1.numpy(), Q_loss_2.numpy(), action_loss else: return Q_loss_1.numpy(), Q_loss_2.numpy(), 0 return 0, 0, 0 @property def update_policy(self): return self.eps > self.burn_in_eps and \ self.episode_length % self.policy_freq def compute_target(self, states, actions, rewards, next_states, done): next_actions = self.target_actor.model(next_states) smoothing_noise = tf.random\ .normal(actions.shape, mean=0.0, stddev=self.smoothing_var, dtype=tf.dtypes.float64) clipped_smoothing_noise = \ tf.clip_by_value(smoothing_noise, clip_value_min=-self.clipping_val, clip_value_max=self.clipping_val) next_actions = tf.clip_by_value(clipped_smoothing_noise + next_actions, clip_value_min=self.low_action, clip_value_max=self.high_action) Q_input = tf.concat([next_states, next_actions], axis=1) Q_1_val = self.target_critic_1.model(Q_input) Q_2_val = self.target_critic_2.model(Q_input) Q_val = tf.math.minimum(Q_1_val, Q_2_val) y = rewards[:, None] + self.discount_factor * (1 - done) * Q_val return y def action_loss(self, states): actions = self.actor.model(states) Q_input = tf.concat([states, actions], axis=1) mean = tf.reduce_mean(self.critic_1.model(Q_input)) return mean