def train_network(config: MuZeroConfig, storage: SharedStorage, replay_buffer: ReplayBuffer): network = storage.latest_network() # recover the latest network to be updated learning_rate = config.lr_init * config.lr_decay_rate**(network.training_steps()/config.lr_decay_steps) network.optimiser.learning_rate = learning_rate for i in range(config.training_steps+1): if i % config.checkpoint_interval == 0: storage.save_network(network.training_steps(), network) batch = replay_buffer.sample_batch(config.num_unroll_steps, config.td_steps, config.prediction_interval) l = network.update_weights(batch, config.weight_decay, config.hidden_state_dampen) if i % 100 == 0: print((i, l)) storage.save_network(network.training_steps(), network) return i ##
class DrlAgent: def __init__(self, sess, is_train, dim_state, dim_action, num_paths, actor_learn_rate, critic_learn_rate, tau, buffer_size, mini_batch, ep_begin, epsilon_end, gamma, max_epoch, seed=66): self.__is_train = is_train self.__dim_state = dim_state self.__dim_action = dim_action self.__mini_batch = mini_batch self.__ep_begin = ep_begin self.__gamma = gamma self.__max_epoch = max_epoch self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0, actor_learn_rate, tau, num_paths) self.__critic = CriticNetwork(sess, dim_state, dim_action, critic_learn_rate, tau) self.__replay = ReplayBuffer(buffer_size, seed) self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch, dim_action, num_paths, seed) self.__state_curt = np.zeros(dim_state) self.__action_curt = self.__explorer.convert_action( np.ones(dim_action)) self.__episode = 0 self.__step = 0 def target_paras_init(self): self.__actor.update_target_paras() self.__critic.update_target_paras() def predict(self, state, reward): action_original = self.__actor.predict([state])[0] if not self.__is_train: return action_original action = self.__explorer.get_act(action_original) self.__replay.add(self.__state_curt, self.__action_curt, reward, state) self.__state_curt = state self.__action_curt = action if len(self.__replay) > self.__mini_batch: self.train() self.__step += 1 if self.__step >= self.__max_epoch: self.__step = 0 self.__episode += 1 self.__explorer.reset_ep(self.__ep_begin) return action def train(self): batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch( self.__mini_batch) weights = [1.0] * self.__mini_batch weights = np.expand_dims(weights, axis=1) target_q = self.__critic.predict_target( batch_state_next, self.__actor.predict_target(batch_state_next)) value_q = self.__critic.predict(batch_state, batch_action) batch_y = [] batch_error = [] for k in range(len(batch_reward)): target_y = batch_reward[k] + self.__gamma * target_q[k] batch_error.append(abs(target_y - value_q[k])) batch_y.append(target_y) predicted_q, _ = self.__critic.train(batch_state, batch_action, batch_y, weights) a_outs = self.__actor.predict(batch_state) grads = self.__critic.calculate_gradients(batch_state, a_outs) weighted_grads = weights * grads[0] self.__actor.train(batch_state, weighted_grads) self.__actor.update_target_paras() self.__critic.update_target_paras()
class DDPG: def __init__(self, env=gym.make('Pendulum-v0'), s_dim=2, a_dim=1, gamma=0.99, episodes=100, tau=0.001, buffer_size=1e06, minibatch_size=64, actor_lr=0.001, critic_lr=0.001, save_name='final_weights', render=False): self.save_name = save_name self.render = render self.env = env self.upper_bound = env.action_space.high[0] self.lower_bound = env.action_space.low[0] self.EPISODES = episodes self.MAX_TIME_STEPS = 200 self.s_dim = s_dim self.a_dim = a_dim self.GAMMA = gamma self.TAU = tau self.buffer_size = buffer_size self.minibatch_size = minibatch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.ou_noise = OUNoise(mean=np.zeros(1)) self.actor = Actor(self.s_dim, self.a_dim).model() self.target_actor = Actor(self.s_dim, self.a_dim).model() self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr) self.target_actor.set_weights(self.actor.get_weights()) self.critic = Critic(self.s_dim, self.a_dim).model() self.critic_opt = tf.keras.optimizers.Adam( learning_rate=self.critic_lr) self.target_critic = Critic(self.s_dim, self.a_dim).model() self.target_critic.set_weights(self.critic.get_weights()) self.replay_buffer = ReplayBuffer(self.buffer_size) def update_target(self): # Two methods to update the target actor # Method 1: self.target_actor.set_weights( np.array(self.actor.get_weights()) * self.TAU + np.array(self.target_actor.get_weights()) * (1 - self.TAU)) self.target_critic.set_weights( np.array(self.critic.get_weights()) * self.TAU + np.array(self.target_critic.get_weights()) * (1 - self.TAU)) """ # Method 2: new_weights = [] target_variables = self.target_critic.weights for i, variable in enumerate(self.critic.weights): new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU)) self.target_critic.set_weights(new_weights) new_weights = [] target_variables = self.target_actor.weights for i, variable in enumerate(self.actor.weights): new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU)) self.target_actor.set_weights(new_weights) """ def train_step(self): s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample_batch( self.minibatch_size) """ mu_prime = self.target_actor(s2_batch) # predictions by target actor Q_prime = self.target_critic([s2_batch, mu_prime]) # predictions by target critic y = np.zeros_like(Q_prime) for k in range(self.minibatch_size): if d_batch[k]: y[k] = r_batch[k] else: y[k] = r_batch[k] + self.GAMMA * Q_prime[k] # y = r_batch + gamma * Q_prime checkpoint_path = "training/cp_critic.ckpt" checkpoint_dir = os.path.dirname(checkpoint_path) # Create a callback that saves the model's weights cp_callback1 = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir, save_weights_only=True, verbose=1) self.critic.train_on_batch([s_batch, a_batch], y) # self.critic.fit([s_batch, a_batch], y, verbose=0, steps_per_epoch=8, callbacks=[cp_callback1]) with tf.GradientTape(persistent=True) as tape: a = self.actor(s_batch) tape.watch(a) theta = self.actor.trainable_variables q = self.critic([s_batch, a]) dq_da = tape.gradient(q, a) da_dtheta = tape.gradient(a, theta, output_gradients=-dq_da) self.actor_opt.apply_gradients(zip(da_dtheta, self.actor.trainable_variables)) """ with tf.GradientTape() as tape: target_actions = self.target_actor(s2_batch) y = r_batch + self.GAMMA * self.target_critic( [s2_batch, target_actions]) critic_value = self.critic([s_batch, a_batch]) critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value)) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_opt.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) with tf.GradientTape() as tape: actions = self.actor(s_batch) q = self.critic([s_batch, actions]) # critic_value # Used `-value` as we want to maximize the value given # by the critic for our actions actor_loss = -tf.math.reduce_mean(q) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_opt.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) self.update_target() return np.mean(q) def policy(self, s): # since batch normalization is done on self.actor, it is multiplied with upper_bound if s.ndim == 1: s = s[None, :] action = self.actor(s) * self.upper_bound + self.ou_noise() action = np.clip(action, self.lower_bound, self.upper_bound) return action def train(self): # To store reward history of each episode ep_reward_list = [] # To store average reward history of last few episodes avg_reward_list = [] monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2) with Loop_handler( ) as interruption: # to properly save even if ctrl+C is pressed for eps in range(self.EPISODES): episode_reward = 0 s = self.env.reset() """ if an env is created using the "gym.make" method, it will terminate after 200 steps """ for t in range(self.MAX_TIME_STEPS): # done = False # while not done: if self.render: self.env.render() a = self.policy(s) s_, r, done, _ = self.env.step(a) self.replay_buffer.add(np.reshape(s, (self.s_dim, )), np.reshape(a, (self.a_dim, )), r, done, np.reshape(s_, (self.s_dim, ))) episode_reward += r if self.replay_buffer.size() > self.minibatch_size: q = self.train_step() s = s_.reshape(1, -1) if interruption(): break ep_reward_list.append(episode_reward) # Mean of last 40 episodes avg_reward = np.mean(ep_reward_list[-40:]) print("Episode * {} * Avg Reward is ==> {}".format( eps, avg_reward)) avg_reward_list.append(avg_reward) monitor.add_data(avg_reward, q) self.save_weights( save_name=self.save_name) # if you want to save weights self.plot_results(avg_reward=avg_reward_list, train=True) def save_weights(self, save_name='final_weights'): self.actor.save_weights("training/%s_actor.h5" % save_name) self.critic.save_weights("training/%s_critic.h5" % save_name) self.target_actor.save_weights("training/%s_target_actor.h5" % save_name) self.target_critic.save_weights("training/%s_target_critic.h5" % save_name) # to save in other format self.target_actor.save_weights('training/%s_actor_weights' % save_name, save_format='tf') self.target_critic.save_weights('training/%s_critic_weights' % save_name, save_format='tf') print('Training completed and network weights saved') # For evaluation of the policy learned def collect_data(self, act_net, iterations=1000): a_all, states_all = [], [] obs = self.env.reset() for t in range(iterations): obs = np.squeeze(obs) if obs.ndim == 1: a = act_net(obs[None, :]) else: a = act_net(obs) obs, _, done, _ = self.env.step(a) states_all.append(obs) a_all.append(a) # self.env.render() # Uncomment this to see the actor in action (But not in python notebook) # if done: # break states = np.squeeze( np.array(states_all)) # cos(theta), sin(theta), theta_dot a_all = np.squeeze(np.array(a_all)) return states, a_all def plot_results(self, avg_reward=None, actions=None, states=None, train=False, title=None): # An additional way to visualize the avg episode rewards if train: plt.figure() plt.plot(avg_reward) plt.xlabel("Episode") plt.ylabel("Avg. Epsiodic Reward") plt.show() else: # work only for Pendulum-v0 environment fig, ax = plt.subplots(3, sharex=True) theta = np.arctan2(states[:, 1], states[:, 0]) ax[0].set_ylabel('u') ax[0].plot(np.squeeze(actions)) ax[1].set_ylabel(u'$\\theta$') ax[1].plot(theta) # ax[1].plot(states[:, 0]) ax[2].set_ylabel(u'$\omega$') ax[2].plot(states[:, 2]) # ang velocity fig.canvas.set_window_title(title)
def train(sess, env, actor, critic): # Set up summary ops summary_ops, summary_vars = build_summaries() # Initialize Tensorflow variables sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) for i in xrange(MAX_EPISODES): s = env.reset() episode_reward = 0 episode_ave_max_q = 0 noise = ExplorationNoise.ou_noise(OU_THETA, OU_MU, OU_SIGMA, MAX_STEPS_EPISODE) noise = ExplorationNoise.exp_decay(noise, EXPLORATION_TIME) for j in xrange(MAX_STEPS_EPISODE): if RENDER_ENV: env.render() # Add exploratory noise according to Ornstein-Uhlenbeck process to action # Decay exploration exponentially from 1 to 0 in EXPLORATION_TIME steps if i < EXPLORATION_TIME: a = actor.predict( np.reshape(s, (1, env.observation_space.shape[0]))) + noise[j] else: a = actor.predict( np.reshape(s, (1, env.observation_space.shape[0]))) s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, actor.state_dim), np.reshape(a, actor.action_dim), r, terminal, np.reshape(s2, actor.state_dim)) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): # If state is terminal assign reward only if t_batch[k]: y_i.append(r_batch[k]) # Else assgin reward + net target Q else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = \ critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) episode_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) a_grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, a_grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 episode_reward += r if terminal or j == MAX_STEPS_EPISODE - 1: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: episode_reward, summary_vars[1]: episode_ave_max_q }) writer.add_summary(summary_str, i) writer.flush() print 'Reward: %.2i' % int(episode_reward), ' | Episode', i, \ '| Qmax: %.4f' % (episode_ave_max_q / float(j)) break
class DDPGagent(object): def __init__(self, env): self.sess = tf.Session() K.set_session(self.sess) ## hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 64 self.BUFFER_SIZE = 20000 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.TAU = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] ## create actor and critic networks self.actor = Actor(self.sess, self.state_dim, self.action_dim, self.action_bound, self.TAU, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.sess, self.state_dim, self.action_dim, self.TAU, self.CRITIC_LEARNING_RATE) ## initialize for later gradient calculation self.sess.run( tf.global_variables_initializer()) #<-- no problem without it ## initialize replay buffer self.buffer = ReplayBuffer(self.BUFFER_SIZE) # save the results self.save_epi_reward = [] ## Ornstein Uhlenbeck Noise def ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1): return x + rho * ( mu - x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim) ## computing TD target: y_k = r_k + gamma*Q(s_k+1, a_k+1) def td_target(self, rewards, q_values, dones): y_k = np.asarray(q_values) for i in range(q_values.shape[0]): # number of batch if dones[i]: y_k[i] = rewards[i] else: y_k[i] = rewards[i] + self.GAMMA * q_values[i] return y_k ## train the agent def train(self, max_episode_num): # initial transfer model weights to target model network self.actor.update_target_network() self.critic.update_target_network() for ep in range(int(max_episode_num)): # reset OU noise pre_noise = np.zeros(self.action_dim) # reset episode time, episode_reward, done = 0, 0, False # reset the environment and observe the first state state = self.env.reset() while not done: # visualize the environment #self.env.render() # pick an action: shape = (1,) action = self.actor.predict(state) noise = self.ou_noise(pre_noise, dim=self.action_dim) # clip continuous action to be within action_bound action = np.clip(action + noise, -self.action_bound, self.action_bound) # observe reward, new_state next_state, reward, done, _ = self.env.step(action) # add transition to replay buffer train_reward = (reward + 8) / 8 self.buffer.add_buffer(state, action, train_reward, next_state, done) if self.buffer.buffer_size > 1000: # start train after buffer has some amounts # sample transitions from replay buffer states, actions, rewards, next_states, dones = self.buffer.sample_batch( self.BATCH_SIZE) # predict target Q-values target_qs = self.critic.target_predict( [next_states, self.actor.target_predict(next_states)]) # compute TD targets y_i = self.td_target(rewards, target_qs, dones) # train critic using sampled batch self.critic.train_on_batch(states, actions, y_i) # Q gradient wrt current policy s_actions = self.actor.model.predict( states) # shape=(batch, 1), # caution: NOT self.actor.predict ! # self.actor.model.predict(state) -> shape=(1,1) # self.actor.predict(state) -> shape=(1,) -> type of gym action s_grads = self.critic.dq_da(states, s_actions) dq_das = np.array(s_grads).reshape((-1, self.action_dim)) # train actor self.actor.train(states, dq_das) # update both target network self.actor.update_target_network() self.critic.update_target_network() # update current state pre_noise = noise state = next_state episode_reward += reward time += 1 ## display rewards every episode print('Episode: ', ep + 1, 'Time: ', time, 'Reward: ', episode_reward) self.save_epi_reward.append(episode_reward) ## save weights every episode #print('Now save') self.actor.save_weights("./save_weights/pendulum_actor.h5") self.critic.save_weights("./save_weights/pendulum_critic.h5") np.savetxt('./save_weights/pendulum_epi_reward.txt', self.save_epi_reward) print(self.save_epi_reward) ## save them to file if done def plot_result(self): plt.plot(self.save_epi_reward) plt.show()
class DDPG: def __init__(self, env, sess, low_action_bound_list, high_action_bound_list): self.env = env self.sess = sess self.low_action_bound_list = low_action_bound_list # depends on the env self.high_action_bound_list = high_action_bound_list self.action_range_bound = [ hi - lo for hi, lo in zip(self.high_action_bound_list, self.low_action_bound_list) ] self.learning_rate = 0.0001 #TODO move these to configs self.epsilon = 1.0 self.epsilon_min = 0.1 self.epsilon_decay = 1e-6 self.gamma = 0.99 self.tau = 0.001 self.buffer_size = 1000000 self.batch_size = 128 self.theta = 0.15 self.ou = 0 self.sigma = 0.3 self.state_dim = self.env.observation_space.shape[0] self.action_dim = len(self.low_action_bound_list ) #self.env.action_space, make this into input self.continuous_action_space = True # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.buffer_size) # Creating ACTOR model actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate) self.actor_state_input, self.actor_model = actor_.create_actor_model() _, self.target_actor_model = actor_.create_actor_model() self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.action_dim]) actor_model_weights = self.actor_model.trainable_weights self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) grads = zip(self.actor_grads, actor_model_weights) self.optimize = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(grads) # Creating CRITIC model critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate) self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model( ) _, _, self.target_critic_model = critic_.create_critic_model() self.critic_grads = tf.gradients(self.critic_model.output, self.critic_action_input) self.noise = OrnsteinUhlenbeckProcess(size=self.action_dim) self.noise.reset() self.sess.run(tf.initialize_all_variables()) def __repr__(self): return 'DDPG_gamma{}_tau{}'.format(self.gamma, self.tau) # TRAINING FUNCTIONS def train_actor(self, samples): current_states, actions, rewards, next_states, dones = samples predicted_actions = self.actor_model.predict(current_states) grads = self.sess.run(self.critic_grads, feed_dict={ self.critic_state_input: current_states, self.critic_action_input: predicted_actions })[0] self.sess.run(self.optimize, feed_dict={ self.actor_state_input: current_states, self.actor_critic_grad: grads }) if self.epsilon - self.epsilon_decay > self.epsilon_min: self.epsilon -= self.epsilon_decay self.noise.reset() def train_critic(self, samples): current_states, actions, rewards, next_states, dones = samples target_actions = self.target_actor_model.predict(next_states) target_q_values = self.target_critic_model.predict( [next_states, target_actions]) rewards = rewards + self.gamma * target_q_values * (1 - dones) evaluation = self.critic_model.fit([current_states, actions], rewards, verbose=0) def train(self): if self.replay_buffer.size() > self.batch_size: samples = self.replay_buffer.sample_batch(self.batch_size) self.train_actor(samples) self.train_critic(samples) # TARGET MODEL UPDATES def update_actor_target(self): actor_model_weights = self.actor_model.get_weights() target_actor_model_weights = self.target_actor_model.get_weights() for i in range(len(target_actor_model_weights)): target_actor_model_weights[i] = actor_model_weights[ i] * self.tau + target_actor_model_weights[i] * (1.0 - self.tau) self.target_actor_model.set_weights(target_actor_model_weights) def update_critic_target(self): critic_model_weights = self.critic_model.get_weights() target_critic_model_weights = self.target_critic_model.get_weights() for i in range(len(target_critic_model_weights)): target_critic_model_weights[i] = critic_model_weights[ i] * self.tau + target_critic_model_weights[i] * (1.0 - self.tau) self.target_critic_model.set_weights(target_critic_model_weights) def update_target_models(self): self.update_actor_target() self.update_critic_target() # ACTING FUNCTION def act(self, current_epsiode, current_state): noise = self.epsilon * self.noise.generate() action = self.actor_model.predict( current_state ) * self.high_action_bound_list + noise #TODO add linear mapping for affine space return np.clip(action, self.low_action_bound_list, self.high_action_bound_list)
class Agent: def __init__(self, env, gamma, batch_size, buffer_size, lr_rate, tau): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_bound = env.action_space.high[0] self.gamma = gamma self.batch_size = batch_size self.buffer_size = buffer_size self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, lr_rate[0], tau) self.critic = Critic(self.state_dim, self.action_dim, lr_rate[1], tau) self.buffer = ReplayBuffer(self.buffer_size) self.save_epi_reward = [] def ou_noise(self, x, rho=0.15, mu=0., dt=1e-1, sigma=0.2, dim=1): rho = torch.FloatTensor([rho]) mu = torch.FloatTensor([mu]) dt = torch.FloatTensor([dt]) return x + rho * (mu - x) * dt + torch.sqrt(dt) * torch.normal( 0., sigma, size=(dim, )) def td_target(self, rewards, q_values, dones): y_k = torch.zeros(q_values.shape) for i in range(q_values.shape[0]): if dones[i]: y_k[i] = rewards[i] else: y_k[i] = rewards[i] + self.gamma * q_values[i] return y_k def train(self, max_episode_num, save_path, save_names): self.actor.update_target_network() self.critic.update_target_network() for episode in range(max_episode_num): time, episode_reward, done = 0, 0, False state = self.env.reset() state = torch.from_numpy(state).type(torch.FloatTensor) pre_noise = torch.zeros(self.action_dim) while not done: #env.render() action = self.actor.predict(state)[0] noise = self.ou_noise(pre_noise, dim=self.action_dim) action = np.array([action.item()]) action = np.clip(action, -self.action_bound, self.action_bound) next_state, reward, done, _ = self.env.step(action) next_state = torch.from_numpy(next_state).type( torch.FloatTensor) action = torch.from_numpy(action).type(torch.FloatTensor) reward = torch.FloatTensor([reward]) train_reward = torch.FloatTensor([(reward + 8) / 8]) state = state.view(1, self.state_dim) next_state = next_state.view(1, self.state_dim) action = action.view(1, self.action_dim) reward = reward.view(1, 1) train_reward = reward.view(1, 1) self.buffer.add_buffer(state, action, train_reward, next_state, done) if self.buffer.buffer_size > 1000: states, actions, rewards, next_states, dones = self.buffer.sample_batch( self.batch_size) actions_ = self.actor.target_predict(next_states) actions_ = actions_.view(next_states.shape[0], self.action_dim) target_qs = self.critic.target_predict( next_states, actions_) y_i = self.td_target(rewards, target_qs, dones) self.critic.train(states, actions, y_i) s_actions = self.actor.predict(states) policy_loss = self.critic.predict(states, s_actions) self.actor.train(policy_loss) self.actor.update_target_network() self.critic.update_target_network() pre_noise = noise state = next_state[0] episode_reward += reward[0] time += 1 self.save_epi_reward.append(episode_reward.item()) if len(self.save_epi_reward) < 20: print('Episode:', episode + 1, 'Time:', time, 'Reward(ave of recent20):', np.mean(self.save_epi_reward)) else: print('Episode:', episode + 1, 'Time:', time, 'Reward(ave of recent20):', np.mean(self.save_epi_reward[-20:])) if episode % 10 == 0: self.actor.save(save_path, save_names[0]) self.critic.save(save_path, save_names[1])
class TD3: def __init__(self, env, sess, low_action_bound_list, high_action_bound_list): self.env = env self.sess = sess self.low_action_bound_list = low_action_bound_list # depends on the env self.high_action_bound_list = high_action_bound_list self.action_range_bound = [ hi - lo for hi, lo in zip(self.high_action_bound_list, self.low_action_bound_list) ] self.learning_rate = 0.0001 self.exploration_noise = 0.1 self.gamma = 0.90 self.tau = 0.01 self.buffer_size = 10000 self.batch_size = 128 self.policy_noise = 0.1 self.noise_clip = 0.05 self.exploration_episodes = 10 # self.policy_freq = 2 self.state_dim = self.env.observation_space.shape[0] self.action_dim = len(self.low_action_bound_list ) #self.env.action_space, make this into input self.continuous_action_space = True # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.buffer_size) # Creating ACTOR model actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate) self.actor_state_input, self.actor_model = actor_.create_actor_model() _, self.target_actor_model = actor_.create_actor_model() self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.action_dim]) actor_model_weights = self.actor_model.trainable_weights self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) grads = zip(self.actor_grads, actor_model_weights) self.optimize = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(grads) # Creating FIRST CRITIC model, this is the one we train/optimize against critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate) self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model( ) self.critic_model.compile(optimizer=Adam(lr=critic_.learning_rate), loss='') _, _, self.target_critic_model = critic_.create_critic_model() self.target_critic_model.compile( optimizer=Adam(lr=critic_.learning_rate), loss='') self.critic_grads = tf.gradients(self.critic_model.output[0], self.critic_action_input) self.sess.run(tf.initialize_all_variables()) def __repr__(self): return 'TD3_gamma{}_tau{}'.format(self.gamma, self.tau) # TRAINING FUNCTIONS def train_actor(self): if self.replay_buffer.size() > self.batch_size: samples = self.replay_buffer.sample_batch(self.batch_size) current_states, actions, rewards, next_states, dones = samples predicted_actions = self.actor_model.predict( current_states ) * self.high_action_bound_list #TODO create linear mapping for affine space grads = self.sess.run(self.critic_grads, feed_dict={ self.critic_state_input: current_states, self.critic_action_input: predicted_actions })[0] self.sess.run(self.optimize, feed_dict={ self.actor_state_input: current_states, self.actor_critic_grad: grads }) def train_critic(self): if self.replay_buffer.size() > self.batch_size: samples = self.replay_buffer.sample_batch(self.batch_size) current_states, actions, rewards, next_states, dones = samples target_actions = self.target_actor_model.predict( next_states) * self.high_action_bound_list # CCOMPUTING FIRST CRITIC # introduce area of noise to action for smoothing purposes noise = np.random.normal( size=len(self.action_range_bound)) * self.policy_noise clipped_noise = np.clip(noise, -self.noise_clip, self.noise_clip) # added above noise to target_actions and clip to be in range of valid actions target_actions = np.clip((target_actions + clipped_noise), self.low_action_bound_list, self.high_action_bound_list) target_q1_values, target_q2_values = self.target_critic_model.predict( [ next_states, target_actions, np.random.rand(self.batch_size, 1) ]) target_q_values = np.minimum(target_q1_values, target_q2_values) target_q = rewards + self.gamma * target_q_values * (1 - dones) # current_q1, current_q2 = self.critic_model.predict([current_states, actions, np.random.rand(self.batch_size, 1)]) history = self.critic_model.fit( [current_states, actions, target_q], verbose=0) # print('Loss: ',history.history['loss']) def train(self): if self.replay_buffer.size() > self.batch_size: samples = self.replay_buffer.sample_batch(self.batch_size) self.train_actor() self.train_critic() # TARGET MODEL UPDATES def update_actor_target(self): actor_model_weights = self.actor_model.get_weights() target_actor_model_weights = self.target_actor_model.get_weights() for i in range(len(target_actor_model_weights)): target_actor_model_weights[i] = actor_model_weights[ i] * self.tau + target_actor_model_weights[i] * (1.0 - self.tau) self.target_actor_model.set_weights(target_actor_model_weights) def update_critic_target(self): critic_model_weights = self.critic_model.get_weights() target_critic_model_weights = self.target_critic_model.get_weights() for i in range(len(target_critic_model_weights)): target_critic_model_weights[i] = critic_model_weights[ i] * self.tau + target_critic_model_weights[i] * (1.0 - self.tau) self.target_critic_model.set_weights(target_critic_model_weights) def update_target_models(self): self.update_actor_target() self.update_critic_target() # ACTING FUNCTION with epsilon greedy def act(self, current_epsiode, current_state): if current_epsiode < self.exploration_episodes: return np.random.uniform( self.low_action_bound_list, self.high_action_bound_list) * self.high_action_bound_list else: action = self.actor_model.predict( current_state) * self.high_action_bound_list + np.random.normal( 0, [ self.exploration_noise * hi for hi in self.high_action_bound_list ]) return np.clip(action, self.low_action_bound_list, self.high_action_bound_list)