class Actor_Critic(): def __init__(self, env, sess): self.env = env self.sess = sess self.memory_buffer = Replay_Buffer(BUFFER_SIZE, BATCH_SIZE) self.learning_rate = LR self.tau = TAU self.buffer_size = BUFFER_SIZE self.batch_size = BATCH_SIZE self.discount = 0.99 self.Actor = Actor(self.env, self.sess, self.learning_rate, self.tau, self.discount) self.Critic = Critic(self.env, self.sess, self.learning_rate, self.tau, self.discount) def update_target(self): self.Actor.actor_target_update() self.Critic.critic_target_update() #def train(self): def save(self, prefixe): self.Actor.save(prefixe) self.Critic.save(prefixe) self.memory_buffer.save()
class DDPG(object): """deep deterministic policy gradient """ def __init__(self, n_state, n_action, a_bound, gamma=0.99, tau=0.01, actor_lr=0.0005, critic_lr=0.001, noise_std=0.1, noise_decay=0.9995, noise_decay_steps=1000, buffer_size=20000, save_interval=5000, assess_interval=10, logger=None, checkpoint_queen=None): self.logger = logger self.logger.save_config(locals()) self.n_action = n_action self.n_state = n_state self.a_bound = a_bound self.noise_std = noise_std self.noise_decay = noise_decay self.noise_decay_steps = noise_decay_steps self.pointer = 0 self.buffer_size = buffer_size self.save_interval = save_interval self.assess_interval = assess_interval self.actor = Actor(self.n_state, self.n_action, gamma=gamma, lr=actor_lr, tau=tau, l2_reg=0) self.critic = Critic(self.n_state, self.n_action, gamma=gamma, lr=critic_lr, tau=tau, l2_reg=0) self.merge = self._merge_summary() self.ckpt_queen = checkpoint_queen self.prefix = self.__class__.__name__.lower() def _merge_summary(self): tf.summary.histogram('critic_output', self.critic.model.output) tf.summary.histogram('actor_output', self.actor.model.output) tf.summary.histogram('critic_dense1', self.critic.model.get_layer('l1').weights[0]) tf.summary.histogram('actor_dense1', self.actor.model.get_layer('l1').weights[0]) tf.summary.histogram('critic_dense2', self.critic.model.get_layer('l2').weights[0]) tf.summary.histogram('actor_dense2', self.actor.model.get_layer('l2').weights[0]) return tf.summary.merge_all() def policy_action(self, state): return self.actor.predict(state) def bellman_q_value(self, rewards, q_nexts, dones): """ Use the Bellman Equation to compute the critic target """ q_target = np.zeros_like( rewards) # asarry( copy = False), array(cope=True) for i in range(rewards.shape[0]): if dones[i]: q_target[i] = rewards[i] else: q_target[i] = rewards[i] + self.critic.gamma * q_nexts[i] return q_target def update_model(self, states, actions, q_values): # train critic loss_names, loss_values = self.critic.train_on_batch( states, actions, q_values) # train actor # p_actions = self.actor.predict(states) #actions with no noise grad_ys = self.critic.gradients( states, self.actor.predict(states)) #(batch, n-action) actor_output = self.actor.train(states, self.actor.predict(states), grad_ys) # copy network self.actor.copy_weights() self.critic.copy_weights() # print(grad_ys, grad_ys.shape) # print(actor_output[0],actor_output[0].shape) # print(np.mean(grad_ys*actor_output[0])) return loss_names, loss_values, grad_ys, actor_output def save_weights(self, path): self.actor.save(path) self.critic.save(path) def save_model(self, path, file): self.actor.model.save( os.path.join(path, self.prefix + '_actor_' + file + '.h5')) self.critic.model.save( os.path.join(path, self.prefix + '_critic_' + file + '.h5')) def checkpoint(self, path, step, metric_value): signature = str(step) + '_' + '{:.4f}'.format(metric_value) to_delete, need_save = self.ckpt_queen.add((metric_value, signature)) if to_delete: actor = os.path.join( path, self.prefix + '_actor_' + to_delete[1] + '.h5') critic = os.path.join( path, self.prefix + '_critic_' + to_delete[1] + '.h5') os.remove(actor) os.remove(critic) if need_save: self.save_model(path, signature) def train(self, args, summary_writer, train_data=None, val_data=None, test_data=None): results = [] max_val_rate = 0 val_data = np.asarray(val_data) # none will be array(None) # First, gather experience tqdm_e = tqdm(range(args.batchs), desc='score', leave=True, unit=" epoch") if train_data is None: dataset = CsvBuffer(args.file_dir, args.reg_pattern, chunksize=args.batch_size) # 100*(20+1) assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available' # noise = OrnsteinUhlenbeckProcess(size=self.n_action) else: dataset = Dataset(train_data, args.batch_size, shuffle=True) for e in tqdm_e: batch_data = next(dataset) states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int) a = self.policy_action(states) #(batch, n_action) a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape), self.a_bound[0], self.a_bound[1]) # a = np.clip(np.random.normal(a, self.noise_std), self.a_bound[0], self.a_bound[1]) # a = np.clip(a + noise.generate(time, a.shape[0]), self.a_bound[0], self.a_bound[1]) llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5) r = np.where(labels == 1, llr.ravel(), -llr.ravel()) #(batch,) # q_nexts = self.critic.target_predict(new_states, self.actor.target_predict(new_states)) q_ = self.bellman_q_value(rewards=r, q_nexts=0, dones=[True] * r.shape[0]) #(batch,) loss_names, loss_values, grad_ys, actor_output = self.update_model( states, a, q_.reshape(-1, 1)) score = r.mean() if ((e + 1) % self.noise_decay_steps - 1) == 0: self.noise_std *= self.noise_decay self.logger.log_tabular('noise', self.noise_std) if e % self.assess_interval == 0 or e == args.batchs - 1: if val_data is not None: val_pred = self.actor.predict(val_data[:, :-1]) val_y = val_data[:, -1] val_rate, top_k = top_ratio_hit_rate( val_y.ravel(), val_pred.ravel()) self.logger.log_tabular('val_rate', val_rate) self.logger.log_tabular('val_k', int(top_k)) self.checkpoint(args.model_path, e, val_rate) max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate if test_data is not None: test_pred = self.actor.predict(test_data[:, :-1]) test_y = test_data[:, -1] test_rate, top_k = top_ratio_hit_rate( test_y, test_pred.ravel()) self.logger.log_tabular('test_rate', test_rate) self.logger.log_tabular('test_k', int(top_k)) summary_writer.add_summary(tf_summary(['mean-reward'], [score]), global_step=e) summary_writer.add_summary(tf_summary(loss_names, [loss_values]), global_step=e) merge = keras.backend.get_session().run( self.merge, feed_dict={ self.critic.model.input[0]: states, self.critic.model.input[1]: a, self.actor.model.input: states }) summary_writer.add_summary(merge, global_step=e) for name, val in zip(loss_names, [loss_values]): self.logger.log_tabular(name, val) # print(grad_ys,grad_ys.shape) # print(actor_output) self.logger.log_tabular( 'dQ/da', '%.4f+%.4f' % (grad_ys.mean(), grad_ys.std())) # grad_ys (batch,act_dim) self.logger.log_tabular( 'aout', '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std())) self.logger.log_tabular('aloss', '%.4f' % (actor_output[1])) self.logger.log_tabular('reward', '%.4f+%.4f' % (score, r.std())) self.logger.dump_tabular() tqdm_e.set_description("score: " + '{:.4f}'.format(score)) tqdm_e.set_postfix(noise_std='{:.4f}'.format(self.noise_std), max_val_rate='{:.4f}'.format(max_val_rate), val_rate='{:.4f}'.format(val_rate), top_k=top_k) tqdm_e.refresh() return results
class DDPG(object): """ Deep Deterministic Policy Gradient (DDPG) Helper Class """ def __init__(self, action_dim, state_dim, batch_size, step, buffer_size, train_indicator, episode, gamma, lra, lrc, tau, load_weight=True): """ Initialization """ # Environment and A2C parameters self.action_dim = action_dim self.state_dim = state_dim self.batch_size = batch_size self.step = step self.gamma = gamma self.lra = lra self.lrc = lrc self.tau = tau self.episode = episode self.train_indicator = train_indicator # Create actor and critic networks self.actor = Actor(state_dim, action_dim, batch_size, lra, tau) self.critic = Critic(state_dim, action_dim, batch_size, lrc, tau) self.buffer = MemoryBuffer(buffer_size) # !: weights folder need to be specified & ensure only one set of A&C weights are in this folder self.weights_dir_path = os.getcwd() + r"\saved_model\*.h5" if load_weight: try: weights_actor_path = "" weights_critic_path = "" weights_file_path = glob.glob(self.weights_dir_path) for file_path in weights_file_path: if file_path.find("actor") < 0: weights_critic_path = file_path if file_path.find("critic") < 0: weights_actor_path = file_path self.load_weights(weights_actor_path, weights_critic_path) print("") print("Actor-Critic Models are loaded with weights...") print("") except: print("") print( "Weights are failed to be loaded, please check weights loading path..." ) print("") def policy_action(self, s): """ Use the actor to predict value """ return self.actor.predict(s)[0] def bellman(self, rewards, q_values, dones): """ Use the Bellman Equation to compute the critic target (one action only) """ critic_target = np.asarray(q_values) for i in range(q_values.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[i] = rewards[i] + self.gamma * q_values[i] return critic_target def memorize(self, state_old, action, reward, done, state_new): """ Store experience in memory buffer """ self.buffer.memorize(state_old, action, reward, done, state_new) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_models(self, states, actions, critic_target): """ Update actor and critic networks from sampled experience """ # Train critic self.critic.train_on_batch(states, actions, critic_target) # Q-Value Gradients under Current Policy actions = self.actor.model.predict(states) grads = self.critic.gradients(states, actions) # Train actor self.actor.train(states, actions, np.array(grads).reshape((-1, self.action_dim))) # Transfer weights to target networks at rate Tau self.actor.transfer_weights() self.critic.transfer_weights() def run(self, env): # First, gather experience for e in range(self.episode): # Reset episode # set initial state loss, cumul_reward, cumul_loss = 0, 0, 0 done = False state_old = env.get_vissim_state( 1, 180 * 5, [45, 55, 60, 65, 70, 75, 80 ]) #TODO: make sure states are recieved correctly actions, states, rewards = [], [], [] print("Episode: ", e, " ========================:") for t in range(self.step): action_original = self.policy_action(state_old) #TODO: OU function params? noise = OrnsteinUhlenbeckProcess(x0=action_original, size=self.action_dim) # action = action_orig + noise action = noise.apply_ou(t) # adjust too-low or too-high action adj_action = np.zeros(len(action)) for index, value in enumerate(action): adj_action[index] = clip(value, -1, 1) #action_mapping function transformed_action = Transformation.convert_actions(adj_action) reward, state_new = env.get_vissim_reward( 180 * 5, transformed_action) # TODO: if we know what the optimal discharging rate, then we set that as done if t == self.step - 1: #we consider the manually setted last step as done done = True # ======================================================================================= Training section if (self.train_indicator): # Add outputs to memory buffer self.memorize(state_old, adj_action, reward, done, state_new) # Sample experience from buffer states_old, actions, rewards, dones, states_new = self.sample_batch( self.batch_size) # Predict target q-values using target networks q_values = self.critic.target_predict( [states_new, self.actor.target_predict(states_new)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states_old, actions, critic_target) # calculate loss loss = self.critic.train_on_batch(states_old, actions, critic_target) state_old = state_new cumul_reward += reward cumul_loss += loss # ======================================================================================= # ======================================================================================= report print("|---> Step: ", t, " | Action: ", transformed_action, " | Reward: ", reward, " | Loss: ", loss) # ======================================================================================= # ======================================================================================= save model if np.mod(e, 10) == 0: print("====================> Saving model...") self.save_weights("./saved_model/") """ with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) """ # ======================================================================================= save model print("") print("*-------------------------------------------------*") print("Average Accumulated Reward: " + str(cumul_reward / self.step)) print("Average Accumulated Loss: " + str(cumul_loss / self.step)) print("*-------------------------------------------------*") print("") # garbage recycling gc.collect() def save_weights(self, path): t = datetime.datetime.now() time = "_" + str(t.date()) + "_" + str(t.hour) + "h-" + str( t.minute) + "m" path_actor = path + '_LR_{}'.format(self.lra) + time path_critic = path + '_LR_{}'.format(self.lrc) + time self.actor.save(path_actor) self.critic.save(path_critic) def load_weights(self, path_actor, path_critic): self.actor.load(path_actor) self.critic.load(path_critic)
class TD3(object): """deep deterministic policy gradient """ def __init__(self, n_state, n_action, a_bound, discount=0.99, tau=0.05, actor_lr=0.001, critic_lr=0.001, policy_freq=2, exp_noise_std=0.1, noise_decay=0.9995, noise_decay_steps=1000, smooth_noise_std=0.1, clip=0.2, buffer_size=20000, save_interval=5000, assess_interval=20, logger=None, checkpoint_queen=None): #self.__dict__.update(locals()) self.logger = logger self.logger.save_config(locals()) self.n_action = n_action self.n_state = n_state self.a_bound = a_bound self.noise_std = exp_noise_std self.noise_decay = noise_decay self.noise_decay_steps = noise_decay_steps self.policy_freq = policy_freq self.smooth_noise_std = smooth_noise_std self.clip = clip self.discount = discount self.pointer = 0 self.buffer = MemoryBuffer(buffer_size, with_per=True) self.save_interval = save_interval self.assess_interval = assess_interval self.actor = Actor(self.n_state, self.n_action, gamma=discount, lr=actor_lr, tau=tau) self.critic1 = Critic(self.n_state, self.n_action, gamma=discount, lr=critic_lr, tau=tau) self.critic2 = Critic(self.n_state, self.n_action, gamma=discount, lr=critic_lr, tau=tau) self.merge = self._merge_summary() self.ckpt_queen = checkpoint_queen self.prefix = self.__class__.__name__ def _merge_summary(self): tf.summary.histogram('critic_output', self.critic1.model.output) tf.summary.histogram('actor_output', self.actor.model.output) tf.summary.histogram('critic_dense1', self.critic1.model.get_layer('l1').weights[0]) tf.summary.histogram('actor_dense1', self.actor.model.get_layer('l1').weights[0]) tf.summary.histogram('critic_dense2', self.critic1.model.get_layer('l2').weights[0]) tf.summary.histogram('actor_dense2', self.actor.model.get_layer('l2').weights[0]) return tf.summary.merge_all() def select_action(self, state): return self.actor.predict(state) def bellman_q_value(self, rewards, q_nexts, dones): """ Use the Bellman Equation to compute the critic target """ q_target = np.zeros_like( rewards) #asarry( copy = False), array(cope=True) for i in range(rewards.shape[0]): if dones[i]: q_target[i] = rewards[i] else: q_target[i] = rewards[i] + self.discount * q_nexts[i] return q_target def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer """ if (self.buffer.with_per): q_val = reward q_val_t = self.critic1.target_predict(state, action) td_error = abs(q_val_t - q_val)[0] # print(td_error) else: td_error = 0 state = state.reshape(-1) action = action.reshape(-1) self.buffer.memorize(state, action, reward, done, new_state, td_error) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_actor(self, states): actions = self.actor.predict(states) grad_ys = self.critic1.gradients(states, actions) actor_output = self.actor.train(states, actions, grad_ys) self.actor.copy_weights() self.critic1.copy_weights() self.critic2.copy_weights() return grad_ys, actor_output def update_critic(self, states, actions, q_values): loss_names, loss_values = self.critic1.train_on_batch( states, actions, q_values) self.critic2.train_on_batch(states, actions, q_values) return loss_names, loss_values def save_weights(self, path): self.actor.save(path) self.critic1.save(path) self.critic2.save(path) def save_model(self, path, file): self.actor.model.save( os.path.join(path, self.prefix + '_actor_' + file + '.h5')) self.critic1.model.save( os.path.join(path, self.prefix + '_critic1_' + file + '.h5')) self.critic2.model.save( os.path.join(path, self.prefix + '_critic2_' + file + '.h5')) def checkpoint(self, path, step, metric_value): signature = str(step) + '_' + '{:.4}'.format(metric_value) to_delete, need_save = self.ckpt_queen.add((metric_value, signature)) if to_delete: delete_actor = os.path.join( path, self.prefix + '_actor_' + to_delete[1] + '.h5') delete_critic1 = os.path.join( path, self.prefix + '_critic1_' + to_delete[1] + '.h5') delete_critic2 = os.path.join( path, self.prefix + '_critic2_' + to_delete[1] + '.h5') os.remove(delete_actor) os.remove(delete_critic1) os.remove(delete_critic2) if need_save: self.save_model(path, signature) def train(self, args, summary_writer, train_data=None, val_data=None, test_data=None): results = [] max_val_rate = 0 val_data = np.asarray(val_data) # none will be array(None) # First, gather experience tqdm_e = tqdm(range(args.batchs), desc='score', leave=True, unit="epoch") if train_data is None: dataset = CsvBuffer(args.file_dir, args.reg_pattern, chunksize=args.batch_size) # 100*(20+1) assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available' # noise = OrnsteinUhlenbeckProcess(size=self.n_action) else: dataset = Dataset(train_data, 1, shuffle=True) warm_up = 20 * args.batch_size for e in tqdm_e: batch_data = next(dataset) states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int) a = self.select_action(states) #(batch, n_action) a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape), self.a_bound[0], self.a_bound[1]) llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5) # rewards = np.where(labels==1, llr.ravel(), -llr.ravel()) #(batch,) rewards = np.where(labels == 1, np.where(llr > 0, llr.ravel(), 2 * llr.ravel()), np.where(llr < 0, -llr.ravel(), -2 * llr.ravel())) #(batch,) # print(rewards) # a_ = self.actor.target_predict(next_states) # noise = np.clip(np.random.normal(0, self.smooth_noise_std), 0, self.clip) # a_ = a_ + noise # q_next1 = self.critic1.target_predict(new_states, a_) # q_next2 = self.critic2.target_predict(new_states,a_) # q_nexts = np.where(q_next1<q_next2, q_next1, q_next2) self.memorize(states, a, rewards, True, None) if e < warm_up: continue states, a, rewards, _, _, _ = self.sample_batch(args.batch_size) # print(states.shape, a.shape, rewards.shape) q_ = self.bellman_q_value(rewards=rewards, q_nexts=0, dones=[True] * rewards.shape[0]) #(batch,) loss_names, loss_values = self.update_critic( states, a, q_.reshape(-1, 1)) if e % self.policy_freq == 0 or e == warm_up: grad_ys, actor_output = self.update_actor(states) if ((e + 1) % self.noise_decay_steps - 1) == 0 or e == warm_up: self.noise_std *= self.noise_decay self.logger.log_tabular('noise', self.noise_std) if e % self.assess_interval == 0 or e == args.batchs - 1 or e == warm_up: if val_data is not None: val_pred = self.actor.predict(val_data[:, :-1]) val_y = val_data[:, -1] # print(val_pred.shape,val_pred[:10]) # print(val_y.shape, val_y[:10]) val_rate, top_k = top_ratio_hit_rate( val_y.ravel(), val_pred.ravel()) self.logger.log_tabular('val_rate', val_rate) self.logger.log_tabular('val_k', int(top_k)) self.checkpoint(args.model_path, e, val_rate) max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate if test_data is not None: test_pred = self.actor.predict(test_data[:, :-1]) test_y = test_data[:, -1] test_rate, top_k = top_ratio_hit_rate( test_y, test_pred.ravel()) self.logger.log_tabular('test_rate', test_rate) self.logger.log_tabular('test_k', int(top_k)) score = rewards.mean() summary_writer.add_summary(tf_summary(['mean-reward'], [score]), global_step=e) summary_writer.add_summary(tf_summary(loss_names, [loss_values]), global_step=e) merge = keras.backend.get_session().run( self.merge, feed_dict={ self.critic1.model.input[0]: states, self.critic1.model.input[1]: a, self.actor.model.input: states }) summary_writer.add_summary(merge, global_step=e) for name, val in zip(loss_names, [loss_values]): self.logger.log_tabular(name, val) self.logger.log_tabular( 'dQ/da', '%.4f+%.4f' % (grad_ys.mean(), grad_ys.std())) # grad_ys (batch,act_dim) self.logger.log_tabular( 'aout', '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std())) self.logger.log_tabular('aloss', '%.4f' % (actor_output[1])) self.logger.log_tabular('reward', '%.4f+%.4f' % (score, rewards.std())) self.logger.dump_tabular() tqdm_e.set_description("score: " + '{:.4f}'.format(score)) tqdm_e.set_postfix(noise_std='{:.4}'.format(self.noise_std), max_val_rate='{:.4}'.format(max_val_rate), val_rate='{:.4}'.format(val_rate), top_k=top_k) tqdm_e.refresh() return results
class DDPGAgent: def __init__(self, state_space_dim, action_space_dim, min_action_val, max_action_val, hidden_layer_size=512, gamma=0.99, tau=0.0001, path_to_load=None): self.gamma = gamma self.tau = tau self.min_action_val = min_action_val self.max_action_val = max_action_val self.buffer = Buffer(state_space_dim, action_space_dim) self.noise_generator = GaussianNoise(0., 0.2, action_space_dim) self.actor = Actor(state_space_dim, action_space_dim, max_action_val, hidden_layer_size) self.critic = Critic(state_space_dim, action_space_dim, hidden_layer_size) if path_to_load is not None: if os.path.exists(path_to_load + "_actor.h5") and \ os.path.exists(path_to_load + "_critic.h5"): self.load(path_to_load) self.target_actor = Actor(state_space_dim, action_space_dim, max_action_val, hidden_layer_size) self.target_critic = Critic(state_space_dim, action_space_dim, hidden_layer_size) self.target_actor.model.set_weights(self.actor.model.get_weights()) self.target_critic.model.set_weights(self.critic.model.get_weights()) critic_lr = 0.002 actor_lr = 0.001 self.critic_optimizer = tf.keras.optimizers.Adam(critic_lr) self.actor_optimizer = tf.keras.optimizers.Adam(actor_lr) @tf.function def _apply_gradients(self, states, actions, next_states, rewards): with tf.GradientTape() as tape: target_actions = self.target_actor.forward(next_states) y = tf.cast(rewards, tf.float32) + self.gamma * self.target_critic.forward( [next_states, target_actions]) critic_value = self.critic.forward([states, actions]) critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value)) critic_grad = tape.gradient(critic_loss, self.critic.model.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.model.trainable_variables)) with tf.GradientTape() as tape: actions = self.actor.forward(states) critic_value = self.critic.forward([states, actions]) actor_loss = -tf.math.reduce_mean(critic_value) actor_grad = tape.gradient(actor_loss, self.actor.model.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.model.trainable_variables)) def learn(self): states, actions, next_states, rewards = self.buffer.sample() self._apply_gradients(states, actions, next_states, rewards) def remember_step(self, info): self.buffer.remember(info) def update_targets(self): new_weights = [] target_variables = self.target_critic.model.weights for i, variable in enumerate(self.critic.model.weights): new_weights.append(variable * self.tau + target_variables[i] * (1 - self.tau)) self.target_critic.model.set_weights(new_weights) new_weights = [] target_variables = self.target_actor.model.weights for i, variable in enumerate(self.actor.model.weights): new_weights.append(variable * self.tau + target_variables[i] * (1 - self.tau)) self.target_actor.model.set_weights(new_weights) def get_best_action(self, state): tf_state = tf.expand_dims(tf.convert_to_tensor(state), 0) return tf.squeeze(self.actor.forward(tf_state)).numpy() def get_action(self, state): actions = self.get_best_action( state) + self.noise_generator.get_noise() return np.clip(actions, self.min_action_val, self.max_action_val) def save(self, path): print(f"Model has been saved as '{path}'") self.actor.save(path) self.critic.save(path) def load(self, path): print(f"Model has been loaded from '{path}'") self.actor.load(path) self.critic.load(path)
class Agent(object): def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma, mem_size, actor_l1_size, actor_l2_size, critic_l1_size, critic_l2_size, batch_size): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(mem_size, n_states, n_actions) self.batch_size = batch_size self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.target_critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005) self.update_network_parameters(tau=1) def choose_action(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) # add noise to action - for exploration mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to( self.actor.device) self.actor.train() return mu_prime.cpu().detach().numpy() def choose_action_no_train(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) return mu.cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): self.memory.push(state, action, reward, new_state, done) def learn(self): if self.memory.idx_last < self.batch_size: # not enough data in replay buffer return # select random events state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device) done = torch.tensor(done).to(self.critic.device) new_state = torch.tensor(new_state, dtype=torch.float).to(self.critic.device) action = torch.tensor(action, dtype=torch.float).to(self.critic.device) state = torch.tensor(state, dtype=torch.float).to(self.critic.device) self.target_actor.eval() self.target_critic.eval() self.critic.eval() target_actions = self.target_actor.forward(new_state) critic_value_ = self.target_critic.forward(new_state, target_actions) critic_value = self.critic.forward(state, action) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = torch.tensor(target).to(self.critic.device) target = target.view(self.batch_size, 1) self.critic.train() self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.critic.eval() self.actor.optimizer.zero_grad() mu = self.actor.forward(state) self.actor.train() actor_loss = -self.critic.forward(state, mu) actor_loss = torch.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_dict = dict(target_critic_params) target_actor_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ (1-tau)*target_critic_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau)*target_actor_dict[name].clone() self.target_actor.load_state_dict(actor_state_dict) def save_models(self): timestamp = time.strftime("%Y%m%d-%H%M%S") self.actor.save("actor_" + timestamp) self.target_actor.save("target_actor_" + timestamp) self.critic.save("critic_" + timestamp) self.target_critic.save("target_critic_" + timestamp) def load_models(self, fn_actor, fn_target_actor, fn_critic, fn_target_critic): self.actor.load_checkpoint(fn_actor) self.target_actor.load_checkpoint(fn_target_actor) self.critic.load_checkpoint(fn_critic) self.target_critic.load_checkpoint(fn_target_critic)