class DDPGAgent: def __init__(self, state_space_dim, action_space_dim, min_action_val, max_action_val, hidden_layer_size=512, gamma=0.99, tau=0.0001, path_to_load=None): self.gamma = gamma self.tau = tau self.min_action_val = min_action_val self.max_action_val = max_action_val self.buffer = Buffer(state_space_dim, action_space_dim) self.noise_generator = GaussianNoise(0., 0.2, action_space_dim) self.actor = Actor(state_space_dim, action_space_dim, max_action_val, hidden_layer_size) self.critic = Critic(state_space_dim, action_space_dim, hidden_layer_size) if path_to_load is not None: if os.path.exists(path_to_load + "_actor.h5") and \ os.path.exists(path_to_load + "_critic.h5"): self.load(path_to_load) self.target_actor = Actor(state_space_dim, action_space_dim, max_action_val, hidden_layer_size) self.target_critic = Critic(state_space_dim, action_space_dim, hidden_layer_size) self.target_actor.model.set_weights(self.actor.model.get_weights()) self.target_critic.model.set_weights(self.critic.model.get_weights()) critic_lr = 0.002 actor_lr = 0.001 self.critic_optimizer = tf.keras.optimizers.Adam(critic_lr) self.actor_optimizer = tf.keras.optimizers.Adam(actor_lr) @tf.function def _apply_gradients(self, states, actions, next_states, rewards): with tf.GradientTape() as tape: target_actions = self.target_actor.forward(next_states) y = tf.cast(rewards, tf.float32) + self.gamma * self.target_critic.forward( [next_states, target_actions]) critic_value = self.critic.forward([states, actions]) critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value)) critic_grad = tape.gradient(critic_loss, self.critic.model.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.model.trainable_variables)) with tf.GradientTape() as tape: actions = self.actor.forward(states) critic_value = self.critic.forward([states, actions]) actor_loss = -tf.math.reduce_mean(critic_value) actor_grad = tape.gradient(actor_loss, self.actor.model.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.model.trainable_variables)) def learn(self): states, actions, next_states, rewards = self.buffer.sample() self._apply_gradients(states, actions, next_states, rewards) def remember_step(self, info): self.buffer.remember(info) def update_targets(self): new_weights = [] target_variables = self.target_critic.model.weights for i, variable in enumerate(self.critic.model.weights): new_weights.append(variable * self.tau + target_variables[i] * (1 - self.tau)) self.target_critic.model.set_weights(new_weights) new_weights = [] target_variables = self.target_actor.model.weights for i, variable in enumerate(self.actor.model.weights): new_weights.append(variable * self.tau + target_variables[i] * (1 - self.tau)) self.target_actor.model.set_weights(new_weights) def get_best_action(self, state): tf_state = tf.expand_dims(tf.convert_to_tensor(state), 0) return tf.squeeze(self.actor.forward(tf_state)).numpy() def get_action(self, state): actions = self.get_best_action( state) + self.noise_generator.get_noise() return np.clip(actions, self.min_action_val, self.max_action_val) def save(self, path): print(f"Model has been saved as '{path}'") self.actor.save(path) self.critic.save(path) def load(self, path): print(f"Model has been loaded from '{path}'") self.actor.load(path) self.critic.load(path)
class DDPG(object): """ Deep Deterministic Policy Gradient (DDPG) Helper Class """ def __init__(self, action_dim, state_dim, batch_size, step, buffer_size, train_indicator, episode, gamma, lra, lrc, tau, load_weight=True): """ Initialization """ # Environment and A2C parameters self.action_dim = action_dim self.state_dim = state_dim self.batch_size = batch_size self.step = step self.gamma = gamma self.lra = lra self.lrc = lrc self.tau = tau self.episode = episode self.train_indicator = train_indicator # Create actor and critic networks self.actor = Actor(state_dim, action_dim, batch_size, lra, tau) self.critic = Critic(state_dim, action_dim, batch_size, lrc, tau) self.buffer = MemoryBuffer(buffer_size) # !: weights folder need to be specified & ensure only one set of A&C weights are in this folder self.weights_dir_path = os.getcwd() + r"\saved_model\*.h5" if load_weight: try: weights_actor_path = "" weights_critic_path = "" weights_file_path = glob.glob(self.weights_dir_path) for file_path in weights_file_path: if file_path.find("actor") < 0: weights_critic_path = file_path if file_path.find("critic") < 0: weights_actor_path = file_path self.load_weights(weights_actor_path, weights_critic_path) print("") print("Actor-Critic Models are loaded with weights...") print("") except: print("") print( "Weights are failed to be loaded, please check weights loading path..." ) print("") def policy_action(self, s): """ Use the actor to predict value """ return self.actor.predict(s)[0] def bellman(self, rewards, q_values, dones): """ Use the Bellman Equation to compute the critic target (one action only) """ critic_target = np.asarray(q_values) for i in range(q_values.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[i] = rewards[i] + self.gamma * q_values[i] return critic_target def memorize(self, state_old, action, reward, done, state_new): """ Store experience in memory buffer """ self.buffer.memorize(state_old, action, reward, done, state_new) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_models(self, states, actions, critic_target): """ Update actor and critic networks from sampled experience """ # Train critic self.critic.train_on_batch(states, actions, critic_target) # Q-Value Gradients under Current Policy actions = self.actor.model.predict(states) grads = self.critic.gradients(states, actions) # Train actor self.actor.train(states, actions, np.array(grads).reshape((-1, self.action_dim))) # Transfer weights to target networks at rate Tau self.actor.transfer_weights() self.critic.transfer_weights() def run(self, env): # First, gather experience for e in range(self.episode): # Reset episode # set initial state loss, cumul_reward, cumul_loss = 0, 0, 0 done = False state_old = env.get_vissim_state( 1, 180 * 5, [45, 55, 60, 65, 70, 75, 80 ]) #TODO: make sure states are recieved correctly actions, states, rewards = [], [], [] print("Episode: ", e, " ========================:") for t in range(self.step): action_original = self.policy_action(state_old) #TODO: OU function params? noise = OrnsteinUhlenbeckProcess(x0=action_original, size=self.action_dim) # action = action_orig + noise action = noise.apply_ou(t) # adjust too-low or too-high action adj_action = np.zeros(len(action)) for index, value in enumerate(action): adj_action[index] = clip(value, -1, 1) #action_mapping function transformed_action = Transformation.convert_actions(adj_action) reward, state_new = env.get_vissim_reward( 180 * 5, transformed_action) # TODO: if we know what the optimal discharging rate, then we set that as done if t == self.step - 1: #we consider the manually setted last step as done done = True # ======================================================================================= Training section if (self.train_indicator): # Add outputs to memory buffer self.memorize(state_old, adj_action, reward, done, state_new) # Sample experience from buffer states_old, actions, rewards, dones, states_new = self.sample_batch( self.batch_size) # Predict target q-values using target networks q_values = self.critic.target_predict( [states_new, self.actor.target_predict(states_new)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states_old, actions, critic_target) # calculate loss loss = self.critic.train_on_batch(states_old, actions, critic_target) state_old = state_new cumul_reward += reward cumul_loss += loss # ======================================================================================= # ======================================================================================= report print("|---> Step: ", t, " | Action: ", transformed_action, " | Reward: ", reward, " | Loss: ", loss) # ======================================================================================= # ======================================================================================= save model if np.mod(e, 10) == 0: print("====================> Saving model...") self.save_weights("./saved_model/") """ with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) """ # ======================================================================================= save model print("") print("*-------------------------------------------------*") print("Average Accumulated Reward: " + str(cumul_reward / self.step)) print("Average Accumulated Loss: " + str(cumul_loss / self.step)) print("*-------------------------------------------------*") print("") # garbage recycling gc.collect() def save_weights(self, path): t = datetime.datetime.now() time = "_" + str(t.date()) + "_" + str(t.hour) + "h-" + str( t.minute) + "m" path_actor = path + '_LR_{}'.format(self.lra) + time path_critic = path + '_LR_{}'.format(self.lrc) + time self.actor.save(path_actor) self.critic.save(path_critic) def load_weights(self, path_actor, path_critic): self.actor.load(path_actor) self.critic.load(path_critic)
class DDPG(): def __init__(self, seed): self.writer = SummaryWriter("logdir") # self.writer = SummaryWriter("logs/" + ps["name"] + str(ps[ps["name"]])) self.evaluation_step = 0 torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # Trading environment self.env = HedgingEnv(init_price=100, mu=0.05, sigma=0.2, strike_price=100, r=0, q=0, trading_freq=1, maturity=1 / 12, trading_cost=0.01) self.env.seed(seed) self.env.action_space.seed(seed) action_bounds = self.env.action_space.low, self.env.action_space.high state_space, action_space = 3, 1 # Policy model - actor self.actor = Actor(state_dim=state_space, action_dim=action_space, action_bounds=action_bounds) self.actor_target = copy.deepcopy(self.actor) # Value model - critic self.critic = Critic(state_dim=state_space, action_dim=action_space) self.critic_target = copy.deepcopy(self.critic) # Use Huber loss: 0 - MAE, inf - MSE self.actor_max_grad_norm = float("inf") self.critic_max_grad_norm = float("inf") # Use Polyak averaging - mix the target network with a fraction of online network self.tau = 0.0001 self.update_target_every_steps = 1 # Optimizers self.actor_optimizer = Adam(params=self.actor.parameters(), lr=1e-4, eps=1e-7) self.critic_q1_optimizer = Adam(params=self.critic.q1.parameters(), lr=0.0025, eps=1e-7) self.critic_q2_optimizer = Adam(params=self.critic.q2.parameters(), lr=0.0025, eps=1e-7) # Use Prioritized Experience Replay - PER as the replay buffer self.replay_buffer = PrioritizedReplayBuffer(size=600_000, alpha=0.6) self.per_beta_schedule = LinearSchedule(schedule_timesteps=50_000, final_p=1.0, initial_p=0.4) # Training strategy self.training_strategy = EGreedyExpStrategy(epsilon=1, min_epsilon=0.1, epsilon_decay=0.9999) self.evaluation_strategy = GreedyStrategy() self.batch_size = 128 self.gamma = 1 # total iterations self.total_optimizations = 0 self.total_steps = 0 self.total_ev_interactions = 0 self.q1_loss = [] self.q2_loss = [] self.actor_loss = [] self.mean_a_grad = 0 self.std_a_grad = 0 self.mean_weights = 0 self.std_weights = 0 def optimize_model(self, experiences, weights, idxs): self.total_optimizations += 1 self.optimize_critic(experiences, weights, idxs) self.optimize_actor(experiences) def optimize_critic(self, experiences, weights, idxs): states, actions, rewards, next_states, is_terminals = experiences weights = torch.tensor(weights, dtype=torch.float32, device=self.critic.device).unsqueeze(1) next_actions = self.actor_target(next_states) next_values_1 = self.critic_target.Q1(next_states, next_actions) next_values_2 = self.critic_target.Q2(next_states, next_actions) done_mask = 1 - is_terminals target_1 = rewards + self.gamma * next_values_1 * done_mask target_2 = rewards ** 2 \ + (self.gamma ** 2 * next_values_2) * done_mask \ + (2 * self.gamma * rewards * next_values_1) * done_mask td_error_1 = self.critic.Q1(states, actions) - target_1.detach() critic_q1_loss = (weights * td_error_1**2).mean() # optimize critic 1 self.critic_q1_optimizer.zero_grad() critic_q1_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.q1.parameters(), self.critic_max_grad_norm) self.critic_q1_optimizer.step() td_error_2 = self.critic.Q2(states, actions) - target_2.detach() critic_q2_loss = (weights * td_error_2**2).mean() # optimize critic Q2 self.critic_q2_optimizer.zero_grad() critic_q2_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.q2.parameters(), self.critic_max_grad_norm) self.critic_q2_optimizer.step() # update priorities in replay buffer priorities = (np.abs(td_error_2.detach().cpu().numpy()) + 1e-10).flatten() # 1e-10 to avoid zero priority self.replay_buffer.update_priorities(idxs, priorities) self.q1_loss.append(td_error_1.detach().pow(2).cpu().numpy().mean()) self.q2_loss.append(td_error_2.detach().pow(2).cpu().numpy().mean()) # self.writer.add_scalar("critic_q1_loss", critic_q1_loss.detach().cpu().numpy(), self.total_optimizations) # self.writer.add_scalar("critic_q2_loss", critic_q2_loss.detach().cpu().numpy(), self.total_optimizations) def optimize_actor(self, experiences): states, actions, rewards, next_states, is_terminals = experiences chosen_actions = self.actor(states) chosen_actions.retain_grad() expected_reward = self.critic(states, chosen_actions) actor_loss = -expected_reward.mean() self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.actor_max_grad_norm) self.actor_optimizer.step() self.mean_a_grad = np.mean(chosen_actions.grad) self.std_a_grad = np.std(chosen_actions.grad) self.actor_loss.append(float(actor_loss.detach().cpu())) # self.writer.add_scalar("actor_loss", actor_loss.detach().cpu().numpy(), self.total_optimizations) def interaction_step(self, state): self.total_steps += 1 action, is_exploratory = self.training_strategy.select_action( self.actor, state, self.env) new_state, reward, is_terminal, info = self.env.step(action) self.replay_buffer.add(state, action, reward, new_state, is_terminal) self.episode_reward[-1] += reward self.episode_exploration[-1] += int(is_exploratory) return new_state, is_terminal def update_networks(self): self.mix_weights(target_model=self.critic_target.q1, online_model=self.critic.q1) self.mix_weights(target_model=self.critic_target.q2, online_model=self.critic.q2) self.mix_weights(target_model=self.actor_target, online_model=self.actor) def mix_weights(self, target_model, online_model): for target_param, online_param in zip(target_model.parameters(), online_model.parameters()): target_param.data.copy_(self.tau * online_param.data + (1 - self.tau) * target_param.data) def train(self, episodes): training_start, last_debug_time = time.time(), float('-inf') self.episode_reward = [] self.episode_exploration = [] self.episode_seconds = [] result = np.empty((episodes, 4)) result[:] = np.nan training_time = 0 for episode in range(1, episodes + 1): episode_start = time.time() state, is_terminal = self.env.reset(), False self.path_length = self.env.simulator.days_to_maturity() self.episode_reward.append(0.0) self.episode_exploration.append(0.0) for step in count(): state, is_terminal = self.interaction_step(state) if len(self.replay_buffer) > self.batch_size: *experiences, weights, idxs = self.replay_buffer.sample( self.batch_size, beta=self.per_beta_schedule.value(episode)) self.mean_weights = np.mean(weights) self.std_weights = np.std(weights) experiences = self.critic.load(experiences) self.optimize_model(experiences, weights, idxs) if step % self.update_target_every_steps == 0: self.update_networks() if is_terminal: gc.collect() break self.training_strategy.epsilon_update() # Stats # elapsed time episode_elapsed = time.time() - episode_start self.episode_seconds.append(episode_elapsed) training_time += episode_elapsed wallclock_elapsed = time.time() - training_start reached_debug_time = time.time( ) - last_debug_time >= LEAVE_PRINT_EVERY_N_SECS if len(self.q1_loss) >= 100: elapsed_str = time.strftime( "%H:%M:%S", time.gmtime(time.time() - training_start)) msg = 'el {}, ep {:>5}, Q1 lst {:>5.0f}, 100 {:>5.0f}\u00B1{:04.0f}, ' \ + 'Q2 lst {:>10.0f}, 100 {:>10.0f}\u00B1{:09.0f}, ' \ + 'A lst {:05.1f}, 100 {:05.1f}\u00B1{:05.1f}' msg = msg.format(elapsed_str, episode, self.q1_loss[-1], np.mean(self.q1_loss[-100:]), np.std(self.q1_loss[-100:]), self.q2_loss[-1], np.mean(self.q2_loss[-100:]), np.std(self.q2_loss[-100:]), self.actor_loss[-1], np.mean(self.actor_loss[-100:]), np.std(self.actor_loss[-100:])) print(msg, end='\r', flush=True) if reached_debug_time or episode >= episodes: print(ERASE_LINE + msg, flush=True) last_debug_time = time.time() if episode % 50 == 0: hist = { "episode": [episode], "last_q1_loss": [self.q1_loss[-1]], "mean_q1_loss": [np.mean(self.q1_loss)], "std_q1_loss": [np.std(self.q1_loss)], "last_q2_loss": [self.q2_loss[-1]], "mean_q2_loss": [np.mean(self.q2_loss)], "std_q2_loss": [np.std(self.q2_loss)], "last_actor_loss": [self.actor_loss[-1]], "mean_actor_loss": [np.mean(self.actor_loss)], "std_actor_loss": [np.std(self.actor_loss)], "mean_weights": [self.mean_weights], "std_weights": [self.std_weights], "mean_a_grad": [self.mean_a_grad], "std_a_grad": [self.std_a_grad], } hist_path = "history/metrics_hist.csv" if not os.path.exists(hist_path): pd.DataFrame.from_dict(hist).to_csv(hist_path, index=False, encoding='utf-8') else: pd.DataFrame.from_dict(hist).to_csv(hist_path, mode='a', index=False, header=False, encoding='utf-8') if episode % 300 == 0: self.q1_loss = self.q1_loss[-100:] self.q2_loss = self.q2_loss[-100:] self.actor_loss = self.actor_loss[-100:] # tensorboard metrics # self.writer.add_scalar("epsilon", self.training_strategy.epsilon, episode) # if episode % 10 == 0 and episode != 0: # self.evaluate(self.actor, self.env) if episode % 100 == 0: filename = 'model/ddpg_' + str(int(episode / 100)) + ".pt" self.save(episode, filename) def save(self, episode, filename): torch.save( { 'episode': episode, 'actor': self.actor.state_dict(), 'actor_target': self.actor_target.state_dict(), 'actor_optimizer': self.actor_optimizer.state_dict(), 'critic_q1': self.critic.q1.state_dict(), 'critic_target_q1': self.critic_target.q1.state_dict(), 'critic_q1_optimizer': self.critic_q1_optimizer.state_dict(), 'critic_q2': self.critic.q2.state_dict(), 'critic_target_q2': self.critic_target.q2.state_dict(), 'critic_q2_optimizer': self.critic_q2_optimizer.state_dict(), }, filename) def load(self, filename): saved = torch.load(filename) self.actor.load_state_dict(saved['actor']) self.actor_target.load_state_dict(saved['actor_target']) self.actor_optimizer.load_state_dict(saved['actor_optimizer']) self.critic.q1.load_state_dict(saved['critic_q1']) self.critic_target.q2.load_state_dict(saved['critic_target_q1']) self.critic.q2.load_state_dict(saved['critic_q2']) self.critic_target.q2.load_state_dict(saved['critic_target_q2']) self.critic_q2_optimizer.load_state_dict(saved['critic_q2_optimizer']) def test(self, episodes): model_actions = [] model_rewards = [] model_final_rewards = [] delta_actions = [] delta_rewards = [] delta_final_rewards = [] for i in range(1, episodes + 1): state, done = self.env.reset(), False while not done: action = self.evaluation_strategy.select_action( self.actor, state) state, reward, done, info = self.env.step(action) model_actions.append(action) model_rewards.append(reward) delta_actions.append(info["delta_action"]) delta_rewards.append(info["delta_reward"]) model_final_rewards.append(np.sum(model_rewards)) delta_final_rewards.append(np.sum(delta_rewards)) model_rewards = [] delta_rewards = [] if i % 1000 == 0: print("{:0>5}: model {:.2f} {:.2f} delta {:.2f} {:.2f}". format(i, np.mean(model_final_rewards), np.std(model_final_rewards), np.mean(delta_final_rewards), np.std(delta_final_rewards))) def evaluate(self, eval_policy_model, eval_env, n_episodes=1): actions = [] rewards = [] delta_actions = [] delta_rewards = [] for _ in range(n_episodes): self.evaluation_step += 1 s, d = eval_env.reset(), False for _ in count(): self.total_ev_interactions += 1 a = self.evaluation_strategy.select_action( eval_policy_model, s) s, r, d, i = eval_env.step(a) actions.append(a) rewards.append(r) delta_actions.append(i["delta_action"]) delta_rewards.append(i["delta_reward"]) self.writer.add_scalars("ev_actions", {"actor": a}, self.total_ev_interactions) self.writer.add_scalars("ev_actions", {"delta": i["delta_action"]}, self.total_ev_interactions) if d: break diffs = np.array(actions) - np.array(delta_actions) diffs_mean = np.mean(diffs) diffs_std = np.std(diffs) self.writer.add_scalars("ev", {"actor_reward": np.sum(rewards)}, self.evaluation_step) self.writer.add_scalars("ev", {"delta_reward": np.sum(delta_rewards)}, self.evaluation_step) self.writer.add_scalars("ev_diff", {"mean": diffs_mean}, self.evaluation_step) self.writer.add_scalars("ev_diff", {"std": diffs_std}, self.evaluation_step) self.writer.flush()