class TD3(object): # Twin Delay DDGP def __init__(self, state_dim, action_dim, max_action): self.actor = Actor(state_dim, action_dim, max_action).to(device) self.actor_target = Actor(state_dim, action_dim, max_action).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic = Critic(state_dim, action_dim).to(device) self.critic_target = Critic(state_dim, action_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.max_action = max_action def select_action(self, state): state = torch.Tensor(state.reshape(1, -1)).to(device) return self.actor(state).cpu().data.numpy().flatten() def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): for it in range(iterations): # step 4: we sample batch of transitions (s, s', a, r) from the memory batch_state, batch_next_state, batch_actions, batch_rewards, batch_dones = replay_buffer.sample( batch_size) state = torch.Tensor(batch_state).to(device) next_state = torch.Tensor(batch_next_state).to(device) action = torch.Tensor(batch_actions).to(device) reward = torch.Tensor(batch_rewards).to(device) done = torch.Tensor(batch_dones).to(device) # step 5: from the next state s', the actor traget plays next action a' next_action = self.actor_target(next_state) # step 6: we add Gaussian noise to the next action a' and we clamp it in a range of values supported by the environment noise = torch.Tensor(batch_actions).data.normal_( 0, policy_noise).to(device) noise = noise.clamp(-noise_clip, noise_clip) next_action = (next_action + noise).clamp(-self.max_action, self.max_action) # step 7: the two critic targets take each the couple (s', a') as input and return 2 Q-values Qt1(s', a') and Qt2(s', a') as output target_Q1, target_Q2 = self.critic_target(next_state, next_action) # step 8: we keep the minimum of these Q-values min(Qt1, Qt2) target_Q = torch.min(target_Q1, target_Q2) # step 9: we get the final target of the 2 critic models, which is : Qt = r + gamma * target_Q target_Q = reward + (1 - done) * discount * target_Q # step 10: the 2 critic models take each the couple (s, a) as input and return 2 Q-values Qt1(s, a) and Qt2(s, a) as outputs current_Q1, current_Q2 = self.critic(state, action) # step 11: we compute the less coming from the 2 critic models : critic loss = mse_loss(Q1(s,a), Qt) + mse_loss(Q2(s,a), Qt) critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # step 12: we backpropagate the critic loss and update the parameters of the 2 critic models with SGD optimizer self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # step 13: one every 2 iterations, we update our Actor model by performing gradient ascent on the output of the first critic model if it % policy_freq == 0: # deterministic policy gradient DPG actor_loss = -self.critic.Q1(state, self.actor(state)).mean() self.actor.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Delay # step 14: still once every 2 iterations, we update the weights of the actor target by polyak averaging for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) # step 15: still ones every 2 iterations, we uodate the weights of the critic target by polyak averaging for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) def save(self, filename, directory): torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) def load(self, filename, directory): self.actor.load_state_dict( torch.load('%s/%s_actor.pth' % (directory, filename))) self.critic.load_state_dict( torch.load('%s/%s_critic.pth' % (directory, filename)))
class DDPG(): def __init__(self, seed): self.writer = SummaryWriter("logdir") # self.writer = SummaryWriter("logs/" + ps["name"] + str(ps[ps["name"]])) self.evaluation_step = 0 torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # Trading environment self.env = HedgingEnv(init_price=100, mu=0.05, sigma=0.2, strike_price=100, r=0, q=0, trading_freq=1, maturity=1 / 12, trading_cost=0.01) self.env.seed(seed) self.env.action_space.seed(seed) action_bounds = self.env.action_space.low, self.env.action_space.high state_space, action_space = 3, 1 # Policy model - actor self.actor = Actor(state_dim=state_space, action_dim=action_space, action_bounds=action_bounds) self.actor_target = copy.deepcopy(self.actor) # Value model - critic self.critic = Critic(state_dim=state_space, action_dim=action_space) self.critic_target = copy.deepcopy(self.critic) # Use Huber loss: 0 - MAE, inf - MSE self.actor_max_grad_norm = float("inf") self.critic_max_grad_norm = float("inf") # Use Polyak averaging - mix the target network with a fraction of online network self.tau = 0.0001 self.update_target_every_steps = 1 # Optimizers self.actor_optimizer = Adam(params=self.actor.parameters(), lr=1e-4, eps=1e-7) self.critic_q1_optimizer = Adam(params=self.critic.q1.parameters(), lr=0.0025, eps=1e-7) self.critic_q2_optimizer = Adam(params=self.critic.q2.parameters(), lr=0.0025, eps=1e-7) # Use Prioritized Experience Replay - PER as the replay buffer self.replay_buffer = PrioritizedReplayBuffer(size=600_000, alpha=0.6) self.per_beta_schedule = LinearSchedule(schedule_timesteps=50_000, final_p=1.0, initial_p=0.4) # Training strategy self.training_strategy = EGreedyExpStrategy(epsilon=1, min_epsilon=0.1, epsilon_decay=0.9999) self.evaluation_strategy = GreedyStrategy() self.batch_size = 128 self.gamma = 1 # total iterations self.total_optimizations = 0 self.total_steps = 0 self.total_ev_interactions = 0 self.q1_loss = [] self.q2_loss = [] self.actor_loss = [] self.mean_a_grad = 0 self.std_a_grad = 0 self.mean_weights = 0 self.std_weights = 0 def optimize_model(self, experiences, weights, idxs): self.total_optimizations += 1 self.optimize_critic(experiences, weights, idxs) self.optimize_actor(experiences) def optimize_critic(self, experiences, weights, idxs): states, actions, rewards, next_states, is_terminals = experiences weights = torch.tensor(weights, dtype=torch.float32, device=self.critic.device).unsqueeze(1) next_actions = self.actor_target(next_states) next_values_1 = self.critic_target.Q1(next_states, next_actions) next_values_2 = self.critic_target.Q2(next_states, next_actions) done_mask = 1 - is_terminals target_1 = rewards + self.gamma * next_values_1 * done_mask target_2 = rewards ** 2 \ + (self.gamma ** 2 * next_values_2) * done_mask \ + (2 * self.gamma * rewards * next_values_1) * done_mask td_error_1 = self.critic.Q1(states, actions) - target_1.detach() critic_q1_loss = (weights * td_error_1**2).mean() # optimize critic 1 self.critic_q1_optimizer.zero_grad() critic_q1_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.q1.parameters(), self.critic_max_grad_norm) self.critic_q1_optimizer.step() td_error_2 = self.critic.Q2(states, actions) - target_2.detach() critic_q2_loss = (weights * td_error_2**2).mean() # optimize critic Q2 self.critic_q2_optimizer.zero_grad() critic_q2_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.q2.parameters(), self.critic_max_grad_norm) self.critic_q2_optimizer.step() # update priorities in replay buffer priorities = (np.abs(td_error_2.detach().cpu().numpy()) + 1e-10).flatten() # 1e-10 to avoid zero priority self.replay_buffer.update_priorities(idxs, priorities) self.q1_loss.append(td_error_1.detach().pow(2).cpu().numpy().mean()) self.q2_loss.append(td_error_2.detach().pow(2).cpu().numpy().mean()) # self.writer.add_scalar("critic_q1_loss", critic_q1_loss.detach().cpu().numpy(), self.total_optimizations) # self.writer.add_scalar("critic_q2_loss", critic_q2_loss.detach().cpu().numpy(), self.total_optimizations) def optimize_actor(self, experiences): states, actions, rewards, next_states, is_terminals = experiences chosen_actions = self.actor(states) chosen_actions.retain_grad() expected_reward = self.critic(states, chosen_actions) actor_loss = -expected_reward.mean() self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.actor_max_grad_norm) self.actor_optimizer.step() self.mean_a_grad = np.mean(chosen_actions.grad) self.std_a_grad = np.std(chosen_actions.grad) self.actor_loss.append(float(actor_loss.detach().cpu())) # self.writer.add_scalar("actor_loss", actor_loss.detach().cpu().numpy(), self.total_optimizations) def interaction_step(self, state): self.total_steps += 1 action, is_exploratory = self.training_strategy.select_action( self.actor, state, self.env) new_state, reward, is_terminal, info = self.env.step(action) self.replay_buffer.add(state, action, reward, new_state, is_terminal) self.episode_reward[-1] += reward self.episode_exploration[-1] += int(is_exploratory) return new_state, is_terminal def update_networks(self): self.mix_weights(target_model=self.critic_target.q1, online_model=self.critic.q1) self.mix_weights(target_model=self.critic_target.q2, online_model=self.critic.q2) self.mix_weights(target_model=self.actor_target, online_model=self.actor) def mix_weights(self, target_model, online_model): for target_param, online_param in zip(target_model.parameters(), online_model.parameters()): target_param.data.copy_(self.tau * online_param.data + (1 - self.tau) * target_param.data) def train(self, episodes): training_start, last_debug_time = time.time(), float('-inf') self.episode_reward = [] self.episode_exploration = [] self.episode_seconds = [] result = np.empty((episodes, 4)) result[:] = np.nan training_time = 0 for episode in range(1, episodes + 1): episode_start = time.time() state, is_terminal = self.env.reset(), False self.path_length = self.env.simulator.days_to_maturity() self.episode_reward.append(0.0) self.episode_exploration.append(0.0) for step in count(): state, is_terminal = self.interaction_step(state) if len(self.replay_buffer) > self.batch_size: *experiences, weights, idxs = self.replay_buffer.sample( self.batch_size, beta=self.per_beta_schedule.value(episode)) self.mean_weights = np.mean(weights) self.std_weights = np.std(weights) experiences = self.critic.load(experiences) self.optimize_model(experiences, weights, idxs) if step % self.update_target_every_steps == 0: self.update_networks() if is_terminal: gc.collect() break self.training_strategy.epsilon_update() # Stats # elapsed time episode_elapsed = time.time() - episode_start self.episode_seconds.append(episode_elapsed) training_time += episode_elapsed wallclock_elapsed = time.time() - training_start reached_debug_time = time.time( ) - last_debug_time >= LEAVE_PRINT_EVERY_N_SECS if len(self.q1_loss) >= 100: elapsed_str = time.strftime( "%H:%M:%S", time.gmtime(time.time() - training_start)) msg = 'el {}, ep {:>5}, Q1 lst {:>5.0f}, 100 {:>5.0f}\u00B1{:04.0f}, ' \ + 'Q2 lst {:>10.0f}, 100 {:>10.0f}\u00B1{:09.0f}, ' \ + 'A lst {:05.1f}, 100 {:05.1f}\u00B1{:05.1f}' msg = msg.format(elapsed_str, episode, self.q1_loss[-1], np.mean(self.q1_loss[-100:]), np.std(self.q1_loss[-100:]), self.q2_loss[-1], np.mean(self.q2_loss[-100:]), np.std(self.q2_loss[-100:]), self.actor_loss[-1], np.mean(self.actor_loss[-100:]), np.std(self.actor_loss[-100:])) print(msg, end='\r', flush=True) if reached_debug_time or episode >= episodes: print(ERASE_LINE + msg, flush=True) last_debug_time = time.time() if episode % 50 == 0: hist = { "episode": [episode], "last_q1_loss": [self.q1_loss[-1]], "mean_q1_loss": [np.mean(self.q1_loss)], "std_q1_loss": [np.std(self.q1_loss)], "last_q2_loss": [self.q2_loss[-1]], "mean_q2_loss": [np.mean(self.q2_loss)], "std_q2_loss": [np.std(self.q2_loss)], "last_actor_loss": [self.actor_loss[-1]], "mean_actor_loss": [np.mean(self.actor_loss)], "std_actor_loss": [np.std(self.actor_loss)], "mean_weights": [self.mean_weights], "std_weights": [self.std_weights], "mean_a_grad": [self.mean_a_grad], "std_a_grad": [self.std_a_grad], } hist_path = "history/metrics_hist.csv" if not os.path.exists(hist_path): pd.DataFrame.from_dict(hist).to_csv(hist_path, index=False, encoding='utf-8') else: pd.DataFrame.from_dict(hist).to_csv(hist_path, mode='a', index=False, header=False, encoding='utf-8') if episode % 300 == 0: self.q1_loss = self.q1_loss[-100:] self.q2_loss = self.q2_loss[-100:] self.actor_loss = self.actor_loss[-100:] # tensorboard metrics # self.writer.add_scalar("epsilon", self.training_strategy.epsilon, episode) # if episode % 10 == 0 and episode != 0: # self.evaluate(self.actor, self.env) if episode % 100 == 0: filename = 'model/ddpg_' + str(int(episode / 100)) + ".pt" self.save(episode, filename) def save(self, episode, filename): torch.save( { 'episode': episode, 'actor': self.actor.state_dict(), 'actor_target': self.actor_target.state_dict(), 'actor_optimizer': self.actor_optimizer.state_dict(), 'critic_q1': self.critic.q1.state_dict(), 'critic_target_q1': self.critic_target.q1.state_dict(), 'critic_q1_optimizer': self.critic_q1_optimizer.state_dict(), 'critic_q2': self.critic.q2.state_dict(), 'critic_target_q2': self.critic_target.q2.state_dict(), 'critic_q2_optimizer': self.critic_q2_optimizer.state_dict(), }, filename) def load(self, filename): saved = torch.load(filename) self.actor.load_state_dict(saved['actor']) self.actor_target.load_state_dict(saved['actor_target']) self.actor_optimizer.load_state_dict(saved['actor_optimizer']) self.critic.q1.load_state_dict(saved['critic_q1']) self.critic_target.q2.load_state_dict(saved['critic_target_q1']) self.critic.q2.load_state_dict(saved['critic_q2']) self.critic_target.q2.load_state_dict(saved['critic_target_q2']) self.critic_q2_optimizer.load_state_dict(saved['critic_q2_optimizer']) def test(self, episodes): model_actions = [] model_rewards = [] model_final_rewards = [] delta_actions = [] delta_rewards = [] delta_final_rewards = [] for i in range(1, episodes + 1): state, done = self.env.reset(), False while not done: action = self.evaluation_strategy.select_action( self.actor, state) state, reward, done, info = self.env.step(action) model_actions.append(action) model_rewards.append(reward) delta_actions.append(info["delta_action"]) delta_rewards.append(info["delta_reward"]) model_final_rewards.append(np.sum(model_rewards)) delta_final_rewards.append(np.sum(delta_rewards)) model_rewards = [] delta_rewards = [] if i % 1000 == 0: print("{:0>5}: model {:.2f} {:.2f} delta {:.2f} {:.2f}". format(i, np.mean(model_final_rewards), np.std(model_final_rewards), np.mean(delta_final_rewards), np.std(delta_final_rewards))) def evaluate(self, eval_policy_model, eval_env, n_episodes=1): actions = [] rewards = [] delta_actions = [] delta_rewards = [] for _ in range(n_episodes): self.evaluation_step += 1 s, d = eval_env.reset(), False for _ in count(): self.total_ev_interactions += 1 a = self.evaluation_strategy.select_action( eval_policy_model, s) s, r, d, i = eval_env.step(a) actions.append(a) rewards.append(r) delta_actions.append(i["delta_action"]) delta_rewards.append(i["delta_reward"]) self.writer.add_scalars("ev_actions", {"actor": a}, self.total_ev_interactions) self.writer.add_scalars("ev_actions", {"delta": i["delta_action"]}, self.total_ev_interactions) if d: break diffs = np.array(actions) - np.array(delta_actions) diffs_mean = np.mean(diffs) diffs_std = np.std(diffs) self.writer.add_scalars("ev", {"actor_reward": np.sum(rewards)}, self.evaluation_step) self.writer.add_scalars("ev", {"delta_reward": np.sum(delta_rewards)}, self.evaluation_step) self.writer.add_scalars("ev_diff", {"mean": diffs_mean}, self.evaluation_step) self.writer.add_scalars("ev_diff", {"std": diffs_std}, self.evaluation_step) self.writer.flush()