def test_generalized_advantage(self): vector = th.randn(VECTOR_SIZE) for i in range(500): self.replay.append(vector, vector, random.random(), vector, False) self.replay.done()[-1] += 1 values = th.randn_like(self.replay.reward()) rewards = self.replay.reward().view(-1).tolist() dones = self.replay.done().view(-1).tolist() next_value = random.random() ref = generalized_advantage_estimate(GAMMA, TAU, rewards, dones, values, next_value) advantages = generalized_advantage(GAMMA, TAU, self.replay.reward(), self.replay.done(), values, next_value + th.zeros(1)) ref = th.Tensor(ref).view(advantages.size()) self.assertTrue(close(ref, advantages)) # Overlapping episodes overlap = self.replay[2:] + self.replay[:3] overlap_values = th.cat((values[2:], values[:3]), dim=0) overlap_next_value = th.randn(1) overlap_adv = generalized_advantage(GAMMA, TAU, overlap.reward().double(), overlap.done().double(), overlap_values.double(), overlap_next_value.double()) values = overlap_values.view(-1).tolist() rewards = overlap.reward().view(-1).tolist() dones = overlap.done().view(-1).tolist() ref = generalized_advantage_estimate(GAMMA, TAU, rewards, dones, values, overlap_next_value.item()) ref = th.Tensor(ref).view(overlap_adv.size()).double() self.assertTrue(close(overlap_adv, ref))
def update(replay, optimizer, policy, env, lr_schedule): _, next_state_value = policy(replay[-1].next_state()) advantages = pg.generalized_advantage(GAMMA, TAU, replay.reward(), replay.done(), replay.value(), next_state_value) advantages = ch.utils.normalize(advantages, epsilon=1e-5).view(-1, 1) rewards = [a + v for a, v in zip(advantages, replay.value())] for i, sars in enumerate(replay): sars.reward = rewards[i].detach() sars.advantage = advantages[i].detach() # Logging policy_losses = [] entropies = [] value_losses = [] mean = lambda a: sum(a) / len(a) # Perform some optimization steps for step in range(PPO_EPOCHS * PPO_NUM_BATCHES): batch = replay.sample(PPO_BSZ) masses, values = policy(batch.state()) # Compute losses new_log_probs = masses.log_prob(batch.action()).sum(-1, keepdim=True) entropy = masses.entropy().sum(-1).mean() policy_loss = ppo.policy_loss(new_log_probs, batch.log_prob(), batch.advantage(), clip=PPO_CLIP) value_loss = ppo.state_value_loss(values, batch.value().detach(), batch.reward(), clip=PPO_CLIP) loss = policy_loss - ENT_WEIGHT * entropy + V_WEIGHT * value_loss # Take optimization step optimizer.zero_grad() loss.backward() th.nn.utils.clip_grad_norm_(policy.parameters(), GRAD_NORM) optimizer.step() policy_losses.append(policy_loss) entropies.append(entropy) value_losses.append(value_loss) # Log metrics if dist.get_rank() == 0: env.log('policy loss', mean(policy_losses).item()) env.log('policy entropy', mean(entropies).item()) env.log('value loss', mean(value_losses).item()) ppt.plot(mean(env.all_rewards[-10000:]), 'PPO results') # Update the parameters on schedule if LINEAR_SCHEDULE: lr_schedule.step()
def main(env='Pendulum-v0'): agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() for step in range(1, MAX_STEPS + 1): replay += env.run(agent, episodes=1) if len(replay) >= BATCH_SIZE: with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(replay.state()) masses = infos['mass'] new_values = infos['value'].view(-1, 1) new_log_probs = masses.log_prob(replay.action()) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss(new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() replay.empty()
def compute_advantages(baseline, tau, gamma, rewards, dones, states, next_states, update_vf=True): returns = ch.td.discount(gamma, rewards, dones) if update_vf: baseline.fit(states, returns) values = baseline(states) next_values = baseline(next_states) bootstraps = values * (1.0 - dones) + next_values * dones next_value = torch.zeros(1, device=values.device) return generalized_advantage(tau=tau, gamma=gamma, rewards=rewards, dones=dones, values=bootstraps, next_value=next_value)
def main(env='Pendulum-v0'): agent = ActorCritic(HIDDEN_SIZE).to(device) agent.apply(weights_init) actor_optimizer = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimizer = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) actor_scheduler = torch.optim.lr_scheduler.StepLR(actor_optimizer, step_size=2000, gamma=0.5) critic_scheduler = torch.optim.lr_scheduler.StepLR(critic_optimizer, step_size=2000, gamma=0.5) replay = ch.ExperienceReplay() env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() def get_action(state): return agent(state.to(device)) for step in range(1, MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) >= BATCH_SIZE: #batch = replay.sample(BATCH_SIZE).to(device) batch = replay.to(device) with torch.no_grad(): advantages = pg.generalized_advantage( DISCOUNT, TRACE_DECAY, batch.reward(), batch.done(), batch.value(), torch.zeros(1).to(device)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, batch.reward(), batch.done()) old_log_probs = batch.log_prob() new_values = batch.value() new_log_probs = batch.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(batch.state()) masses = infos['mass'] new_values = infos['value'].view(-1, 1) new_log_probs = masses.log_prob(batch.action()) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimizer.zero_grad() policy_loss.backward() #nn.utils.clip_grad_norm_(agent.actor.parameters(), 1.0) actor_optimizer.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimizer.zero_grad() value_loss.backward() #nn.utils.clip_grad_norm_(agent.critic.parameters(), 1.0) critic_optimizer.step() actor_scheduler.step() critic_scheduler.step() replay.empty()
def main(): order_book_id_number = 10 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=20, start="2019-05-01", end="2019-12-12", frequency="D") env = PortfolioTradingGym(data_df=toy_data, sequence_window=5, add_cash=True) env = Numpy(env) env = ch.envs.Logger(env, interval=1000) env = ch.envs.Torch(env) env = ch.envs.Runner(env) # create net action_size = env.action_space.shape[0] number_asset, seq_window, features_number = env.observation_space.shape input_size = features_number agent = ActorCritic(input_size=input_size, hidden_size=HIDDEN_SIZE, action_size=action_size) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() for step in range(1, MAX_STEPS + 1): replay += env.run(agent, episodes=1) if len(replay) >= BATCH_SIZE: with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() # here is to add readability new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(replay.state()) masses = infos['mass'] new_values = infos['value'] new_log_probs = masses.log_prob( replay.action()).unsqueeze(-1) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() replay.empty()
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) replay = ch.ExperienceReplay() agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) def get_action(state): mass, value = agent(state) action = mass.sample() log_prob = mass.log_prob(action) return action, { 'log_prob': log_prob, 'value': value, } result = { 'rewards': [], 'policy_losses': [], 'value_losses': [], 'weights': [], } for step in range(1, CHERRY_MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) >= BATCH_SIZE: for r in replay.reward(): result['rewards'].append(r.item()) with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: masses, new_values = agent(replay.state()) new_log_probs = masses.log_prob(replay.action()) new_values = new_values.view(-1, 1) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['policy_losses'].append(policy_loss.item()) # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() result['value_losses'].append(value_loss.item()) replay.empty() result['weights'] = list(agent.parameters()) return result