def trpo_update(episodes, policy, baseline, prms): """ Inspired by cherry-rl/examples/bsuite/trpo_v_random.py """ new_loss = 0.0 old_policy = deepcopy(policy) for step in range(prms['trpo_steps']): states = episodes.state() actions = episodes.action() rewards = episodes.reward() dones = episodes.done() next_states = episodes.next_state() returns = ch.td.discount(prms['gamma'], rewards, dones) baseline.fit(states, returns) values = baseline(states) next_values = baseline(next_states) # Compute KL with torch.no_grad(): old_density = old_policy.density(states) new_density = policy.density(states) kl = torch.distributions.kl_divergence(old_density, new_density).mean() # Compute surrogate loss old_log_probs = old_density.log_prob(actions).mean(dim=1, keepdim=True) new_log_probs = new_density.log_prob(actions).mean(dim=1, keepdim=True) bootstraps = values * (1.0 - dones) + next_values * dones advantages = ch.pg.generalized_advantage(prms['gamma'], prms['tau'], rewards, dones, bootstraps, torch.zeros(1)) advantages = ch.normalize(advantages).detach() surr_loss = ch.algorithms.trpo.policy_loss(new_log_probs, old_log_probs, advantages) # Compute the update grad = torch.autograd.grad(surr_loss, policy.parameters(), retain_graph=True) fvp = ch.algorithms.trpo.hessian_vector_product(kl, policy.parameters()) grad = torch.nn.utils.parameters_to_vector(grad).detach() step = ch.algorithms.trpo.conjugate_gradient(fvp, grad) lagrange_mult = 0.5 * torch.dot(step, fvp(step)) / prms['max_kl'] step = step / lagrange_mult step_ = [torch.zeros_like(p.data) for p in policy.parameters()] torch.nn.utils.vector_to_parameters(step, step_) step = step_ # Line-search for ls_step in range(prms['ls_max_steps']): stepsize = prms['backtrack_factor'] ** ls_step clone = deepcopy(policy) for c, u in zip(clone.parameters(), step): c.data.add_(u.data, alpha=-stepsize) new_density = clone.density(states) new_kl = torch.distributions.kl_divergence(old_density, new_density).mean() new_log_probs = new_density.log_prob(actions).mean(dim=1, keepdim=True) new_loss = ch.algorithms.trpo.policy_loss(new_log_probs, old_log_probs, advantages) if new_loss < surr_loss and new_kl < prms['max_kl']: for p, c in zip(policy.parameters(), clone.parameters()): p.data[:] = c.data[:] break return new_loss
def ppo_update(episodes, policy, optimizer, baseline, prms): # Get values to device states, actions, rewards, dones, next_states = get_episode_values(episodes) # Update value function & Compute advantages returns = ch.td.discount(prms['gamma'], rewards, dones) advantages = compute_advantages(baseline, prms['tau'], prms['gamma'], rewards, dones, states, next_states) advantages = ch.normalize(advantages, epsilon=1e-8).detach() # Calculate loss between states and action in the network with torch.no_grad(): old_log_probs = policy.log_prob(states, actions) # Initialize inner loop PPO optimizer av_loss = 0.0 for ppo_epoch in range(prms['ppo_epochs']): new_log_probs = policy.log_prob(states, actions) # Compute the policy loss policy_loss = ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=prms['ppo_clip_ratio']) # Adapt model based on the loss optimizer.zero_grad() policy_loss.backward() optimizer.step() baseline.fit(states, returns) av_loss += policy_loss.item() return av_loss / prms['ppo_epochs']
def trpo_update(replay, policy, baseline): gamma = 0.99 tau = 0.95 max_kl = 0.01 ls_max_steps = 15 backtrack_factor = 0.5 old_policy = deepcopy(policy) for step in range(10): states = replay.state() actions = replay.action() rewards = replay.reward() dones = replay.done() next_states = replay.next_state() returns = ch.td.discount(gamma, rewards, dones) baseline.fit(states, returns) values = baseline(states) next_values = baseline(next_states) # Compute KL with th.no_grad(): old_density = old_policy.density(states) new_density = policy.density(states) kl = kl_divergence(old_density, new_density).mean() # Compute surrogate loss old_log_probs = old_density.log_prob(actions).mean(dim=1, keepdim=True) new_log_probs = new_density.log_prob(actions).mean(dim=1, keepdim=True) bootstraps = values * (1.0 - dones) + next_values * dones advantages = ch.pg.generalized_advantage(gamma, tau, rewards, dones, bootstraps, th.zeros(1)) advantages = ch.normalize(advantages).detach() surr_loss = trpo.policy_loss(new_log_probs, old_log_probs, advantages) # Compute the update grad = autograd.grad(surr_loss, policy.parameters(), retain_graph=True) Fvp = trpo.hessian_vector_product(kl, policy.parameters()) grad = parameters_to_vector(grad).detach() step = trpo.conjugate_gradient(Fvp, grad) lagrange_mult = 0.5 * th.dot(step, Fvp(step)) / max_kl step = step / lagrange_mult step_ = [th.zeros_like(p.data) for p in policy.parameters()] vector_to_parameters(step, step_) step = step_ # Line-search for ls_step in range(ls_max_steps): stepsize = backtrack_factor**ls_step clone = deepcopy(policy) for c, u in zip(clone.parameters(), step): c.data.add_(-stepsize, u.data) new_density = clone.density(states) new_kl = kl_divergence(old_density, new_density).mean() new_log_probs = new_density.log_prob(actions).mean(dim=1, keepdim=True) new_loss = trpo.policy_loss(new_log_probs, old_log_probs, advantages) if new_loss < surr_loss and new_kl < max_kl: for p, c in zip(policy.parameters(), clone.parameters()): p.data[:] = c.data[:] break
def main(env='Pendulum-v0'): agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() for step in range(1, MAX_STEPS + 1): replay += env.run(agent, episodes=1) if len(replay) >= BATCH_SIZE: with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(replay.state()) masses = infos['mass'] new_values = infos['value'].view(-1, 1) new_log_probs = masses.log_prob(replay.action()) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss(new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() replay.empty()
def update(replay, optimizer, policy, env, lr_schedule): _, next_state_value = policy(replay[-1].next_state) advantages = pg.generalized_advantage(GAMMA, TAU, replay.reward(), replay.done(), replay.value(), next_state_value) advantages = ch.normalize(advantages, epsilon=1e-5).view(-1, 1) rewards = [a + v for a, v in zip(advantages, replay.value())] for i, sars in enumerate(replay): sars.reward = rewards[i].detach() sars.advantage = advantages[i].detach() # Logging policy_losses = [] entropies = [] value_losses = [] mean = lambda a: sum(a) / len(a) # Perform some optimization steps for step in range(PPO_EPOCHS * PPO_NUM_BATCHES): batch = replay.sample(PPO_BSZ) masses, values = policy(batch.state()) # Compute losses new_log_probs = masses.log_prob(batch.action()).sum(-1, keepdim=True) entropy = masses.entropy().sum(-1).mean() policy_loss = ppo.policy_loss(new_log_probs, batch.log_prob(), batch.advantage(), clip=PPO_CLIP) value_loss = ppo.state_value_loss(values, batch.value().detach(), batch.reward(), clip=PPO_CLIP) loss = policy_loss - ENT_WEIGHT * entropy + V_WEIGHT * value_loss # Take optimization step optimizer.zero_grad() loss.backward() th.nn.utils.clip_grad_norm_(policy.parameters(), GRAD_NORM) optimizer.step() policy_losses.append(policy_loss) entropies.append(entropy) value_losses.append(value_loss) # Log metrics env.log('policy loss', mean(policy_losses).item()) env.log('policy entropy', mean(entropies).item()) env.log('value loss', mean(value_losses).item()) # Update the parameters on schedule if LINEAR_SCHEDULE: lr_schedule.step()
def maml_a2c_loss(train_episodes, learner, baseline, gamma, tau): # Update policy and baseline states = train_episodes.state() actions = train_episodes.action() rewards = train_episodes.reward() dones = train_episodes.done() next_states = train_episodes.next_state() log_probs = learner.log_prob(states, actions) advantages = compute_advantages(baseline, tau, gamma, rewards, dones, states, next_states) advantages = ch.normalize(advantages).detach() return a2c.policy_loss(log_probs, advantages)
def update(replay, optimizer): policy_loss = [] value_loss = [] rewards = discount(GAMMA, replay.reward(), replay.done()) rewards = ch.normalize(rewards) for sars, reward in zip(replay, rewards): log_prob = sars.log_prob value = sars.value policy_loss.append(-log_prob * (reward - value.item())) value_loss.append(F.mse_loss(value, reward.detach())) optimizer.zero_grad() loss = th.stack(policy_loss).sum() + V_WEIGHT * th.stack(value_loss).sum() loss.backward() optimizer.step()
def meta_surrogate_loss(iteration_replays, iteration_policies, policy, baseline, tau, gamma, adapt_lr): mean_loss = 0.0 mean_kl = 0.0 for task_replays, old_policy in tqdm(zip(iteration_replays, iteration_policies), total=len(iteration_replays), desc='Surrogate Loss', leave=False): policy.reset_context() train_replays = task_replays[:-1] valid_episodes = task_replays[-1] new_policy = l2l.clone_module(policy) # Fast Adapt for train_episodes in train_replays: new_policy = fast_adapt_a2c(new_policy, train_episodes, adapt_lr, baseline, gamma, tau, first_order=False) # Useful values states = valid_episodes.state() actions = valid_episodes.action() next_states = valid_episodes.next_state() rewards = valid_episodes.reward() dones = valid_episodes.done() # Compute KL old_densities = old_policy.density(states) new_densities = new_policy.density(states) kl = kl_divergence(new_densities, old_densities).mean() mean_kl += kl # Compute Surrogate Loss advantages = compute_advantages(baseline, tau, gamma, rewards, dones, states, next_states) advantages = ch.normalize(advantages).detach() old_log_probs = old_densities.log_prob(actions).mean( dim=1, keepdim=True).detach() new_log_probs = new_densities.log_prob(actions).mean(dim=1, keepdim=True) mean_loss += trpo.policy_loss(new_log_probs, old_log_probs, advantages) mean_kl /= len(iteration_replays) mean_loss /= len(iteration_replays) return mean_loss, mean_kl
def update(replay): policy_loss = [] # Discount and normalize rewards rewards = ch.discount(GAMMA, replay.reward(), replay.done()) rewards = ch.normalize(rewards) # Compute loss for sars, reward in zip(replay, rewards): log_prob = sars.log_prob policy_loss.append(-log_prob * reward) # Take optimization step optimizer.zero_grad() policy_loss = th.stack(policy_loss).sum() policy_loss.backward() optimizer.step()
def meta_surrogate_loss(iter_replays, iter_policies, policy, baseline, params, anil): mean_loss = 0.0 mean_kl = 0.0 for task_replays, old_policy in zip(iter_replays, iter_policies): train_replays = task_replays[:-1] valid_episodes = task_replays[-1] new_policy = clone_module(policy) # Fast Adapt to the training episodes for train_episodes in train_replays: new_policy = trpo_update(train_episodes, new_policy, baseline, params['inner_lr'], params['gamma'], params['tau'], anil=anil, first_order=False) # Calculate KL from the validation episodes states, actions, rewards, dones, next_states = get_episode_values( valid_episodes) # Compute KL old_densities = old_policy.density(states) new_densities = new_policy.density(states) kl = kl_divergence(new_densities, old_densities).mean() mean_kl += kl # Compute Surrogate Loss advantages = compute_advantages(baseline, params['tau'], params['gamma'], rewards, dones, states, next_states) advantages = ch.normalize(advantages).detach() old_log_probs = old_densities.log_prob(actions).mean( dim=1, keepdim=True).detach() new_log_probs = new_densities.log_prob(actions).mean(dim=1, keepdim=True) mean_loss += trpo.policy_loss(new_log_probs, old_log_probs, advantages) mean_kl /= len(iter_replays) mean_loss /= len(iter_replays) return mean_loss, mean_kl
def trpo_a2c_loss(episodes, learner, baseline, gamma, tau, update_vf=True): # Get values to device states, actions, rewards, dones, next_states = get_episode_values(episodes) # Calculate loss between states and action in the network log_probs = learner.log_prob(states, actions) # Compute advantages & normalize advantages = compute_advantages(baseline, tau, gamma, rewards, dones, states, next_states, update_vf=update_vf) advantages = ch.normalize(advantages).detach() # Compute the policy loss return a2c.policy_loss(log_probs, advantages)
def single_ppo_update(episodes, learner, baseline, params, anil=False): # Get values to device states, actions, rewards, dones, next_states = get_episode_values(episodes) # Update value function & Compute advantages advantages = compute_advantages(baseline, params['tau'], params['gamma'], rewards, dones, states, next_states) advantages = ch.normalize(advantages, epsilon=1e-8).detach() # Calculate loss between states and action in the network with torch.no_grad(): old_log_probs = learner.log_prob(states, actions) # Initialize inner loop PPO optimizer new_log_probs = learner.log_prob(states, actions) # Compute the policy loss loss = ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=params['ppo_clip_ratio']) # Adapt model based on the loss learner.adapt(loss, allow_unused=anil) return loss
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) replay = ch.ExperienceReplay() agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) def get_action(state): mass, value = agent(state) action = mass.sample() log_prob = mass.log_prob(action) return action, { 'log_prob': log_prob, 'value': value, } result = { 'rewards': [], 'policy_losses': [], 'value_losses': [], 'weights': [], } for step in range(1, CHERRY_MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) >= BATCH_SIZE: for r in replay.reward(): result['rewards'].append(r.item()) with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: masses, new_values = agent(replay.state()) new_log_probs = masses.log_prob(replay.action()) new_values = new_values.view(-1, 1) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['policy_losses'].append(policy_loss.item()) # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() result['value_losses'].append(value_loss.item()) replay.empty() result['weights'] = list(agent.parameters()) return result
def main( experiment='dev', env_name='2DNavigation-v0', adapt_lr=0.1, meta_lr=0.01, adapt_steps=1, num_iterations=20, meta_bsz=10, adapt_bsz=10, tau=1.00, gamma=0.99, num_workers=1, seed=42, ): random.seed(seed) np.random.seed(seed) th.manual_seed(seed) def make_env(): return gym.make(env_name) env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)]) env.seed(seed) env = ch.envs.Torch(env) policy = DiagNormalPolicy(env.state_size, env.action_size) meta_learner = l2l.MAML(policy, lr=meta_lr) baseline = LinearValue(env.state_size, env.action_size) opt = optim.Adam(meta_learner.parameters(), lr=meta_lr) all_rewards = [] for iteration in range(num_iterations): iteration_reward = 0.0 iteration_replays = [] iteration_policies = [] policy.to('cpu') baseline.to('cpu') for task_config in tqdm(env.sample_tasks(meta_bsz), leave=False, desc='Data'): # Samples a new config learner = meta_learner.clone() env.reset_task(task_config) env.reset() task = ch.envs.Runner(env) task_replay = [] # Fast Adapt for step in range(adapt_steps): train_episodes = task.run(learner, episodes=adapt_bsz) learner = fast_adapt_a2c(learner, train_episodes, adapt_lr, baseline, gamma, tau, first_order=True) task_replay.append(train_episodes) # Compute Validation Loss valid_episodes = task.run(learner, episodes=adapt_bsz) task_replay.append(valid_episodes) iteration_reward += valid_episodes.reward().sum().item( ) / adapt_bsz iteration_replays.append(task_replay) iteration_policies.append(learner) # Print statistics print('\nIteration', iteration) adaptation_reward = iteration_reward / meta_bsz all_rewards.append(adaptation_reward) print('adaptation_reward', adaptation_reward) # PPO meta-optimization for ppo_step in tqdm(range(10), leave=False, desc='Optim'): ppo_loss = 0.0 for task_replays, old_policy in zip(iteration_replays, iteration_policies): train_replays = task_replays[:-1] valid_replay = task_replays[-1] # Fast adapt new policy, starting from the current init new_policy = meta_learner.clone() for train_episodes in train_replays: new_policy = fast_adapt_a2c(new_policy, train_episodes, adapt_lr, baseline, gamma, tau) # Compute PPO loss between old and new clones states = valid_replay.state() actions = valid_replay.action() rewards = valid_replay.reward() dones = valid_replay.done() next_states = valid_replay.next_state() old_log_probs = old_policy.log_prob(states, actions).detach() new_log_probs = new_policy.log_prob(states, actions) advantages = compute_advantages(baseline, tau, gamma, rewards, dones, states, next_states) advantages = ch.normalize(advantages).detach() ppo_loss += ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=0.1) ppo_loss /= meta_bsz opt.zero_grad() ppo_loss.backward() opt.step()
def main(env='Pendulum-v0'): agent = ActorCritic(HIDDEN_SIZE).to(device) agent.apply(weights_init) actor_optimizer = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimizer = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) actor_scheduler = torch.optim.lr_scheduler.StepLR(actor_optimizer, step_size=2000, gamma=0.5) critic_scheduler = torch.optim.lr_scheduler.StepLR(critic_optimizer, step_size=2000, gamma=0.5) replay = ch.ExperienceReplay() env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() def get_action(state): return agent(state.to(device)) for step in range(1, MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) >= BATCH_SIZE: #batch = replay.sample(BATCH_SIZE).to(device) batch = replay.to(device) with torch.no_grad(): advantages = pg.generalized_advantage( DISCOUNT, TRACE_DECAY, batch.reward(), batch.done(), batch.value(), torch.zeros(1).to(device)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, batch.reward(), batch.done()) old_log_probs = batch.log_prob() new_values = batch.value() new_log_probs = batch.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(batch.state()) masses = infos['mass'] new_values = infos['value'].view(-1, 1) new_log_probs = masses.log_prob(batch.action()) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimizer.zero_grad() policy_loss.backward() #nn.utils.clip_grad_norm_(agent.actor.parameters(), 1.0) actor_optimizer.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimizer.zero_grad() value_loss.backward() #nn.utils.clip_grad_norm_(agent.critic.parameters(), 1.0) critic_optimizer.step() actor_scheduler.step() critic_scheduler.step() replay.empty()
def main(): order_book_id_number = 10 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=20, start="2019-05-01", end="2019-12-12", frequency="D") env = PortfolioTradingGym(data_df=toy_data, sequence_window=5, add_cash=True) env = Numpy(env) env = ch.envs.Logger(env, interval=1000) env = ch.envs.Torch(env) env = ch.envs.Runner(env) # create net action_size = env.action_space.shape[0] number_asset, seq_window, features_number = env.observation_space.shape input_size = features_number agent = ActorCritic(input_size=input_size, hidden_size=HIDDEN_SIZE, action_size=action_size) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() for step in range(1, MAX_STEPS + 1): replay += env.run(agent, episodes=1) if len(replay) >= BATCH_SIZE: with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() # here is to add readability new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(replay.state()) masses = infos['mass'] new_values = infos['value'] new_log_probs = masses.log_prob( replay.action()).unsqueeze(-1) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() replay.empty()
def main(): order_book_id_number = 10 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=20, start="2019-05-01", end="2019-12-12", frequency="D") env = PortfolioTradingGym(data_df=toy_data, sequence_window=5, add_cash=True) env = Numpy(env) env = ch.envs.Logger(env, interval=1000) env = ch.envs.Torch(env) env = ch.envs.Runner(env) # create net action_size = env.action_space.shape[0] number_asset, seq_window, features_number = env.observation_space.shape input_size = features_number actor = Actor(input_size=input_size, hidden_size=50, action_size=action_size) critic = Critic(input_size=input_size, hidden_size=50, action_size=action_size) target_actor = create_target_network(actor) target_critic = create_target_network(critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE_ACTOR) critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE_CRITIC) replay = ch.ExperienceReplay() ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(action_size)) def get_action(state): action = actor(state) action = action + ou_noise()[0] return action def get_random_action(state): action = torch.softmax(torch.randn(action_size), dim=0) return action for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) next_values = target_critic(batch.next_state(), target_actor(batch.next_state())).view( -1, 1) values = critic(batch.state(), batch.action()).view(-1, 1) rewards = ch.normalize(batch.reward()) #rewards = batch.reward()/100.0 change the convergency a lot value_loss = ch.algorithms.ddpg.state_value_loss( values, next_values.detach(), rewards, batch.done(), DISCOUNT) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() # Update policy by one step of gradient ascent policy_loss = -critic(batch.state(), actor(batch.state())).mean() actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target networks ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR) ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR)
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) def get_action(state): mass, value = agent(state) action = mass.sample() log_prob = mass.log_prob(action) return action, { 'log_prob': log_prob, 'value': value, } env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) replay = ch.ExperienceReplay() result = { 'rewards': [], 'policy_losses': [], 'value_losses': [], 'weights': [], } for step in range(1, CHERRY_MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) > BATCH_SIZE: for r in replay.reward(): result['rewards'].append(r.item()) with torch.no_grad(): advantages = ch.pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = ch.td.discount(DISCOUNT, replay.reward(), replay.done()) # Policy loss log_probs = replay.log_prob() policy_loss = ch.algorithms.a2c.policy_loss(log_probs, advantages) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['policy_losses'].append(policy_loss.item()) # Value loss value_loss = ch.algorithms.a2c.state_value_loss(replay.value(), returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() result['value_losses'].append(value_loss.item()) replay.empty() result['weights'] = list(agent.parameters()) return result
def fast_adapt_ppo(task, learner, baseline, params, anil=False, render=False): # During inner loop adaptation we do not store gradients for the network body if anil: learner.module.turn_off_body_grads() for step in range(params['adapt_steps']): # Collect adaptation / support episodes support_episodes = task.run(learner, episodes=params['adapt_batch_size'], render=render) # Get values to device states, actions, rewards, dones, next_states = get_episode_values( support_episodes) # Update value function & Compute advantages advantages = compute_advantages(baseline, params['tau'], params['gamma'], rewards, dones, states, next_states) advantages = ch.normalize(advantages, epsilon=1e-8).detach() # Calculate loss between states and action in the network with torch.no_grad(): old_log_probs = learner.log_prob(states, actions) # Initialize inner loop PPO optimizer av_loss = 0.0 for ppo_epoch in range(params['ppo_epochs']): new_log_probs = learner.log_prob(states, actions) # Compute the policy loss loss = ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=params['ppo_clip_ratio']) # Adapt model based on the loss learner.adapt(loss, allow_unused=anil) av_loss += loss # We need to include the body network parameters for the query set if anil: learner.module.turn_on_body_grads() # Collect evaluation / query episodes query_episodes = task.run(learner, episodes=params['adapt_batch_size']) # Get values to device states, actions, rewards, dones, next_states = get_episode_values( query_episodes) # Update value function & Compute advantages advantages = compute_advantages(baseline, params['tau'], params['gamma'], rewards, dones, states, next_states) advantages = ch.normalize(advantages, epsilon=1e-8).detach() # Calculate loss between states and action in the network with torch.no_grad(): old_log_probs = learner.log_prob(states, actions) new_log_probs = learner.log_prob(states, actions) # Compute the policy loss valid_loss = ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=params['ppo_clip_ratio']) # Calculate the average reward of the evaluation episodes query_rew = query_episodes.reward().sum().item( ) / params['adapt_batch_size'] query_success_rate = get_ep_successes( query_episodes, params['max_path_length']) / params['adapt_batch_size'] return valid_loss, query_rew, query_success_rate
def main( env_name='AntDirection-v1', adapt_lr=0.1, meta_lr=3e-4, adapt_steps=3, num_iterations=1000, meta_bsz=40, adapt_bsz=20, ppo_clip=0.3, ppo_steps=5, tau=1.00, gamma=0.99, eta=0.0005, adaptive_penalty=False, kl_target=0.01, num_workers=4, seed=421, ): random.seed(seed) np.random.seed(seed) th.manual_seed(seed) def make_env(): env = gym.make(env_name) env = ch.envs.ActionSpaceScaler(env) return env env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)]) env.seed(seed) env = ch.envs.ActionSpaceScaler(env) env = ch.envs.Torch(env) policy = DiagNormalPolicy(input_size=env.state_size, output_size=env.action_size, hiddens=[64, 64], activation='tanh') meta_learner = l2l.algorithms.MAML(policy, lr=meta_lr) baseline = LinearValue(env.state_size, env.action_size) opt = optim.Adam(meta_learner.parameters(), lr=meta_lr) for iteration in range(num_iterations): iteration_reward = 0.0 iteration_replays = [] iteration_policies = [] # Sample Trajectories for task_config in tqdm(env.sample_tasks(meta_bsz), leave=False, desc='Data'): clone = deepcopy(meta_learner) env.set_task(task_config) env.reset() task = ch.envs.Runner(env) task_replay = [] task_policies = [] # Fast Adapt for step in range(adapt_steps): for p in clone.parameters(): p.detach_().requires_grad_() task_policies.append(deepcopy(clone)) train_episodes = task.run(clone, episodes=adapt_bsz) clone = fast_adapt_a2c(clone, train_episodes, adapt_lr, baseline, gamma, tau, first_order=True) task_replay.append(train_episodes) # Compute Validation Loss for p in clone.parameters(): p.detach_().requires_grad_() task_policies.append(deepcopy(clone)) valid_episodes = task.run(clone, episodes=adapt_bsz) task_replay.append(valid_episodes) iteration_reward += valid_episodes.reward().sum().item( ) / adapt_bsz iteration_replays.append(task_replay) iteration_policies.append(task_policies) # Print statistics print('\nIteration', iteration) adaptation_reward = iteration_reward / meta_bsz print('adaptation_reward', adaptation_reward) # ProMP meta-optimization for ppo_step in tqdm(range(ppo_steps), leave=False, desc='Optim'): promp_loss = 0.0 kl_total = 0.0 for task_replays, old_policies in zip(iteration_replays, iteration_policies): new_policy = meta_learner.clone() states = task_replays[0].state() actions = task_replays[0].action() rewards = task_replays[0].reward() dones = task_replays[0].done() next_states = task_replays[0].next_state() old_policy = old_policies[0] (old_density, new_density, old_log_probs, new_log_probs) = precompute_quantities( states, actions, old_policy, new_policy) advantages = compute_advantages(baseline, tau, gamma, rewards, dones, states, next_states) advantages = ch.normalize(advantages).detach() for step in range(adapt_steps): # Compute KL penalty kl_pen = kl_divergence(old_density, new_density).mean() kl_total += kl_pen.item() # Update the clone surr_loss = trpo.policy_loss(new_log_probs, old_log_probs, advantages) new_policy.adapt(surr_loss) # Move to next adaptation step states = task_replays[step + 1].state() actions = task_replays[step + 1].action() rewards = task_replays[step + 1].reward() dones = task_replays[step + 1].done() next_states = task_replays[step + 1].next_state() old_policy = old_policies[step + 1] (old_density, new_density, old_log_probs, new_log_probs) = precompute_quantities( states, actions, old_policy, new_policy) # Compute clip loss advantages = compute_advantages(baseline, tau, gamma, rewards, dones, states, next_states) advantages = ch.normalize(advantages).detach() clip_loss = ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=ppo_clip) # Combine into ProMP loss promp_loss += clip_loss + eta * kl_pen kl_total /= meta_bsz * adapt_steps promp_loss /= meta_bsz * adapt_steps opt.zero_grad() promp_loss.backward(retain_graph=True) opt.step() # Adapt KL penalty based on desired target if adaptive_penalty: if kl_total < kl_target / 1.5: eta /= 2.0 elif kl_total > kl_target * 1.5: eta *= 2.0
env = ch.envs.Torch(env) policy = PolicyNet() optimizer = optim.Adam(policy.parameters(), lr=1e-2) replay = ch.ExperienceReplay() # Manage transitions for step in range(1000): state = env.reset() while True: mass = Categorical(policy(state)) action = mass.sample() log_prob = mass.log_prob(action) next_state, reward, done, _ = env.step(action) # Build the ExperienceReplay replay.append(state, action, reward, next_state, done, log_prob=log_prob) if done: break else: state = next_state # Discounting and normalizing rewards rewards = ch.td.discount(0.99, replay.reward(), replay.done()) rewards = ch.normalize(rewards) loss = -th.sum(replay.log_prob() * rewards) optimizer.zero_grad() loss.backward() optimizer.step() replay.empty()