def main(env='PongNoFrameskip-v4'): num_steps = 10000000 th.set_num_threads(1) random.seed(SEED) th.manual_seed(SEED) np.random.seed(SEED) env = gym.make(env) env.seed(1234) env = envs.Logger(env, interval=1000) env = envs.OpenAIAtari(env) env = envs.Torch(env) env = envs.Runner(env) env.seed(SEED) num_updates = num_steps // A2C_STEPS + 1 th.manual_seed(1234) policy = NatureCNN(env) optimizer = optim.RMSprop(policy.parameters(), lr=LR, alpha=0.99, eps=1e-5) #lr_schedule = optim.lr_scheduler.LambdaLR(optimizer, lambda step: 1 - step/num_updates) get_action = lambda state: get_action_value(state, policy) for updt in range(num_updates): # Sample some transitions replay = env.run(get_action, steps=A2C_STEPS) # Update policy update(replay, optimizer, policy, env=env)
def main( num_steps=10000000, env_name='PongNoFrameskip-v4', # env_name='BreakoutNoFrameskip-v4', seed=42): th.set_num_threads(1) random.seed(seed) th.manual_seed(seed) np.random.seed(seed) env = gym.make(env_name) env = envs.VisdomLogger(env, interval=10) env = envs.OpenAIAtari(env) env = envs.Torch(env) env = envs.Runner(env) env.seed(seed) policy = NatureCNN(env) optimizer = optim.RMSprop(policy.parameters(), lr=LR, alpha=0.99, eps=1e-5) get_action = lambda state: get_action_value(state, policy) for step in range(num_steps // A2C_STEPS + 1): # Sample some transitions replay = env.run(get_action, steps=A2C_STEPS) env.log('random', random.random())
def main(env='PongNoFrameskip-v4'): random.seed(SEED) np.random.seed(SEED) th.manual_seed(SEED) env = gym.make(env) env = envs.OpenAIAtari(env) env = envs.Logger(env, interval=PPO_STEPS) env = envs.Torch(env) env = envs.Runner(env) env.seed(SEED) policy = NatureCNN(env).to('cuda:0') optimizer = optim.Adam(policy.parameters(), lr=LR, eps=1e-5) num_updates = TOTAL_STEPS // PPO_STEPS + 1 lr_schedule = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: 1 - epoch / num_updates) get_action = lambda state: get_action_value(state, policy) for epoch in range(num_updates): policy.cpu() replay = env.run(get_action, steps=PPO_STEPS, render=RENDER) replay = replay.cuda() policy.cuda() update(replay, optimizer, policy, env, lr_schedule)
def test_config(n_envs, base_env, use_torch, use_logger, return_info): config = 'n_envs' + str(n_envs) + '-base_env' + str(base_env) \ + '-torch' + str(use_torch) + '-logger' + str(use_logger) \ + '-info' + str(return_info) if isinstance(base_env, str): env = vec_env = gym.vector.make(base_env, num_envs=n_envs) else: def make_env(): env = base_env() return env env_fns = [make_env for _ in range(n_envs)] env = vec_env = AsyncVectorEnv(env_fns) if use_logger: env = envs.Logger(env, interval=5, logger=self.logger) if use_torch: env = envs.Torch(env) policy = lambda x: ch.totensor(vec_env.action_space.sample()) else: policy = lambda x: vec_env.action_space.sample() if return_info: agent = lambda x: (policy(x), {'policy': policy(x)[0]}) else: agent = policy # Gather experience env = envs.Runner(env) replay = env.run(agent, steps=NUM_STEPS) # Pre-compute some shapes shape = (NUM_STEPS, n_envs) state_shape = vec_env.observation_space.sample()[0] if isinstance(state_shape, (int, float)): state_shape = tuple() else: state_shape = state_shape.shape action_shape = vec_env.action_space.sample()[0] if isinstance(action_shape, (int, float)): action_shape = (1, ) else: action_shape = action_shape.shape done_shape = tuple() # Check shapes states = replay.state() self.assertEqual(states.shape, shape + state_shape, config) actions = replay.action() self.assertEqual(actions.shape, shape + action_shape, config) dones = replay.done() self.assertEqual(dones.shape, shape + done_shape, config) if return_info: policies = replay.policy() self.assertEqual(policies.shape, (NUM_STEPS, ) + action_shape, config)
def main(env='Pendulum-v0'): agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() for step in range(1, MAX_STEPS + 1): replay += env.run(agent, episodes=1) if len(replay) >= BATCH_SIZE: with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(replay.state()) masses = infos['mass'] new_values = infos['value'].view(-1, 1) new_log_probs = masses.log_prob(replay.action()) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss(new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss(new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() replay.empty()
def main(env): env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.ActionLambda(env, convert_discrete_to_continuous_action) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() agent = DQN(HIDDEN_SIZE, ACTION_DISCRETISATION) target_agent = create_target_network(agent) optimiser = optim.Adam(agent.parameters(), lr=LEARNING_RATE) def get_random_action(state): action = torch.tensor([[random.randint(0, ACTION_DISCRETISATION - 1)]]) return action def get_action(state): # Original sampling (for unit test) #if random.random() < EPSILON: # action = torch.tensor([[random.randint(0, ACTION_DISCRETISATION - 1)]]) #else: # action = agent(state)[1].argmax(dim=1, keepdim=True) #return action return agent(state)[0] for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: # Randomly sample a batch of experience batch = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(batch) # Compute targets target_values = target_agent(batch.next_state())[1].max( dim=1, keepdim=True)[0] target_values = batch.reward() + DISCOUNT * ( 1 - batch.done()) * target_values # Update Q-function by one step of gradient descent pred_values = agent(batch.state())[1].gather(1, batch.action()) value_loss = F.mse_loss(pred_values, target_values) optimiser.zero_grad() value_loss.backward() optimiser.step() if step > UPDATE_START and step % TARGET_UPDATE_INTERVAL == 0: # Update target network target_agent = create_target_network(agent)
def main(env='Pendulum-v0'): env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) actor = Actor(HIDDEN_SIZE, stochastic=False, layer_norm=True) critic = Critic(HIDDEN_SIZE, state_action=True, layer_norm=True) target_actor = create_target_network(actor) target_critic = create_target_network(critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() get_action = lambda s: (actor(s) + ACTION_NOISE * torch.randn(1, 1)).clamp( -1, 1) for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) next_values = target_critic(batch.next_state(), target_actor(batch.next_state())).view( -1, 1) values = critic(batch.state(), batch.action()).view(-1, 1) value_loss = ch.algorithms.ddpg.state_value_loss( values, next_values.detach(), batch.reward(), batch.done(), DISCOUNT) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() # Update policy by one step of gradient ascent policy_loss = -critic(batch.state(), actor(batch.state())).mean() actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target networks ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR) ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR)
def main(env='HalfCheetahBulletEnv-v0'): random.seed(SEED) np.random.seed(SEED) th.manual_seed(SEED) env = gym.make(env) env = envs.VisdomLogger(env, interval=1000) env = envs.ActionSpaceScaler(env) env = envs.Torch(env) env = envs.Runner(env) env.seed(SEED) log_alpha = th.zeros(1, requires_grad=True) if USE_AUTOMATIC_ENTROPY_TUNING: # Heuristic target entropy target_entropy = -np.prod(env.action_space.shape).item() else: target_entropy = TARGET_ENTROPY state_size = env.state_size action_size = env.action_size policy = Policy(input_size=state_size, output_size=action_size) critic_qf1 = MLP(input_size=state_size + action_size, output_size=1) critic_qf2 = MLP(input_size=state_size + action_size, output_size=1) target_qf1 = copy.deepcopy(critic_qf1) target_qf2 = copy.deepcopy(critic_qf2) policy_opt = optim.Adam(policy.parameters(), lr=ALL_LR) qf1_opt = optim.Adam(critic_qf1.parameters(), lr=ALL_LR) qf2_opt = optim.Adam(critic_qf2.parameters(), lr=ALL_LR) alpha_opt = optim.Adam([log_alpha], lr=ALL_LR) replay = ch.ExperienceReplay() get_action = lambda state: policy(state).rsample() for step in range(TOTAL_STEPS): # Collect next step ep_replay = env.run(get_action, steps=1, render=RENDER) # Update policy replay += ep_replay replay = replay[-REPLAY_SIZE:] if len(replay) > MIN_REPLAY: update(env, replay, policy, critic_qf1, critic_qf2, target_qf1, target_qf2, log_alpha, policy_opt, qf1_opt, qf2_opt, alpha_opt, target_entropy)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int) args = parser.parse_args() dist.init_process_group('gloo', init_method='file:///home/seba-1511/.dist_init', rank=args.local_rank, world_size=16) rank = dist.get_rank() th.set_num_threads(1) random.seed(SEED + rank) th.manual_seed(SEED + rank) np.random.seed(SEED + rank) # env_name = 'CartPoleBulletEnv-v0' env_name = 'AntBulletEnv-v0' # env_name = 'RoboschoolAnt-v1' env = gym.make(env_name) env = envs.AddTimestep(env) if rank == 0: env = envs.Logger(env, interval=PPO_STEPS) env = envs.Normalizer(env, states=True, rewards=True) env = envs.Torch(env) env = envs.Runner(env) env.seed(SEED) th.set_num_threads(1) policy = ActorCriticNet(env) optimizer = optim.Adam(policy.parameters(), lr=LR, eps=1e-5) num_updates = TOTAL_STEPS // PPO_STEPS + 1 lr_schedule = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: 1 - epoch / num_updates) optimizer = Distributed(policy.parameters(), optimizer) get_action = lambda state: get_action_value(state, policy) for epoch in range(num_updates): # We use the Runner collector, but could've written our own replay = env.run(get_action, steps=PPO_STEPS, render=False) # Update policy update(replay, optimizer, policy, env, lr_schedule)
def main(num_steps=10000000, env_name='PongNoFrameskip-v4', # env_name='BreakoutNoFrameskip-v4', seed=42): th.set_num_threads(1) random.seed(seed) th.manual_seed(seed) np.random.seed(seed) env = gym.make(env_name) env = envs.Logger(env, interval=1000) env = envs.OpenAIAtari(env) env = envs.Torch(env) env = envs.Runner(env) env.seed(seed) dqn = DQN(env) target_dqn = copy.deepcopy(dqn) optimizer = optim.RMSprop(dqn.parameters(), lr=LR, alpha=0.95, eps=0.01, centered=True) replay = ch.ExperienceReplay() epsilon = EPSILON get_action = lambda state: epsilon_greedy(dqn(state), epsilon) for step in range(num_steps // UPDATE_FREQ + 1): # Sample some transitions ep_replay = env.run(get_action, steps=UPDATE_FREQ) replay += ep_replay if step * UPDATE_FREQ < 1e6: # Update epsilon epsilon -= 9.9e-7 * UPDATE_FREQ if step * UPDATE_FREQ > EXPLORATION_STEPS: # Only keep the last 1M transitions replay = replay[-REPLAY_SIZE:] # Update Q-function update(replay, optimizer, dqn, target_dqn, env=env) if step % TARGET_UPDATE_FREQ == 0: target_dqn.load_state_dict(dqn.state_dict())
def test_training(self): """ Issue: Depending on the computer architecture, PyTorch will represent floating numbers differently differently. For example, the above is the output from Seb's MacBook, but it doesn't exactly match the output on his desktop after episode 109. Saving weights / initializing using numpy didn't work either. Is there a workaround ? To be more specific, it seems to be the way PyTorch stores FP, since when calling .tolist() on the weights, all decimals match. Or, it's an out-of-order execution issue. (We did try to use a single MKL/OMP thread.) """ th.set_num_threads(1) random.seed(SEED) np.random.seed(SEED) th.manual_seed(SEED) env = gym.make('CartPole-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) policy = ActorCriticNet(env) optimizer = optim.Adam(policy.parameters(), lr=1e-2) running_reward = 10.0 get_action = lambda state: get_action_value(state, policy) best_running = 0.0 for episode in range( 0, 99): # >100 breaks at episode 109 for torch == 1.2.0 replay = env.run(get_action, episodes=1) update(replay, optimizer) running_reward = running_reward * 0.99 + len(replay) * 0.01 if running_reward >= best_running: best_running = running_reward if (episode + 1) % 10 == 0: # print('ref:', GROUND_TRUTHS[episode // 10], 'curr:', running_reward) self.assertTrue( (GROUND_TRUTHS[episode // 10] - running_reward)**2 <= 1e-4)
def main(env='CliffWalking-v0'): env = gym.make(env) env = envs.Logger(env, interval=1000) env = envs.Torch(env) env = envs.Runner(env) agent = Agent(env) discount = 1.00 optimizer = optim.SGD(agent.parameters(), lr=0.5, momentum=0.0) for t in range(1, 10000): transition = env.run(agent, steps=1)[0] curr_q = transition.q_action next_state = ch.onehot(transition.next_state, dim=env.state_size) next_q = agent.qf(next_state).max().detach() td_error = ch.temporal_difference(discount, transition.reward, transition.done, curr_q, next_q) optimizer.zero_grad() loss = td_error.pow(2).mul(0.5) loss.backward() optimizer.step()
def main(env='PongNoFrameskip-v4'): num_steps = 5000000 seed = 42 import argparse parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int) args = parser.parse_args() dist.init_process_group('gloo', init_method='file:///home/seba-1511/.dist_init_' + env, rank=args.local_rank, world_size=16) rank = dist.get_rank() th.set_num_threads(1) random.seed(seed + rank) th.manual_seed(seed + rank) np.random.seed(seed + rank) env = gym.make(env) if rank == 0: env = envs.Logger(env, interval=1000) env = envs.OpenAIAtari(env) env = envs.Torch(env) env = envs.Runner(env) env.seed(seed + rank) policy = NatureCNN(env) optimizer = optim.RMSprop(policy.parameters(), lr=LR, alpha=0.99, eps=1e-5) optimizer = Distributed(policy.parameters(), optimizer) get_action = lambda state: get_action_value(state, policy) for step in range(num_steps // A2C_STEPS + 1): # Sample some transitions replay = env.run(get_action, steps=A2C_STEPS) # Update policy update(replay, optimizer, policy, env=env)
def main(env='MinitaurTrottingEnv-v0'): env = gym.make(env) env = envs.AddTimestep(env) env = envs.Logger(env, interval=PPO_STEPS) env = envs.Normalizer(env, states=True, rewards=True) env = envs.Torch(env) # env = envs.Recorder(env) env = envs.Runner(env) env.seed(SEED) th.set_num_threads(1) policy = ActorCriticNet(env) optimizer = optim.Adam(policy.parameters(), lr=LR, eps=1e-5) num_updates = TOTAL_STEPS // PPO_STEPS + 1 lr_schedule = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: 1 - epoch / num_updates) get_action = lambda state: get_action_value(state, policy) for epoch in range(num_updates): # We use the Runner collector, but could've written our own replay = env.run(get_action, steps=PPO_STEPS, render=RENDER) # Update policy update(replay, optimizer, policy, env, lr_schedule)
def main(env='Pendulum-v0'): agent = ActorCritic(HIDDEN_SIZE).to(device) agent.apply(weights_init) actor_optimizer = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimizer = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) actor_scheduler = torch.optim.lr_scheduler.StepLR(actor_optimizer, step_size=2000, gamma=0.5) critic_scheduler = torch.optim.lr_scheduler.StepLR(critic_optimizer, step_size=2000, gamma=0.5) replay = ch.ExperienceReplay() env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() def get_action(state): return agent(state.to(device)) for step in range(1, MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) >= BATCH_SIZE: #batch = replay.sample(BATCH_SIZE).to(device) batch = replay.to(device) with torch.no_grad(): advantages = pg.generalized_advantage( DISCOUNT, TRACE_DECAY, batch.reward(), batch.done(), batch.value(), torch.zeros(1).to(device)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, batch.reward(), batch.done()) old_log_probs = batch.log_prob() new_values = batch.value() new_log_probs = batch.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: _, infos = agent(batch.state()) masses = infos['mass'] new_values = infos['value'].view(-1, 1) new_log_probs = masses.log_prob(batch.action()) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimizer.zero_grad() policy_loss.backward() #nn.utils.clip_grad_norm_(agent.actor.parameters(), 1.0) actor_optimizer.step() # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimizer.zero_grad() value_loss.backward() #nn.utils.clip_grad_norm_(agent.critic.parameters(), 1.0) critic_optimizer.step() actor_scheduler.step() critic_scheduler.step() replay.empty()
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) replay = ch.ExperienceReplay() agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) def get_action(state): mass, value = agent(state) action = mass.sample() log_prob = mass.log_prob(action) return action, { 'log_prob': log_prob, 'value': value, } result = { 'rewards': [], 'policy_losses': [], 'value_losses': [], 'weights': [], } for step in range(1, CHERRY_MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) >= BATCH_SIZE: for r in replay.reward(): result['rewards'].append(r.item()) with torch.no_grad(): advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = td.discount(DISCOUNT, replay.reward(), replay.done()) old_log_probs = replay.log_prob() new_values = replay.value() new_log_probs = replay.log_prob() for epoch in range(PPO_EPOCHS): # Recalculate outputs for subsequent iterations if epoch > 0: masses, new_values = agent(replay.state()) new_log_probs = masses.log_prob(replay.action()) new_values = new_values.view(-1, 1) # Update the policy by maximising the PPO-Clip objective policy_loss = ch.algorithms.ppo.policy_loss( new_log_probs, old_log_probs, advantages, clip=PPO_CLIP_RATIO) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['policy_losses'].append(policy_loss.item()) # Fit value function by regression on mean-squared error value_loss = ch.algorithms.a2c.state_value_loss( new_values, returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() result['value_losses'].append(value_loss.item()) replay.empty() result['weights'] = list(agent.parameters()) return result
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) agent = ActorCritic(HIDDEN_SIZE) actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE) def get_action(state): mass, value = agent(state) action = mass.sample() log_prob = mass.log_prob(action) return action, { 'log_prob': log_prob, 'value': value, } env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) replay = ch.ExperienceReplay() result = { 'rewards': [], 'policy_losses': [], 'value_losses': [], 'weights': [], } for step in range(1, CHERRY_MAX_STEPS + 1): replay += env.run(get_action, episodes=1) if len(replay) > BATCH_SIZE: for r in replay.reward(): result['rewards'].append(r.item()) with torch.no_grad(): advantages = ch.pg.generalized_advantage(DISCOUNT, TRACE_DECAY, replay.reward(), replay.done(), replay.value(), torch.zeros(1)) advantages = ch.normalize(advantages, epsilon=1e-8) returns = ch.td.discount(DISCOUNT, replay.reward(), replay.done()) # Policy loss log_probs = replay.log_prob() policy_loss = ch.algorithms.a2c.policy_loss(log_probs, advantages) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['policy_losses'].append(policy_loss.item()) # Value loss value_loss = ch.algorithms.a2c.state_value_loss(replay.value(), returns) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() result['value_losses'].append(value_loss.item()) replay.empty() result['weights'] = list(agent.parameters()) return result
def get_action_value(state, policy): mass, value = policy(state) action = mass.sample() info = { 'log_prob': mass.log_prob(action), # Cache log_prob for later 'value': value } return action, info if __name__ == '__main__': env = gym.vector.make('CartPole-v0', num_envs=1) env = envs.Logger(env, interval=1000) env = envs.Torch(env) env = envs.Runner(env) env.seed(SEED) policy = ActorCriticNet(env) optimizer = optim.Adam(policy.parameters(), lr=1e-2) running_reward = 10.0 get_action = lambda state: get_action_value(state, policy) for episode in count(1): # We use the Runner collector, but could've written our own replay = env.run(get_action, episodes=1) # Update policy update(replay, optimizer) # Compute termination criterion
def test_config(n_envs, n_episodes, base_env, use_torch, use_logger, return_info, retry): config = 'n_envs' + str(n_envs) + '-n_eps' + str(n_episodes) \ + '-base_env' + str(base_env) \ + '-torch' + str(use_torch) + '-logger' + str(use_logger) \ + '-info' + str(return_info) if isinstance(base_env, str): env = vec_env = gym.vector.make(base_env, num_envs=n_envs) else: def make_env(): env = base_env() return env env_fns = [make_env for _ in range(n_envs)] env = vec_env = AsyncVectorEnv(env_fns) if use_logger: env = envs.Logger(env, interval=5, logger=self.logger) if use_torch: env = envs.Torch(env) policy = lambda x: ch.totensor(vec_env.action_space.sample()) else: policy = lambda x: vec_env.action_space.sample() if return_info: agent = lambda x: (policy(x), { 'policy': policy(x)[0], 'act': policy(x) }) else: agent = policy # Gather experience env = envs.Runner(env) replay = env.run(agent, episodes=n_episodes) if retry: replay = env.run(agent, episodes=n_episodes) # Pre-compute some shapes shape = (len(replay), ) state_shape = vec_env.observation_space.sample().shape[1:] action_shape = np.array(vec_env.action_space.sample())[0].shape if len(action_shape) == 0: action_shape = (1, ) done_shape = (1, ) # Check shapes states = replay.state() self.assertEqual(states.shape, shape + state_shape, config) actions = replay.action() self.assertEqual(actions.shape, shape + action_shape, config) dones = replay.done() self.assertEqual(dones.shape, shape + done_shape, config) if return_info: policies = replay.policy() self.assertEqual(policies.shape, shape + action_shape, config) acts = replay.act() self.assertEqual(acts.shape, (len(replay), n_envs) + action_shape, config)
def main(env='Pendulum-v0'): env = gym.make(env) env.seed(SEED) env = envs.Torch(env) env = envs.Logger(env) env = envs.Runner(env) replay = ch.ExperienceReplay() actor = SoftActor(HIDDEN_SIZE) critic_1 = Critic(HIDDEN_SIZE, state_action=True) critic_2 = Critic(HIDDEN_SIZE, state_action=True) value_critic = Critic(HIDDEN_SIZE) target_value_critic = create_target_network(value_critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critics_optimiser = optim.Adam( (list(critic_1.parameters()) + list(critic_2.parameters())), lr=LEARNING_RATE) value_critic_optimiser = optim.Adam(value_critic.parameters(), lr=LEARNING_RATE) get_action = lambda state: actor(state).sample() for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) # Pre-compute some quantities states = batch.state() rewards = batch.reward() old_actions = batch.action() dones = batch.done() masses = actor(states) actions = masses.rsample() log_probs = masses.log_prob(actions) q_values = torch.min(critic_1(states, actions.detach()), critic_2(states, actions.detach())).view(-1, 1) # Compute Q losses v_next = target_value_critic(batch.next_state()).view(-1, 1) q_old_pred1 = critic_1(states, old_actions.detach()).view(-1, 1) q_old_pred2 = critic_2(states, old_actions.detach()).view(-1, 1) qloss1 = ch.algorithms.sac.action_value_loss( q_old_pred1, v_next.detach(), rewards, dones, DISCOUNT) qloss2 = ch.algorithms.sac.action_value_loss( q_old_pred2, v_next.detach(), rewards, dones, DISCOUNT) # Update Q-functions by one step of gradient descent qloss = qloss1 + qloss2 critics_optimiser.zero_grad() qloss.backward() critics_optimiser.step() # Update V-function by one step of gradient descent v_pred = value_critic(batch.state()).view(-1, 1) vloss = ch.algorithms.sac.state_value_loss(v_pred, log_probs.detach(), q_values.detach(), alpha=ENTROPY_WEIGHT) value_critic_optimiser.zero_grad() vloss.backward() value_critic_optimiser.step() # Update policy by one step of gradient ascent q_actions = critic_1(batch.state(), actions).view(-1, 1) policy_loss = ch.algorithms.sac.policy_loss(log_probs, q_actions, alpha=ENTROPY_WEIGHT) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target value network ch.models.polyak_average(target_value_critic, value_critic, POLYAK_FACTOR)
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) result = { 'rewards': [], 'plosses': [], 'vlosses': [], 'pweights': [], 'vweights': [], 'target_vweights': [], 'target_pweights': [], } env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) actor = Actor(HIDDEN_SIZE, stochastic=False, layer_norm=True) critic = Critic(HIDDEN_SIZE, state_action=True, layer_norm=True) target_actor = create_target_network(actor) target_critic = create_target_network(critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE) replay = ch.ExperienceReplay() def get_random_action(state): return torch.tensor([[2 * random.random() - 1]]) def get_action(state): action = actor(state) + ACTION_NOISE * torch.randn(1, 1) return torch.clamp(action, min=-1, max=1) for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) result['rewards'].append(replay.reward()[-1].item()) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) next_values = target_critic(batch.next_state(), target_actor(batch.next_state())).view( -1, 1) values = critic(batch.state(), batch.action()).view(-1, 1) value_loss = ch.algorithms.ddpg.state_value_loss( values, next_values.detach(), batch.reward(), batch.done(), DISCOUNT) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() result['vlosses'].append(value_loss.item()) # Update policy by one step of gradient ascent policy_loss = -critic(batch.state(), actor(batch.state())).mean() actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['plosses'].append(policy_loss.item()) # Update target networks ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR) ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR) result['pweights'] = list(actor.parameters()) result['target_pweights'] = list(target_actor.parameters()) result['vweights'] = list(critic.parameters()) result['target_vweights'] = list(target_critic.parameters()) return result
def train_cherry(): random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) result = { 'rewards': [], 'plosses': [], 'vlosses': [], 'qlosses': [], 'pweights': [], 'vweights': [], 'vweights_target': [], 'qweights1': [], 'qweights2': [], } env = gym.make('Pendulum-v0') env.seed(SEED) env = envs.Torch(env) env = envs.Runner(env) replay = ch.ExperienceReplay() actor = SoftActor(HIDDEN_SIZE) critic_1 = Critic(HIDDEN_SIZE, state_action=True) critic_2 = Critic(HIDDEN_SIZE, state_action=True) value_critic = Critic(HIDDEN_SIZE) target_value_critic = create_target_network(value_critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critics_optimiser = optim.Adam(list(critic_1.parameters()) + list(critic_2.parameters()), lr=LEARNING_RATE) value_critic_optimiser = optim.Adam(value_critic.parameters(), lr=LEARNING_RATE) def get_random_action(state): return torch.tensor([[2 * random.random() - 1]]) def get_action(state): return actor(state).sample() for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] result['rewards'].append(replay.reward()[-1].item()) if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) # Pre-compute some quantities masses = actor(batch.state()) actions = masses.rsample() log_probs = masses.log_prob(actions) q_values = torch.min(critic_1(batch.state(), actions.detach()), critic_2(batch.state(), actions.detach())).view(-1, 1) # Compute Q losses v_next = target_value_critic(batch.next_state()).view(-1, 1) q_old_pred1 = critic_1(batch.state(), batch.action().detach()).view(-1, 1) q_old_pred2 = critic_2(batch.state(), batch.action().detach()).view(-1, 1) qloss1 = ch.algorithms.sac.action_value_loss(q_old_pred1, v_next.detach(), batch.reward(), batch.done(), DISCOUNT) qloss2 = ch.algorithms.sac.action_value_loss(q_old_pred2, v_next.detach(), batch.reward(), batch.done(), DISCOUNT) # Update Q-functions by one step of gradient descent qloss = qloss1 + qloss2 critics_optimiser.zero_grad() qloss.backward() critics_optimiser.step() result['qlosses'].append(qloss.item()) # Update V-function by one step of gradient descent v_pred = value_critic(batch.state()).view(-1, 1) vloss = ch.algorithms.sac.state_value_loss(v_pred, log_probs.detach(), q_values.detach(), alpha=ENTROPY_WEIGHT) value_critic_optimiser.zero_grad() vloss.backward() value_critic_optimiser.step() result['vlosses'].append(vloss.item()) # Update policy by one step of gradient ascent q_actions = critic_1(batch.state(), actions).view(-1, 1) policy_loss = ch.algorithms.sac.policy_loss(log_probs, q_actions, alpha=ENTROPY_WEIGHT) actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() result['plosses'].append(policy_loss.item()) # Update target value network ch.models.polyak_average(target_value_critic, value_critic, POLYAK_FACTOR) result['pweights'] = list(actor.parameters()) result['vweights'] = list(value_critic.parameters()) result['vweights_target'] = list(target_value_critic.parameters()) result['qweights1'] = list(critic_1.parameters()) result['qweights2'] = list(critic_2.parameters()) return result