class CartPole: def __init__(self, gravity): self.dim = 5 self.env = CartPoleEnv() self.env.gravity = gravity def sigmoid(self, x): return 1 / (1 + np.exp(-x)) def action(self, observation, x): x = x * 10 - 5 w = x[:4] b = x[4] return int(self.sigmoid(np.sum(observation * w) + b) > 0.5) def fitness(self, x): fitness = 0 observation = self.env.reset() for t in range(200): action = self.action(observation, x) observation, reward, done, info = self.env.step(action) fitness += reward if done: break return -fitness def __del__(self): self.env.close()
def test_step_return_type(self): """Check if a 3-tuple of list, float, and bool is returned""" env = CartPoleEnv() env.reset() obs, reward, done = env.step(action=-1) self.assertIsInstance(obs, list) self.assertIsInstance(reward, float) self.assertIsInstance(done, bool)
class TestCartPoleEnv(unittest.TestCase): def setUp(self): self.env = CartPoleEnv() @unittest.skip('Skipping due to cartpole.out') def test_reset_return_type(self): """Check if a list is returned""" self.assertIsInstance(self.env.reset(), list) @unittest.skip('Skipping due to cartpole.out') def test_step_return_type(self): """Check if a 3-tuple of list, float, and bool is returned""" env = CartPoleEnv() env.reset() obs, reward, done = env.step(action=-1) self.assertIsInstance(obs, list) self.assertIsInstance(reward, float) self.assertIsInstance(done, bool) @unittest.skip('Skipping due to cartpole.out') def test_obs_dim_return_type(self): """Check if an integer is returned""" self.assertIsInstance(self.env.obs_dim(), int) @unittest.skip('Skipping due to cartpole.out') def test_reset_return_dims(self): """Check if a 4-dimensional list is returned""" self.assertEqual(len(self.env.reset()), 4) @unittest.skip('Skipping due to cartpole.out') def test_obs_dim_return_value(self): """Check if 4 is returned""" env = CartPoleEnv() env.reset() self.assertEqual(env.obs_dim(), 4) def test_step_wrong_input(self): """Check if assertion is raised with wrong input""" with self.assertRaises(AssertionError): self.env.step(43892.42) @unittest.skip('Skipping due to cartpole.out') def test_done_signal_per_episode(self): """Check if done signal is triggered at the end of the episode""" env = CartPoleEnv() env.reset() for t in range(10): _, _, done = env.step(action=-1) if t != 499: # Must be false within 10 steps self.assertFalse(done) # Must be true at the end of the episode self.assertTrue(done)
def main(): # Build parser parser = build_parser() options = parser.parse_args() # Set random seed random.seed(options.random_seed) # Get CEM methods cem = CrossEntropyMethod(N=options.n, p=options.p) # Create environment object env = CartPoleEnv() # Create linear model model = LinearModel(dims=env.obs_dim()) # Initialize parameters params = model.params # Episode scores win_ratio_list = [] successful_episodes = 0 for i_episode in range(options.episodes): sys.stderr.write('\n###### Episode {} of {} ###### \n'.format(i_episode+1, options.episodes)) # Sample N parameter vectors noisy_params = cem.sample_parameters(params) # Evaluate the sampled vectors rewards = [noisy_evaluation(model, env, options.step_size, i) for i in noisy_params] # Get elite parameters based on reward elite_params = cem.get_elite_parameters(noisy_params,rewards) # Update parameters params = cem.get_parameter_mean(elite_params) episode_reward = run_episode(model=update_model(model,params), env=env, steps=options.step_size, print_step=options.print_step) win_ratio = episode_reward / options.step_size sys.stderr.write('Episode reward: {} ({:.2f}%)\n'.format(episode_reward, win_ratio)) # Save win_ratio win_ratio_list.append(win_ratio) if episode_reward >= options.step_size: successful_episodes += 1 sys.stderr.write('\nFinal params: {}'.format(model.params)) sys.stderr.write('\nRun finished. {} out of {} episodes ({:.2f}%) have a reward of atleast {}\n'.format(successful_episodes, options.episodes, successful_episodes / options.episodes, options.step_size)) # If output_file is given, write scores to disk if options.output_file: sys.stderr.write('\nWriting scores to file: {}.csv...\n'.format(options.output_file)) with open(options.output_file + '.csv', 'w', newline='') as f: wr = csv.writer(f) wr.writerow(win_ratio_list) sys.stderr.write('Done!\n') # Terminate the host program env.terminate()
def test_done_signal_per_episode(self): """Check if done signal is triggered at the end of the episode""" env = CartPoleEnv() env.reset() for t in range(10): _, _, done = env.step(action=-1) if t != 499: # Must be false within 10 steps self.assertFalse(done) # Must be true at the end of the episode self.assertTrue(done)
def evaluate_agent(agent, episodes, return_trajectories=False, seed=1): env = CartPoleEnv() env.seed(seed) returns, trajectories = [], [] for _ in range(episodes): states, actions, rewards = [], [], [] state, terminal = env.reset(), False while not terminal: with torch.no_grad(): policy, _ = agent(state) action = policy.logits.argmax(dim=-1) # Pick action greedily state, reward, terminal = env.step(action) if return_trajectories: states.append(state) actions.append(action) rewards.append(reward) returns.append(sum(rewards)) if return_trajectories: # Collect trajectory data (including terminal signal, which may be needed for offline learning) terminals = torch.cat( [torch.ones(len(rewards) - 1), torch.zeros(1)]) trajectories.append( dict(states=torch.cat(states), actions=torch.cat(actions), rewards=torch.tensor(rewards, dtype=torch.float32), terminals=terminals)) return (returns, trajectories) if return_trajectories else returns
def __init__(self, gravity): self.dim = 5 self.env = CartPoleEnv() self.env.gravity = gravity
parser.add_argument('--imitation-epochs', type=int, default=5, metavar='IE', help='Imitation learning epochs') parser.add_argument('--imitation-replay-size', type=int, default=1, metavar='IRS', help='Imitation learning trajectory replay size') args = parser.parse_args() torch.manual_seed(args.seed) os.makedirs('results', exist_ok=True) # Set up environment and models env = CartPoleEnv() env.seed(args.seed) agent = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.hidden_size) agent_optimiser = optim.RMSprop(agent.parameters(), lr=args.learning_rate) if args.imitation: # Set up expert trajectories dataset expert_trajectories = torch.load('expert_trajectories.pth') expert_trajectories = { k: torch.cat([trajectory[k] for trajectory in expert_trajectories], dim=0) for k in expert_trajectories[0].keys() } # Flatten expert trajectories expert_trajectories = TransitionDataset(expert_trajectories) # Set up discriminator if args.imitation in ['AIRL', 'GAIL']:
def test_obs_dim_return_value(self): """Check if 4 is returned""" env = CartPoleEnv() env.reset() self.assertEqual(env.obs_dim(), 4)
def setUp(self): self.env = CartPoleEnv()