def compatible_2(): print('+++++++++++++++++++++++++++++++++++++++++++++++++') fruit_env = GymEnvironment(env_name='CartPole-v1') state = fruit_env.get_state_space() print(state.get_range()) print(tuple(state.get_shape())) print(fruit_env.get_action_space().get_range()) print(fruit_env.reset()) print(fruit_env.get_state()) print('+++++++++++++++++++++++++++++++++++++++++++++++++') print('+++++++++++++++++++++++++++++++++++++++++++++++++') env = OpenAIGym(level='CartPole-v1') state = env.states() print(state) print(env.actions()) print(env.reset()) print(env.execute(0)) print(env.max_episode_timesteps()) print('+++++++++++++++++++++++++++++++++++++++++++++++++') print('+++++++++++++++++++++++++++++++++++++++++++++++++') env = TFEnvironment(fruit_environment=fruit_env) print(env.states()) print(env.actions()) print(env.getrobotics_states()) print(env.execute(0)) print(env.max_episode_timesteps()) print('+++++++++++++++++++++++++++++++++++++++++++++++++')
def test_quickstart(self): sys.stdout.write('\nQuickstart:\n') sys.stdout.flush() # Create an OpenAI-Gym environment environment = OpenAIGym('CartPole-v1') # Create the agent agent = PPOAgent( states=environment.states(), actions=environment.actions(), # Automatically configured network network='auto', # Memory sampling most recent experiences, with a capacity of 2500 timesteps # (6100 > [30 batch episodes] * [200 max timesteps per episode]) memory=6100, # Update every 10 episodes, with a batch of 30 episodes update_mode=dict(unit='episodes', batch_size=30, frequency=10), # PPO optimizer step_optimizer=dict(type='adam', learning_rate=1e-3), # PPO multi-step optimization: 10 updates, each based on a third of the batch subsampling_fraction=0.33, optimization_steps=10, # MLP baseline baseline_mode='states', baseline=dict(type='network', network='auto'), # Baseline optimizer baseline_optimizer=dict(type='multi_step', optimizer=dict(type='adam', learning_rate=1e-4), num_steps=5), # Other parameters discount=0.99, entropy_regularization=1e-2, gae_lambda=None, likelihood_ratio_clipping=0.2) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Function handle called after each finished episode def callback(r): return float(np.mean(r.episode_rewards[-100:])) <= 180.0 # Start the runner runner.run(num_episodes=1000, max_episode_timesteps=200, callback=callback) runner.close() if float(np.mean(runner.episode_rewards[-100:])) <= 180.0: sys.stdout.write('Test failed, exceeding {} episodes\n'.format( runner.episode)) sys.stdout.flush() self.assertTrue(expr=False) else: sys.stdout.write('Test passed after {} episodes\n'.format( runner.episode)) sys.stdout.flush() self.assertTrue(expr=True)
def main(): # Create an OpenAI-Gym environment environment = OpenAIGym('CartPole-v1') # Create the agent agent = PPOAgent( states=environment.states(), actions=environment.actions(), # Automatically configured network network='auto', # Memory sampling most recent experiences, with a capacity of 2500 timesteps # (6100 > [30 batch episodes] * [200 max timesteps per episode]) memory=6100, # Update every 10 episodes, with a batch of 30 episodes update_mode=dict(unit='episodes', batch_size=30, frequency=10), # PPO optimizer step_optimizer=dict(type='adam', learning_rate=1e-3), # PPO multi-step optimization: 10 updates, each based on a third of the batch subsampling_fraction=0.33, optimization_steps=10, # MLP baseline baseline_mode='states', baseline=dict(type='network', network='auto'), # Baseline optimizer baseline_optimizer=dict(type='multi_step', optimizer=dict(type='adam', learning_rate=1e-4), num_steps=5), # Other parameters discount=0.99, entropy_regularization=1e-2, gae_lambda=None, likelihood_ratio_clipping=0.2) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=1000, max_episode_timesteps=200) runner.close()
def main(): parser = argparse.ArgumentParser() # Gym arguments parser.add_argument('-g', '--gym', help="Gym environment id") parser.add_argument('-i', '--import-modules', help="Import module(s) required for gym environment") parser.add_argument('--monitor', type=str, default=None, help="Save results to this directory") parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('--visualize', action='store_true', default=False, help="Enable OpenAI Gym's visualization") # Agent arguments parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") # Runner arguments parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") args = parser.parse_args() if args.import_modules is not None: for module in args.import_modules.split(','): importlib.import_module(name=module) environment = OpenAIGym(gym_id=args.gym, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video, visualize=args.visualize) agent = Agent.from_spec(spec=args.agent, states=environment.states(), actions=environment.actions(), network=args.network) runner = Runner(agent=agent, environment=environment) def callback(r): if r.episode % 100 == 0: print("================================================\n" "Average secs/episode over 100 episodes: {time:0.2f}\n" "Average steps/sec over 100 episodes: {timestep:0.2f}\n" "Average reward over 100 episodes: {reward100:0.2f}\n" "Average reward over 500 episodes: {reward500:0.2f}". format(time=(sum(r.episode_times[-100:]) / 100.0), timestep=(sum(r.episode_timesteps[-100:]) / sum(r.episode_times[-100:])), reward100=(sum(r.episode_rewards[-100:]) / min(100.0, r.episode)), reward500=(sum(r.episode_rewards[-500:]) / min(500.0, r.episode)))) return True runner.run(num_timesteps=args.timesteps, num_episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, callback=callback) runner.close()