def __init__(self, fruit_environment, **kwargs): super().__init__() if isinstance(fruit_environment, BaseEnvironment): self.environment = fruit_environment if self.environment.get_processor() is not None: raise ValueError('Do not use state processor with TensorForce !') else: raise ValueError('Environment must be from FruitAPI !') self.__max_episode_timesteps = False state_space = self.environment.get_state_space() self.states_spec = dict(type='float', shape=tuple(state_space.get_shape())) action_ranges, _ = self.environment.get_action_space().get_range() self.actions_spec = dict(type='int', num_values=len(action_ranges)) self.__timesteps = 0 if isinstance(fruit_environment, GymEnvironment): self.__max_episode_timesteps = None _, self.__max_episode_timesteps = OpenAIGym.create_level(level=self.environment.env_name, max_episode_timesteps=self.__max_episode_timesteps, reward_threshold=None, tags=None, **kwargs) self.states_spec = OpenAIGym.specs_from_gym_space( space=self.environment.env.observation_space, ignore_value_bounds=True) self.actions_spec = OpenAIGym.specs_from_gym_space( space=self.environment.env.action_space, ignore_value_bounds=False)
def test_quickstart(self): sys.stdout.write('\nQuickstart:\n') sys.stdout.flush() # Create an OpenAI-Gym environment environment = OpenAIGym('CartPole-v1') # Create the agent agent = PPOAgent( states=environment.states(), actions=environment.actions(), # Automatically configured network network='auto', # Memory sampling most recent experiences, with a capacity of 2500 timesteps # (6100 > [30 batch episodes] * [200 max timesteps per episode]) memory=6100, # Update every 10 episodes, with a batch of 30 episodes update_mode=dict(unit='episodes', batch_size=30, frequency=10), # PPO optimizer step_optimizer=dict(type='adam', learning_rate=1e-3), # PPO multi-step optimization: 10 updates, each based on a third of the batch subsampling_fraction=0.33, optimization_steps=10, # MLP baseline baseline_mode='states', baseline=dict(type='network', network='auto'), # Baseline optimizer baseline_optimizer=dict(type='multi_step', optimizer=dict(type='adam', learning_rate=1e-4), num_steps=5), # Other parameters discount=0.99, entropy_regularization=1e-2, gae_lambda=None, likelihood_ratio_clipping=0.2) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Function handle called after each finished episode def callback(r): return float(np.mean(r.episode_rewards[-100:])) <= 180.0 # Start the runner runner.run(num_episodes=1000, max_episode_timesteps=200, callback=callback) runner.close() if float(np.mean(runner.episode_rewards[-100:])) <= 180.0: sys.stdout.write('Test failed, exceeding {} episodes\n'.format( runner.episode)) sys.stdout.flush() self.assertTrue(expr=False) else: sys.stdout.write('Test passed after {} episodes\n'.format( runner.episode)) sys.stdout.flush() self.assertTrue(expr=True)
def get_states(self): state = self.environment.get_state() if isinstance(self.environment, GymEnvironment): state = OpenAIGym.flatten_state(state=state, states_spec=self.states_spec) else: state = state.astype(dtype=np.float32) / 255.0 return state
def main(): # Create an OpenAI-Gym environment environment = OpenAIGym('CartPole-v1') # Create the agent agent = PPOAgent( states=environment.states(), actions=environment.actions(), # Automatically configured network network='auto', # Memory sampling most recent experiences, with a capacity of 2500 timesteps # (6100 > [30 batch episodes] * [200 max timesteps per episode]) memory=6100, # Update every 10 episodes, with a batch of 30 episodes update_mode=dict(unit='episodes', batch_size=30, frequency=10), # PPO optimizer step_optimizer=dict(type='adam', learning_rate=1e-3), # PPO multi-step optimization: 10 updates, each based on a third of the batch subsampling_fraction=0.33, optimization_steps=10, # MLP baseline baseline_mode='states', baseline=dict(type='network', network='auto'), # Baseline optimizer baseline_optimizer=dict(type='multi_step', optimizer=dict(type='adam', learning_rate=1e-4), num_steps=5), # Other parameters discount=0.99, entropy_regularization=1e-2, gae_lambda=None, likelihood_ratio_clipping=0.2) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=1000, max_episode_timesteps=200) runner.close()
def __init__(self, level, visualize=False, monitor_directory=None, **kwargs): import retro self._max_episode_timesteps = False self.level = level self.visualize = visualize self.environment = retro.make(game=self.level, **kwargs) if monitor_directory is not None: self.environment = gym.wrappers.Monitor( env=self.environment, directory=monitor_directory) self.states_spec = OpenAIGym.specs_from_gym_space( space=self.environment.observation_space, ignore_value_bounds=True # TODO: not ignore? ) self.actions_spec = OpenAIGym.specs_from_gym_space( space=self.environment.action_space, ignore_value_bounds=False)
def execute(self, actions): if isinstance(self.environment, GymEnvironment): actions = OpenAIGym.unflatten_action(action=actions) reward = self.environment.step(actions) terminal = self.environment.is_terminal() self.__timesteps += 1 if self.__max_episode_timesteps is not None: if self.__timesteps > self.__max_episode_timesteps: terminal = 2 elif terminal: terminal = 1 else: terminal = 0 states = self.get_states() return states, terminal, reward
def main(): #Creates a log for MineRL #logging.basicConfig(level=logging.DEBUG) # Create the environment ENV_NAME = "MineRLTreechop-v0" # Pre-defined or custom environment env = gym.make(ENV_NAME) environment = OpenAIGym(env) agent = Agent.create(agent='ac', environment=environment, max_episode_timesteps=8000, exploration=.03, critic_optimizer='evolutionary') sum_rewards = 0.0 rewards_by_episode = [] for _ in range(200): states = environment.reset() terminal = False print("Training episode " + str(_)) while not terminal: actions = agent.act(states=states, evaluation=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward #print(actions) print("Sum reward so far: " + str(sum_rewards)) rewards_by_episode.append((_, sum_rewards)) print("Ending episode ", _) print(rewards_by_episode) print('Mean episode reward:', sum_rewards / 200) agent.close() environment.close()
def main(): parser = argparse.ArgumentParser() # Gym arguments parser.add_argument('-g', '--gym', help="Gym environment id") parser.add_argument('-i', '--import-modules', help="Import module(s) required for gym environment") parser.add_argument('--monitor', type=str, default=None, help="Save results to this directory") parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('--visualize', action='store_true', default=False, help="Enable OpenAI Gym's visualization") # Agent arguments parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") # Runner arguments parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") args = parser.parse_args() if args.import_modules is not None: for module in args.import_modules.split(','): importlib.import_module(name=module) environment = OpenAIGym(gym_id=args.gym, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video, visualize=args.visualize) agent = Agent.from_spec(spec=args.agent, states=environment.states(), actions=environment.actions(), network=args.network) runner = Runner(agent=agent, environment=environment) def callback(r): if r.episode % 100 == 0: print("================================================\n" "Average secs/episode over 100 episodes: {time:0.2f}\n" "Average steps/sec over 100 episodes: {timestep:0.2f}\n" "Average reward over 100 episodes: {reward100:0.2f}\n" "Average reward over 500 episodes: {reward500:0.2f}". format(time=(sum(r.episode_times[-100:]) / 100.0), timestep=(sum(r.episode_timesteps[-100:]) / sum(r.episode_times[-100:])), reward100=(sum(r.episode_rewards[-100:]) / min(100.0, r.episode)), reward500=(sum(r.episode_rewards[-500:]) / min(500.0, r.episode)))) return True runner.run(num_timesteps=args.timesteps, num_episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, callback=callback) runner.close()
def conversion_2(): env = OpenAIGym(level='FrozenLake8x8-v0', visualize=False) create_random_agent(env)
def compatible_2(): print('+++++++++++++++++++++++++++++++++++++++++++++++++') fruit_env = GymEnvironment(env_name='CartPole-v1') state = fruit_env.get_state_space() print(state.get_range()) print(tuple(state.get_shape())) print(fruit_env.get_action_space().get_range()) print(fruit_env.reset()) print(fruit_env.get_state()) print('+++++++++++++++++++++++++++++++++++++++++++++++++') print('+++++++++++++++++++++++++++++++++++++++++++++++++') env = OpenAIGym(level='CartPole-v1') state = env.states() print(state) print(env.actions()) print(env.reset()) print(env.execute(0)) print(env.max_episode_timesteps()) print('+++++++++++++++++++++++++++++++++++++++++++++++++') print('+++++++++++++++++++++++++++++++++++++++++++++++++') env = TFEnvironment(fruit_environment=fruit_env) print(env.states()) print(env.actions()) print(env.getrobotics_states()) print(env.execute(0)) print(env.max_episode_timesteps()) print('+++++++++++++++++++++++++++++++++++++++++++++++++')
from tensorforce import Agent, Environment from tensorforce.agents import PPOAgent from tensorforce.environments import OpenAIGym # Pre-defined or custom environment # environment = Environment.create( # environment='gym', level='CartPole', max_episode_timesteps=500 # ) # environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500) environment = OpenAIGym('LunarLanderContinuous-v2', visualize=True, max_episode_steps=500) # environment = OpenAIGym('BipedalWalker-v3', visualize=False, max_episode_steps=500) agent = Agent.create( agent='ppo', environment=environment, batch_size=10, network=[ dict(type='dense', size=64), dict(type='dense', size=64) ], learning_rate=1e-3, name='agent_loader' ) # import ipdb;ipdb.set_trace() agent = agent.load() running_score = 0.0 # Train for 300 episodes for i_epoch in range(50000):
from tensorforce import Agent, Environment from tensorforce.agents import PPOAgent from tensorforce.environments import OpenAIGym # Pre-defined or custom environment # environment = Environment.create( # environment='gym', level='CartPole', max_episode_timesteps=500 # ) # Network as list of layers network_spec = [ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ] environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500) # Instantiate a Tensorforce agent # agent = Agent.create( # agent='tensorforce', # environment=environment, # alternatively: states, actions, (max_episode_timesteps) # memory=10000, # update=dict(unit='timesteps', batch_size=64), # optimizer=dict(type='adam', learning_rate=3e-4), # policy=dict(network='auto'), # objective='policy_gradient', # reward_estimation=dict(horizon=20) # ) agent = Agent.create(agent='ppo', environment=environment,
import gym from tensorforce import Agent, Runner from tensorforce.environments import OpenAIGym, Environment from gym.wrappers import Monitor from matplotlib import animation import matplotlib.pyplot as plt import random import time random.seed(time.time_ns()) EPISODES = 1 environment = OpenAIGym( level='CartPole-v1', visualize=True, visualize_directory=f'visualization_{random.randint(0, 1000)}', ) agent = Agent.load(directory='model', environment=environment) # runner = Runner(agent=agent, environment=environment, max_episode_timesteps=500) # runner.run(num_episodes=EPISODES, evaluation=True) # runner.close() sum_rewards = 0.0 for _ in range(EPISODES): states = environment.reset() internals = agent.initial_internals() terminal = False while not terminal: