def test_environment(self): self.start_tests(name='getting-started-environment') environment = Environment.create(environment='gym', level='CartPole', max_episode_timesteps=50) self.finished_test() environment = Environment.create(environment='gym', level='CartPole-v1') self.finished_test() environment = Environment.create( environment='test/data/environment.json', max_episode_timesteps=50) self.finished_test() environment = Environment.create( environment='test.data.custom_env.CustomEnvironment', max_episode_timesteps=10) self.finished_test() from test.data.custom_env import CustomEnvironment environment = Environment.create(environment=CustomEnvironment, max_episode_timesteps=10) self.finished_test()
def server(port): Environment.create(environment=environment, max_episode_timesteps=5, remote='socket-server', port=port, states=self.__class__.states, actions=self.__class__.actions, min_timesteps=self.__class__.min_timesteps)
def prepare( self, # general environment environment=None, max_episode_timesteps=None, # unit-test environment min_timesteps=None, states=None, actions=None, # exclude action types exclude_bool_action=False, exclude_int_action=False, exclude_float_action=False, exclude_bounded_action=False, # agent require_observe=False, require_all=False, **agent): """ Generic unit-test preparation. """ Layer.layers = None if environment is None: environment = self.environment_spec( max_episode_timesteps=max_episode_timesteps, min_timesteps=min_timesteps, states=states, actions=actions, exclude_bool_action=exclude_bool_action, exclude_int_action=exclude_int_action, exclude_float_action=exclude_float_action, exclude_bounded_action=exclude_bounded_action) environment = Environment.create(environment=environment) elif min_timesteps is None: if max_episode_timesteps is None: max_episode_timesteps = self.__class__.max_episode_timesteps environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps) else: raise TensorforceError.unexpected() agent = self.agent_spec(require_observe=require_observe, require_all=require_all, **agent) agent = Agent.create(agent=agent, environment=environment) return agent, environment
def setup(self, dbars: Any) -> Any: trainingEnvironment = Environment.create( environment=TradingEnvironment(dbars), ) self.agent = Agent.create( agent=PPOAgent, environment=trainingEnvironment, # alternatively: states, actions, (max_episode_timesteps) update=dict( unit='timesteps', batch_size=64 ), network="auto", ## exploration=?, reward_estimation=dict( horizon=20 # discount=?, ), learning_rate=3e-4, # likelihood_ratio_clipping=?, # subsampling_fraction=?, # multi_step=? summarizer=dict( directory='./tensorboard/' ) ) self.agent.save(directory='model-numpy', format='checkpoint', append='episodes') ## Train! runner = Runner(self.agent, environment=trainingEnvironment) runner.run( num_episodes=10000, save_best_agent='./best-agent/' ) trainingEnvironment.close() ## Prepare agent for trading self.internal_state = self.agent.initial_internals()
def test_agent(self): self.start_tests(name='getting-started-agent') environment = Environment.create(environment='gym', level='CartPole', max_episode_timesteps=50) self.finished_test() agent = Agent.create(agent='tensorforce', environment=environment, update=64, optimizer=dict(optimizer='adam', learning_rate=1e-3), objective='policy_gradient', reward_estimation=dict(horizon=20)) self.finished_test() agent = Agent.create(agent='ppo', environment=environment, batch_size=10, learning_rate=1e-3) self.finished_test() agent = Agent.create(agent='test/data/agent.json', environment=environment) self.finished_test()
def main(): # Record experience traces record_ppo_config(directory='ppo-traces') # Alternatively: # record_custom_act_function(directory='ppo-traces') # write_custom_recording_file(directory='ppo-traces') # Pretrain a new agent on the recorded traces: for 30 iterations, feed the # experience of one episode to the agent and subsequently perform one update environment = Environment.create( environment='benchmarks/configs/cartpole.json') agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) agent.pretrain(directory='ppo-traces', num_iterations=30, num_traces=1, num_updates=1) # Evaluate the pretrained agent runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=100, evaluation=True) runner.close() # Close agent and environment agent.close() environment.close()
def main(): # Start recording traces after the first 100 episodes -- by then, the agent # has solved the environment runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json', recorder=dict(directory='ppo-traces', start=80)), environment='benchmarks/configs/cartpole.json') runner.run(num_episodes=100) runner.close() # Pretrain a new agent on the recorded traces: for 30 iterations, feed the # experience of one episode to the agent and subsequently perform one update environment = Environment.create( environment='benchmarks/configs/cartpole.json') agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) agent.pretrain(directory='ppo-traces', num_iterations=30, num_traces=1, num_updates=1) # Evaluate the pretrained agent runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=100, evaluation=True) runner.close() # Close agent and environment agent.close() environment.close()
def get_agent_and_runner(max_timesteps=EPISODE_MAX_LENGTH): max_timesteps = EPISODE_MAX_LENGTH if max_timesteps is None else max_timesteps # OpenAI-Gym environment specification gym_environment = gym.make(LEVEL, render=True) gym_environment = TimeLimit(gym_environment.unwrapped, max_episode_steps=max_timesteps) # gym_environment = Monitor(gym_environment, RECORD_DICT, force=True) environment = Environment.create( environment=gym_environment, max_episode_timesteps=gym_environment.spec.max_episode_steps, ) agent = Agent.create( agent='a2c', environment=environment, # parallel_interactions=PARALLEL, # Automatically configured network # network='auto', network=[ dict(type='dense', size=256, activation='tanh'), dict(type='dense', size=256, activation='tanh'), dict(type='dense', size=256, activation='tanh'), ], # AC optimization parameters batch_size=256, update_frequency=2, learning_rate=0.001, # Reward estimation discount=0.99, predict_terminal_values=False, # Regularization l2_regularization=1.0, entropy_regularization=0.0, # Preprocessing state_preprocessing='linear_normalization', reward_preprocessing=None, # Exploration exploration=0.3, variable_noise=0.2, # Default additional config values config=None, # Save agent every 10 updates and keep the 5 most recent checkpoints saver=dict(directory=MODEL_DICT, frequency=10, max_checkpoints=5), # Log all available Tensorboard summaries summarizer=dict(directory=SUMMARY_DICT, summaries='all'), # Do not record agent-environment interaction trace recorder=None # RECORD_DICT ) # Initialize the runner runner = Runner( agent=agent, environment=environment, max_episode_timesteps=gym_environment.spec.max_episode_steps, # num_parallel=PARALLEL, # remote="multiprocessing" ) return agent, runner
def test_quickstart(self): self.start_tests(name='quickstart') # ==================== # Create an OpenAI-Gym environment environment = Environment.create(environment='gym', level='CartPole-v1') # Create a PPO agent agent = Agent.create( agent='ppo', environment=environment, # Automatically configured network network='auto', # Optimization batch_size=10, update_frequency=2, learning_rate=1e-3, subsampling_fraction=0.2, optimization_steps=5, # Reward estimation likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False, # Critic critic_network='auto', critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3), # Preprocessing preprocessing=None, # Exploration exploration=0.0, variable_noise=0.0, # Regularization l2_regularization=0.0, entropy_regularization=0.0, # TensorFlow etc name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None, summarizer=None, recorder=None) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=50) runner.close() # ==================== self.finished_test()
def prepare(self, environment=None, states=None, actions=None, **agent): """ Generic unit-test preparation. """ if environment is None: environment = self.environment_spec(states=states, actions=actions) environment = Environment.create(environment=environment) else: environment = Environment.create( environment=environment, max_episode_timesteps=self.__class__.max_episode_timesteps) agent = self.agent_spec(**agent) agent = Agent.create(agent=agent, environment=environment) return agent, environment
def test_load_performance(self): self.start_tests(name='load-performance') environment = Environment.create(environment='CartPole-v1') agent = dict(directory='test/data', filename='ppo-checkpoint', format='checkpoint') runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, use_tqdm=False, evaluation=True) self.assertTrue( all(episode_return == 500.0 for episode_return in runner.episode_returns)) runner.close() self.finished_test() agent = dict(directory='test/data', filename='ppo-checkpoint', format='numpy') runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, use_tqdm=False, evaluation=True) self.assertTrue( all(episode_return == 500.0 for episode_return in runner.episode_returns)) runner.close() self.finished_test() agent = dict(directory='test/data', filename='ppo-checkpoint', format='hdf5') runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, use_tqdm=False, evaluation=True) self.assertTrue( all(episode_return == 500.0 for episode_return in runner.episode_returns)) runner.close() self.finished_test() agent = tf.saved_model.load(export_dir='test/data/ppo-checkpoint') # 10 episodes for _ in range(10): states = environment.reset() terminal = False episode_return = 0.0 while not terminal: states = np.expand_dims(states, axis=0) auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool)) actions = agent.act(states, auxiliaries, True) actions = actions.numpy().item() states, terminal, reward = environment.execute(actions=actions) episode_return += reward self.assertEqual(episode_return, 500.0) environment.close() self.finished_test()
def prepare(self, environment=None, states=None, actions=None, **agent): """ Generic unit-test preparation. """ if environment is None: environment = self.environment_spec(states=states, actions=actions) environment = Environment.create(environment=environment) else: environment = Environment.create( environment=environment, max_episode_timesteps=self.__class__.max_episode_timesteps) agent = self.agent_spec(**agent) agent = Agent.create(agent=agent, environment=environment) assert agent.__class__.__name__ in ('ConstantAgent', 'RandomAgent') or \ isinstance(agent.model.get_architecture(), str) return agent, environment
def main(): num_parallel = 8 environment = Environment.create(environment='custom_cartpole', max_episode_timesteps=500) agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment, parallel_interactions=num_parallel) # Train for 100 episodes for episode in range(0, 100, num_parallel): # Episode using act and observe parallel, states = environment.reset(num_parallel=num_parallel) terminal = (parallel < 0) # all false sum_rewards = 0.0 num_updates = 0 while not terminal.all(): actions = agent.act(states=states, parallel=parallel) next_parallel, states, terminal, reward = environment.execute( actions=actions) num_updates += agent.observe(terminal=terminal, reward=reward, parallel=parallel) parallel = next_parallel sum_rewards += reward.sum() print('Episode {}: return={} updates={}'.format( episode, sum_rewards / num_parallel, num_updates)) # Evaluate for 100 episodes num_parallel = 4 num_episodes = 100 sum_rewards = 0.0 for _ in range(0, num_episodes, num_parallel): parallel, states = environment.reset(num_parallel=num_parallel) internals = agent.initial_internals() internals = [internals for _ in range(num_parallel)] terminal = (parallel < 0) # all false while not terminal.all(): actions, internals = agent.act(states=states, internals=internals, independent=True, deterministic=True) _, states, terminal, reward = environment.execute(actions=actions) internals = [ internal for internal, term in zip(internals, terminal) if not term ] sum_rewards += reward.sum() print('Mean evaluation return:', sum_rewards / num_episodes) # Close agent and environment agent.close() environment.close()
def main(): # Train agent environment = Environment.create( environment='benchmarks/configs/cartpole.json') runner = Runner(agent='benchmarks/configs/ppo.json', environment=environment) runner.run(num_episodes=100) # Save agent SavedModel runner.agent.save(directory='saved-model', format='saved-model') runner.close() # Model serving, potentially using different programming language etc # (For regular model saving and loading within Python, see save_load_agent.py example) # Load agent SavedModel agent = tf.saved_model.load(export_dir='saved-model') # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(100): states = environment.reset() # Required in case of internal states: # internals = agent.initial_internals() # internals = recursive_map(batch, internals) terminal = False while not terminal: states = batch(states) # Required in case of nested states: # states = recursive_map(batch, states) auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool)) deterministic = True actions = agent.act(states, auxiliaries, deterministic) # Required in case of internal states: # actions_internals = agent.act(states, internals, auxiliaries, deterministic) # actions, internals = actions_internals['actions'], actions_internals['internals'] actions = unbatch(actions) # Required in case of nested actions: # actions = recursive_map(unbatch, actions) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward print('Mean evaluation return:', sum_rewards / 100.0) environment.close()
def test_record_and_pretrain(self): self.start_tests(name='record-and-pretrain') with TemporaryDirectory() as directory: # ==================== # Start recording traces after the first 100 episodes -- by then, the agent # has solved the environment runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json', recorder=dict(directory=directory, start=8)), environment='benchmarks/configs/cartpole.json') runner.run(num_episodes=10) runner.close() # Pretrain a new agent on the recorded traces: for 30 iterations, feed the # experience of one episode to the agent and subsequently perform one update environment = Environment.create( environment='benchmarks/configs/cartpole.json') agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) agent.pretrain(directory='test/data/ppo-traces', num_iterations=30, num_traces=1, num_updates=1) # Evaluate the pretrained agent runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, evaluation=True) self.assertTrue( all(episode_reward == 500.0 for episode_reward in runner.episode_rewards)) runner.close() # Close agent and environment agent.close() environment.close() # ==================== files = sorted(os.listdir(path=directory)) self.assertEqual(len(files), 2) self.assertTrue( all( file.startswith('trace-') and file.endswith('0000000{}.npz'.format(n)) for n, file in enumerate(files, start=8))) self.finished_test()
def test_act_observe(self): self.start_tests(name='act-observe') # ==================== environment = Environment.create( environment='benchmarks/configs/cartpole.json') agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) # Train for 100 episodes for episode in range(10): # Episode using act and observe states = environment.reset() terminal = False sum_reward = 0.0 num_updates = 0 while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) num_updates += agent.observe(terminal=terminal, reward=reward) sum_reward += reward print('Episode {}: return={} updates={}'.format( episode, sum_reward, num_updates)) # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(10): states = environment.reset() internals = agent.initial_internals() terminal = False while not terminal: actions, internals = agent.act(states=states, internals=internals, independent=True, deterministic=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward print('Mean evaluation return:', sum_rewards / 100.0) # Close agent and environment agent.close() environment.close() # ==================== self.finished_test()
def test_execution(self): self.start_tests(name='getting-started-execution') runner = Runner(agent='test/data/agent.json', environment=dict(environment='gym', level='CartPole'), max_episode_timesteps=10) runner.run(num_episodes=10) runner.run(num_episodes=5, evaluation=True) runner.close() self.finished_test() # Create agent and environment environment = Environment.create( environment='test/data/environment.json', max_episode_timesteps=10) agent = Agent.create(agent='test/data/agent.json', environment=environment) # Train for 200 episodes for _ in range(10): states = environment.reset() terminal = False while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(5): states = environment.reset() internals = agent.initial_internals() terminal = False while not terminal: actions, internals = agent.act(states=states, internals=internals, evaluation=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward sum_rewards / 100 # Close agent and environment agent.close() environment.close() self.finished_test()
def test_readme(self): self.start_tests(name='readme') # ==================== from tensorforce import Agent, Environment # Pre-defined or custom environment environment = Environment.create(environment='gym', level='CartPole', max_episode_timesteps=500) # Instantiate a Tensorforce agent agent = Agent.create( agent='tensorforce', environment= environment, # alternatively: states, actions, (max_episode_timesteps) memory=1000, update=dict(unit='timesteps', batch_size=64), optimizer=dict(type='adam', learning_rate=3e-4), policy=dict(network='auto'), objective='policy_gradient', reward_estimation=dict(horizon=20)) # Train for 300 episodes for _ in range(1): # Initialize episode states = environment.reset() terminal = False while not terminal: # Episode timestep actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() # ==================== self.finished_test()
def main(): # OpenAI-Gym environment initialization environment = Environment.create(environment='benchmarks/configs/cartpole.json') # PPO agent initialization agent = Agent.create( agent='benchmarks/configs/ppo.json', environment=environment, # Option 1: Saver - save agent periodically every 10 updates # and keep the 5 most recent checkpoints saver=dict(directory='model-checkpoint', frequency=10, max_checkpoints=5), ) # Runner initialization runner = Runner(agent=agent, environment=environment) # Training runner.run(num_episodes=100) runner.close() # Option 2: Explicit save # (format: 'numpy' or 'hdf5' store only weights, 'checkpoint' stores full TensorFlow model, # agent argument saver, specified above, uses 'checkpoint') agent.save(directory='model-numpy', format='numpy', append='episodes') # Close agent separately, since created separately agent.close() # Load agent TensorFlow checkpoint agent = Agent.load(directory='model-checkpoint', format='checkpoint', environment=environment) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=100, evaluation=True) runner.close() agent.close() # Load agent NumPy weights agent = Agent.load(directory='model-numpy', format='numpy', environment=environment) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=100, evaluation=True) runner.close() agent.close() # Close environment separately, since created separately environment.close()
def initialize_agent(self): # Set up information about the boost pads now that the game is active and the info is available self.boost_pad_tracker.initialize_boosts(self.get_field_info()) if MODEL is not None: max_time = 10 frames_per_sec = 20 max_timesteps = RLEnvironment.get_max_timesteps(max_time, frames_per_sec) self.env = Environment.create( environment=KickoffEnvironment, max_episode_timesteps=max_timesteps, max_time=max_time, message_throttle=20, frames_per_sec=frames_per_sec, input_exclude=[ InputOptions.BALL_POSITION_REL, InputOptions.BALL_DIRECTION, InputOptions.CAR_POSITION_REL, InputOptions.CAR_VELOCITY_MAG, ], output_exclude=[ OutputOptions.BOOST, OutputOptions.STEER, OutputOptions.E_BRAKE, OutputOptions.THROTTLE, OutputOptions.ROLL, ] ) directory='../learning/training/{0}'.format(MODEL) filename='agent' agent = os.path.join(directory, os.path.splitext(filename)[0] + '.json') if not os.path.isfile(agent): logging_utils.log_warn(os.getcwd(), {}) raise Exception('Model file doesn\'t exist') self.agent = Agent.load( directory=os.path.abspath(directory), environment=self.env, format='checkpoint', ) self.env.reset()
def write_custom_recording_file(directory): # Start recording traces after 80 episodes -- by then, the environment is solved environment = Environment.create( environment='benchmarks/configs/cartpole.json') agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=80) runner.close() # Record 20 episodes for episode in range(20): # Record episode experience episode_states = list() episode_actions = list() episode_terminal = list() episode_reward = list() # Evaluation episode states = environment.reset() terminal = False while not terminal: episode_states.append(states) actions = agent.act(states=states, independent=True, deterministic=True) episode_actions.append(actions) states, terminal, reward = environment.execute(actions=actions) episode_terminal.append(terminal) episode_reward.append(reward) # Write recorded episode trace to npz file np.savez_compressed(file=os.path.join( directory, 'trace-{:09d}.npz'.format(episode)), states=np.stack(episode_states, axis=0), actions=np.stack(episode_actions, axis=0), terminal=np.stack(episode_terminal, axis=0), reward=np.stack(episode_reward, axis=0))
def test_quickstart(self): environment = Environment.create(environment='gym', level='CartPole', max_episode_timesteps=500) agent = Agent.create( agent='tensorforce', environment= environment, # alternatively: states, actions, (max_episode_timesteps) memory=1000, update=dict(unit='timesteps', batch_size=32), optimizer=dict(type='adam', learning_rate=3e-4), policy=dict(network='auto'), objective='policy_gradient', reward_estimation=dict(horizon=1)) # Train for a single episode. states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) self.assertEqual(4, len(states)) self.assertFalse(terminal) self.assertEqual(1, reward)
dest_folder=data_folder, ) # download latest version of Oxford dataset download_csv(LATEST_DATA_URL, "OxCGRT_latest", dest_folder=data_folder) # number of prescriptions (1 prescription per day) future_days = 3 # Path to the "standard_predictor/predict.py" from the covid-xprize repo # the covid-xprize package needs to be installed "pip install -e." predictor_script_path = "/Users/romainegele/Documents/xPrize/covid-xprize/covid_xprize/standard_predictor/predict.py" # Instanciate environment and wrap it up in Tensorforce.Environment class env = Environment.create(CovidEnv(future_days, predictor_script_path, OXFORD_CSV_PATH), max_episode_timesteps=future_days) print("ACTION SPACE") pprint(env.actions()) print("STATE SPACE") pprint(env.states()) # Create Agent agent = Agent.create(agent='ppo', environment=env, batch_size=10, learning_rate=1e-3) # Create a runner
from tensorforce import Agent, Environment import matplotlib.pyplot as plt import numpy as np import math import pickle from tqdm import tqdm episode_number = 400 average_over = 20 # Pre-defined or custom environment environment = Environment.create(environment='gym', level='CartPole-v1', max_episode_timesteps=1000) ''' Actions: Type: Discrete(2) Num Action 0 Push cart to the left 1 Push cart to the right Observation: Type: Box(4) Num Observation Min Max 0 Cart Position -4.8 4.8 1 Cart Velocity -Inf Inf 2 Pole Angle -0.418 rad (-24 deg) 0.418 rad (24 deg) 3 Pole Angular Velocity -Inf Inf Terminal State: self.theta_threshold_radians = 12 * 2 * math.pi / 360 self.x_threshold = 2.4 ''' # Intialize reward record and set parameters
#setparameters num_steps=1000 #update exploration rate over n steps initial_value=0.9 #initial exploartion rate decay_rate=0.5 #exploration rate decay rate set_type='exponential' #set the type of decay linear, exponential exploration=dict(type=set_type, unit='timesteps', num_steps=num_steps,initial_value=initial_value, decay_rate=decay_rate) episode_number=10000 evaluation_episode_number=50 average_over=100 # Pre-defined or custom environment environment = Environment.create(environment='gym', level='Walker2d-v3') ''' For detailed notes on how to interact with the Mujoco environment, please refer to note https://bailiping.github.io/Mujoco/ Observation: def _get_obs(self): qpos = self.sim.data.qpos qvel = self.sim.data.qvel return np.concatenate([qpos[1:], np.clip(qvel, -10, 10)]).ravel() Num Observation Min Max rootx(_get_obs states from root z) Not Limited 0 rootz Not Limited 1 rooty Not Limited 2 thigh joint -150 0
def main(): environment = Environment.create( environment='benchmarks/configs/cartpole.json') agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) # Train for 100 episodes for episode in range(100): # Record episode experience episode_states = list() episode_internals = list() episode_actions = list() episode_terminal = list() episode_reward = list() # Episode using independent-act and agent.intial_internals() states = environment.reset() internals = agent.initial_internals() terminal = False sum_reward = 0.0 while not terminal: episode_states.append(states) episode_internals.append(internals) actions, internals = agent.act(states=states, internals=internals, independent=True) episode_actions.append(actions) states, terminal, reward = environment.execute(actions=actions) episode_terminal.append(terminal) episode_reward.append(reward) sum_reward += reward print('Episode {}: {}'.format(episode, sum_reward)) # Feed recorded experience to agent agent.experience(states=episode_states, internals=episode_internals, actions=episode_actions, terminal=episode_terminal, reward=episode_reward) # Perform update agent.update() # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(100): states = environment.reset() internals = agent.initial_internals() terminal = False while not terminal: actions, internals = agent.act(states=states, internals=internals, independent=True, deterministic=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward print('Mean evaluation return:', sum_rewards / 100.0) # Close agent and environment agent.close() environment.close()
#setparameters num_steps = 500 #update exploration rate over n steps initial_value = 0.95 #initial exploartion rate decay_rate = 0.5 #exploration rate decay rate set_type = 'exponential' #set the type of decay linear, exponential exploration = dict(type=set_type, unit='timesteps', num_steps=num_steps, initial_value=initial_value, decay_rate=decay_rate) episode_number = 5000 evaluation_episode_number = 5 # Pre-defined or custom environment environment = Environment.create(environment='gym', level='Hopper-v3') length = np.zeros(episode_number) reward_record_without = [] agent_without = Agent.create(agent='agent.json', environment=environment, exploration=exploration) states = environment.reset() terminal = False print('training agent without boundary') angle_record = [] for _ in tqdm(range(episode_number)): episode_reward = 0 states = environment.reset()
def main(): parser = argparse.ArgumentParser(description='Tensorforce runner') # Agent arguments parser.add_argument( '-a', '--agent', type=str, default=None, help='Agent (name, configuration JSON file, or library module)') parser.add_argument('-c', '--checkpoints', type=str, default=None, help='TensorFlow checkpoints directory') parser.add_argument('-s', '--summaries', type=str, default=None, help='TensorBoard summaries directory') parser.add_argument('--recordings', type=str, default=None, help='Traces recordings directory') # Environment arguments parser.add_argument( '-e', '--environment', type=str, default=None, help='Environment (name, configuration JSON file, or library module)') parser.add_argument( '-l', '--level', type=str, default=None, help='Level or game id, like `CartPole-v1`, if supported') parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help='Maximum number of timesteps per episode') parser.add_argument( '--visualize', action='store_true', help='Visualize agent--environment interaction, if supported') parser.add_argument( '--visualize-directory', type=str, default=None, help= 'Directory to store videos of agent--environment interaction, if supported' ) parser.add_argument( '--import-modules', type=str, default=None, help='Import comma-separated modules required for environment') # Parallel execution arguments parser.add_argument( '--num-parallel', type=int, default=None, help='Number of environment instances to execute in parallel') parser.add_argument( '--batch-agent-calls', action='store_true', help='Batch agent calls for parallel environment execution') parser.add_argument( '--sync-timesteps', action='store_true', help='Synchronize parallel environment execution on timestep-level') parser.add_argument( '--sync-episodes', action='store_true', help='Synchronize parallel environment execution on episode-level') parser.add_argument( '--remote', type=str, choices=('multiprocessing', 'socket-client', 'socket-server'), default=None, help= 'Communication mode for remote environment execution of parallelized' 'environment execution') parser.add_argument('--blocking', action='store_true', help='Remote environments should be blocking') parser.add_argument( '--host', type=str, default=None, help= 'Socket server hostname(s) or IP address(es), single value or comma-separated list' ) parser.add_argument( '--port', type=str, default=None, help= 'Socket server port(s), single value or comma-separated list, increasing sequence if' 'single host and port given') # Runner arguments parser.add_argument( '-v', '--evaluation', action='store_true', help='Run environment (last if multiple) in evaluation mode') parser.add_argument('-n', '--episodes', type=int, default=None, help='Number of episodes') parser.add_argument('-t', '--timesteps', type=int, default=None, help='Number of timesteps') parser.add_argument('-u', '--updates', type=int, default=None, help='Number of agent updates') parser.add_argument( '--mean-horizon', type=int, default=1, help= 'Number of episodes progress bar values and evaluation score are averaged over' ) parser.add_argument( '--save-best-agent', type=str, default=None, help= 'Directory to save the best version of the agent according to the evaluation score' ) # Logging arguments parser.add_argument('-r', '--repeat', type=int, default=1, help='Number of repetitions') parser.add_argument( '--path', type=str, default=None, help='Logging path, directory plus filename without extension') parser.add_argument('--seaborn', action='store_true', help='Use seaborn') args = parser.parse_args() if args.import_modules is not None: for module in args.import_modules.split(','): importlib.import_module(name=module) if args.path is None: callback = None else: assert os.path.splitext(args.path)[1] == '' assert args.episodes is not None and args.visualize is not None rewards = [list() for _ in range(args.episodes)] timesteps = [list() for _ in range(args.episodes)] seconds = [list() for _ in range(args.episodes)] agent_seconds = [list() for _ in range(args.episodes)] def callback(r, p): rewards[r.episodes - 1].append(float(r.episode_rewards[-1])) timesteps[r.episodes - 1].append(int(r.episode_timesteps[-1])) seconds[r.episodes - 1].append(float(r.episode_seconds[-1])) agent_seconds[r.episodes - 1].append( float(r.episode_agent_seconds[-1])) return True if args.environment is None: environment = None else: environment = dict(environment=args.environment) if args.level is not None: environment['level'] = args.level if args.visualize: environment['visualize'] = True if args.visualize_directory is not None: environment['visualize_directory'] = args.visualize_directory if args.host is not None and ',' in args.host: args.host = args.host.split(',') if args.port is not None and ',' in args.port: args.port = [int(x) for x in args.port.split(',')] elif args.port is not None: args.port = int(args.port) if args.remote == 'socket-server': Environment.create(environment=environment, max_episode_timesteps=args.max_episode_timesteps, remote=args.remote, port=args.port) return if args.agent is None: assert args.saver is None and args.summarizer is None and args.recorder is None agent = None else: agent = dict(agent=args.agent) if args.checkpoints is not None: assert 'saver' not in agent agent['saver'] = args.checkpoints if args.summaries is not None: assert 'summarizer' not in agent agent['summarizer'] = args.summaries if args.recordings is not None: assert 'recorder' not in agent agent['recorder'] = args.recordings for _ in range(args.repeat): runner = Runner(agent=agent, environment=environment, max_episode_timesteps=args.max_episode_timesteps, evaluation=args.evaluation, num_parallel=args.num_parallel, remote=args.remote, blocking=args.blocking, host=args.host, port=args.port) runner.run(num_episodes=args.episodes, num_timesteps=args.timesteps, num_updates=args.updates, batch_agent_calls=args.batch_agent_calls, sync_timesteps=args.sync_timesteps, sync_episodes=args.sync_episodes, callback=callback, mean_horizon=args.mean_horizon, save_best_agent=args.save_best_agent) runner.close() if args.path is not None: directory = os.path.split(args.path)[0] if directory != '' and not os.path.isdir(directory): os.makedirs(directory, exist_ok=True) with open(args.path + '.json', 'w') as filehandle: filehandle.write( json.dumps( dict(rewards=rewards, timesteps=timesteps, seconds=seconds, agent_seconds=agent_seconds))) if args.seaborn: import seaborn as sns sns.set() xs = np.arange(len(rewards)) min_rewards = np.amin(rewards, axis=1) max_rewards = np.amax(rewards, axis=1) median_rewards = np.median(rewards, axis=1) plt.plot(xs, median_rewards, color='green', linewidth=2.0) plt.fill_between(xs, min_rewards, max_rewards, color='green', alpha=0.4) plt.xlabel('episodes') plt.ylabel('reward') plt.savefig(fname=(args.path + '.png'))
def __init__( self, agent, environment=None, max_episode_timesteps=None, num_parallel=None, environments=None, evaluation=False, remote=None, blocking=False, host=None, port=None ): if environment is None and environments is None: if remote != 'socket-client': raise TensorforceError.required( name='Runner', argument='environment or environments' ) if num_parallel is None: raise TensorforceError.required( name='Runner', argument='num_parallel', condition='socket-client remote mode' ) environments = [None for _ in range(num_parallel)] elif environment is None: if environments is None: raise TensorforceError.required( name='Runner', argument='environment or environments' ) if not util.is_iterable(x=environments): raise TensorforceError.type( name='Runner', argument='environments', value=environments ) if len(environments) <= 1: raise TensorforceError.value( name='Runner', argument='len(environments)', value=len(environments) ) if num_parallel is not None and num_parallel != len(environments): raise TensorforceError.value( name='Runner', argument='num_parallel', value=num_parallel, hint='!= len(environments)' ) num_parallel = len(environments) environments = list(environments) elif num_parallel is None: if environments is not None: raise TensorforceError.invalid( name='Runner', argument='environments', condition='environment is specified' ) if evaluation: raise TensorforceError.invalid( name='Runner', argument='evaluation', condition='single environment' ) num_parallel = 1 environments = [environment] else: if not isinstance(num_parallel, int): raise TensorforceError.value( name='Runner', argument='num_parallel', dtype=type(num_parallel) ) elif num_parallel < 2: raise TensorforceError.value( name='Runner', argument='num_parallel', value=num_parallel, hint='< 2' ) if environments is not None: raise TensorforceError.invalid( name='Runner', argument='environments', condition='environment is specified' ) if isinstance(environment, Environment): raise TensorforceError.type( name='Runner', argument='environment', dtype=type(environment), condition='num_parallel', hint='is not specification' ) environments = [environment for _ in range(num_parallel)] if port is None or isinstance(port, int): if isinstance(host, str): port = [port + n for n in range(num_parallel)] else: port = [port for _ in range(num_parallel)] else: if len(port) != num_parallel: raise TensorforceError.value( name='Runner', argument='len(port)', value=len(port), hint='!= num_parallel' ) if host is None or isinstance(host, str): host = [host for _ in range(num_parallel)] else: if len(host) != num_parallel: raise TensorforceError.value( name='Runner', argument='len(host)', value=len(host), hint='!= num_parallel' ) self.environments = list() self.is_environment_external = isinstance(environments[0], Environment) environment = Environment.create( environment=environments[0], max_episode_timesteps=max_episode_timesteps, remote=remote, blocking=blocking, host=host[0], port=port[0] ) self.is_environment_remote = isinstance(environment, RemoteEnvironment) states = environment.states() actions = environment.actions() self.environments.append(environment) for n, environment in enumerate(environments[1:], start=1): assert isinstance(environment, Environment) == self.is_environment_external environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps, remote=remote, blocking=blocking, host=host[n], port=port[n] ) assert isinstance(environment, RemoteEnvironment) == self.is_environment_remote assert util.is_equal(x=environment.states(), y=states) assert util.is_equal(x=environment.actions(), y=actions) self.environments.append(environment) self.evaluation = evaluation self.is_agent_external = isinstance(agent, Agent) if num_parallel - int(self.evaluation) > 1: self.agent = Agent.create( agent=agent, environment=environment, parallel_interactions=(num_parallel - int(self.evaluation)) ) else: self.agent = Agent.create(agent=agent, environment=environment)
def test_execution(self): self.start_tests(name='getting-started-execution') runner = Runner(agent='test/data/agent.json', environment=dict(environment='gym', level='CartPole'), max_episode_timesteps=10) runner.run(num_episodes=10) runner.run(num_episodes=5, evaluation=True) runner.close() self.finished_test() runner = Runner(agent='test/data/agent.json', environment=dict(environment='gym', level='CartPole'), max_episode_timesteps=50, num_parallel=5, remote='multiprocessing') runner.run(num_episodes=10) runner.close() self.finished_test() # Create agent and environment environment = Environment.create( environment='test/data/environment.json', max_episode_timesteps=10) agent = Agent.create(agent='test/data/agent.json', environment=environment) # Train for 100 episodes for _ in range(10): states = environment.reset() terminal = False while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # Train for 100 episodes for _ in range(10): episode_states = list() episode_internals = list() episode_actions = list() episode_terminal = list() episode_reward = list() states = environment.reset() internals = agent.initial_internals() terminal = False while not terminal: episode_states.append(states) episode_internals.append(internals) actions, internals = agent.act(states=states, internals=internals, independent=True) episode_actions.append(actions) states, terminal, reward = environment.execute(actions=actions) episode_terminal.append(terminal) episode_reward.append(reward) agent.experience(states=episode_states, internals=episode_internals, actions=episode_actions, terminal=episode_terminal, reward=episode_reward) agent.update() # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(10): states = environment.reset() internals = agent.initial_internals() terminal = False while not terminal: actions, internals = agent.act(states=states, internals=internals, deterministic=True, independent=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward print('Mean episode reward:', sum_rewards / 100) # Close agent and environment agent.close() environment.close() self.finished_test()