def test_agent(self): self.start_tests(name='getting-started-agent') environment = Environment.create(environment='gym', level='CartPole', max_episode_timesteps=50) self.finished_test() agent = Agent.create(agent='tensorforce', environment=environment, update=64, optimizer=dict(optimizer='adam', learning_rate=1e-3), objective='policy_gradient', reward_estimation=dict(horizon=20)) self.finished_test() agent = Agent.create(agent='ppo', environment=environment, batch_size=10, learning_rate=1e-3) self.finished_test() agent = Agent.create(agent='test/data/agent.json', environment=environment) self.finished_test()
def test_load_performance(self): self.start_tests(name='load-performance') environment = Environment.create(environment='CartPole-v1') agent = Agent.load( directory='test/data', filename='ppo-checkpoint', format='checkpoint', environment=environment ) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, use_tqdm=False, evaluation=True) self.assertTrue(all(episode_reward == 500.0 for episode_reward in runner.episode_rewards)) runner.close() agent.close() self.finished_test() agent = Agent.load( directory='test/data', filename='ppo-checkpoint', format='numpy', environment=environment ) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, use_tqdm=False, evaluation=True) self.assertTrue(all(episode_reward == 500.0 for episode_reward in runner.episode_rewards)) runner.close() agent.close() self.finished_test() agent = Agent.load( directory='test/data', filename='ppo-checkpoint', format='hdf5', environment=environment ) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, use_tqdm=False, evaluation=True) self.assertTrue(all(episode_reward == 500.0 for episode_reward in runner.episode_rewards)) runner.close() agent.close() self.finished_test() agent = tf.saved_model.load(export_dir='test/data/ppo-checkpoint') # 10 episodes for _ in range(10): states = environment.reset() terminal = False episode_reward = 0.0 while not terminal: states = np.expand_dims(states, axis=0) auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool)) actions = agent.act(states, auxiliaries, True) actions = actions.numpy().item() states, terminal, reward = environment.execute(actions=actions) episode_reward += reward self.assertEqual(episode_reward, 500.0) environment.close() self.finished_test()
def create_agent( self, env, n_episodes, save_frenquency, load=False, ): ########### WORK NEEDED ########### ### You need to tweak the Agent ### ################################### """ Agent definition. Tweak the Agent's parameters to your convenience Use any agent from tensorforce and refer to the documentation for the available hyperparameters : -Vanilla Policy Gradient : https://tensorforce.readthedocs.io/en/latest/agents/vpg.html -Proximal Policy Optimization : https://tensorforce.readthedocs.io/en/latest/agents/ppo.html -Trust-Region Policy Optimization : https://tensorforce.readthedocs.io/en/latest/agents/trpo.html -Deterministic Policy Gradient : https://tensorforce.readthedocs.io/en/latest/agents/dpg.html -Deep Q-Network : https://tensorforce.readthedocs.io/en/latest/agents/dqn.html -Double DQN : https://tensorforce.readthedocs.io/en/latest/agents/double_dqn.html -Dueling DQN : https://tensorforce.readthedocs.io/en/latest/agents/dueling_dqn.html -Actor-Critic : https://tensorforce.readthedocs.io/en/latest/agents/ac.html -Advantage Actor-Critic : https://tensorforce.readthedocs.io/en/latest/agents/a2c.html For the network parameters : https://tensorforce.readthedocs.io/en/latest/modules/networks.html """ ##### Agent definition ######## if not (load): agent = Agent.create( agent="ppo", batch_size=10, exploration=0.01, learning_rate=0.00001, likelihood_ratio_clipping=0.1, # etc..., saver=dict( directory="data/checkpoints", frequency=10, # save checkpoint every 10 updates ), # don't change this environment=env, ) else: agent = Agent.load(directory="data/checkpoints") return agent
def test_vpg(self): self.start_tests(name='VPG') agent, environment = self.prepare(agent='vpg', batch_size=2, network=dict(type='auto', size=8, depth=1, rnn=2), baseline=dict(type='auto', size=7, depth=1, rnn=1), baseline_optimizer=dict( optimizer='adam', learning_rate=1e-3)) self.execute(agent=agent, environment=environment) with TemporaryDirectory() as directory: agent.save(directory=directory, format='numpy') agent = Agent.load(directory=directory) states = environment.reset() agent.act(states=states) agent.close() environment.close()
def test_dpg(self): self.start_tests(name='DPG') actions = dict(gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), gaussian_action2=dict(type='float', shape=(1, ), min_value=-2.0, max_value=1.0)) agent, environment = self.prepare( actions=actions, agent='dpg', memory=100, batch_size=4, # TODO: no-RNN restriction can be removed network=dict(type='auto', size=8, depth=1, rnn=False), # TODO: cannot use RNN since value function takes states and actions critic=dict(type='auto', size=7, depth=1, rnn=False)) self.execute(agent=agent, environment=environment) with TemporaryDirectory() as directory: agent.save(directory=directory, format='numpy') agent = Agent.load(directory=directory) states = environment.reset() agent.act(states=states) agent.close() environment.close()
def main(): # Record experience traces record_ppo_config(directory='ppo-traces') # Alternatively: # record_custom_act_function(directory='ppo-traces') # write_custom_recording_file(directory='ppo-traces') # Pretrain a new agent on the recorded traces: for 30 iterations, feed the # experience of one episode to the agent and subsequently perform one update environment = Environment.create( environment='benchmarks/configs/cartpole.json') agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) agent.pretrain(directory='ppo-traces', num_iterations=30, num_traces=1, num_updates=1) # Evaluate the pretrained agent runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=100, evaluation=True) runner.close() # Close agent and environment agent.close() environment.close()
def setup(self, dbars: Any) -> Any: trainingEnvironment = Environment.create( environment=TradingEnvironment(dbars), ) self.agent = Agent.create( agent=PPOAgent, environment=trainingEnvironment, # alternatively: states, actions, (max_episode_timesteps) update=dict( unit='timesteps', batch_size=64 ), network="auto", ## exploration=?, reward_estimation=dict( horizon=20 # discount=?, ), learning_rate=3e-4, # likelihood_ratio_clipping=?, # subsampling_fraction=?, # multi_step=? summarizer=dict( directory='./tensorboard/' ) ) self.agent.save(directory='model-numpy', format='checkpoint', append='episodes') ## Train! runner = Runner(self.agent, environment=trainingEnvironment) runner.run( num_episodes=10000, save_best_agent='./best-agent/' ) trainingEnvironment.close() ## Prepare agent for trading self.internal_state = self.agent.initial_internals()
def createRLagent(self, load=None): states_dict = {'type': 'float', 'shape': self.num_states} actions_dict = { 'type': 'float', 'shape': self.num_actions, 'min_value': self.input_low, 'max_value': self.input_high } agent = Agent.create( agent='tensorforce', states= states_dict, # alternatively: states, actions, (max_episode_timesteps) actions=actions_dict, memory=10000, update=dict(unit='timesteps', batch_size=64), max_episode_timesteps=self.len_episode, optimizer=dict(type='adam', learning_rate=3e-4), policy=dict(network='auto'), objective='policy_gradient', reward_estimation=dict(horizon=20)) if not load == None: agent.restore(directory=load) return agent
def main(): # Start recording traces after the first 100 episodes -- by then, the agent # has solved the environment runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json', recorder=dict(directory='ppo-traces', start=80)), environment='benchmarks/configs/cartpole.json') runner.run(num_episodes=100) runner.close() # Pretrain a new agent on the recorded traces: for 30 iterations, feed the # experience of one episode to the agent and subsequently perform one update environment = Environment.create( environment='benchmarks/configs/cartpole.json') agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) agent.pretrain(directory='ppo-traces', num_iterations=30, num_traces=1, num_updates=1) # Evaluate the pretrained agent runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=100, evaluation=True) runner.close() # Close agent and environment agent.close() environment.close()
def __init__(self, m: int, n: int, breach_level: float, delta_t: float, learning_rate: float, timeout_time: int, save_path: str, model: TensorForceModel) -> None: super(OurProximalPolicyAgent, self).__init__(m, n) self._breach_level = breach_level self._delta_t = delta_t self.timeout_time = timeout_time self._save_path = save_path self._model = model self._ppo_agent: TensorForceAgent = TensorForceAgent.create( agent=OurProximalPolicyAgent._SPECIFICATION_KEY, states={ 'type': 'float', 'shape': (self._m + self._n, ), 'min_value': 0.0, 'max_value': self._breach_level + self._delta_t }, actions={ 'type': 'int', 'shape': (self._m + self._n, ), 'num_values': OurProximalPolicyAgent._NUM_ACTIONS }, max_episode_timesteps=self.timeout_time, batch_size=OurProximalPolicyAgent._BATCH_SIZE, learning_rate=learning_rate, network=self._model, saver=None if not OurProximalPolicyAgent._SAVE else { 'directory': self._save_path, 'filename': OurProximalPolicyAgent._SAVE_NAME, 'frequency': OurProximalPolicyAgent._SAVING_FREQUENCY })
def test_quickstart(self): self.start_tests(name='quickstart') # ==================== # Create an OpenAI-Gym environment environment = Environment.create(environment='gym', level='CartPole-v1') # Create a PPO agent agent = Agent.create( agent='ppo', environment=environment, # Automatically configured network network='auto', # Optimization batch_size=10, update_frequency=2, learning_rate=1e-3, subsampling_fraction=0.2, optimization_steps=5, # Reward estimation likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False, # Critic critic_network='auto', critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3), # Preprocessing preprocessing=None, # Exploration exploration=0.0, variable_noise=0.0, # Regularization l2_regularization=0.0, entropy_regularization=0.0, # TensorFlow etc name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None, summarizer=None, recorder=None) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=50) runner.close() # ==================== self.finished_test()
def set_up( time_limit=100, batch_size=16, env_version=1, seed_count=10, max_count=10, ): """ Set up a rushed CartSeed agent with less time than it needs to complete an episode. Parameters ---------- time_limit : int, None Turn time limit for episode batch_size : int Batch size for training env_version : int in {1, 2} Environment version. 1 being ideal time, 2 being time limited seed_count : int Number of bad seeds max_count: int Maximum number of samples/scans needed to saturate a bad_seed Returns ------- Environment Agent """ def default_score(state, *args): return 1 if env_version == 1: environment = CartSeed( seed_count=seed_count, bad_seed_count=None, max_count=max_count, sequential=True, revisiting=True, bad_seed_reward_f=default_score, measurement_time=time_limit, ) elif env_version == 2: environment = CartSeedCountdown( seed_count=seed_count, bad_seed_count=None, max_count=max_count, sequential=True, revisiting=True, bad_seed_reward_f=default_score, measurement_time=time_limit, ) else: raise NotImplementedError env = Environment.create(environment=environment) agent = Agent.create(agent="a2c", batch_size=batch_size, environment=env) return env, agent
def main(): # OpenAI-Gym environment initialization environment = Environment.create(environment='benchmarks/configs/cartpole.json') # PPO agent initialization agent = Agent.create( agent='benchmarks/configs/ppo.json', environment=environment, # Option 1: Saver - save agent periodically every 10 updates # and keep the 5 most recent checkpoints saver=dict(directory='model-checkpoint', frequency=10, max_checkpoints=5), ) # Runner initialization runner = Runner(agent=agent, environment=environment) # Training runner.run(num_episodes=100) runner.close() # Option 2: Explicit save # (format: 'numpy' or 'hdf5' store only weights, 'checkpoint' stores full TensorFlow model, # agent argument saver, specified above, uses 'checkpoint') agent.save(directory='model-numpy', format='numpy', append='episodes') # Close agent separately, since created separately agent.close() # Load agent TensorFlow checkpoint agent = Agent.load(directory='model-checkpoint', format='checkpoint', environment=environment) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=100, evaluation=True) runner.close() agent.close() # Load agent NumPy weights agent = Agent.load(directory='model-numpy', format='numpy', environment=environment) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=100, evaluation=True) runner.close() agent.close() # Close environment separately, since created separately environment.close()
def prepare( self, # general environment environment=None, max_episode_timesteps=None, # unit-test environment min_timesteps=None, states=None, actions=None, # exclude action types exclude_bool_action=False, exclude_int_action=False, exclude_float_action=False, exclude_bounded_action=False, # agent require_observe=False, require_all=False, **agent): """ Generic unit-test preparation. """ Layer.layers = None if environment is None: environment = self.environment_spec( max_episode_timesteps=max_episode_timesteps, min_timesteps=min_timesteps, states=states, actions=actions, exclude_bool_action=exclude_bool_action, exclude_int_action=exclude_int_action, exclude_float_action=exclude_float_action, exclude_bounded_action=exclude_bounded_action) environment = Environment.create(environment=environment) elif min_timesteps is None: if max_episode_timesteps is None: max_episode_timesteps = self.__class__.max_episode_timesteps environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps) else: raise TensorforceError.unexpected() agent = self.agent_spec(require_observe=require_observe, require_all=require_all, **agent) agent = Agent.create(agent=agent, environment=environment) return agent, environment
def main(): num_parallel = 8 environment = Environment.create(environment='custom_cartpole', max_episode_timesteps=500) agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment, parallel_interactions=num_parallel) # Train for 100 episodes for episode in range(0, 100, num_parallel): # Episode using act and observe parallel, states = environment.reset(num_parallel=num_parallel) terminal = (parallel < 0) # all false sum_rewards = 0.0 num_updates = 0 while not terminal.all(): actions = agent.act(states=states, parallel=parallel) next_parallel, states, terminal, reward = environment.execute( actions=actions) num_updates += agent.observe(terminal=terminal, reward=reward, parallel=parallel) parallel = next_parallel sum_rewards += reward.sum() print('Episode {}: return={} updates={}'.format( episode, sum_rewards / num_parallel, num_updates)) # Evaluate for 100 episodes num_parallel = 4 num_episodes = 100 sum_rewards = 0.0 for _ in range(0, num_episodes, num_parallel): parallel, states = environment.reset(num_parallel=num_parallel) internals = agent.initial_internals() internals = [internals for _ in range(num_parallel)] terminal = (parallel < 0) # all false while not terminal.all(): actions, internals = agent.act(states=states, internals=internals, independent=True, deterministic=True) _, states, terminal, reward = environment.execute(actions=actions) internals = [ internal for internal, term in zip(internals, terminal) if not term ] sum_rewards += reward.sum() print('Mean evaluation return:', sum_rewards / num_episodes) # Close agent and environment agent.close() environment.close()
def test_readme(self): self.start_tests(name='readme') environment = UnittestEnvironment( states=dict(type='float', shape=(10,)), actions=dict(type='int', shape=(), num_values=5), min_timesteps=5 ) def get_current_state(): return environment.reset() def execute_decision(x): return environment.execute(actions=x)[2] # ========== from tensorforce import Agent # Instantiate a Tensorforce agent agent = Agent.create( agent='tensorforce', states=dict(type='float', shape=(10,)), actions=dict(type='int', num_values=5), max_episode_timesteps=100, memory=10000, update=dict(unit='timesteps', batch_size=64), optimizer=dict(type='adam', learning_rate=3e-4), policy=dict(network='auto'), objective='policy_gradient', reward_estimation=dict(horizon=20) ) # Retrieve the latest (observable) environment state state = get_current_state() # (float array of shape [10]) # Query the agent for its action decision action = agent.act(states=state) # (scalar between 0 and 4) # Execute the decision and retrieve the current performance score reward = execute_decision(action) # (any scalar float) # Pass feedback about performance (and termination) to the agent agent.observe(reward=reward, terminal=False) # ========== agent.close() environment.close() self.finished_test()
def test_record_and_pretrain(self): self.start_tests(name='record-and-pretrain') with TemporaryDirectory() as directory: # ==================== # Start recording traces after the first 100 episodes -- by then, the agent # has solved the environment runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json', recorder=dict(directory=directory, start=8)), environment='benchmarks/configs/cartpole.json') runner.run(num_episodes=10) runner.close() # Pretrain a new agent on the recorded traces: for 30 iterations, feed the # experience of one episode to the agent and subsequently perform one update environment = Environment.create( environment='benchmarks/configs/cartpole.json') agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) agent.pretrain(directory='test/data/ppo-traces', num_iterations=30, num_traces=1, num_updates=1) # Evaluate the pretrained agent runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, evaluation=True) self.assertTrue( all(episode_reward == 500.0 for episode_reward in runner.episode_rewards)) runner.close() # Close agent and environment agent.close() environment.close() # ==================== files = sorted(os.listdir(path=directory)) self.assertEqual(len(files), 2) self.assertTrue( all( file.startswith('trace-') and file.endswith('0000000{}.npz'.format(n)) for n, file in enumerate(files, start=8))) self.finished_test()
def test_act_observe(self): self.start_tests(name='act-observe') # ==================== environment = Environment.create( environment='benchmarks/configs/cartpole.json') agent = Agent.create(agent='benchmarks/configs/ppo.json', environment=environment) # Train for 100 episodes for episode in range(10): # Episode using act and observe states = environment.reset() terminal = False sum_reward = 0.0 num_updates = 0 while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) num_updates += agent.observe(terminal=terminal, reward=reward) sum_reward += reward print('Episode {}: return={} updates={}'.format( episode, sum_reward, num_updates)) # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(10): states = environment.reset() internals = agent.initial_internals() terminal = False while not terminal: actions, internals = agent.act(states=states, internals=internals, independent=True, deterministic=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward print('Mean evaluation return:', sum_rewards / 100.0) # Close agent and environment agent.close() environment.close() # ==================== self.finished_test()
def test_execution(self): self.start_tests(name='getting-started-execution') runner = Runner(agent='test/data/agent.json', environment=dict(environment='gym', level='CartPole'), max_episode_timesteps=10) runner.run(num_episodes=10) runner.run(num_episodes=5, evaluation=True) runner.close() self.finished_test() # Create agent and environment environment = Environment.create( environment='test/data/environment.json', max_episode_timesteps=10) agent = Agent.create(agent='test/data/agent.json', environment=environment) # Train for 200 episodes for _ in range(10): states = environment.reset() terminal = False while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(5): states = environment.reset() internals = agent.initial_internals() terminal = False while not terminal: actions, internals = agent.act(states=states, internals=internals, evaluation=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward sum_rewards / 100 # Close agent and environment agent.close() environment.close() self.finished_test()
def test_masking(self): # FEATURES.MD self.start_tests(name='masking') agent = Agent.create(agent=self.agent_spec( states=dict(type='float', shape=(10, )), actions=dict(type='int', shape=(), num_values=3))) states = dict( state=np.random.random_sample( size=(10, )), # state (default name: "state") action_mask=[ True, False, True ] # mask as'[ACTION-NAME]_mask' (default name: "action") ) action = agent.act(states=states) assert action != 1
def create_agent(param_grid, i, directory, environment): return Agent.create( agent="ppo", environment=environment, # Automatically configured network network=dict( type=param_grid["network"], size=param_grid["size"], depth=param_grid["depth"], ), # Optimization batch_size=param_grid["batch_size"], update_frequency=param_grid["update_frequency"], learning_rate=param_grid["learning_rate"], subsampling_fraction=param_grid["subsampling_fraction"], optimization_steps=param_grid["optimization_steps"], # Reward estimation likelihood_ratio_clipping=param_grid["likelihood_ratio_clipping"], discount=param_grid["discount"], estimate_terminal=param_grid["estimate_terminal"], # Critic critic_network="auto", critic_optimizer=dict( optimizer="adam", multi_step=param_grid["multi_step"], learning_rate=param_grid["learning_rate_critic"], ), # Preprocessing preprocessing=None, # Exploration exploration=param_grid["exploration"], variable_noise=param_grid["variable_noise"], # Regularization l2_regularization=param_grid["l2_regularization"], entropy_regularization=param_grid["entropy_regularization"], # TensorFlow etc name="agent_" + str(i), device=None, parallel_interactions=5, seed=124, execution=None, recorder=dict(directory=directory, frequency=1000), summarizer=None, saver=dict(directory=directory, filename="agent_" + str(i)), )
def createRLagent(self, load): states_dict = {'type': 'float', 'shape': self.num_states} actions_dict = { 'type': 'float', 'shape': self.num_actions, 'min_value': self.input_low, 'max_value': self.input_high } return Agent.create( agent='dqn', states= states_dict, # alternatively: states, actions, (max_episode_timesteps) actions=actions_dict, memory=10000, exploration=0.75, max_episode_timesteps=self.len_episode, )
def prepare(self, environment=None, states=None, actions=None, **agent): """ Generic unit-test preparation. """ if environment is None: environment = self.environment_spec(states=states, actions=actions) environment = Environment.create(environment=environment) else: environment = Environment.create( environment=environment, max_episode_timesteps=self.__class__.max_episode_timesteps) agent = self.agent_spec(**agent) agent = Agent.create(agent=agent, environment=environment) return agent, environment
def __init__(self, m: int, n: int, breach_level: float, delta_t: float, learning_rate: float, use_gradient_clipping: bool, save_path: str, model: TensorForceModel) -> None: super(OurTensorForceAgent, self).__init__(m, n) self._breach_level = breach_level self._delta_t = delta_t self._save_path = save_path self._model = model self._tensor_force_agent: TensorForceAgent = TensorForceAgent.create( agent=OurTensorForceAgent._SPECIFICATION_KEY, states={ 'type': 'float', 'shape': (self._m + self._n, ), 'min_value': 0.0, 'max_value': self._breach_level + self._delta_t }, actions={ 'type': 'int', 'shape': (self._m + self._n, ), 'num_values': OurTensorForceAgent._NUM_ACTIONS }, memory=OurTensorForceAgent._MEMORY, update={ 'unit': 'timesteps', 'batch_size': OurTensorForceAgent._BATCH_SIZE }, optimizer={ 'type': OurTensorForceAgent._OPTIMIZER_NAME, 'learning_rate': learning_rate, 'clipnorm': OurTensorForceAgent._OPTIMIZER_GRADIENT_CLIP_THRESHOLD if use_gradient_clipping else None }, policy=self._model, objective=OurTensorForceAgent._OBJECTIVE, exploration=OurTensorForceAgent._EXPLORATION_RATE, reward_estimation={'horizon': OurTensorForceAgent._REWARD_HORIZON}, saver=None if not OurTensorForceAgent._SAVE else { 'directory': self._save_path, 'file_name': OurTensorForceAgent._SAVE_NAME, 'frequency': OurTensorForceAgent._SAVING_FREQUENCY })
def test_readme(self): self.start_tests(name='readme') # ==================== from tensorforce import Agent, Environment # Pre-defined or custom environment environment = Environment.create(environment='gym', level='CartPole', max_episode_timesteps=500) # Instantiate a Tensorforce agent agent = Agent.create( agent='tensorforce', environment= environment, # alternatively: states, actions, (max_episode_timesteps) memory=1000, update=dict(unit='timesteps', batch_size=64), optimizer=dict(type='adam', learning_rate=3e-4), policy=dict(network='auto'), objective='policy_gradient', reward_estimation=dict(horizon=20)) # Train for 300 episodes for _ in range(1): # Initialize episode states = environment.reset() terminal = False while not terminal: # Episode timestep actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() # ==================== self.finished_test()
def main(): # Setup interactive = 0 size = 4 brd = Board(size, graphics=0) rand_ag = RandomAgent() if interactive == 1: brd.start_interactive() agent = Agent.create(agent='tensorforce', environment=Board, update=64, objective='policy_gradient', reward_estimation=dict(horizon=20)) runner = Runner(agent=agent, environment=Board, max_episode_timesteps=500) runner.run(num_episodes=200)
def training_example(num_episodes: int, max_episode_timesteps: int): # Instantiate the environment (run the CARLA simulator before doing this!) env = CARLAEnvironment(debug=True) # Create your own agent (here is just an example) agent = Agent.create(agent='ppo', environment=env, max_episode_timesteps=max_episode_timesteps, batch_size=1) # Training loop (you couldn't use a Runner instead) # `weights_dir` and `record_dir` are `None` to prevent saving and recording env.train(agent=agent, num_episodes=num_episodes, max_episode_timesteps=max_episode_timesteps, weights_dir=None, record_dir=None) pygame.quit()
def createRLagent(self, load=None): states_dict = {'type': 'float', 'shape': self.inputSize} if self.binary: outType = 'bool' else: outType = 'float' actions_dict = {'type': 'bool', 'shape': 1} agent = Agent.create(agent='dqn', states=states_dict, actions=actions_dict, max_episode_timesteps=1, exploration=0.05, memory=10000) if not load == None: agent.restore(directory=load) return agent
def initialize_agent(self): # Set up information about the boost pads now that the game is active and the info is available self.boost_pad_tracker.initialize_boosts(self.get_field_info()) if MODEL is not None: max_time = 10 frames_per_sec = 20 max_timesteps = RLEnvironment.get_max_timesteps(max_time, frames_per_sec) self.env = Environment.create( environment=KickoffEnvironment, max_episode_timesteps=max_timesteps, max_time=max_time, message_throttle=20, frames_per_sec=frames_per_sec, input_exclude=[ InputOptions.BALL_POSITION_REL, InputOptions.BALL_DIRECTION, InputOptions.CAR_POSITION_REL, InputOptions.CAR_VELOCITY_MAG, ], output_exclude=[ OutputOptions.BOOST, OutputOptions.STEER, OutputOptions.E_BRAKE, OutputOptions.THROTTLE, OutputOptions.ROLL, ] ) directory='../learning/training/{0}'.format(MODEL) filename='agent' agent = os.path.join(directory, os.path.splitext(filename)[0] + '.json') if not os.path.isfile(agent): logging_utils.log_warn(os.getcwd(), {}) raise Exception('Model file doesn\'t exist') self.agent = Agent.load( directory=os.path.abspath(directory), environment=self.env, format='checkpoint', ) self.env.reset()
def prepare(self, environment=None, states=None, actions=None, **agent): """ Generic unit-test preparation. """ if environment is None: environment = self.environment_spec(states=states, actions=actions) environment = Environment.create(environment=environment) else: environment = Environment.create( environment=environment, max_episode_timesteps=self.__class__.max_episode_timesteps) agent = self.agent_spec(**agent) agent = Agent.create(agent=agent, environment=environment) assert agent.__class__.__name__ in ('ConstantAgent', 'RandomAgent') or \ isinstance(agent.model.get_architecture(), str) return agent, environment