def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration( batch_size=20, entropy_penalty=0.01, loss_clipping=0.1, epochs=10, optimizer_batch_size=10, learning_rate=0.0005, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ]) ) agent = PPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=2000, episode_finished=episode_finished) print('PPO agent (continuous): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('PPO agent (continuous) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_multi(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) return state0 * state1 for _ in xrange(5): environment = MinimalTest(definition=[False, (False, 2)]) config = Configuration(batch_size=8, keep_last=True, learning_rate=0.001, states=environment.states, actions=environment.actions, network=network_builder) agent = DQNNstepAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 15 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-15:], r.episode_lengths[-15:])) runner.run(episodes=2000, episode_finished=episode_finished) print('DQN Nstep agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print( 'DQN Nstep agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def train(config, network_spec=None): data_provider = DataProvider(config.db) env = StockEnvironment(data_provider, config, 0) agent = overwrite_agent(env, network_spec, config) if config.overwrite_agent else load_agent( config, env, network_spec) mlflow.log_param("agent", "tensorforce.agents.DQNAgent") for key in config.agent_specs: mlflow.log_param(key, config.agent_specs[key]) runner = Runner(agent=agent, environment=env) offset = 20000 num_episodes = 20 step = 0 while data_provider.has_data_key(offset + config.max_step_per_episode): runner.run(num_episodes=num_episodes) offset = offset + config.max_step_per_episode env.offset = offset agent.save(config.agent_dir, config.agent_name) if step % 10 == 0: evaluate(config, data_provider, offset - config.max_step_per_episode, agent) step += 1 return agent, env
def test_quickstart(self): sys.stdout.write('\nQuickstart:\n') sys.stdout.flush() # Create an OpenAI-Gym environment environment = OpenAIGym('CartPole-v1') # Create the agent agent = PPOAgent( states=environment.states(), actions=environment.actions(), # Automatically configured network network='auto', # Memory sampling most recent experiences, with a capacity of 2500 timesteps # (6100 > [30 batch episodes] * [200 max timesteps per episode]) memory=6100, # Update every 10 episodes, with a batch of 30 episodes update_mode=dict(unit='episodes', batch_size=30, frequency=10), # PPO optimizer step_optimizer=dict(type='adam', learning_rate=1e-3), # PPO multi-step optimization: 10 updates, each based on a third of the batch subsampling_fraction=0.33, optimization_steps=10, # MLP baseline baseline_mode='states', baseline=dict(type='network', network='auto'), # Baseline optimizer baseline_optimizer=dict(type='multi_step', optimizer=dict(type='adam', learning_rate=1e-4), num_steps=5), # Other parameters discount=0.99, entropy_regularization=1e-2, gae_lambda=None, likelihood_ratio_clipping=0.2) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Function handle called after each finished episode def callback(r): return float(np.mean(r.episode_rewards[-100:])) <= 180.0 # Start the runner runner.run(num_episodes=1000, max_episode_timesteps=200, callback=callback) runner.close() if float(np.mean(runner.episode_rewards[-100:])) <= 180.0: sys.stdout.write('Test failed, exceeding {} episodes\n'.format( runner.episode)) sys.stdout.flush() self.assertTrue(expr=False) else: sys.stdout.write('Test passed after {} episodes\n'.format( runner.episode)) sys.stdout.flush() self.assertTrue(expr=True)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('VPG Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('VPG discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_multi(self): passed = 0 def network_builder(inputs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) state2 = layer(x=layer(x=inputs['state2'], size=32), size=32) state3 = layer(x=layer(x=inputs['state3'], size=32), size=32) return state0 * state1 * state2 * state3 for _ in xrange(5): environment = MinimalTest(definition=[ False, (False, 2), (False, (1, 2)), (True, (1, 2)) ]) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=network_builder) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 50 or not all( x >= 1.0 for x in r.episode_rewards[-50:]) runner.run(episodes=2000, episode_finished=episode_finished) print('VPG agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('VPG agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_save_restore(self): environment_spec = {"float": ()} environment = create_environment(environment_spec) network_spec = [dict(type='dense', size=32)] agent = create_agent(environment, network_spec) runner = Runner(agent=agent, environment=environment) runner.run(episodes=100) model_values = agent.model.session.run( agent.model.get_variables(include_submodules=True, include_nontrainable=False)) save_path = agent.model.save(directory=self._tmp_dir_path + "/model") print("Saved at: %s" % (save_path, )) runner.close() agent = create_agent(environment, network_spec) agent.model.restore(directory="", file=save_path) restored_model_values = agent.model.session.run( agent.model.get_variables(include_submodules=True, include_nontrainable=False)) assert len(model_values) == len(restored_model_values) assert all([ np.array_equal(v1, v2) for v1, v2 in zip(model_values, restored_model_values) ]) agent.close()
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration(batch_size=8, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=10000, episode_finished=episode_finished) print('TRPO Agent (continuous): ' + str(runner.episode)) if runner.episode < 10000: passed += 1 print('passed') else: print('failed') print('TRPO continuous agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.0005, memory_capacity=800, first_update=80, target_update_frequency=20, memory=dict(type='replay', random_sampling=True), states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = CategoricalDQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('Categorical DQN agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('Categorical DQN agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def main(): env, agent = set_up() runner = Runner(agent=agent, environment=env) runner.run(num_episodes=10000) agent.save(directory="saved_models") agent.close() env.close()
def train_and_test(self, agent, early_stop=-1, n_tests=15): n_train = TIMESTEPS // n_tests i = 0 runner = Runner(agent=agent, environment=self) try: while i <= n_tests: self.use_dataset(Mode.TRAIN) runner.run(timesteps=n_train, max_episode_timesteps=n_train) self.use_dataset(Mode.TEST) self.run_deterministic(runner, print_results=True) if early_stop > 0: advantages = np.array( self.acc.episode.advantages[-early_stop:]) if i >= early_stop and np.all(advantages > 0): i = n_tests i += 1 except KeyboardInterrupt: # Lets us kill training with Ctrl-C and skip straight to the final test. This is useful in case you're # keeping an eye on terminal and see "there! right there, stop you found it!" (where early_stop & n_tests # are the more methodical approaches) pass # On last "how would it have done IRL?" run, without getting in the way (no killing on repeats, 0-balance) print('Running no-kill test-set') self.use_dataset(Mode.TEST, no_kill=True) self.run_deterministic(runner, print_results=True)
def test_discrete(self): passed = 0 # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, learning_rate=0.0001, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_replay(self): environment = MinimalTest(definition=[(False, (1, 2))]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=50, memory=dict(type='replay', random_sampling=True), first_update=20, target_update_frequency=10, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('Replay memory DQN: ' + str(runner.episode))
def main(): bad_seeds_environment, agent = set_up() runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=10000) agent.save(directory="saved_models") bad_seeds_environment.close() agent.close()
def test_discrete_baseline(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, baseline=dict(type="mlp", sizes=[32, 32], epochs=5, update_batch_size=8, learning_rate=0.01), network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1500, episode_finished=episode_finished) print('VPG agent (discrete): ' + str(runner.episode)) if runner.episode < 1500: passed += 1 print('VPG agent (discrete) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_multi(self): passed = 0 def network_builder(inputs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) return state0 * state1 for _ in xrange(5): environment = MinimalTest(definition=[False, (False, 2)]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=network_builder) agent = CategoricalDQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 15 or not all( x >= 1.0 for x in r.episode_rewards[-15:]) runner.run(episodes=2000, episode_finished=episode_finished) print('Categorical DQN agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('Categorical DQN agent (multi-state/action) passed = {}'.format( passed)) self.assertTrue(passed >= 2)
def test_beta(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) actions = environment.actions actions['min_value'] = -0.5 actions['max_value'] = 1.5 config = Configuration(batch_size=8, learning_rate=0.01, states=environment.states, actions=actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1500, episode_finished=episode_finished) print('VPG agent (beta): ' + str(runner.episode)) if runner.episode < 1500: passed += 1 print('VPG agent (beta) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('VPG agent (continuous): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('VPG agent (continuous) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration(batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('NAF agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('NAF agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, keep_last=True, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQNNstepAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('DQN Nstep agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQN Nstep agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration( batch_size=8, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('TRPO agent (continuous): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('TRPO agent (continuous) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def main( time_limit=None, scoring="default", batch_size=16, gpu_idx=0, env_version=1, seed_count=9, max_count=10, out_path=None, num_episodes=int(3 * 10**3), ): env, agent = set_up( time_limit=time_limit, scoring=scoring, batch_size=batch_size, gpu_idx=gpu_idx, env_version=env_version, seed_count=seed_count, max_count=max_count, out_path=out_path, ) runner = Runner(agent=agent, environment=env) runner.run(num_episodes=num_episodes) if out_path is None: out_path = Path() else: out_path = Path(out_path).expanduser() agent.save(directory=str(out_path / "saved_models")) agent.close() env.close()
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder([dict(type='dense', size=32)]) ) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQN Agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQN Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def main(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeeds02, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=100, ) agent = Agent.create( agent="random", environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_random_env_02/summaries", labels="all", frequency=100, # store values every 100 timesteps ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=10000) bad_seeds_environment.close() agent.close()
def load_agent( time_limit=None, scoring="default", batch_size=16, gpu_idx=0, env_version=1, seed_count=9, max_count=10, out_path=None, ): env, agent = set_up( time_limit=time_limit, scoring=scoring, batch_size=batch_size, gpu_idx=gpu_idx, env_version=env_version, seed_count=seed_count, max_count=max_count, out_path=out_path, ) if out_path is None: out_path = Path() else: out_path = Path(out_path).expanduser() agent.restore(directory=str(out_path / "saved_models")) runner = Runner(agent=agent, environment=env) runner.run(num_episodes=20) return agent
class Player: """Mandatory class with the player methods""" def __init__(self, name='ppo_agent', load_model=None, env=None): """Initialization of an agent""" self.equity_alive = 0 self.actions = [] self.last_action_in_stage = '' self.temp_stack = [] self.name = name self.autoplay = True self.ppo_agent = None self.poker_env = Environment.create(environment=env, max_episode_timesteps=100) self.runner = None if load_model: self.load(load_model) def load(self, model_name): print("Loading model...") self.ppo_agent = Agent.load(directory=model_name, format='hdf5') def start_step_policy(self, observation): log.info("Random action") _ = observation action = self.poker_env.action_space.sample() return action def train(self, model_name, num_ep=500): print('Training...') self.runner = Runner(agent='ppo.json', environment=dict(type=self.poker_env), num_parallel=5, remote='multiprocessing') self.runner.run(num_episodes=num_ep) self.runner.agent.save(directory=model_name, format='hdf5') self.runner.close() def play(self, model_name, num_ep=5): self.load(model_name) print('Evaluating...') self.runner = Runner(agent=self.ppo_agent, environment=dict(type=self.poker_env)) self.runner.run(num_episodes=num_ep, evaluation=True) self.runner.close() def action(self, action_space, observation, info): _ = observation _ = info this_player_action_space = { Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_POT, Action.RAISE_HALF_POT, Action.RAISE_2POT } action = this_player_action_space.intersection(set(action_space)) return action
def test_runner_evaluation(self): states = dict(type='float', shape=(1,)) actions = dict(type='int', shape=(), num_values=3) agent, environment = self.prepare(name='runner-evaluation', states=states, actions=actions) runner = Runner(agent=agent, environment=environment) self.num_evaluations = 0 evaluation_frequency = 3 max_evaluation_timesteps = 2 num_evaluation_iterations = 2 def evaluation_callback(r): self.num_evaluations += 1 self.assertEqual(r.episode, self.num_evaluations * evaluation_frequency) self.assertEqual(len(r.evaluation_timesteps), num_evaluation_iterations) for num_timesteps in r.evaluation_timesteps: self.assertLessEqual(num_timesteps, max_evaluation_timesteps) runner.run( num_episodes=10, evaluation_callback=evaluation_callback, evaluation_frequency=evaluation_frequency, max_evaluation_timesteps=max_evaluation_timesteps, num_evaluation_iterations=num_evaluation_iterations ) runner.close() sys.stdout.flush() self.assertTrue(expr=True)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration( batch_size=8, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO Agent (continuous): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO continuous agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=5000, episode_finished=episode_finished) print('DQN Agent: ' + str(runner.episode)) if runner.episode < 5000: passed += 1 print('passed') else: print('failed') print('DQN Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration(batch_size=8, learning_rate=0.0001, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def main(): bad_seeds_environment = Environment.create( environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100 ) agent = Agent.create( agent="a2c", batch_size=100, horizon=100, # changed from 20 to 100 for agent_03 exploration=0.05, # changed from 0.01 to 0.05 for agent_03 l2_regularization=0.2, # changed from 0.1 to 0.2 for agent_03 #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.1, # changed from 0.05 to 0.1 for agent_03 environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_03_env_03/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), saver=dict( directory='saved_models/agent_03_env_03/checkpoints', frequency=600 # save checkpoint every 600 seconds (10 minutes) ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) for _ in range(10): runner.run(num_episodes=10000) runner.run(num_episodes=1000, evaluation=True) bad_seeds_environment.close() agent.close()
def train_and_test(self, agent, n_steps, n_tests, early_stop): test_acc = self.acc.tests n_steps = n_steps * 10000 test_acc.n_tests = n_tests test_acc.i = 0 timesteps_each = n_steps // n_tests runner = Runner(agent=agent, environment=self) try: while test_acc.i <= n_tests: self.use_dataset(Mode.TRAIN) # max_episode_timesteps not required, since we kill on (cash|value)<0 or max_repeats runner.run(timesteps=timesteps_each) self.use_dataset(Mode.TEST) self.run_deterministic(runner, print_results=True) if early_stop > 0: sharpes = np.array(self.acc.episode.sharpes[-early_stop:]) if test_acc.i >= early_stop and np.all(sharpes > 0): test_acc.i = n_tests test_acc.i += 1 except KeyboardInterrupt: # Lets us kill training with Ctrl-C and skip straight to the final test. This is useful in case you're # keeping an eye on terminal and see "there! right there, stop you found it!" (where early_stop & n_steps # are the more methodical approaches) pass # On last "how would it have done IRL?" run, without getting in the way (no killing on repeats, 0-balance) print('Running no-kill test-set') self.use_dataset(Mode.TEST, full_set=True) self.run_deterministic(runner, print_results=True)
def test_restore_from_checkpoint(self): saver_steps = 15 steps_per_episode = 20 train_episodes = 2 assert ((steps_per_episode + 1) * train_episodes % saver_steps) > 0 environment = DummyEnv() network_spec = [ dict(type='dense', size=4) ] model_path = self._tmp_dir_path + "/model_auto_save" saver_spec = dict( directory=model_path, steps=saver_steps, load=False ) agent = create_agent(environment, network_spec, saver_spec) runner = Runner(agent=agent, environment=environment) runner.run(max_episode_timesteps=steps_per_episode, episodes=train_episodes) # Deliberately avoid closing the runner/agent to simulate unexpected shutdown agent = create_agent(environment, network_spec) agent.restore_model(directory=model_path) agent.reset() expected_timestep = train_episodes * (steps_per_episode + 1) // saver_steps * saver_steps assert agent.episode == train_episodes - 1 assert agent.timestep == expected_timestep runner = Runner(agent=agent, environment=environment) runner.run(max_episode_timesteps=steps_per_episode, episodes=train_episodes) assert agent.episode == 2 * train_episodes - 1 runner.close()
def main(): parser = argparse.ArgumentParser(description="Train an IBM agent") parser.add_argument("--render", default=False, action='store_true', help="Whether to render or not. Defaults to False.") args = parser.parse_args() for n_simple in [3]: #[1, 2, 3]: agent, environment = make_agent_env(1, n_simple, args.render) agent = restore_agent(agent) # Run runner = Runner(agent=agent, environment=environment) while True: runner.run(episodes=100, max_episode_timesteps=2000) ave_reward = np.mean(runner.episode_rewards) print("Average reward: %f with %d SimpleAgents" % (ave_reward, n_simple)) directory = os.path.join(os.getcwd(), "log", "agent") runner.agent.save_model(directory=directory) if ave_reward > 0 and n_simple < 3: break if ave_reward > 0.9: break try: runner.close() except AttributeError as e: pass
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration( batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, clip_gradients=1.0, states=environment.states, actions=environment.actions, network=layered_network_builder([dict(type='dense', size=32)]) # batch_size=8, # learning_rate=0.0025, # # exploration="OrnsteinUhlenbeckProcess", # # exploration_kwargs=dict( # # sigma=0.1, # # mu=0, # # theta=0.1 # # ), # discount=0.99, # memory_capacity=800, # first_update=80, # repeat_update=4, # target_update_frequency=20, # states=environment.states, # actions=environment.actions, # clip_gradients=5.0, # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)]) ) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('NAF Agent: ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('NAF Agent passed = {}'.format(passed)) self.assertTrue(passed >= 3)
def test_dqfd_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=16, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.1, states=environment.states, actions=environment.actions, network=layered_network_builder(layers_config=[dict(type='dense', size=32, l2_regularization=0.0001)]) ) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = 1 state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD Agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=Configuration( loglevel='info', batch_size=100, baseline='mlp', baseline_args=None, baseline_kwargs=dict( size=32, repeat_update=100 ), override_line_search=False, generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, cg_iterations=20, cg_damping=0.01, line_search_steps=20, max_kl_divergence=0.005, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', default='DQNAgent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000*60, help="Maximum number of timesteps per episode") # parser.add_argument('-m', '--monitor', help="Save results to this directory") # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() env = OpenAIUniverse(args.gym_id) env.configure(remotes=1) default = dict( repeat_actions=1, actions=env.actions, states=env.states, max_episode_length=args.max_timesteps ) if args.agent_config: config = Configuration.from_json(args.agent_config) else: config = Configuration() config.default(default) if args.network_config: network_config = Configuration.from_json(args.network_config).network_layers else: if config.network_layers: network_config = config.network_layers else: raise TensorForceError("Error: No network configuration provided.") if args.debug: print("Configuration:") print(config) logger = logging.getLogger(__name__) logger.setLevel(log_levels[config.loglevel]) stack = None agent = create_agent(args.agent, config, network_config) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(config) runner = Runner(agent, env, preprocessor=stack, repeat_actions=config.repeat_actions) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) runner.save_model(args.save, args.save_episodes) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:]))) logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:]))) return True logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) if args.monitor: env.gym.monitor.close() env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', help='Agent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-m', '--monitor', help="Save results to this directory") parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # configurable!!! environment = OpenAIGym(args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video) if args.agent_config: agent_config = Configuration.from_json(args.agent_config) else: agent_config = Configuration() logger.info("No agent configuration provided.") if args.network_config: network = from_json(args.network_config) else: network = None logger.info("No network configuration provided.") agent_config.default(dict(states=environment.states, actions=environment.actions, network=network)) agent = agents[args.agent](config=agent_config) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) runner = Runner( agent=agent, environment=environment, repeat_actions=1, save_path=args.save, save_episodes=args.save_episodes ) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) if args.monitor: environment.gym.monitor.close() environment.close()
line_search_steps=20, max_kl_divergence=0.005, gamma=0.97, continuous=False, preprocessing=None, states=env.states, actions=env.actions, network=layered_network_builder([dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh')]) )) # Create the runner runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics def episode_finished(r): print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.timestep, reward=r.episode_rewards[-1])) return True # Start learning runner.run(episodes=3000, max_timesteps=200, episode_finished=episode_finished) # Print statistics print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format(ep=runner.episode, ar=np.mean( runner.episode_rewards[ -100:])))
def main(): parser = argparse.ArgumentParser(description="Playground Flags.") parser.add_argument("--game", default="pommerman", help="Game to choose.") parser.add_argument("--config", default="PommeFFA-v0", help="Configuration to execute. See env_ids in " "configs.py for options.") parser.add_argument("--agents", default="tensorforce::ppo,test::agents.SimpleAgent," "test::agents.SimpleAgent,test::agents.SimpleAgent", help="Comma delineated list of agent types and docker " "locations to run the agents.") parser.add_argument("--agent_env_vars", help="Comma delineated list of agent environment vars " "to pass to Docker. This is only for the Docker Agent." " An example is '0:foo=bar:baz=lar,3:foo=lam', which " "would send two arguments to Docker Agent 0 and one to" " Docker Agent 3.", default="") parser.add_argument("--record_pngs_dir", default=None, help="Directory to record the PNGs of the game. " "Doesn't record if None.") parser.add_argument("--record_json_dir", default=None, help="Directory to record the JSON representations of " "the game. Doesn't record if None.") parser.add_argument("--render", default=True, help="Whether to render or not. Defaults to True.") parser.add_argument("--game_state_file", default=None, help="File from which to load game state. Defaults to " "None.") args = parser.parse_args() config = args.config record_pngs_dir = args.record_pngs_dir record_json_dir = args.record_json_dir agent_env_vars = args.agent_env_vars game_state_file = args.game_state_file # TODO: After https://github.com/MultiAgentLearning/playground/pull/40 # this is still missing the docker_env_dict parsing for the agents. agents = [ helpers.make_agent_from_string(agent_string, agent_id+1000) for agent_id, agent_string in enumerate(args.agents.split(",")) ] env = make(config, agents, game_state_file) training_agent = None for agent in agents: if type(agent) == TensorForceAgent: training_agent = agent env.set_training_agent(agent.agent_id) break if args.record_pngs_dir: assert not os.path.isdir(args.record_pngs_dir) os.makedirs(args.record_pngs_dir) if args.record_json_dir: assert not os.path.isdir(args.record_json_dir) os.makedirs(args.record_json_dir) # Create a Proximal Policy Optimization agent agent = training_agent.initialize(env) atexit.register(functools.partial(clean_up_agents, agents)) wrapped_env = WrappedEnv(env, visualize=args.render) runner = Runner(agent=agent, environment=wrapped_env) runner.run(episodes=10, max_episode_timesteps=2000) print("Stats: ", runner.episode_rewards, runner.episode_timesteps, runner.episode_times) try: runner.close() except AttributeError as e: pass
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', help='Agent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-w', '--num-workers', type=int, default=1, help="Number of worker agents") parser.add_argument('-m', '--monitor', help="Save results to this file") parser.add_argument('-M', '--mode', choices=['tmux', 'child'], default='tmux', help="Starter mode") parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory") parser.add_argument('-C', '--is-child', action='store_true') parser.add_argument('-i', '--task-index', type=int, default=0, help="Task index") parser.add_argument('-K', '--kill', action='store_true', default=False, help="Kill runners") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() session_name = 'openai_async' shell = '/bin/bash' kill_cmds = [ "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(12222 + args.num_workers), "tmux kill-session -t {}".format(session_name), ] if args.kill: os.system("\n".join(kill_cmds)) return 0 if not args.is_child: # start up child processes target_script = os.path.abspath(inspect.stack()[0][1]) def wrap_cmd(session, name, cmd): if isinstance(cmd, list): cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd) if args.mode == 'tmux': return 'tmux send-keys -t {}:{} {} Enter'.format(session, name, shlex_quote(cmd)) elif args.mode == 'child': return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format( cmd, args.logdir, session, name, args.logdir ) def build_cmd(index): cmd_args = [ 'CUDA_VISIBLE_DEVICES=', sys.executable, target_script, args.gym_id, '--is-child', '--agent', args.agent, '--agent-config', os.path.join(os.getcwd(), args.agent_config), '--network-config', os.path.join(os.getcwd(), args.network_config), '--num-workers', args.num_workers, '--task-index', index ] if args.debug: cmd_args.append('--debug') return cmd_args if args.mode == 'tmux': cmds = kill_cmds + ['tmux new-session -d -s {} -n ps'.format(session_name)] elif args.mode == 'child': cmds = ['mkdir -p {}'.format(args.logdir), 'rm -f {}/kill.sh'.format(args.logdir), 'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir), 'chmod +x {}/kill.sh'.format(args.logdir)] cmds.append(wrap_cmd(session_name, 'ps', build_cmd(-1))) for i in xrange(args.num_workers): name = 'w_{}'.format(i) if args.mode == 'tmux': cmds.append('tmux new-window -t {} -n {} -d {}'.format(session_name, name, shell)) cmds.append(wrap_cmd(session_name, name, build_cmd(i))) # add one PS call # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell)) print("\n".join(cmds)) os.system("\n".join(cmds)) return 0 ps_hosts = ['127.0.0.1:{}'.format(12222)] worker_hosts = [] port = 12223 for _ in range(args.num_workers): worker_hosts.append('127.0.0.1:{}'.format(port)) port += 1 cluster = {'ps': ps_hosts, 'worker': worker_hosts} cluster_spec = tf.train.ClusterSpec(cluster) environment = OpenAIGym(args.gym_id) if args.agent_config: agent_config = Configuration.from_json(args.agent_config) else: raise TensorForceError("No agent configuration provided.") if not args.network_config: raise TensorForceError("No network configuration provided.") agent_config.default(dict(states=environment.states, actions=environment.actions, network=from_json(args.network_config))) agent_config.default(dict(distributed=True, cluster_spec=cluster_spec, global_model=(args.task_index == -1), device=('/job:ps' if args.task_index == -1 else '/job:worker/task:{}/cpu:0'.format(args.task_index)))) logger = logging.getLogger(__name__) logger.setLevel(log_levels[agent_config.loglevel]) agent = agents[args.agent](config=agent_config) logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(gym_id=args.gym_id)) logger.info("Config:") logger.info(agent_config) runner = Runner( agent=agent, environment=environment, repeat_actions=1, cluster_spec=cluster_spec, task_index=args.task_index ) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
def main(): parser = argparse.ArgumentParser() # N.b. if ran from within lab, the working directory is something like lab/bazel-out/../../tensorforce # Hence, relative paths will not work without first fetching the path of this run file parser.add_argument('-id', '--level-id', default='tests/demo_map',help="DeepMind Lab level id") parser.add_argument('-a', '--agent', default='VPGAgent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=1000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=200, help="Maximum number of timesteps per episode") parser.add_argument('-m', '--monitor', help="Save results to this directory") parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=True, help="Show debug outputs") # Redirect output to file sys.stdout = open('lab_output.txt', 'w') args = parser.parse_args() environment = DeepMindLab(args.level_id) path = os.path.dirname(__file__) if args.agent_config: # Use absolute path agent_config = Configuration.from_json(path + args.agent_config, True) else: raise TensorForceError("No agent configuration provided.") if not args.network_config: raise TensorForceError("No network configuration provided.") agent_config.default(dict(states=environment.states, actions=environment.actions, network=from_json(path + args.network_config, True))) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # configurable!!! agent = agents[args.agent](config=agent_config) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) runner = Runner( agent=agent, environment=environment, repeat_actions=1, save_path=args.save, save_episodes=args.save_episodes ) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) report_episodes = args.episodes // 1000 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:]))) logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:]))) return True logger.info("Starting {agent} for Lab environment '{env}'".format(agent=agent, env=environment)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode + 1)) environment.close()