def main(): # maximum reward is approximately max_episode_length / seed_count = 10 bad_seeds_environment = Environment.create( environment=BadSeeds01, seed_count=10, bad_seed_count=3, max_episode_length=100 ) agent = Agent.create( agent="a2c", batch_size=100, # this seems to help a2c horizon=20, # does this help a2c? exploration=0.01, # tried without this at first l2_regularization=0.1, entropy_regularization=0.2, variable_noise=0.05, environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_01_env_01/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=100000) agent.save(directory="saved_models")
def __init__(self, environment: 'TradingEnvironment', agent_spec: any, save_best_agent: bool = True, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent: A `Tensorforce` agent or agent specification. save_best_agent (optional): The runner will automatically save the best agent kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._max_episode_timesteps = kwargs.get('max_episode_timesteps', False) self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._agent = Agent.create(agent=agent_spec, environment=self._environment) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=save_best_agent)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('VPG Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('VPG discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_multi(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) return state0 * state1 for _ in xrange(5): environment = MinimalTest(definition=[True, (True, 2)]) config = Configuration(batch_size=16, learning_rate=0.00025, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=network_builder) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 20 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-20:], r.episode_lengths[-20:])) runner.run(episodes=10000, episode_finished=episode_finished) print('NAF agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 10000: passed += 1 print('NAF agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 0)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=50, memory='replay', first_update=20, repeat_update=4, target_update_frequency=10, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('Replay DQN: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('Replay DQN passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, learning_rate=0.0001, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration(batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('NAF agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('NAF agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder([dict(type='dense', size=32)]) ) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQN Agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQN Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def main(): gym_id = 'CartPole-v0' logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) max_episodes = 10000 max_timesteps = 1000 env = OpenAIGymEnvironment(gym_id, monitor=False, monitor_video=False) config = Config({ 'repeat_actions': 1, 'actions': env.actions, 'action_shape': env.action_shape, 'state_shape': env.state_shape, 'exploration': 'constant', 'exploration_args': [0.1] }) agent = SimpleQAgent(config, "simpleq") runner = Runner(agent, env) def episode_finished(r): if r.episode % 10 == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 10 rewards: {}".format(np.mean(r.episode_rewards[-10:]))) return True logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(max_episodes, max_timesteps, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode + 1))
def train_and_test(self, agent, early_stop=-1, n_tests=15): n_train = TIMESTEPS // n_tests i = 0 runner = Runner(agent=agent, environment=self) try: while i <= n_tests: self.use_dataset(Mode.TRAIN) runner.run(timesteps=n_train, max_episode_timesteps=n_train) self.use_dataset(Mode.TEST) self.run_deterministic(runner, print_results=True) if early_stop > 0: advantages = np.array( self.acc.episode.advantages[-early_stop:]) if i >= early_stop and np.all(advantages > 0): i = n_tests i += 1 except KeyboardInterrupt: # Lets us kill training with Ctrl-C and skip straight to the final test. This is useful in case you're # keeping an eye on terminal and see "there! right there, stop you found it!" (where early_stop & n_tests # are the more methodical approaches) pass # On last "how would it have done IRL?" run, without getting in the way (no killing on repeats, 0-balance) print('Running no-kill test-set') self.use_dataset(Mode.TEST, no_kill=True) self.run_deterministic(runner, print_results=True)
def main(): env, agent = set_up() runner = Runner(agent=agent, environment=env) runner.run(num_episodes=10000) agent.save(directory="saved_models") agent.close() env.close()
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration( batch_size=8, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO Agent (continuous): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO continuous agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def play(self, model_name, num_ep=5): self.load(model_name) print('Evaluating...') self.runner = Runner(agent=self.ppo_agent, environment=dict(type=self.poker_env)) self.runner.run(num_episodes=num_ep, evaluation=True) self.runner.close()
def test_multi(self): passed = 0 def network_builder(inputs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) state2 = layer(x=layer(x=inputs['state2'], size=32), size=32) return state0 * state1 * state2 for _ in xrange(5): environment = MinimalTest( definition=[False, (False, 2), (False, (1, 2))]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.2, states=environment.states, actions=environment.actions, network=network_builder) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = dict(action0=1, action1=(1, 1), action2=((1, 1), )) state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 50 or not all( x >= 1.0 for x in r.episode_rewards[-50:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def unittest(self, num_updates=None, num_episodes=None, num_timesteps=None, environment=None, min_timesteps=None, states=None, actions=None, exclude_bool_action=False, exclude_int_action=False, exclude_float_action=False, exclude_bounded_action=False, require_observe=False, require_all=False, **agent): """ Generic unit-test. """ agent, environment = self.prepare( environment=environment, min_timesteps=min_timesteps, states=states, actions=actions, exclude_bool_action=exclude_bool_action, exclude_int_action=exclude_int_action, exclude_float_action=exclude_float_action, exclude_bounded_action=exclude_bounded_action, require_observe=require_observe, require_all=require_all, **agent) self.runner = Runner(agent=agent, environment=environment) assert (num_updates is not None) + (num_episodes is not None) + \ (num_timesteps is not None) <= 1 if num_updates is None and num_episodes is None and num_timesteps is None: num_updates = self.__class__.num_updates num_episodes = self.__class__.num_episodes num_timesteps = self.__class__.num_timesteps if num_updates is None and num_episodes is None and num_timesteps is None: num_updates = 2 assert (num_updates is not None) + (num_episodes is not None) + \ (num_timesteps is not None) == 1 evaluation = not any([ require_all, require_observe, self.__class__.require_all, self.__class__.require_observe ]) self.runner.run(num_episodes=num_episodes, num_timesteps=num_timesteps, num_updates=num_updates, use_tqdm=False, evaluation=evaluation) self.runner.close() agent.close() environment.close() self.finished_test()
def environment(self, environment: 'TradingEnvironment'): self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=self._save_best_agent)
def main(): #tensorforce env = OpenAIGym('JacoArm-v0') agent = TRPOAgent(states_spec=env.states, actions_spec=env.actions, network_spec=network_spec, batch_size=512) # agent = PPOAgent( # states_spec=env.states, # actions_spec=env.actions, # network_spec=network_spec, # batch_size=512, # step_optimizer=dict( # type='adam', # learning_rate=1e-4 # ) # ) runner = Runner(agent=agent, environment=env) raw_input("hit enter when gazebo is loaded...") print() env.gym.unpause() env.gym.hold_init_robot_pos([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]) runner.run(episodes=1500, max_episode_timesteps=1000, episode_finished=episode_finished) #old-fashioned way # env = gym.make('JacoArm-v0') # print "launching the world..." # #gz loaing issues, let user start the learning # raw_input("hit enter when gazebo is loaded...") # env.set_physics_update(0.0001, 10000) # raw_input("hit enter when gazebo is loaded...") # # env.set_goal([0.167840578046, 0.297489331432, 0.857454500127]) # total_episodes = 100 # action = [1,1,1,1,1,1,1,1,1,1] # x = 0 # # for x in range(total_episodes): # while True: # # if x % 10 is 0: # action = numpy.random.rand(1, 10)[0] # # print 'new action is', action # state, reward, done, _ = env.step(action) # print reward # time.sleep(0.2) # x += 1 write_to_csv(train_data, 'test.csv') env.close()
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.2, memory=dict(type='replay', random_sampling=True), states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = 1 state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def train(self, model_name, num_ep=500): print('Training...') self.runner = Runner(agent='ppo.json', environment=dict(type=self.poker_env), num_parallel=5, remote='multiprocessing') self.runner.run(num_episodes=num_ep) self.runner.agent.save(directory=model_name, format='hdf5') self.runner.close()
def main( *, time_limit=None, scoring="default", batch_size=16, gpu_idx=0, env_version=2, out_path=None, num_episodes=int(3 * 10 ** 3), ): """ A self contained set up of the environment and run. Can be used to create all of the figures associated in the reference for variable batch size and variable time limit. All experiments use 10 'seeds'. Parameters ---------- time_limit : int, None Turn time limit for episode scoring : str in {'t22', 'tt5', 'monotonic', 'linear', 'square', 'default' Name of reward function batch_size : int Batch size for training gpu_idx : int optional index for GPU env_version : int in {1, 2} Environment version. 1 being ideal time, 2 being time limited out_path : path Toplevel dir for output of models and checkpoints num_episodes: int Number of episodes to learn over Returns ------- None """ env, agent = set_up( time_limit=time_limit, scoring=scoring, batch_size=batch_size, gpu_idx=gpu_idx, env_version=env_version, out_path=out_path, ) runner = Runner(agent=agent, environment=env) runner.run(num_episodes=num_episodes) if out_path is None: out_path = Path() else: out_path = Path(out_path).expanduser() agent.save(directory=str(out_path / "saved_models")) agent.close() env.close()
def test_dqfd_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=16, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.1, states=environment.states, actions=environment.actions, network=layered_network_builder(layers_config=[ dict(type='dense', size=32, l2_regularization=0.0001) ])) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = 1 state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD Agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def restore_agent(self, directory: str, filename: str = None): """Deserialize the strategy's learning agent from a file. Arguments: directory: The `str` path of the directory the agent checkpoint is stored in. filename (optional): The `str` path of the file the agent specification is stored in. The `.json` file extension will be automatically appended if not provided. """ self._agent = Agent.load(directory, filename=filename) self._runner = Runner(agent=self._agent, environment=self._environment)
def test_example(self): sys.stdout.write('\nQuickstart:\n') sys.stdout.flush() passed = 0 for _ in xrange(3): # Create an OpenAI-Gym environment environment = OpenAIGym('CartPole-v0') # Network specification for the model network_spec = [ dict(type='dense', size=32), dict(type='dense', size=32) ] # Create the agent agent = PPOAgent(states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec, batch_size=4000, step_optimizer=dict(type='adam', learning_rate=1e-2), optimization_steps=5, discount=0.99, normalize_rewards=False, entropy_regularization=0.01, likelihood_ratio_clipping=0.2) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Function handle called after each finished episode def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off mean_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or mean_reward < 50.0 # Start the runner runner.run(episodes=2000, max_episode_timesteps=200, episode_finished=episode_finished) sys.stdout.write('episodes: {}\n'.format(runner.episode)) sys.stdout.flush() # Test passed if episode_finished handle evaluated to False if runner.episode < 2000: passed += 1 sys.stdout.write('==> passed: {}\n'.format(passed)) sys.stdout.flush() self.assertTrue(passed >= 2)
def test_runner_callback(self): states = dict(type='float', shape=(1,)) actions = dict(type='int', shape=(), num_values=3) agent, environment = self.prepare(name='runner-callback', states=states, actions=actions) environment.timestep_range = (6, 10) runner = Runner(agent=agent, environment=environment) callback_episode_frequency = 2 self.num_callbacks = 0 def callback(r): self.num_callbacks += 1 self.assertEqual(r.episode, self.num_callbacks * callback_episode_frequency) runner.run( num_episodes=10, callback=callback, callback_episode_frequency=callback_episode_frequency ) callback_timestep_frequency = 3 self.num_callbacks = 0 def callback(r): self.num_callbacks += 1 self.assertEqual(r.episode_timestep, self.num_callbacks * callback_timestep_frequency) runner.run( num_episodes=11, callback=callback, callback_timestep_frequency=callback_timestep_frequency ) self.is_callback1 = False self.is_callback2 = False def callback1(r): self.is_callback1 = True def callback2(r): self.is_callback2 = True runner.run( num_episodes=12, callback=[callback1, callback2], callback_timestep_frequency=callback_timestep_frequency ) self.assertTrue(expr=(self.is_callback1 and self.is_callback2)) runner.close() sys.stdout.flush() self.assertTrue(expr=True)
def main(): bad_seeds_environment = Environment.create(environment=Bollux, seed_count=10, bad_seed_count=3, max_episode_length=100) # 20200820-223031 # 20200820-233243 # batch_size 1000 goes not get smarter or dumber # batch_size 100 20200821-095410 gets dumber # try batch size 10000 ! agent = Agent.create( agent="a2c", batch_size=10000, # changed for 04 but was this a mistake? no horizon=50, # changed from 100 to 50 for agent_04 discount=0.97, # new for agent_04 #exploration=0.05, # turned off for agent_04 - turn on for 05? l2_regularization=0.1, #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.5, # changed from 0.1 to 0.5 for agent_04 environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_04_bollux_1000000/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), saver=dict( directory='saved_models/agent_04_bollux_1000000/checkpoints', frequency=6000 # save checkpoint every 6000 seconds (100 minutes) ), ) # this is the batch_size = 10000 version # I hope it is the last env 04 runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=1000000) #for i in range(100): # print("running 10000 episodes") # runner.run(num_episodes=10000) # print("saving the agent") # directory = Path(f"saved_models/agent_04_env_04_1000000/10000_{i}/checkpoints") # if directory.exists(): # directory.rmdir() # directory.mkdir(parents=True, exist_ok=True) # agent.save(directory=str(directory), format="numpy") bad_seeds_environment.close() agent.close()
def test_continuous(self): environment = MinimalTest(definition=True) config = Configuration(states=environment.states, actions=environment.actions) agent = RandomAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('Random agent (continuous): ' + str(runner.episode)) self.assertTrue(runner.episode == 1000)
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = PPOAgent(config=Configuration( log_level='info', batch_size=256, memory=dict( type='prioritized_replay', ), update_frequency=256, first_update=512, learning_rate=0.0001, optimizer_batch_size=64, normalize_rewards=False, gae_rewards=False, baseline=dict( type="mlp", sizes=[32, 32], epochs=1, update_batch_size=64, learning_rate=0.001 ), states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
class Player: """Mandatory class with the player methods""" def __init__(self, name='ppo_agent', load_model=None, env=None): """Initialization of an agent""" self.equity_alive = 0 self.actions = [] self.last_action_in_stage = '' self.temp_stack = [] self.name = name self.autoplay = True self.ppo_agent = None self.poker_env = Environment.create(environment=env, max_episode_timesteps=100) self.runner = None if load_model: self.load(load_model) def load(self, model_name): print("Loading model...") self.ppo_agent = Agent.load(directory=model_name, format='hdf5') def start_step_policy(self, observation): log.info("Random action") _ = observation action = self.poker_env.action_space.sample() return action def train(self, model_name, num_ep=500): print('Training...') self.runner = Runner(agent='ppo.json', environment=dict(type=self.poker_env), num_parallel=5, remote='multiprocessing') self.runner.run(num_episodes=num_ep) self.runner.agent.save(directory=model_name, format='hdf5') self.runner.close() def play(self, model_name, num_ep=5): self.load(model_name) print('Evaluating...') self.runner = Runner(agent=self.ppo_agent, environment=dict(type=self.poker_env)) self.runner.run(num_episodes=num_ep, evaluation=True) self.runner.close() def action(self, action_space, observation, info): _ = observation _ = info this_player_action_space = { Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_POT, Action.RAISE_HALF_POT, Action.RAISE_2POT } action = this_player_action_space.intersection(set(action_space)) return action
def test_multi_baseline(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32, scope='state0-1'), size=32, scope='state0-2') state1 = layer(x=layer(x=inputs['state1'], size=32, scope='state1-1'), size=32, scope='state1-2') state2 = layer(x=layer(x=inputs['state2'], size=32, scope='state2-1'), size=32, scope='state2-2') return state0 * state1 * state2 for _ in xrange(5): environment = MinimalTest( definition=[False, (False, 2), (True, 2)]) config = Configuration(batch_size=8, learning_rate=0.001, baseline=dict(type="mlp", sizes=[32, 32], epochs=5, update_batch_size=8, learning_rate=0.01), states=environment.states, actions=environment.actions, network=network_builder) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=4000, episode_finished=episode_finished) print('VPG agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 4000: passed += 1 print('VPG agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): environment = MinimalTest(definition=False) config = Configuration(states=environment.states, actions=environment.actions) agent = RandomAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('Random agent (discrete): ' + str(runner.episode)) self.assertTrue(runner.episode == 1000)
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration( batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, clip_gradients=1.0, states=environment.states, actions=environment.actions, network=layered_network_builder([dict(type='dense', size=32)]) # batch_size=8, # learning_rate=0.0025, # # exploration="OrnsteinUhlenbeckProcess", # # exploration_kwargs=dict( # # sigma=0.1, # # mu=0, # # theta=0.1 # # ), # discount=0.99, # memory_capacity=800, # first_update=80, # repeat_update=4, # target_update_frequency=20, # states=environment.states, # actions=environment.actions, # clip_gradients=5.0, # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)]) ) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('NAF Agent: ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('NAF Agent passed = {}'.format(passed)) self.assertTrue(passed >= 3)
def base_test_pass(self, name, environment, network_spec, **kwargs): """ Basic test loop, requires an Agent to achieve a certain performance on an environment. """ sys.stdout.write('\n{} ({}):'.format(self.__class__.agent.__name__, name)) sys.stdout.flush() passed = 0 for _ in xrange(3): if self.__class__.requires_network: agent = self.__class__.agent( states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec, **kwargs ) else: agent = self.__class__.agent( states_spec=environment.states, actions_spec=environment.actions, **kwargs ) runner = Runner(agent=agent, environment=environment) self.pre_run(agent=agent, environment=environment) def episode_finished(r): episodes_passed = [ rw / ln >= self.__class__.pass_threshold for rw, ln in zip(r.episode_rewards[-100:], r.episode_timesteps[-100:]) ] return r.episode < 100 or not all(episodes_passed) runner.run(episodes=3000, deterministic=self.__class__.deterministic, episode_finished=episode_finished) sys.stdout.write(' ' + str(runner.episode)) sys.stdout.flush() if all(rw / ln >= self.__class__.pass_threshold for rw, ln in zip(runner.episode_rewards[-100:], runner.episode_timesteps[-100:])): passed += 1 if passed == 2: break sys.stdout.write(' ==> {} passed\n'.format(passed)) sys.stdout.flush() self.assertTrue(passed >= 2)
def test_multi(self): environment = MinimalTest( definition=[False, (False, 2), (False, (1, 2)), (True, (1, 2))]) config = Configuration(states=environment.states, actions=environment.actions) agent = RandomAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 20 or not all(x >= 1.0 for x in r.episode_rewards[-20:]) runner.run(episodes=1000, episode_finished=episode_finished) print('Random agent (multi-state/action): ' + str(runner.episode)) self.assertTrue(runner.episode == 1000)
def test_dqfd_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=16, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.1, states=environment.states, actions=environment.actions, network=layered_network_builder(layers_config=[dict(type='dense', size=32, l2_regularization=0.0001)]) ) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = 1 state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD Agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration( batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, clip_gradients=1.0, states=environment.states, actions=environment.actions, network=layered_network_builder([dict(type='dense', size=32)]) # batch_size=8, # learning_rate=0.0025, # # exploration="OrnsteinUhlenbeckProcess", # # exploration_kwargs=dict( # # sigma=0.1, # # mu=0, # # theta=0.1 # # ), # discount=0.99, # memory_capacity=800, # first_update=80, # repeat_update=4, # target_update_frequency=20, # states=environment.states, # actions=environment.actions, # clip_gradients=5.0, # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)]) ) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('NAF Agent: ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('NAF Agent passed = {}'.format(passed)) self.assertTrue(passed >= 3)
def run_experiment(self, environment, experiment_num=0): config = copy(self.config) max_episodes = config.pop('max_episodes') max_episode_timesteps = config.pop('max_episode_timesteps') network_spec = config.pop('network') agent = Agent.from_spec( spec=config, kwargs=dict( states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec ) ) if experiment_num == 0 and self.history_data: logging.info("Attaching history data to runner") history_data = self.history_data else: history_data = None if experiment_num == 0 and self.load_model_file: logging.info("Loading model data from file: {}".format(self.load_model)) agent.load_model(self.load_model_file) runner = Runner( agent=agent, environment=environment, repeat_actions=1, history=history_data # save_path=args.model, # save_episodes=args.save_model ) environment.reset() agent.reset() runner.run(episodes=max_episodes, max_episode_timesteps=max_episode_timesteps, episode_finished=self.episode_finished) return dict( initial_reset_time=0, episode_rewards=runner.episode_rewards, episode_timesteps=runner.episode_timesteps, episode_end_times=runner.episode_times )
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=Configuration( loglevel='info', batch_size=100, baseline='mlp', baseline_args=None, baseline_kwargs=dict( size=32, repeat_update=100 ), override_line_search=False, generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, cg_iterations=20, cg_damping=0.01, line_search_steps=20, max_kl_divergence=0.005, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', help='Agent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-w', '--num-workers', type=int, default=1, help="Number of worker agents") parser.add_argument('-m', '--monitor', help="Save results to this file") parser.add_argument('-M', '--mode', choices=['tmux', 'child'], default='tmux', help="Starter mode") parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory") parser.add_argument('-C', '--is-child', action='store_true') parser.add_argument('-i', '--task-index', type=int, default=0, help="Task index") parser.add_argument('-K', '--kill', action='store_true', default=False, help="Kill runners") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() session_name = 'openai_async' shell = '/bin/bash' kill_cmds = [ "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(12222 + args.num_workers), "tmux kill-session -t {}".format(session_name), ] if args.kill: os.system("\n".join(kill_cmds)) return 0 if not args.is_child: # start up child processes target_script = os.path.abspath(inspect.stack()[0][1]) def wrap_cmd(session, name, cmd): if isinstance(cmd, list): cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd) if args.mode == 'tmux': return 'tmux send-keys -t {}:{} {} Enter'.format(session, name, shlex_quote(cmd)) elif args.mode == 'child': return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format( cmd, args.logdir, session, name, args.logdir ) def build_cmd(index): cmd_args = [ 'CUDA_VISIBLE_DEVICES=', sys.executable, target_script, args.gym_id, '--is-child', '--agent', args.agent, '--agent-config', os.path.join(os.getcwd(), args.agent_config), '--network-config', os.path.join(os.getcwd(), args.network_config), '--num-workers', args.num_workers, '--task-index', index ] if args.debug: cmd_args.append('--debug') return cmd_args if args.mode == 'tmux': cmds = kill_cmds + ['tmux new-session -d -s {} -n ps'.format(session_name)] elif args.mode == 'child': cmds = ['mkdir -p {}'.format(args.logdir), 'rm -f {}/kill.sh'.format(args.logdir), 'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir), 'chmod +x {}/kill.sh'.format(args.logdir)] cmds.append(wrap_cmd(session_name, 'ps', build_cmd(-1))) for i in xrange(args.num_workers): name = 'w_{}'.format(i) if args.mode == 'tmux': cmds.append('tmux new-window -t {} -n {} -d {}'.format(session_name, name, shell)) cmds.append(wrap_cmd(session_name, name, build_cmd(i))) # add one PS call # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell)) print("\n".join(cmds)) os.system("\n".join(cmds)) return 0 ps_hosts = ['127.0.0.1:{}'.format(12222)] worker_hosts = [] port = 12223 for _ in range(args.num_workers): worker_hosts.append('127.0.0.1:{}'.format(port)) port += 1 cluster = {'ps': ps_hosts, 'worker': worker_hosts} cluster_spec = tf.train.ClusterSpec(cluster) environment = OpenAIGym(args.gym_id) if args.agent_config: agent_config = Configuration.from_json(args.agent_config) else: raise TensorForceError("No agent configuration provided.") if not args.network_config: raise TensorForceError("No network configuration provided.") agent_config.default(dict(states=environment.states, actions=environment.actions, network=from_json(args.network_config))) agent_config.default(dict(distributed=True, cluster_spec=cluster_spec, global_model=(args.task_index == -1), device=('/job:ps' if args.task_index == -1 else '/job:worker/task:{}/cpu:0'.format(args.task_index)))) logger = logging.getLogger(__name__) logger.setLevel(log_levels[agent_config.loglevel]) agent = agents[args.agent](config=agent_config) logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(gym_id=args.gym_id)) logger.info("Config:") logger.info(agent_config) runner = Runner( agent=agent, environment=environment, repeat_actions=1, cluster_spec=cluster_spec, task_index=args.task_index ) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
def main(): parser = argparse.ArgumentParser(description="Playground Flags.") parser.add_argument("--game", default="pommerman", help="Game to choose.") parser.add_argument("--config", default="PommeFFA-v0", help="Configuration to execute. See env_ids in " "configs.py for options.") parser.add_argument("--agents", default="tensorforce::ppo,test::agents.SimpleAgent," "test::agents.SimpleAgent,test::agents.SimpleAgent", help="Comma delineated list of agent types and docker " "locations to run the agents.") parser.add_argument("--agent_env_vars", help="Comma delineated list of agent environment vars " "to pass to Docker. This is only for the Docker Agent." " An example is '0:foo=bar:baz=lar,3:foo=lam', which " "would send two arguments to Docker Agent 0 and one to" " Docker Agent 3.", default="") parser.add_argument("--record_pngs_dir", default=None, help="Directory to record the PNGs of the game. " "Doesn't record if None.") parser.add_argument("--record_json_dir", default=None, help="Directory to record the JSON representations of " "the game. Doesn't record if None.") parser.add_argument("--render", default=True, help="Whether to render or not. Defaults to True.") parser.add_argument("--game_state_file", default=None, help="File from which to load game state. Defaults to " "None.") args = parser.parse_args() config = args.config record_pngs_dir = args.record_pngs_dir record_json_dir = args.record_json_dir agent_env_vars = args.agent_env_vars game_state_file = args.game_state_file # TODO: After https://github.com/MultiAgentLearning/playground/pull/40 # this is still missing the docker_env_dict parsing for the agents. agents = [ helpers.make_agent_from_string(agent_string, agent_id+1000) for agent_id, agent_string in enumerate(args.agents.split(",")) ] env = make(config, agents, game_state_file) training_agent = None for agent in agents: if type(agent) == TensorForceAgent: training_agent = agent env.set_training_agent(agent.agent_id) break if args.record_pngs_dir: assert not os.path.isdir(args.record_pngs_dir) os.makedirs(args.record_pngs_dir) if args.record_json_dir: assert not os.path.isdir(args.record_json_dir) os.makedirs(args.record_json_dir) # Create a Proximal Policy Optimization agent agent = training_agent.initialize(env) atexit.register(functools.partial(clean_up_agents, agents)) wrapped_env = WrappedEnv(env, visualize=args.render) runner = Runner(agent=agent, environment=wrapped_env) runner.run(episodes=10, max_episode_timesteps=2000) print("Stats: ", runner.episode_rewards, runner.episode_timesteps, runner.episode_times) try: runner.close() except AttributeError as e: pass
gae_lambda=0.97, cg_iterations=20, cg_damping=0.01, line_search_steps=20, max_kl_divergence=0.005, gamma=0.97, continuous=False, preprocessing=None, states=env.states, actions=env.actions, network=layered_network_builder([dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh')]) )) # Create the runner runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics def episode_finished(r): print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.timestep, reward=r.episode_rewards[-1])) return True # Start learning runner.run(episodes=3000, max_timesteps=200, episode_finished=episode_finished) # Print statistics print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format(ep=runner.episode, ar=np.mean(
def main(): parser = argparse.ArgumentParser() # N.b. if ran from within lab, the working directory is something like lab/bazel-out/../../tensorforce # Hence, relative paths will not work without first fetching the path of this run file parser.add_argument('-id', '--level-id', default='tests/demo_map',help="DeepMind Lab level id") parser.add_argument('-a', '--agent', default='VPGAgent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=1000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=200, help="Maximum number of timesteps per episode") parser.add_argument('-m', '--monitor', help="Save results to this directory") parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=True, help="Show debug outputs") # Redirect output to file sys.stdout = open('lab_output.txt', 'w') args = parser.parse_args() environment = DeepMindLab(args.level_id) path = os.path.dirname(__file__) if args.agent_config: # Use absolute path agent_config = Configuration.from_json(path + args.agent_config, True) else: raise TensorForceError("No agent configuration provided.") if not args.network_config: raise TensorForceError("No network configuration provided.") agent_config.default(dict(states=environment.states, actions=environment.actions, network=from_json(path + args.network_config, True))) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # configurable!!! agent = agents[args.agent](config=agent_config) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) runner = Runner( agent=agent, environment=environment, repeat_actions=1, save_path=args.save, save_episodes=args.save_episodes ) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) report_episodes = args.episodes // 1000 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:]))) logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:]))) return True logger.info("Starting {agent} for Lab environment '{env}'".format(agent=agent, env=environment)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode + 1)) environment.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', default='DQNAgent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000*60, help="Maximum number of timesteps per episode") # parser.add_argument('-m', '--monitor', help="Save results to this directory") # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() env = OpenAIUniverse(args.gym_id) env.configure(remotes=1) default = dict( repeat_actions=1, actions=env.actions, states=env.states, max_episode_length=args.max_timesteps ) if args.agent_config: config = Configuration.from_json(args.agent_config) else: config = Configuration() config.default(default) if args.network_config: network_config = Configuration.from_json(args.network_config).network_layers else: if config.network_layers: network_config = config.network_layers else: raise TensorForceError("Error: No network configuration provided.") if args.debug: print("Configuration:") print(config) logger = logging.getLogger(__name__) logger.setLevel(log_levels[config.loglevel]) stack = None agent = create_agent(args.agent, config, network_config) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(config) runner = Runner(agent, env, preprocessor=stack, repeat_actions=config.repeat_actions) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) runner.save_model(args.save, args.save_episodes) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:]))) logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:]))) return True logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) if args.monitor: env.gym.monitor.close() env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', help='Agent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-m', '--monitor', help="Save results to this directory") parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # configurable!!! environment = OpenAIGym(args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video) if args.agent_config: agent_config = Configuration.from_json(args.agent_config) else: agent_config = Configuration() logger.info("No agent configuration provided.") if args.network_config: network = from_json(args.network_config) else: network = None logger.info("No network configuration provided.") agent_config.default(dict(states=environment.states, actions=environment.actions, network=network)) agent = agents[args.agent](config=agent_config) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) runner = Runner( agent=agent, environment=environment, repeat_actions=1, save_path=args.save, save_episodes=args.save_episodes ) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) if args.monitor: environment.gym.monitor.close() environment.close()