def main(): bad_seeds_environment, agent = set_up() runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=10000) agent.save(directory="saved_models") bad_seeds_environment.close() agent.close()
def main( time_limit=None, scoring="default", batch_size=16, gpu_idx=0, env_version=1, seed_count=9, max_count=10, out_path=None, num_episodes=int(3 * 10**3), ): env, agent = set_up( time_limit=time_limit, scoring=scoring, batch_size=batch_size, gpu_idx=gpu_idx, env_version=env_version, seed_count=seed_count, max_count=max_count, out_path=out_path, ) runner = Runner(agent=agent, environment=env) runner.run(num_episodes=num_episodes) if out_path is None: out_path = Path() else: out_path = Path(out_path).expanduser() agent.save(directory=str(out_path / "saved_models")) agent.close() env.close()
def test_lstm(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32), dict(type='lstm') ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('VPG agent (LSTM): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('VPG agent (LSTM) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def main(): env, agent = set_up() runner = Runner(agent=agent, environment=env) runner.run(num_episodes=10000) agent.save(directory="saved_models") agent.close() env.close()
def test_save_restore(self): environment_spec = {"float": ()} environment = create_environment(environment_spec) network_spec = [dict(type='dense', size=32)] agent = create_agent(environment, network_spec) runner = Runner(agent=agent, environment=environment) runner.run(episodes=100) model_values = agent.model.session.run( agent.model.get_variables(include_submodules=True, include_nontrainable=False)) save_path = agent.model.save(directory=self._tmp_dir_path + "/model") print("Saved at: %s" % (save_path, )) runner.close() agent = create_agent(environment, network_spec) agent.model.restore(directory="", file=save_path) restored_model_values = agent.model.session.run( agent.model.get_variables(include_submodules=True, include_nontrainable=False)) assert len(model_values) == len(restored_model_values) assert all([ np.array_equal(v1, v2) for v1, v2 in zip(model_values, restored_model_values) ]) agent.close()
def train_and_test(self, agent, early_stop=-1, n_tests=15): n_train = TIMESTEPS // n_tests i = 0 runner = Runner(agent=agent, environment=self) try: while i <= n_tests: self.use_dataset(Mode.TRAIN) runner.run(timesteps=n_train, max_episode_timesteps=n_train) self.use_dataset(Mode.TEST) self.run_deterministic(runner, print_results=True) if early_stop > 0: advantages = np.array( self.acc.episode.advantages[-early_stop:]) if i >= early_stop and np.all(advantages > 0): i = n_tests i += 1 except KeyboardInterrupt: # Lets us kill training with Ctrl-C and skip straight to the final test. This is useful in case you're # keeping an eye on terminal and see "there! right there, stop you found it!" (where early_stop & n_tests # are the more methodical approaches) pass # On last "how would it have done IRL?" run, without getting in the way (no killing on repeats, 0-balance) print('Running no-kill test-set') self.use_dataset(Mode.TEST, no_kill=True) self.run_deterministic(runner, print_results=True)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.0005, memory_capacity=800, first_update=80, target_update_frequency=20, memory=dict(type='replay', random_sampling=True), states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = CategoricalDQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('Categorical DQN agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('Categorical DQN agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_replay(self): environment = MinimalTest(definition=[(False, (1, 2))]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=50, memory=dict(type='replay', random_sampling=True), first_update=20, target_update_frequency=10, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('Replay memory DQN: ' + str(runner.episode))
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration( batch_size=8, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('TRPO agent (continuous): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('TRPO agent (continuous) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete_baseline(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, baseline=dict(type="mlp", sizes=[32, 32], epochs=5, update_batch_size=8, learning_rate=0.01), network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1500, episode_finished=episode_finished) print('VPG agent (discrete): ' + str(runner.episode)) if runner.episode < 1500: passed += 1 print('VPG agent (discrete) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, keep_last=True, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQNNstepAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('DQN Nstep agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQN Nstep agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def __init__(self, environment: 'TradingEnvironment', agent_spec: any, save_best_agent: bool = False, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent: A `Tensorforce` agent or agent specification. save_best_agent (optional): The runner will automatically save the best agent kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._max_episode_timesteps = kwargs.get('max_episode_timesteps', False) self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._agent = Agent.create(agent=agent_spec, environment=self._environment) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=save_best_agent)
def test_multi(self): passed = 0 def network_builder(inputs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) return state0 * state1 for _ in xrange(5): environment = MinimalTest(definition=[True, (True, 2)]) config = Configuration(batch_size=16, learning_rate=0.00025, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=network_builder) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 20 or not all( x >= 1.0 for x in r.episode_rewards[-20:]) runner.run(episodes=10000, episode_finished=episode_finished) print('NAF agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 10000: passed += 1 print('NAF agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 0)
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration(batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('NAF agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('NAF agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def main(): parser = argparse.ArgumentParser(description="Train an IBM agent") parser.add_argument("--render", default=False, action='store_true', help="Whether to render or not. Defaults to False.") args = parser.parse_args() for n_simple in [3]: #[1, 2, 3]: agent, environment = make_agent_env(1, n_simple, args.render) agent = restore_agent(agent) # Run runner = Runner(agent=agent, environment=environment) while True: runner.run(episodes=100, max_episode_timesteps=2000) ave_reward = np.mean(runner.episode_rewards) print("Average reward: %f with %d SimpleAgents" % (ave_reward, n_simple)) directory = os.path.join(os.getcwd(), "log", "agent") runner.agent.save_model(directory=directory) if ave_reward > 0 and n_simple < 3: break if ave_reward > 0.9: break try: runner.close() except AttributeError as e: pass
def test_runner_evaluation(self): states = dict(type='float', shape=(1,)) actions = dict(type='int', shape=(), num_values=3) agent, environment = self.prepare(name='runner-evaluation', states=states, actions=actions) runner = Runner(agent=agent, environment=environment) self.num_evaluations = 0 evaluation_frequency = 3 max_evaluation_timesteps = 2 num_evaluation_iterations = 2 def evaluation_callback(r): self.num_evaluations += 1 self.assertEqual(r.episode, self.num_evaluations * evaluation_frequency) self.assertEqual(len(r.evaluation_timesteps), num_evaluation_iterations) for num_timesteps in r.evaluation_timesteps: self.assertLessEqual(num_timesteps, max_evaluation_timesteps) runner.run( num_episodes=10, evaluation_callback=evaluation_callback, evaluation_frequency=evaluation_frequency, max_evaluation_timesteps=max_evaluation_timesteps, num_evaluation_iterations=num_evaluation_iterations ) runner.close() sys.stdout.flush() self.assertTrue(expr=True)
def main(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeeds02, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=100, ) agent = Agent.create( agent="random", environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_random_env_02/summaries", labels="all", frequency=100, # store values every 100 timesteps ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=10000) bad_seeds_environment.close() agent.close()
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=5000, episode_finished=episode_finished) print('DQN Agent: ' + str(runner.episode)) if runner.episode < 5000: passed += 1 print('passed') else: print('failed') print('DQN Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_multi(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) return state0 * state1 for _ in xrange(5): environment = MinimalTest(definition=[False, (False, 2)]) config = Configuration( batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=network_builder ) agent = CategoricalDQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 15 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-15:], r.episode_lengths[-15:])) runner.run(episodes=2000, episode_finished=episode_finished) print('Categorical DQN agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('Categorical DQN agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def main(): bad_seeds_environment = Environment.create( environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100 ) agent = Agent.create( agent="a2c", batch_size=100, horizon=100, # changed from 20 to 100 for agent_03 exploration=0.05, # changed from 0.01 to 0.05 for agent_03 l2_regularization=0.2, # changed from 0.1 to 0.2 for agent_03 #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.1, # changed from 0.05 to 0.1 for agent_03 environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_03_env_03/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), saver=dict( directory='saved_models/agent_03_env_03/checkpoints', frequency=600 # save checkpoint every 600 seconds (10 minutes) ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) for _ in range(10): runner.run(num_episodes=10000) runner.run(num_episodes=1000, evaluation=True) bad_seeds_environment.close() agent.close()
def test_discrete(self): passed = 0 # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration(batch_size=8, learning_rate=0.0001, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def __init__(self, environment: TradingEnvironment, agent_spec: Dict = None, network_spec: Dict = None, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent_spec: A specification dictionary for the `Tensorforce` agent. network_sepc: A specification dictionary for the `Tensorforce` agent's model network. kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._environment = environment self._max_episode_timesteps = kwargs.get('max_episode_timesteps', None) if agent_spec and network_spec: self._agent_spec = agent_spec self._network_spec = network_spec self._agent = Agent.from_spec(spec=agent_spec, kwargs=dict( network=network_spec, states=environment.states, actions=environment.actions)) self._runner = Runner(agent=self._agent, environment=environment)
def train_and_test(self, agent, n_steps, n_tests, early_stop): test_acc = self.acc.tests n_steps = n_steps * 10000 test_acc.n_tests = n_tests test_acc.i = 0 timesteps_each = n_steps // n_tests runner = Runner(agent=agent, environment=self) try: while test_acc.i <= n_tests: self.use_dataset(Mode.TRAIN) # max_episode_timesteps not required, since we kill on (cash|value)<0 or max_repeats runner.run(timesteps=timesteps_each) self.use_dataset(Mode.TEST) self.run_deterministic(runner, print_results=True) if early_stop > 0: sharpes = np.array(self.acc.episode.sharpes[-early_stop:]) if test_acc.i >= early_stop and np.all(sharpes > 0): test_acc.i = n_tests test_acc.i += 1 except KeyboardInterrupt: # Lets us kill training with Ctrl-C and skip straight to the final test. This is useful in case you're # keeping an eye on terminal and see "there! right there, stop you found it!" (where early_stop & n_steps # are the more methodical approaches) pass # On last "how would it have done IRL?" run, without getting in the way (no killing on repeats, 0-balance) print('Running no-kill test-set') self.use_dataset(Mode.TEST, full_set=True) self.run_deterministic(runner, print_results=True)
def restore_agent(self, path: str, model_path: str = None): """Deserialize the strategy's learning agent from a file. Arguments: path: The `str` path of the file the agent specification is stored in. The `.json` file extension will be automatically appended if not provided. model_path (optional): The `str` path of the file or directory the agent checkpoint is stored in. If not provided, the `model_path` will default to `{path_without_dot_json}/agents`. """ path_with_ext = path if path.endswith('.json') else f'{path}.json' with open(path_with_ext) as json_file: spec = json.load(json_file) self._agent_spec = spec.agent self._network_spec = spec.network self._agent = Agent.from_spec(spec=self._agent_spec, kwargs=dict( network=self._network_spec, states=self._environment.states, actions=self._environment.actions)) path_without_ext = path_with_ext.replace('.json', '') model_path = model_path or f'{path_without_ext}/agent' self._agent.restore_model(file=model_path) self._runner = Runner(agent=self._agent, environment=self._environment)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration(batch_size=8, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=10000, episode_finished=episode_finished) print('TRPO Agent (continuous): ' + str(runner.episode)) if runner.episode < 10000: passed += 1 print('passed') else: print('failed') print('TRPO continuous agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration( batch_size=20, entropy_penalty=0.01, loss_clipping=0.1, epochs=10, optimizer_batch_size=10, learning_rate=0.0005, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ]) ) agent = PPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=2000, episode_finished=episode_finished) print('PPO agent (continuous): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('PPO agent (continuous) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def load_agent( time_limit=None, scoring="default", batch_size=16, gpu_idx=0, env_version=1, seed_count=9, max_count=10, out_path=None, ): env, agent = set_up( time_limit=time_limit, scoring=scoring, batch_size=batch_size, gpu_idx=gpu_idx, env_version=env_version, seed_count=seed_count, max_count=max_count, out_path=out_path, ) if out_path is None: out_path = Path() else: out_path = Path(out_path).expanduser() agent.restore(directory=str(out_path / "saved_models")) runner = Runner(agent=agent, environment=env) runner.run(num_episodes=20) return agent
def test_multi(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) state2 = layer(x=layer(x=inputs['state2'], size=32), size=32) return state0 * state1 * state2 for _ in xrange(5): environment = MinimalTest( definition=[False, (False, 2), (True, 2)]) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=network_builder) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=4000, episode_finished=episode_finished) print('VPG agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 4000: passed += 1 print('VPG agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_beta(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) actions = environment.actions actions['min_value'] = -0.5 actions['max_value'] = 1.5 config = Configuration(batch_size=8, learning_rate=0.01, states=environment.states, actions=actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1500, episode_finished=episode_finished) print('VPG agent (beta): ' + str(runner.episode)) if runner.episode < 1500: passed += 1 print('VPG agent (beta) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def train(config, network_spec=None): data_provider = DataProvider(config.db) env = StockEnvironment(data_provider, config, 0) agent = overwrite_agent(env, network_spec, config) if config.overwrite_agent else load_agent( config, env, network_spec) mlflow.log_param("agent", "tensorforce.agents.DQNAgent") for key in config.agent_specs: mlflow.log_param(key, config.agent_specs[key]) runner = Runner(agent=agent, environment=env) offset = 20000 num_episodes = 20 step = 0 while data_provider.has_data_key(offset + config.max_step_per_episode): runner.run(num_episodes=num_episodes) offset = offset + config.max_step_per_episode env.offset = offset agent.save(config.agent_dir, config.agent_name) if step % 10 == 0: evaluate(config, data_provider, offset - config.max_step_per_episode, agent) step += 1 return agent, env