def test_load_performance(self): self.start_tests(name='load-performance') environment = Environment.create(environment='CartPole-v1') agent = Agent.load( directory='test/data', filename='ppo-checkpoint', format='checkpoint', environment=environment ) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, use_tqdm=False, evaluation=True) self.assertTrue(all(episode_reward == 500.0 for episode_reward in runner.episode_rewards)) runner.close() agent.close() self.finished_test() agent = Agent.load( directory='test/data', filename='ppo-checkpoint', format='numpy', environment=environment ) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, use_tqdm=False, evaluation=True) self.assertTrue(all(episode_reward == 500.0 for episode_reward in runner.episode_rewards)) runner.close() agent.close() self.finished_test() agent = Agent.load( directory='test/data', filename='ppo-checkpoint', format='hdf5', environment=environment ) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, use_tqdm=False, evaluation=True) self.assertTrue(all(episode_reward == 500.0 for episode_reward in runner.episode_rewards)) runner.close() agent.close() self.finished_test() agent = tf.saved_model.load(export_dir='test/data/ppo-checkpoint') # 10 episodes for _ in range(10): states = environment.reset() terminal = False episode_reward = 0.0 while not terminal: states = np.expand_dims(states, axis=0) auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool)) actions = agent.act(states, auxiliaries, True) actions = actions.numpy().item() states, terminal, reward = environment.execute(actions=actions) episode_reward += reward self.assertEqual(episode_reward, 500.0) environment.close() self.finished_test()
def test_dpg(self): self.start_tests(name='DPG') actions = dict(gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0), gaussian_action2=dict(type='float', shape=(1, ), min_value=-2.0, max_value=1.0)) agent, environment = self.prepare( actions=actions, agent='dpg', memory=100, batch_size=4, # TODO: no-RNN restriction can be removed network=dict(type='auto', size=8, depth=1, rnn=False), # TODO: cannot use RNN since value function takes states and actions critic=dict(type='auto', size=7, depth=1, rnn=False)) self.execute(agent=agent, environment=environment) with TemporaryDirectory() as directory: agent.save(directory=directory, format='numpy') agent = Agent.load(directory=directory) states = environment.reset() agent.act(states=states) agent.close() environment.close()
def test_vpg(self): self.start_tests(name='VPG') agent, environment = self.prepare(agent='vpg', batch_size=2, network=dict(type='auto', size=8, depth=1, rnn=2), baseline=dict(type='auto', size=7, depth=1, rnn=1), baseline_optimizer=dict( optimizer='adam', learning_rate=1e-3)) self.execute(agent=agent, environment=environment) with TemporaryDirectory() as directory: agent.save(directory=directory, format='numpy') agent = Agent.load(directory=directory) states = environment.reset() agent.act(states=states) agent.close() environment.close()
def main(): # OpenAI-Gym environment initialization environment = Environment.create(environment='benchmarks/configs/cartpole.json') # PPO agent initialization agent = Agent.create( agent='benchmarks/configs/ppo.json', environment=environment, # Option 1: Saver - save agent periodically every 10 updates # and keep the 5 most recent checkpoints saver=dict(directory='model-checkpoint', frequency=10, max_checkpoints=5), ) # Runner initialization runner = Runner(agent=agent, environment=environment) # Training runner.run(num_episodes=100) runner.close() # Option 2: Explicit save # (format: 'numpy' or 'hdf5' store only weights, 'checkpoint' stores full TensorFlow model, # agent argument saver, specified above, uses 'checkpoint') agent.save(directory='model-numpy', format='numpy', append='episodes') # Close agent separately, since created separately agent.close() # Load agent TensorFlow checkpoint agent = Agent.load(directory='model-checkpoint', format='checkpoint', environment=environment) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=100, evaluation=True) runner.close() agent.close() # Load agent NumPy weights agent = Agent.load(directory='model-numpy', format='numpy', environment=environment) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=100, evaluation=True) runner.close() agent.close() # Close environment separately, since created separately environment.close()
def create_agent( self, env, n_episodes, save_frenquency, load=False, ): ########### WORK NEEDED ########### ### You need to tweak the Agent ### ################################### """ Agent definition. Tweak the Agent's parameters to your convenience Use any agent from tensorforce and refer to the documentation for the available hyperparameters : -Vanilla Policy Gradient : https://tensorforce.readthedocs.io/en/latest/agents/vpg.html -Proximal Policy Optimization : https://tensorforce.readthedocs.io/en/latest/agents/ppo.html -Trust-Region Policy Optimization : https://tensorforce.readthedocs.io/en/latest/agents/trpo.html -Deterministic Policy Gradient : https://tensorforce.readthedocs.io/en/latest/agents/dpg.html -Deep Q-Network : https://tensorforce.readthedocs.io/en/latest/agents/dqn.html -Double DQN : https://tensorforce.readthedocs.io/en/latest/agents/double_dqn.html -Dueling DQN : https://tensorforce.readthedocs.io/en/latest/agents/dueling_dqn.html -Actor-Critic : https://tensorforce.readthedocs.io/en/latest/agents/ac.html -Advantage Actor-Critic : https://tensorforce.readthedocs.io/en/latest/agents/a2c.html For the network parameters : https://tensorforce.readthedocs.io/en/latest/modules/networks.html """ ##### Agent definition ######## if not (load): agent = Agent.create( agent="ppo", batch_size=10, exploration=0.01, learning_rate=0.00001, likelihood_ratio_clipping=0.1, # etc..., saver=dict( directory="data/checkpoints", frequency=10, # save checkpoint every 10 updates ), # don't change this environment=env, ) else: agent = Agent.load(directory="data/checkpoints") return agent
def initialize_agent(self): # Set up information about the boost pads now that the game is active and the info is available self.boost_pad_tracker.initialize_boosts(self.get_field_info()) if MODEL is not None: max_time = 10 frames_per_sec = 20 max_timesteps = RLEnvironment.get_max_timesteps(max_time, frames_per_sec) self.env = Environment.create( environment=KickoffEnvironment, max_episode_timesteps=max_timesteps, max_time=max_time, message_throttle=20, frames_per_sec=frames_per_sec, input_exclude=[ InputOptions.BALL_POSITION_REL, InputOptions.BALL_DIRECTION, InputOptions.CAR_POSITION_REL, InputOptions.CAR_VELOCITY_MAG, ], output_exclude=[ OutputOptions.BOOST, OutputOptions.STEER, OutputOptions.E_BRAKE, OutputOptions.THROTTLE, OutputOptions.ROLL, ] ) directory='../learning/training/{0}'.format(MODEL) filename='agent' agent = os.path.join(directory, os.path.splitext(filename)[0] + '.json') if not os.path.isfile(agent): logging_utils.log_warn(os.getcwd(), {}) raise Exception('Model file doesn\'t exist') self.agent = Agent.load( directory=os.path.abspath(directory), environment=self.env, format='checkpoint', ) self.env.reset()
def main(): agent_type = 'dqn' agent_dir = f'data/{agent_type}' agent_name = 'counting' model_format = 'tensorflow' num_episodes = 1000000 debug = False environment = TFBlackjackEnvironment(CountDeck(), SimpleDealer(), Player(PassPlayerHandAgent(), ConstantBettingAgent()), debug=debug) agent = Agent.load( directory=agent_dir, name=agent_name, format=model_format, environment=environment, ) for _ in range(num_episodes): if debug: print() states = environment.reset() terminal = False while not terminal: actions = agent.act(states=states, evaluation=True) if debug: print(f"ACTION TAKEN: {actions}") states, terminal, _ = environment.execute(actions=actions) environment.get_stats() environment.print_stats() agent.close() environment.close()
def test_dueling_dqn(self): self.start_tests(name='DuelingDQN') agent, environment = self.prepare(actions=dict(type='int', shape=(2, ), num_values=4), agent='dueling_dqn', memory=100, batch_size=4, network=dict(type='auto', size=8, depth=1, rnn=2)) self.execute(agent=agent, environment=environment) with TemporaryDirectory() as directory: agent.save(directory=directory, format='numpy') agent = Agent.load(directory=directory) states = environment.reset() agent.act(states=states) agent.close() environment.close()
def test_ac(self): self.start_tests(name='AC') # TODO: baseline horizon has to be equal to policy horizon agent, environment = self.prepare(agent='ac', batch_size=4, network=dict(type='auto', size=8, depth=1, rnn=2), critic=dict(type='auto', size=7, depth=1, rnn=2)) self.execute(agent=agent, environment=environment) with TemporaryDirectory() as directory: agent.save(directory=directory, format='numpy') agent = Agent.load(directory=directory) states = environment.reset() agent.act(states=states) agent.close() environment.close()
def test_tensorforce(self): self.start_tests(name='Tensorforce') # Explicit, singleton state/action self.unittest(states=dict(type='float', shape=(), min_value=1.0, max_value=2.0), actions=dict(type='int', shape=(), num_values=4), agent='tensorforce', **UnittestBase.agent) # Implicit agent, environment = self.prepare(**UnittestBase.agent) self.execute(agent=agent, environment=environment) with TemporaryDirectory() as directory: agent.save(directory=directory, format='numpy') agent = Agent.load(directory=directory) states = environment.reset() agent.act(states=states) agent.close() environment.close()
def test_config(self): # FEATURES.MD self.start_tests(name='config') with TemporaryDirectory() as directory: # save: before first timestep update = dict(unit='episodes', batch_size=1) saver = dict(directory=directory, frequency=1) agent, environment = self.prepare(update=update, saver=saver, config=dict( eager_mode=False, create_debug_assertions=True, tf_log_level=20)) weights0 = agent.model.policy.network.layers[1].weights.numpy() states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) updated = agent.observe(terminal=terminal, reward=reward) agent.close() self.finished_test() # load: from given directory agent = Agent.load(directory=directory, environment=environment) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(np.allclose(x, weights0)) self.assertEqual(agent.timesteps, 0) while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) updated = agent.observe(terminal=terminal, reward=reward) self.assertTrue(updated) weights1 = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(not np.allclose(weights1, weights0)) timesteps = agent.timesteps agent.close() self.finished_test() # load: from given directory agent = Agent.load(directory=directory, environment=environment) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(np.allclose(x, weights1)) self.assertEqual(agent.timesteps, timesteps) agent.close() environment.close() self.finished_test() # create, not load agent, environment = self.prepare(update=update, saver=saver, config=dict( eager_mode=False, create_debug_assertions=True, tf_log_level=20)) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(not np.allclose(x, weights0)) self.assertTrue(not np.allclose(x, weights1)) self.assertEqual(agent.timesteps, 0) states = environment.reset() terminal = False while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) updated = agent.observe(terminal=terminal, reward=reward) self.assertTrue(updated) weights2 = agent.model.policy.network.layers[1].weights.numpy() agent.close() self.finished_test() # load: from given directory agent = Agent.load(directory=directory, environment=environment) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(np.allclose(x, weights2)) agent.close() environment.close() self.finished_test() files = set(os.listdir(path=directory)) self.assertTrue( files == { 'agent.json', 'agent-0.data-00000-of-00001', 'agent-0.index', 'agent-1.data-00000-of-00001', 'agent-1.index', 'checkpoint' }) self.finished_test()
def main(): agent_type = 'dqn' agent_dir = f'data/{agent_type}' agent_name = 'counting' model_format = 'tensorflow' tensorboard_dir = f'data/summaries/{agent_type}' tensorboard_labels = [ 'graph', 'entropy', 'kl-divergence', 'losses', 'rewards' ] tensorboard_freq = 20 batch_size = 20 memory = 10000 num_episodes = 50000 learning_rate = 3e-4 exploration = 0.0 summarizer = dict( directory=tensorboard_dir, labels=tensorboard_labels, frequency=tensorboard_freq, ) should_load = True debug = False environment = TFBlackjackEnvironment(CountDeck(), SimpleDealer(), Player(PassPlayerHandAgent(), ConstantBettingAgent()), debug=debug) if should_load: agent = Agent.load( name=agent_name, directory=agent_dir, format=model_format, batch_size=batch_size, environment=environment, exploration=exploration, summarizer=summarizer, memory=memory, learning_rate=learning_rate, ) print("Loading existing agent for training") else: agent = Agent.create( name=agent_name, agent=agent_type, environment=environment, batch_size=batch_size, exploration=exploration, summarizer=summarizer, memory=memory, learning_rate=learning_rate, ) print("Creating new agent") # Train the agent on the number of episodes specified for _ in range(num_episodes): if debug: print() states = environment.reset() terminal = False while not terminal: # Episode timestep actions = agent.act(states=states) if debug: print(f"ACTION TAKEN: {actions}") states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # save agent after done training. Attaches num episodes trained on to agent name agent.save(directory=agent_dir, append='episodes') agent.close() environment.close()
def __init__(self, agent, environment=None, max_episode_timesteps=None, num_parallel=None, environments=None, evaluation=False, remote=None, blocking=False, host=None, port=None): if environment is None and environments is None: if remote != 'socket-client': raise TensorforceError.required( name='Runner', argument='environment or environments') if num_parallel is None: raise TensorforceError.required( name='Runner', argument='num_parallel', condition='socket-client remote mode') environments = [None for _ in range(num_parallel)] elif environment is None: if environments is None: raise TensorforceError.required( name='Runner', argument='environment or environments') if not util.is_iterable(x=environments): raise TensorforceError.type(name='Runner', argument='environments', value=environments) if len(environments) == 0: raise TensorforceError.value(name='Runner', argument='len(environments)', value=len(environments)) if num_parallel is not None and num_parallel != len(environments): raise TensorforceError.value(name='Runner', argument='num_parallel', value=num_parallel, hint='!= len(environments)') num_parallel = len(environments) environments = list(environments) elif num_parallel is None: if environments is not None: raise TensorforceError.invalid( name='Runner', argument='environments', condition='environment is specified') if evaluation: raise TensorforceError.invalid(name='Runner', argument='evaluation', condition='single environment') num_parallel = 1 environments = [environment] else: if not isinstance(num_parallel, int): raise TensorforceError.value(name='Runner', argument='num_parallel', dtype=type(num_parallel)) elif num_parallel < 2: raise TensorforceError.value(name='Runner', argument='num_parallel', value=num_parallel, hint='< 2') if environments is not None: raise TensorforceError.invalid( name='Runner', argument='environments', condition='environment is specified') if isinstance(environment, Environment): raise TensorforceError.value( name='Runner', argument='environment', value=environment, condition='num_parallel', hint= 'is Environment instance, but specification dict is required' ) environments = [environment for _ in range(num_parallel)] if port is None or isinstance(port, int): if isinstance(host, str): port = [port + n for n in range(num_parallel)] else: port = [port for _ in range(num_parallel)] else: if len(port) != num_parallel: raise TensorforceError.value(name='Runner', argument='len(port)', value=len(port), hint='!= num_parallel') if host is None or isinstance(host, str): host = [host for _ in range(num_parallel)] else: if len(host) != num_parallel: raise TensorforceError.value(name='Runner', argument='len(host)', value=len(host), hint='!= num_parallel') self.environments = list() self.is_environment_external = isinstance(environments[0], Environment) environment = Environment.create( environment=environments[0], max_episode_timesteps=max_episode_timesteps, remote=remote, blocking=blocking, host=host[0], port=port[0]) self.is_environment_remote = isinstance(environment, RemoteEnvironment) states = environment.states() actions = environment.actions() self.environments.append(environment) if remote is None and num_parallel > 1 and environment.is_vectorizable( ): self.num_vectorized = num_parallel environments = environments[:1] if evaluation: raise TensorforceError.invalid( name='Runner', argument='evaluation', condition='vectorized environment') elif environment.num_actors() > 1: assert num_parallel == 1 num_parallel = environment.num_actors() self.num_vectorized = environment.num_actors() else: self.num_vectorized = None for n, environment in enumerate(environments[1:], start=1): assert isinstance(environment, Environment) == self.is_environment_external environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps, remote=remote, blocking=blocking, host=host[n], port=port[n]) assert isinstance(environment, RemoteEnvironment) == self.is_environment_remote assert util.is_equal(x=environment.states(), y=states) assert util.is_equal(x=environment.actions(), y=actions) self.environments.append(environment) self.evaluation = evaluation self.is_agent_external = isinstance(agent, Agent) if not self.is_agent_external and 'directory' in agent: self.agent = Agent.load(**agent, environment=environment, parallel_interactions=( num_parallel - int(self.evaluation))) elif num_parallel - int(self.evaluation) > 1: self.agent = Agent.create( agent=agent, environment=environment, parallel_interactions=(num_parallel - int(self.evaluation))) else: self.agent = Agent.create(agent=agent, environment=environment)
pass # Test the agent with RandomAgent opponents test_agents = [] for agent_id in range(3): test_agents.append( SimpleAgent(config["agent"](agent_id, config["game_type"]))) # Add TensorforceAgent agent_id += 1 test_agents.append( TensorforceAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(test_agents) test_agent = Agent.load(directory="C:\\Users\\ali_k\\Desktop\\my_model", format='checkpoint') wrapped_env = WrappedEnv(env, env.observation_space, env.action_space, True, 3000) test_runner = Runner(agent=test_agent, environment=wrapped_env, max_episode_timesteps=2000) log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) test_runner.run(num_episodes=100, evaluation=True, evaluation_callback=tensorboard_callback)
def test_explicit_extended(self): self.start_tests(name='explicit extended') # filename agent, environment = self.prepare() states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.save(directory=self.__class__.directory, filename='test') agent.close() agent = Agent.load(directory=self.__class__.directory, filename='test', environment=environment) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'test.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'test-1.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'test-1.index')) os.remove(path=os.path.join(self.__class__.directory, 'test-1.meta')) os.rmdir(path=self.__class__.directory) self.finished_test() # no timestep agent, environment = self.prepare() states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.save(directory=self.__class__.directory, append_timestep=False) agent.close() agent = Agent.load(directory=self.__class__.directory, environment=environment) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'agent.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent.meta')) os.rmdir(path=self.__class__.directory) self.finished_test()
def test_explicit(self): self.start_tests(name='explicit') # default agent, environment = self.prepare() states = environment.reset() agent.save(directory=self.__class__.directory) agent.close() agent = Agent.load(directory=self.__class__.directory, environment=environment) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.meta')) os.rmdir(path=self.__class__.directory) self.finished_test() # single then parallel and different episode length agent, environment = self.prepare(memory=50, update=dict(unit='episodes', batch_size=1)) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.save(directory=self.__class__.directory) agent.close() environment.close() agent, environment = self.prepare(update=dict(unit='episodes', batch_size=1), max_episode_timesteps=7, parallel_interactions=2) agent.restore(directory=self.__class__.directory) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.meta')) os.rmdir(path=self.__class__.directory) self.finished_test()
from tensorforce import Agent, Environment import matplotlib.pyplot as plt import numpy as np import math import pickle from tqdm import tqdm import gym from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline environment = Environment.create(environment='gym', level='InvertedDoublePendulum-v2') #polynomial regression coach = Agent.load(directory='Double_Model', format='numpy') internals = coach.initial_internals() actions_record = [] theta_states = [] theta1_record = [] theta2_record = [] theta1_integral_record = [] theta2_integral_record = [] for k in range(20): states = environment.reset() terminal = False theta1_integral = 0 theta2_integral = 0
def test_explicit(self): # FEATURES.MD self.start_tests(name='explicit') with TemporaryDirectory() as directory: policy = dict( network=dict(type='auto', size=8, depth=1, rnn=False)) update = dict(unit='episodes', batch_size=1) # TODO: no agent, environment = self.prepare( policy=policy, memory=50, update=update, config=dict(eager_mode=False, create_debug_assertions=True)) states = environment.reset() # save: default checkpoint format weights0 = agent.model.policy.network.layers[1].weights.numpy() agent.save(directory=directory) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) self.assertEqual(agent.timesteps, 1) agent.close() self.finished_test() # load: only directory agent = Agent.load(directory=directory, environment=environment) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue((x == weights0).all()) self.assertEqual(agent.timesteps, 0) self.finished_test() # one timestep actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # save: numpy format, append timesteps agent.save(directory=directory, format='numpy', append='timesteps') agent.close() self.finished_test() # load: numpy format and directory agent = Agent.load(directory=directory, format='numpy', environment=environment) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue((x == weights0).all()) self.assertEqual(agent.timesteps, 1) self.finished_test() # one timestep actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # save: numpy format, append timesteps agent.save(directory=directory, format='numpy', append='timesteps') agent.close() self.finished_test() # load: numpy format and directory agent = Agent.load(directory=directory, format='numpy', environment=environment) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue((x == weights0).all()) self.assertEqual(agent.timesteps, 2) self.finished_test() # one episode while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # save: hdf5 format, filename, append episodes weights1 = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue((weights1 != weights0).any()) self.assertEqual(agent.episodes, 1) agent.save(directory=directory, filename='agent2', format='hdf5', append='episodes') agent.close() self.finished_test() # env close environment.close() # differing agent config: update, parallel_interactions # TODO: episode length, others? environment = Environment.create( environment=self.environment_spec()) # load: filename (hdf5 format implicit) update['batch_size'] = 2 agent = Agent.load(directory=directory, filename='agent2', environment=environment, policy=policy, update=update, parallel_interactions=2) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue((x == weights1).all()) self.assertEqual(agent.episodes, 1) agent.close() self.finished_test() # load: tensorflow format (filename explicit) # TODO: parallel_interactions=2 should be possible, but problematic if all variables are # saved in checkpoint format agent = Agent.load(directory=directory, format='checkpoint', environment=environment, policy=policy, update=update, parallel_interactions=1) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue((x == weights0).all()) self.assertEqual(agent.timesteps, 0) self.assertEqual(agent.episodes, 0) agent.close() self.finished_test() # load: numpy format, full filename including timesteps suffix agent = Agent.load(directory=directory, filename='agent-1', format='numpy', environment=environment, policy=policy, update=update, parallel_interactions=2) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue((x == weights0).all()) self.assertEqual(agent.timesteps, 1) self.assertEqual(agent.episodes, 0) self.finished_test() # three episodes (due to batch_size change, mismatch with loaded internal last_update) for _ in range(3): states = environment.reset() terminal = False while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute( actions=actions) agent.observe(terminal=terminal, reward=reward) self.assertEqual(agent.updates, 1) # save: saved-model format, append updates agent.save(directory=directory, format='saved-model', append='updates') agent.close() # load: saved-model format import tensorflow as tf agent = tf.saved_model.load( export_dir=os.path.join(directory, 'agent-1')) act = next(iter(agent._independent_act_graphs.values())) # one episode states = environment.reset() terminal = False while not terminal: # Turn dicts into lists and batch inputs auxiliaries = [[ np.expand_dims(states.pop('int_action_mask'), axis=0) ]] states = [ np.expand_dims(state, axis=0) for state in states.values() ] actions = act(states, auxiliaries) # Split result dict and unbatch values actions = { name: value.numpy().item() if value.shape == (1, ) else value.numpy()[0] for name, value in actions.items() } states, terminal, _ = environment.execute(actions=actions) # agent.close() environment.close() files = set(os.listdir(path=directory)) self.assertTrue( files == { 'agent.json', 'agent-1', 'agent-1.data-00000-of-00001', 'agent-1.index', 'agent-1.npz', 'agent2.json', 'agent-2.npz', 'agent2-1.hdf5', 'checkpoint' }) files = set(os.listdir(path=os.path.join(directory, 'agent-1'))) self.assertTrue(files == {'assets', 'saved_model.pb', 'variables'}) files = set( os.listdir( path=os.path.join(directory, 'agent-1', 'variables'))) self.assertTrue( files == {'variables.data-00000-of-00001', 'variables.index'}) self.finished_test()
def test_save_load_agent(self): self.start_tests(name='save-load-agent') with TemporaryDirectory() as checkpoint_directory, TemporaryDirectory( ) as numpy_directory: # ==================== # OpenAI-Gym environment initialization environment = Environment.create( environment='benchmarks/configs/cartpole.json') # PPO agent initialization agent = Agent.create( agent='benchmarks/configs/ppo.json', environment=environment, # Option 1: Saver - save agent periodically every 10 updates # and keep the 5 most recent checkpoints saver=dict(directory=checkpoint_directory, frequency=1, max_checkpoints=5), ) # Runner initialization runner = Runner(agent=agent, environment=environment) # Training runner.run(num_episodes=10) runner.close() # Option 2: Explicit save # (format: 'numpy' or 'hdf5' store only weights, 'checkpoint' stores full TensorFlow model, # agent argument saver, specified above, uses 'checkpoint') agent.save(directory=numpy_directory, format='numpy', append='episodes') # Close agent separately, since created separately agent.close() # Load agent TensorFlow checkpoint agent = Agent.load(directory=checkpoint_directory, format='checkpoint', environment=environment) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, evaluation=True) runner.close() agent.close() # Load agent NumPy weights agent = Agent.load(directory=numpy_directory, format='numpy', environment=environment) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10, evaluation=True) runner.close() agent.close() # Close environment separately, since created separately environment.close() # ==================== self.finished_test()
from tensorforce import Agent, Environment import matplotlib.pyplot as plt import numpy as np import math import pickle from tqdm import tqdm import gym from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline environment = Environment.create(environment='gym', level='Hopper-v3') #polynomial regression" coach = Agent.load(directory='Hopper_RL', format='numpy') internals = coach.initial_internals() actions_record = [] theta_states = [] for k in range(1): states = environment.reset() terminal = False while not terminal: print('y_position %s y_velocity %s' % (states[1], states[7])) theta_states.append(states) actions, internals = coach.act(states=states, internals=internals, independent=True, deterministic=True) states, terminal, reward = environment.execute(actions=actions) actions_record.append(actions)
LEVEL = 'PandaReach-v1' EPISODES = 1 EPISODE_MAX_LENGTH = 500 MODEL_DICT = f'{LEVEL}/model' VISUALIZE_DICT = f'{LEVEL}/visualize/{random.randint(0, 1000)}' gym_environment = gym.make(LEVEL) environment = Environment.create( environment=gym_environment, max_episode_timesteps=EPISODE_MAX_LENGTH, visualize=True, visualize_directory=VISUALIZE_DICT, ) agent = Agent.load(directory=MODEL_DICT, environment=environment) runner = Runner(agent=agent, environment=environment, max_episode_timesteps=EPISODE_MAX_LENGTH) runner.run(num_episodes=EPISODES, evaluation=True) runner.close() # sum_rewards = 0.0 # for _ in range(EPISODES): # states = environment.reset() # internals = agent.initial_internals() # terminal = False # while not terminal: # actions, internals = agent.act( # states=states, internals=internals, independent=True, deterministic=True # ) # states, terminal, reward = environment.execute(actions=actions)
def test_explicit(self): # FEATURES.MD self.start_tests(name='explicit') with TemporaryDirectory() as directory: update = dict(unit='episodes', batch_size=1) agent, environment = self.prepare(memory=50, update=update, config=dict( eager_mode=False, create_debug_assertions=True, tf_log_level=20)) states = environment.reset() # save: default checkpoint format weights0 = agent.model.policy.network.layers[1].weights.numpy() agent.save(directory=directory) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) self.assertEqual(agent.timesteps, 1) agent.close() self.finished_test() # load: only directory agent = Agent.load(directory=directory, environment=environment) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(np.allclose(x, weights0)) self.assertEqual(agent.timesteps, 0) self.finished_test() # one timestep actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) # save: numpy format, append timesteps agent.save(directory=directory, format='numpy', append='timesteps') agent.close() self.finished_test() # load: numpy format and directory agent = Agent.load(directory=directory, format='numpy', environment=environment) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(np.allclose(x, weights0)) self.assertEqual(agent.timesteps, 1) self.finished_test() # one timestep actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # save: numpy format, append timesteps agent.save(directory=directory, format='numpy', append='timesteps') agent.close() self.finished_test() # load: numpy format and directory agent = Agent.load(directory=directory, format='numpy', environment=environment) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(np.allclose(x, weights0)) self.assertEqual(agent.timesteps, 2) self.finished_test() # one episode while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # save: hdf5 format, filename, append episodes weights1 = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(not np.allclose(weights1, weights0)) self.assertEqual(agent.episodes, 1) agent.save(directory=directory, filename='agent2', format='hdf5', append='episodes') agent.close() self.finished_test() # env close environment.close() # differing agent config: update, parallel_interactions # TODO: episode length, others? environment = Environment.create( environment=self.environment_spec()) # load: filename (hdf5 format implicit) update['batch_size'] = 2 agent = Agent.load(directory=directory, filename='agent2', environment=environment, update=update, parallel_interactions=2) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(np.allclose(x, weights1)) self.assertEqual(agent.episodes, 1) agent.close() self.finished_test() # load: tensorflow format (filename explicit) # TODO: parallel_interactions=2 should be possible, but problematic if all variables are # saved in checkpoint format agent = Agent.load(directory=directory, format='checkpoint', environment=environment, update=update, parallel_interactions=1) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(np.allclose(x, weights0)) self.assertEqual(agent.timesteps, 0) self.assertEqual(agent.episodes, 0) agent.close() self.finished_test() # load: numpy format, full filename including timesteps suffix agent = Agent.load(directory=directory, filename='agent-1', format='numpy', environment=environment, update=update, parallel_interactions=2) x = agent.model.policy.network.layers[1].weights.numpy() self.assertTrue(np.allclose(x, weights0)) self.assertEqual(agent.timesteps, 1) self.assertEqual(agent.episodes, 0) self.finished_test() # three episodes (due to batch_size change, mismatch with loaded internal last_update) for _ in range(3): states = environment.reset() terminal = False while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute( actions=actions) agent.observe(terminal=terminal, reward=reward) self.assertEqual(agent.updates, 1) # save: saved-model format, append updates agent.save(directory=directory, format='saved-model', append='updates') agent.close() # saved-model functions def batch(x): return np.expand_dims(x, axis=0) def unbatch(x): if isinstance(x, tf.Tensor): x = x.numpy() if x.shape == (1, ): return x.item() else: return np.squeeze(x, axis=0) def recursive_map(function, dictionary): mapped = dict() for key, value in dictionary.items(): if isinstance(value, dict): mapped[key] = recursive_map(function, value) else: mapped[key] = function(value) return mapped # load: saved-model format agent = tf.saved_model.load( export_dir=os.path.join(directory, 'agent-1')) # one episode states = environment.reset() internals = agent.initial_internals() internals = recursive_map(batch, internals) terminal = False while not terminal: auxiliaries = dict(int_action=dict( mask=batch(states.pop('int_action_mask')))) states = recursive_map(batch, states) actions_internals = agent.act(states, internals, auxiliaries, False) actions = actions_internals['actions'] internals = actions_internals['internals'] actions = recursive_map(unbatch, actions) states, terminal, _ = environment.execute(actions=actions) environment.close() # saved-model format with singleton state/action, no internals, no masking policy = dict( network=dict(type='auto', size=8, depth=1, rnn=False)) update = dict(unit='episodes', batch_size=1) baseline = dict( network=dict(type='auto', size=7, depth=1, rnn=False)) agent, environment = self.prepare(states=dict(type='float', shape=(), min_value=1.0, max_value=2.0), actions=dict(type='float', shape=(), min_value=1.0, max_value=2.0), policy=policy, update=update, baseline=baseline, config=dict( eager_mode=False, create_debug_assertions=True, tf_log_level=20)) # one episode states = environment.reset() terminal = False while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) self.assertEqual(agent.updates, 1) # save: saved-model format, append updates agent.save(directory=directory, format='saved-model', append='updates') agent.close() # load: saved-model format agent = tf.saved_model.load( export_dir=os.path.join(directory, 'agent-1')) # one episode states = environment.reset() terminal = False while not terminal: states = batch(states) actions = agent.act(states, True) actions = unbatch(actions) states, terminal, _ = environment.execute(actions=actions) environment.close() files = set(os.listdir(path=directory)) self.assertTrue( files == { 'agent.json', 'agent-1', 'agent-1.data-00000-of-00001', 'agent-1.index', 'agent-1.npz', 'agent2.json', 'agent-2.npz', 'agent2-1.hdf5', 'checkpoint' }) files = set(os.listdir(path=os.path.join(directory, 'agent-1'))) self.assertTrue(files == {'assets', 'saved_model.pb', 'variables'}) files = set( os.listdir( path=os.path.join(directory, 'agent-1', 'variables'))) self.assertTrue( files == {'variables.data-00000-of-00001', 'variables.index'}) self.finished_test()
def test_config_extended(self): self.start_tests(name='config extended') # filename saver = dict(directory=self.__class__.directory, filename='test') agent, environment = self.prepare(saver=saver) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() agent = Agent.load(directory=self.__class__.directory, filename='test', environment=environment) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'test.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'graph.pbtxt')) os.remove(path=os.path.join(self.__class__.directory, 'test-0.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'test-0.index')) os.remove(path=os.path.join(self.__class__.directory, 'test-0.meta')) os.remove(path=os.path.join(self.__class__.directory, 'test-1.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'test-1.index')) os.remove(path=os.path.join(self.__class__.directory, 'test-1.meta')) os.remove(path=os.path.join(self.__class__.directory, 'test-2.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'test-2.index')) os.remove(path=os.path.join(self.__class__.directory, 'test-2.meta')) for filename in os.listdir(path=self.__class__.directory): os.remove(path=os.path.join(self.__class__.directory, filename)) assert filename.startswith('events.out.tfevents.') break os.rmdir(path=self.__class__.directory) self.finished_test() # frequency saver = dict(directory=self.__class__.directory, frequency=1) agent, environment = self.prepare(saver=saver) states = environment.reset() time.sleep(1) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) time.sleep(1) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'graph.pbtxt')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.meta')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.meta')) os.remove(path=os.path.join(self.__class__.directory, 'agent-2.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-2.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-2.meta')) for filename in os.listdir(path=self.__class__.directory): os.remove(path=os.path.join(self.__class__.directory, filename)) assert filename.startswith('events.out.tfevents.'), filename break os.rmdir(path=self.__class__.directory) self.finished_test() # load filename saver = dict(directory=self.__class__.directory) agent, environment = self.prepare(saver=saver) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() saver = dict(directory=self.__class__.directory, load='agent-0') agent, environment = self.prepare(saver=saver) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'graph.pbtxt')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.meta')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.meta')) for filename in os.listdir(path=self.__class__.directory): os.remove(path=os.path.join(self.__class__.directory, filename)) assert filename.startswith('events.out.tfevents.') break os.rmdir(path=self.__class__.directory) self.finished_test()
def create_agent( self, env, n_episodes, save_frenquency, load=False, ): ########### WORK NEEDED ########### ### You need to tweak the Agent ### ################################### """ Agent definition. Tweak the Agent's parameters to your convenience Use any agent from tensorforce and refer to the documentation for the available hyperparameters : -Vanilla Policy Gradient : https://tensorforce.readthedocs.io/en/latest/agents/vpg.html -Proximal Policy Optimization : https://tensorforce.readthedocs.io/en/latest/agents/ppo.html -Trust-Region Policy Optimization : https://tensorforce.readthedocs.io/en/latest/agents/trpo.html -Deterministic Policy Gradient : https://tensorforce.readthedocs.io/en/latest/agents/dpg.html -Deep Q-Network : https://tensorforce.readthedocs.io/en/latest/agents/dqn.html -Double DQN : https://tensorforce.readthedocs.io/en/latest/agents/double_dqn.html -Dueling DQN : https://tensorforce.readthedocs.io/en/latest/agents/dueling_dqn.html -Actor-Critic : https://tensorforce.readthedocs.io/en/latest/agents/ac.html -Advantage Actor-Critic : https://tensorforce.readthedocs.io/en/latest/agents/a2c.html For the network parameters : https://tensorforce.readthedocs.io/en/latest/modules/networks.html """ ##### Agent definition ######## if not (load): print(f"INIT AGENT.") agent = Agent.create( agent="ppo", states={ 'type': 'float', 'shape': (10, ), 'min_value': [ -1.000e+00, -2.000e+00, -1.000e+00, -1.000e+00, -1.000e+00, -1.280e+00, -3.400e+00, -9.999e+03, -9.999e+03, -9.999e+03 ], 'max_value': [ 1.000e+00, 2.000e+00, 1.000e+00, 1.000e+00, 1.000e+00, 1.000e+00, 3.600e+00, 9.999e+03, 9.999e+03, 9.999e+03 ] }, actions=dict( gimbal=dict(type='int', shape=1, num_values=5), throttle=dict(type='int', shape=1, num_values=5), side_booster=dict(type='int', shape=1, num_values=5), ), max_episode_timesteps=100000, batch_size=8, discount=0.99, exploration=0.01, #entropy_regularization=1e-3, #l2_regularization=1e-3, learning_rate=5e-4, config=dict(name="ppo_agent_V3"), saver=dict( directory="data/checkpoints", frequency=10 # save checkpoint every 10 updates ), # don't change this # environment=env, ) else: print(f"RELOADING AGENT.") agent = Agent.load(directory="data/checkpoints", filename="ppo_agent_V3") return agent
def test_config(self): # FEATURES.MD self.start_tests(name='config') # default saver = dict(directory=self.__class__.directory) agent, environment = self.prepare(saver=saver) states = environment.reset() agent.close() agent = Agent.load(directory=self.__class__.directory, environment=environment) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'graph.pbtxt')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.meta')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.meta')) for filename in os.listdir(path=self.__class__.directory): os.remove(path=os.path.join(self.__class__.directory, filename)) assert filename.startswith('events.out.tfevents.') break os.rmdir(path=self.__class__.directory) self.finished_test() # single then parallel and different episode length saver = dict(directory=self.__class__.directory) agent, environment = self.prepare(memory=50, update=dict(unit='episodes', batch_size=1), saver=saver) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() agent, environment = self.prepare(update=dict(unit='episodes', batch_size=1), saver=saver, max_episode_timesteps=7, parallel_interactions=2) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'graph.pbtxt')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.meta')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.meta')) os.remove(path=os.path.join(self.__class__.directory, 'agent-2.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-2.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-2.meta')) for filename in os.listdir(path=self.__class__.directory): os.remove(path=os.path.join(self.__class__.directory, filename)) assert filename.startswith('events.out.tfevents.') break os.rmdir(path=self.__class__.directory) self.finished_test() # no load saver = dict(directory=self.__class__.directory) agent, environment = self.prepare(saver=saver) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() saver = dict(directory=self.__class__.directory, load=False) agent, environment = self.prepare(saver=saver) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'graph.pbtxt')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.meta')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.meta')) for filename in os.listdir(path=self.__class__.directory): os.remove(path=os.path.join(self.__class__.directory, filename)) assert filename.startswith('events.out.tfevents.') break os.rmdir(path=self.__class__.directory) self.finished_test()
def test_explicit(self): # FEATURES.MD self.start_tests(name='explicit') # Remove directory if exists if os.path.exists(path=self.__class__.directory): for filename in os.listdir(path=self.__class__.directory): os.remove(path=os.path.join(self.__class__.directory, filename)) os.rmdir(path=self.__class__.directory) agent, environment = self.prepare(memory=50, update=dict(unit='episodes', batch_size=1)) states = environment.reset() # save: default tensorflow format weights0 = agent.get_variable(variable='policy/policy-network/dense0/weights') agent.save(directory=self.__class__.directory) agent.close() self.finished_test() # load: only directory agent = Agent.load(directory=self.__class__.directory, environment=environment) x = agent.get_variable(variable='policy/policy-network/dense0/weights') self.assertTrue((x == weights0).all()) self.assertEqual(agent.timesteps, 0) self.finished_test() # one timestep actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # save: numpy format, append timesteps weights1 = agent.get_variable(variable='policy/policy-network/dense0/weights') agent.save(directory=self.__class__.directory, format='numpy', append='timesteps') agent.close() self.finished_test() # load: numpy format and directory agent = Agent.load( directory=self.__class__.directory, format='numpy', environment=environment ) x = agent.get_variable(variable='policy/policy-network/dense0/weights') self.assertTrue((x == weights1).all()) self.assertEqual(agent.timesteps, 1) self.finished_test() # one timestep actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # save: numpy format, append timesteps weights2 = agent.get_variable(variable='policy/policy-network/dense0/weights') agent.save(directory=self.__class__.directory, format='numpy', append='timesteps') agent.close() self.finished_test() # load: numpy format and directory agent = Agent.load( directory=self.__class__.directory, format='numpy', environment=environment ) x = agent.get_variable(variable='policy/policy-network/dense0/weights') self.assertTrue((x == weights2).all()) self.assertEqual(agent.timesteps, 2) self.finished_test() # one episode while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # save: hdf5 format, filename, append episodes weights3 = agent.get_variable(variable='policy/policy-network/dense0/weights') self.assertFalse(not (weights3 == weights2).all()) agent.save( directory=self.__class__.directory, filename='agent2', format='hdf5', append='episodes' ) agent.close() self.finished_test() # env close environment.close() # differing agent config: episode length, update, parallel_interactions environment = Environment.create(environment=self.environment_spec(max_episode_timesteps=7)) # load: filename (hdf5 format implicit) agent = Agent.load( directory=self.__class__.directory, filename='agent2', environment=environment, update=dict(unit='episodes', batch_size=2), parallel_interactions=2 ) x = agent.get_variable(variable='policy/policy-network/dense0/weights') self.assertTrue((x == weights3).all()) self.assertEqual(agent.episodes, 1) agent.close() self.finished_test() # load: tensorflow format (filename explicit) agent = Agent.load( directory=self.__class__.directory, format='tensorflow', environment=environment, update=dict(unit='episodes', batch_size=2), parallel_interactions=2 ) x = agent.get_variable(variable='policy/policy-network/dense0/weights') self.assertTrue((x == weights0).all()) self.assertEqual(agent.timesteps, 0) self.assertEqual(agent.episodes, 0) agent.close() self.finished_test() # load: numpy format, full filename including timesteps suffix agent = Agent.load( directory=self.__class__.directory, filename='agent-1', format='numpy', environment=environment, update=dict(unit='episodes', batch_size=2), parallel_interactions=2 ) x = agent.get_variable(variable='policy/policy-network/dense0/weights') self.assertTrue((x == weights1).all()) self.assertEqual(agent.timesteps, 1) self.assertEqual(agent.episodes, 0) agent.close() self.finished_test() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'agent.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent.meta')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.npz')) os.remove(path=os.path.join(self.__class__.directory, 'agent-2.npz')) os.remove(path=os.path.join(self.__class__.directory, 'agent2.json')) os.remove(path=os.path.join(self.__class__.directory, 'agent2-1.hdf5')) os.rmdir(path=self.__class__.directory) self.finished_test()
from Normal import moving_average from Normal import prohibition_parameter from Normal import prohibition_position from Normal import environment #training and evaluation with boundary reward_record_average = np.zeros( (len(prohibition_position), len(prohibition_parameter), len(measure_length))) reward_record = np.zeros( (len(prohibition_position), len(prohibition_parameter), episode_number)) evaluation_reward_record = np.zeros( (len(prohibition_position), len(prohibition_parameter), evaluation_episode_number)) coach = Agent.load(directory='Walker_RL', format='numpy') internals = coach.initial_internals() for k in range(len(prohibition_position)): #training for i in range(len(prohibition_parameter)): record = [] agent = Agent.create(agent='agent.json', environment=environment) print( 'training agent with boundary position at %s and prohibitive parameter %s' % (prohibition_position[k], prohibition_parameter[i])) for _ in tqdm(range(episode_number)): episode_reward = 0 states = environment.reset() terminal = False while not terminal:
def test_config(self): # FEATURES.MD self.start_tests(name='config') # Remove directory if exists if os.path.exists(path=self.__class__.directory): for filename in os.listdir(path=self.__class__.directory): os.remove(path=os.path.join(self.__class__.directory, filename)) os.rmdir(path=self.__class__.directory) # default saver = dict(directory=self.__class__.directory) agent, environment = self.prepare(saver=saver) states = environment.reset() agent.close() agent = Agent.load(directory=self.__class__.directory, environment=environment) actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'graph.pbtxt')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.meta')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.meta')) for filename in os.listdir(path=self.__class__.directory): os.remove(path=os.path.join(self.__class__.directory, filename)) assert filename.startswith('events.out.tfevents.') break os.rmdir(path=self.__class__.directory) self.finished_test() # no load saver = dict(directory=self.__class__.directory) agent, environment = self.prepare(saver=saver) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() saver = dict(directory=self.__class__.directory, load=False) agent, environment = self.prepare(saver=saver) states = environment.reset() actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) agent.close() environment.close() os.remove(path=os.path.join(self.__class__.directory, 'agent.json')) os.remove(path=os.path.join(self.__class__.directory, 'checkpoint')) os.remove(path=os.path.join(self.__class__.directory, 'graph.pbtxt')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-0.meta')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.data-00000-of-00001')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.index')) os.remove(path=os.path.join(self.__class__.directory, 'agent-1.meta')) for filename in os.listdir(path=self.__class__.directory): os.remove(path=os.path.join(self.__class__.directory, filename)) assert filename.startswith('events.out.tfevents.') break os.rmdir(path=self.__class__.directory) self.finished_test()
from tensorforce import Agent, Environment import matplotlib.pyplot as plt import numpy as np import math import pickle from tqdm import tqdm import gym from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline from gym.wrappers import Monitor #environment = Environment.create(environment='gym', level='InvertedPendulum-v2') environment = gym.make('InvertedPendulum-v2') RL = Agent.load(directory='model5', format='numpy') internals = RL.initial_internals() actions_record = [] theta_states = [] for k in range(1): states = environment.reset() terminal = False integrals = 0 while not terminal: #environment.render() integrals += states[1] temp = [states[1], integrals, states[3]] theta_states.append(temp) actions, internals = RL.act(states=states, internals=internals, independent=True, deterministic=True)
from tensorforce import Agent, Environment import matplotlib.pyplot as plt import numpy as np import math import pickle from tqdm import tqdm import gym test_episodes=10 ip_pid_episode_record=[] ip_rl_episode_record=[] ip_rl = Agent.load(directory='Inverted_Pendulum_RL', format='numpy') internals = ip_rl.initial_internals() environment = gym.make('InvertedPendulum-v2') environment_rl = Environment.create(environment='gym', level='InvertedPendulum-v2') kp=25 kd=2.3 for i in range(test_episodes): episode_reward=0 states = environment.reset() terminal=False while not terminal: actions = kp*states[1]+kd*states[3] states, reward, terminal,info = environment.step(actions) episode_reward+=reward ip_pid_episode_record.append(episode_reward)