def test_reinforceio_homepage(self): """ Code example from the homepage and README.md. """ from tensorforce import Configuration from tensorforce.agents import TRPOAgent config = Configuration(batch_size=100, ) # Create a Trust Region Policy Optimization agent agent = TRPOAgent(states_spec=dict(shape=(10, ), type='float'), actions_spec=dict(type='int', num_actions=2), network_spec=[ dict(type='dense', size=50), dict(type='dense', size=50) ], config=config) # Get new data from somewhere, e.g. a client to a web app client = TestTutorialCode.MyClient('http://127.0.0.1', 8080) # Poll new state from client state = client.get_state() # Get prediction from agent, execute action = agent.act(states=state) reward = client.execute(action) # Add experience, agent automatically updates model according to batch size agent.observe(reward=reward, terminal=False)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration( batch_size=8, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('TRPO agent (continuous): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('TRPO agent (continuous) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration( batch_size=8, max_kl_divergence=0.01, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip(r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('TRPO agent (discrete): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('TRPO agent (discrete) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_multi(self): passed = 0 def network_builder(inputs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32), size=32) state1 = layer(x=layer(x=inputs['state1'], size=32), size=32) state2 = layer(x=layer(x=inputs['state2'], size=32), size=32) state3 = layer(x=layer(x=inputs['state3'], size=32), size=32) return state0 * state1 * state2 * state3 for _ in xrange(5): environment = MinimalTest(definition=[ False, (False, 2), (False, (1, 2)), (True, (1, 2)) ]) config = Configuration(batch_size=8, max_kl_divergence=0.01, states=environment.states, actions=environment.actions, network=network_builder) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 15 or not all( x >= 1.0 for x in r.episode_rewards[-15:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def test_discrete(self): passed = 0 # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration(batch_size=8, learning_rate=0.0001, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_multi(self): passed = 0 def network_builder(inputs, **kwargs): layer = layers['dense'] state0 = layer(x=layer(x=inputs['state0'], size=32, scope='state0-1'), size=32, scope='state0-2') state1 = layer(x=layer(x=inputs['state1'], size=32, scope='state1-1'), size=32, scope='state1-2') state2 = layer(x=layer(x=inputs['state2'], size=32, scope='state2-1'), size=32, scope='state2-2') return state0 * state1 * state2 for _ in xrange(5): environment = MinimalTest(definition=[False, (False, 2), (True, 2)]) config = Configuration( batch_size=8, states=environment.states, actions=environment.actions, network=network_builder ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 15 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-15:], r.episode_lengths[-15:])) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO agent (multi-state/action): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO agent (multi-state/action) passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration(batch_size=8, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=10000, episode_finished=episode_finished) print('TRPO Agent (continuous): ' + str(runner.episode)) if runner.episode < 10000: passed += 1 print('passed') else: print('failed') print('TRPO continuous agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_trpo_agent(self): config = { 'batch_size': 16, "override_line_search": False, "cg_iterations": 20, "use_gae": False, "normalize_advantage": False, "gae_lambda": 0.97, "cg_damping": 0.001, "line_search_steps": 20, 'max_kl_divergence': 0.05, 'max_episode_length': 4, 'continuous': False, 'state_shape': (2,), 'actions': 2, 'gamma': 0.99 } config = create_config(config) tf.reset_default_graph() network_builder = NeuralNetwork.layered_network(layers=[{'type': 'dense', 'num_outputs': 8}]) agent = TRPOAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in xrange(10000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = True agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: print('Steps until passed = {:d}'.format(n)) return print('sum = {:f}'.format(sum(rewards)))
def get_agent(agentType): if agentType == "dqn": agent = DQNAgent( states={ "type": 'float', "shape": ( int(args.population), 1, int(args.resources), ) }, actions={ "type": 'int', "shape": (int(args.resources), ), "num_values": 3 }, memory=1000, network="auto", ) elif agentType == "vpg": agent = VPGAgent( states={ "type": 'float', "shape": ( int(args.population), 1, int(args.resources), ) }, actions={ "type": 'int', "shape": (int(args.resources), ), "num_values": 3 }, network="auto", memory=1000, ) elif agentType == "trpo": agent = TRPOAgent( states={ "type": 'float', "shape": ( int(args.population), 1, int(args.resources), ) }, actions={ "type": 'int', "shape": (int(args.resources), ), "num_values": 3 }, network="auto", memory=1000, ) return agent
def test_reinforceio_homepage(self): """ Code example from the homepage and README.md. """ from tensorforce.agents import TRPOAgent # Create a Trust Region Policy Optimization agent agent = TRPOAgent( states=dict(shape=(10, ), type='float'), actions=dict(type='int', num_actions=2), network=[dict(type='dense', size=50), dict(type='dense', size=50)], update_mode=dict(unit='episodes', batch_size=1, frequency=1), memory=dict(type='latest', include_next_states=False, capacity=100)) # Get new data from somewhere, e.g. a client to a web app client = TestTutorialCode.MyClient('http://127.0.0.1', 8080) # Poll new state from client state = client.get_state() # Get prediction from agent, execute action = agent.act(states=state) reward = client.execute(action) # Add experience, agent automatically updates model according to batch size agent.observe(reward=reward, terminal=False) agent.close()
def __init__(self, state_size, env=None, is_eval=False): self.state_size = state_size self.action_size = 3 self._memory_size = 1000 self._memory = deque(maxlen=1000) self.is_eval = is_eval self.env = env TRPOAgent.__init__(self, states=dict(type='float', shape=self.state_size.shape), actions=dict(type='int', num_actions=self.action_size), network=env.get_network(), discount=env.hyperparameters['gamma'], batching_capacity=env.batch_size * 100, learning_rate=env.hyperparameters['learning_rate']) self._load_model()
def test_reinforceio_homepage(self): """ Code example from the homepage and README.md. """ class MyClient(object): def __init__(self, *args, **kwargs): pass def get_state(self): import numpy as np return np.random.rand(10) def execute(self, action): pass from tensorforce import Configuration from tensorforce.agents import TRPOAgent from tensorforce.core.networks import layered_network_builder config = Configuration(batch_size=100, states=dict(shape=(10, ), type='float'), actions=dict(continuous=False, num_actions=2), network=layered_network_builder([ dict(type='dense', size=50), dict(type='dense', size=50) ])) # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=config) # Get new data from somewhere, e.g. a client to a web app client = MyClient('http://127.0.0.1', 8080) # Poll new state from client state = client.get_state() # Get prediction from agent, execute action = agent.act(state=state) reward = client.execute(action) # Add experience, agent automatically updates model according to batch size agent.observe(reward=reward, terminal=False)
def test_trpo_agent(self): config = { 'batch_size': 8, "cg_iterations": 20, "cg_damping": 0.001, "line_search_steps": 20, 'max_kl_divergence': 0.01, 'max_episode_length': 4, 'continuous': False, 'state_shape': (2, ), 'actions': 2 } tf.reset_default_graph() config = create_config(config) network_builder = NeuralNetwork.layered_network( layers=[{ 'type': 'dense', 'num_outputs': 32 }]) agent = TRPOAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in range(100): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = True agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: return
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=Configuration( log_level='info', batch_size=100, baseline=dict( type='mlp', size=32, hidden_layers=1, epochs=20, update_batch_size=32 ), generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, max_kl_divergence=0.005, cg_iterations=20, cg_damping=0.01, ls_max_backtracks=20, ls_override=False, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def run(self): agent = TRPOAgent( states = dict(type='float', shape=(30,1000,)), #states={"type":'float', "shape": (20, 6561,) }, actions={ "user": dict(type="int", num_values=G.graph.shape[0]), "item": dict(type="int", num_values=G.graph.shape[1]) }, network=[ dict(type='flatten'), dict(type="dense", size=32), ], memory=1000, ) #agent, environment = self.saving_prepare(name='explicit-default') # restored_agent = copy.deepcopy(agent) agent.initialize() agent.save(directory='/Users/lbarberiscanoni/Lorenzo/Github/bubble-poppers/user-based/aggregate/saved',filename=None) agent.restore(directory = '/Users/lbarberiscanoni/Lorenzo/Github/bubble-poppers/user-based/aggregate/saved', filename=None) print("Restored -------")
from tensorforce.contrib.openai_gym import OpenAIGym # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=Configuration( loglevel='info', batch_size=100, baseline=dict( type='mlp', size=32, hidden_layers=1, epochs=20, update_batch_size=32), generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, override_line_search=False, cg_iterations=20, cg_damping=0.01, line_search_steps=20, max_kl_divergence=0.005, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]))) # Create the runner runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics
def main(max_timesteps): max_episodes = None #max_timesteps = 86400000000*days network_spec = [ #dict(type='flatten'), dict(type='dense', size=11, activation='tanh'), #dict(type='dense', size=20, activation='tanh'), #dict(type='dense', size=32, activation='tanh'), ] exploration = dict(type='epsilon_decay', timesteps=max_timesteps) summarizer = dict( directory="./models/" + str(datetime.now()).replace(' ', ''), steps=10000, seconds=None, labels=[ #'rewards', #'actions', 'inputs', 'gradients', 'configuration', ], meta_dict=dict( description='July 2: Trying 11 node hidden layer.', layers=str(network_spec), timesteps=max_timesteps, exploration=exploration, ), ) agent = TRPOAgent( states=env.states, actions=env.actions, network=network_spec, actions_exploration=exploration, #summarizer=summarizer, #batch_size=64 ) runner = Runner(agent, env) report_episodes = 1 #global prev global prev prev = 0 def episode_finished(r): global prev if r.episode % report_episodes == 0: #print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep-prev)) #print("Episode reward: {}".format(r.episode_rewards[-1])) print(r.episode_rewards[-1]) prev = r.timestep #print("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True print("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(num_episodes=max_episodes, num_timesteps=max_timesteps, max_episode_timesteps=None, episode_finished=episode_finished) agent.save_model(directory='./results/TRPO/' + str(datetime.now()).replace(' ', '') + '/model') runner.close() print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
# Create an environment env = DotaEnvironment() network_spec = [ dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), dict(type='dense', size=172, activation='tanh'), ] agent = TRPOAgent( actions=env.actions, states=env.states, discount=0.99, network=network_spec, ) # Callback function printing episode statistics def episode_finished(r): print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1])) agent.save_model('./saved_model/') return True def main(): global agent # Network as list of layers
numberOfPlayers = 10 #training phase Market = ClearingHouse(numberOfPlayers, 4, .7, .3) initialState = Market.get_state() agent = TRPOAgent( states=dict(type='float', shape=( len(initialState), len(initialState[0]), )), actions=dict(type='int', shape=(Market.numOfResources, ), num_actions=3), network=[ dict(type="flatten"), dict(type='dense', size=64), dict(type='dense', size=64) ], batching_capacity=10, ) playerList = [agent for i in range(numberOfPlayers)] training_size = 1000000 for i in tqdm(range(training_size)): # print("iteration #", i, "/", training_size) state = Market.get_state()
from tensorforce import Configuration from tensorforce.agents import TRPOAgent from tensorforce.environments.openai_gym import OpenAIGym from tensorforce.execution import Runner from tensorforce.core.networks import layered_network_builder import numpy as np # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=Configuration(batch_size=200, states=env.states, actions=env.actions, network=layered_network_builder( [dict(type='dense', size=10)]))) # Create the runner runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics def episode_finished(r): print( "Finished episode {ep} after {ts} timesteps (reward: {reward})".format( ep=r.episode, ts=r.timestep, reward=r.episode_rewards[-1])) return True
env = RunEnv(args.visualize) env = TensorForceEnv(env) trpo_agent = TRPOAgent( config=Configuration(log_level='debug', batch_size=64, baseline=dict( type='mlp', size=64, hidden_layers=2, epochs=5, update_batch_size=64, ), generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, max_kl_divergence=0.005, cg_iterations=20, cg_damping=0.01, ls_max_backtracks=20, ls_override=False, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=64), dict(type='dense', size=64), ]))) ppo_agent = PPOAgent(config=Configuration(log_level='debug', batch_size=64, entropy_penalty=0.01,
# Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent( config=Configuration(loglevel="info", batch_size=100, baseline="mlp", baseline_args=None, baseline_kwargs=dict(size=32, repeat_update=100), override_line_search=False, generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, cg_iterations=20, cg_damping=0.01, line_search_steps=20, max_kl_divergence=0.005, gamma=0.97, continuous=False, preprocessing=None, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]))) # Create the runner runner = Runner(agent=agent, environment=env)
env = Pomme(**config["env_kwargs"]) env.seed(0) # Create a Proximal Policy Optimization agent agentPPO = PPOAgent( states=dict(type='float', shape=env.observation_space.shape), actions=dict(type='int', num_actions=env.action_space.n), network=[dict(type='dense', size=64), dict(type='dense', size=64)], batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-4)) # Create a Trust Region Policy Optimization agent agentTRPO = TRPOAgent( states=dict(type='float', shape=env.observation_space.shape), actions=dict(type='int', num_actions=env.action_space.n), network=[dict(type='dense', size=64), dict(type='dense', size=64)]) # Create a Vanilla Policy Gradient agent agentVPG = VPGAgent( states=dict(type='float', shape=env.observation_space.shape), actions=dict(type='int', num_actions=env.action_space.n), network=[dict(type='dense', size=64), dict(type='dense', size=64)]) # Add 3 random agents agents = [] for agent_id in range(3): agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))
}, network=[ dict(type='flatten'), dict(type="dense", size=32), ], memory=10000, ) elif args.agent == "trpo": agent = TRPOAgent( states={ "type": 'float', "shape": G.graph.shape }, actions={ "user": dict(type="int", num_values=G.graph.shape[0]), "item": dict(type="int", num_values=G.graph.shape[1]) }, network=[ dict(type='flatten'), dict(type="dense", size=32), ], memory=10000, ) print("agent ready", agent) if args.process == "train": new_agent = copy.deepcopy(agent) agent.initialize()