def test_introduction_dqnagent(self): from tensorforce import Configuration from tensorforce.agents import DQNAgent from tensorforce.core.networks import layered_network_builder # Define a network builder from an ordered list of layers layers = [dict(type='dense', size=32), dict(type='dense', size=32)] network = layered_network_builder(layers_config=layers) # Define a state states = dict(shape=(10, ), type='float') # Define an action (models internally assert whether # they support continuous and/or discrete control) actions = dict(continuous=False, num_actions=5) # The agent is configured with a single configuration object agent_config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, states=states, actions=actions, network=network) agent = DQNAgent(config=agent_config)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, keep_last=True, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQNNstepAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('DQN Nstep agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQN Nstep agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration( batch_size=8, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('TRPO agent (continuous): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('TRPO agent (continuous) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.0005, memory_capacity=800, first_update=80, target_update_frequency=20, memory=dict(type='replay', random_sampling=True), states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = CategoricalDQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('Categorical DQN agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('Categorical DQN agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_beta(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) actions = environment.actions actions['min_value'] = -0.5 actions['max_value'] = 1.5 config = Configuration(batch_size=8, learning_rate=0.01, states=environment.states, actions=actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1500, episode_finished=episode_finished) print('VPG agent (beta): ' + str(runner.episode)) if runner.episode < 1500: passed += 1 print('VPG agent (beta) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_reinforceio_homepage(self): """ Code example from the homepage and README.md. """ from tensorforce import Configuration from tensorforce.agents import TRPOAgent from tensorforce.core.networks import layered_network_builder config = Configuration( batch_size=100, states=dict(shape=(10,), type='float'), actions=dict(continuous=False, num_actions=2), network=layered_network_builder([dict(type='dense', size=50), dict(type='dense', size=50)]) ) # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=config) # Get new data from somewhere, e.g. a client to a web app client = TestTutorialCode.MyClient('http://127.0.0.1', 8080) # Poll new state from client state = client.get_state() # Get prediction from agent, execute action = agent.act(state=state) reward = client.execute(action) # Add experience, agent automatically updates model according to batch size agent.observe(reward=reward, terminal=False)
def test_baseline(self): config = Configuration(discount=0.75, batch_size=8, learning_rate=0.001, states=dict(shape=(1, )), actions=dict(continuous=True), network=layered_network_builder(())) agent = VPGAgent(config=config) states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] terminals = [ False, False, False, False, True, False, False, False, True ] discounted_rewards = np.array([ 0.75 + 0.75**4, 1.0 + 0.75**3, 0.75**2, 0.75, 1.0, 1.0 + 0.75**2, 0.75, 1.0, 0.0 ]) baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0]) agent.model.baseline = dict(state=Baseline()) agent.model.baseline['state'].predict = lambda states: baseline result, _ = agent.model.reward_estimation(states=dict(state=states), rewards=rewards, terminals=terminals) expected = discounted_rewards - baseline print(result) print(expected) self.assertTrue((result == expected).all())
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder([dict(type='dense', size=32)]) ) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQN Agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQN Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration( batch_size=20, entropy_penalty=0.01, loss_clipping=0.1, epochs=10, optimizer_batch_size=10, learning_rate=0.0005, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ]) ) agent = PPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=2000, episode_finished=episode_finished) print('PPO agent (continuous): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('PPO agent (continuous) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration( batch_size=8, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO Agent (continuous): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO continuous agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration(batch_size=8, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=10000, episode_finished=episode_finished) print('TRPO Agent (continuous): ' + str(runner.episode)) if runner.episode < 10000: passed += 1 print('passed') else: print('failed') print('TRPO continuous agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=5000, episode_finished=episode_finished) print('DQN Agent: ' + str(runner.episode)) if runner.episode < 5000: passed += 1 print('passed') else: print('failed') print('DQN Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('VPG Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('VPG discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def create_tf_operations(self, config): if len(config.states) > 1: raise Exception() with tf.variable_scope('mlp_value_function'): self.state = tf.placeholder( dtype=tf.float32, shape=(None, util.prod(next(iter(config.states))[1].shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None, )) network_builder = layered_network_builder(({ 'type': 'dense', 'size': self.size }, { 'type': 'dense', 'size': 1 })) network = NeuralNetwork(network_builder=network_builder, inputs=dict(state=self.state)) self.prediction = network.output loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer( learning_rate=config.learning_rate) self.optimize = optimizer.minimize(loss)
def create_tf_operations(self, state, batch_size, scope='cnn_baseline'): with tf.variable_scope(scope): self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(state.shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None,)) self.updates = int(batch_size / self.update_batch_size) * self.epochs self.batch_size = batch_size layers = [] for size in self.sizes: layers.append({'type': 'conv2d', 'size': size, 'stride': 1, 'window': 3}) # First layer has larger window layers[0]['window'] = 5 # TODO append maxpooling layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork(network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = network.output loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.optimize = optimizer.minimize(loss)
def create_tf_operations(self, state, scope='cnn_baseline'): with tf.variable_scope(scope) as scope: self.state = tf.placeholder(dtype=tf.float32, shape=(None,) + tuple(state.shape)) self.returns = tf.placeholder(dtype=tf.float32, shape=(None,)) layers = [] for size in self.cnn_sizes: layers.append({'type': 'conv2d', 'size': size, 'stride': 1, 'window': 3}) # First layer has larger window layers[0]['window'] = 5 layers.append({'type': 'flatten'}) for size in self.dense_sizes: layers.append({'type': 'dense', 'size': size}) layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork(network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = tf.squeeze(input=network.output, axis=1) loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) variables = tf.contrib.framework.get_variables(scope=scope) self.optimize = optimizer.minimize(loss, var_list=variables)
def create_tf_operations(self, config): if len(config.states) > 1: raise Exception() with tf.variable_scope('mlp_value_function'): self.state = tf.placeholder( dtype=tf.float32, shape=(None, util.prod(next(iter(config.states))[1].shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None, )) self.updates = int( config.batch_size / self.update_batch_size) * self.epochs self.batch_size = config.batch_size layers = [] for _ in xrange(self.hidden_layers): layers.append({'type': 'dense', 'size': self.size}) layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork( network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = network.output loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer( learning_rate=config.learning_rate) self.optimize = optimizer.minimize(loss)
def create_tf_operations(self, state, batch_size, scope='mlp_baseline'): with tf.variable_scope(scope): self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(state.shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None, )) self.updates = int( batch_size / self.update_batch_size) * self.epochs self.batch_size = batch_size layers = [] for size in self.sizes: layers.append({'type': 'dense', 'size': size}) layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork( network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = network.output loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.optimize = optimizer.minimize(loss)
def test_discrete(self): passed = 0 # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, learning_rate=0.0001, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) ) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration(batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, target_update_frequency=20, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('NAF agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('NAF agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_continuous(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=True) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('VPG agent (continuous): ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('VPG agent (continuous) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration(batch_size=8, learning_rate=0.0001, cg_iterations=20, cg_damping=0.001, line_search_steps=20, max_kl_divergence=0.05, states=environment.states, actions=environment.actions, network=layered_network_builder( [dict(type='dense', size=32)])) agent = TRPOAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('TRPO Agent (discrete): ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('TRPO discrete agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_replay(self): environment = MinimalTest(definition=[(False, (1, 2))]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=50, memory=dict(type='replay', random_sampling=True), first_update=20, target_update_frequency=10, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('Replay memory DQN: ' + str(runner.episode))
def test_discrete_baseline(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.001, states=environment.states, actions=environment.actions, baseline=dict(type="mlp", sizes=[32, 32], epochs=5, update_batch_size=8, learning_rate=0.01), network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = VPGAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1500, episode_finished=episode_finished) print('VPG agent (discrete): ' + str(runner.episode)) if runner.episode < 1500: passed += 1 print('VPG agent (discrete) passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_discrete(self): passed = 0 for _ in xrange(5): environment = MinimalTest(definition=False) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=800, first_update=80, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.2, memory=dict(type='replay', random_sampling=True), states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = 1 state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x / l >= reward_threshold for x, l in zip( r.episode_rewards[-100:], r.episode_lengths[-100:])) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_dqfd_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=16, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.1, states=environment.states, actions=environment.actions, network=layered_network_builder(layers_config=[ dict(type='dense', size=32, l2_regularization=0.0001) ])) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = 1 state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD Agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = PPOAgent(config=Configuration( log_level='info', batch_size=256, memory=dict( type='prioritized_replay', ), update_frequency=256, first_update=512, learning_rate=0.0001, optimizer_batch_size=64, normalize_rewards=False, gae_rewards=False, baseline=dict( type="mlp", sizes=[32, 32], epochs=1, update_batch_size=64, learning_rate=0.001 ), states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration( batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, clip_gradients=1.0, states=environment.states, actions=environment.actions, network=layered_network_builder([dict(type='dense', size=32)]) # batch_size=8, # learning_rate=0.0025, # # exploration="OrnsteinUhlenbeckProcess", # # exploration_kwargs=dict( # # sigma=0.1, # # mu=0, # # theta=0.1 # # ), # discount=0.99, # memory_capacity=800, # first_update=80, # repeat_update=4, # target_update_frequency=20, # states=environment.states, # actions=environment.actions, # clip_gradients=5.0, # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)]) ) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('NAF Agent: ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('NAF Agent passed = {}'.format(passed)) self.assertTrue(passed >= 3)
def test_dqfd_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=16, learning_rate=0.001, memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, demo_memory_capacity=100, demo_sampling_ratio=0.1, states=environment.states, actions=environment.actions, network=layered_network_builder(layers_config=[dict(type='dense', size=32, l2_regularization=0.0001)]) ) agent = DQFDAgent(config=config) # First generate demonstration data and pretrain demonstrations = list() terminal = True for n in xrange(50): if terminal: state = environment.reset() action = 1 state, reward, terminal = environment.execute(action=action) demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[]) demonstrations.append(demonstration) agent.import_demonstrations(demonstrations) agent.pretrain(steps=1000) # Normal training runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('DQFD Agent: ' + str(runner.episode)) if runner.episode < 1000: passed += 1 print('DQFD Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)
def test_naf_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=True) config = Configuration( batch_size=8, learning_rate=0.001, exploration=dict(type='ornstein_uhlenbeck'), memory_capacity=800, first_update=80, repeat_update=4, target_update_frequency=20, clip_gradients=1.0, states=environment.states, actions=environment.actions, network=layered_network_builder([dict(type='dense', size=32)]) # batch_size=8, # learning_rate=0.0025, # # exploration="OrnsteinUhlenbeckProcess", # # exploration_kwargs=dict( # # sigma=0.1, # # mu=0, # # theta=0.1 # # ), # discount=0.99, # memory_capacity=800, # first_update=80, # repeat_update=4, # target_update_frequency=20, # states=environment.states, # actions=environment.actions, # clip_gradients=5.0, # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)]) ) agent = NAFAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=2000, episode_finished=episode_finished) print('NAF Agent: ' + str(runner.episode)) if runner.episode < 2000: passed += 1 print('NAF Agent passed = {}'.format(passed)) self.assertTrue(passed >= 3)
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=Configuration( log_level='info', batch_size=100, baseline=dict( type='mlp', size=32, hidden_layers=1, epochs=20, update_batch_size=32 ), generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, max_kl_divergence=0.005, cg_iterations=20, cg_damping=0.01, ls_max_backtracks=20, ls_override=False, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=Configuration( loglevel='info', batch_size=100, baseline='mlp', baseline_args=None, baseline_kwargs=dict( size=32, repeat_update=100 ), override_line_search=False, generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, cg_iterations=20, cg_damping=0.01, line_search_steps=20, max_kl_divergence=0.005, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def create_tf_operations(self, config): if len(config.states) > 1: raise Exception() with tf.variable_scope('mlp_value_function'): self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(next(iter(config.states))[1].shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None,)) network_builder = layered_network_builder(( {'type': 'dense', 'size': self.size}, {'type': 'dense', 'size': 1}) ) network = NeuralNetwork(network_builder=network_builder, inputs=dict(state=self.state)) self.prediction = network.output loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate) self.optimize = optimizer.minimize(loss)
def test_gae(self): config = Configuration(discount=0.75, batch_size=8, learning_rate=0.001, gae_rewards=True, gae_lambda=0.5, states=dict(shape=(1, )), actions=dict(continuous=True), network=layered_network_builder(())) agent = VPGAgent(config=config) states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] terminals = [ False, False, False, False, True, False, False, False, True ] baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0]) agent.model.baseline = dict(state=Baseline()) agent.model.baseline['state'].predict = lambda states: baseline td_residuals = np.array([ 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.75 * 0.25, 0.75 * 0.5 - 0.25, 1.0, 1.0 + 0.75 * 0.25 - 0.5, 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.0 ]) result, _ = agent.model.reward_estimation(states=dict(state=states), rewards=rewards, terminals=terminals) expected = np.array([ np.sum( ((0.5 * 0.75)**np.array([0, 1, 2, 3, 4])) * td_residuals[:5]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[1:5]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[2:5]), np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[3:5]), np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[4:5]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[5:]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[6:]), np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[7:]), np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[8:]) ]) self.assertTrue((result == expected).all())
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = PPOAgent(config=Configuration( log_level='info', batch_size=4096, gae_lambda=0.97, learning_rate=0.001, entropy_penalty=0.01, epochs=5, optimizer_batch_size=512, loss_clipping=0.2, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]))) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def test_prioritized_replay(self): environment = MinimalTest(definition=[(False, (1, 2))]) config = Configuration(batch_size=8, learning_rate=0.001, memory_capacity=50, memory='prioritized_replay', first_update=20, target_update_frequency=10, states=environment.states, actions=environment.actions, network=layered_network_builder([ dict(type='dense', size=32), dict(type='dense', size=32) ])) agent = DQNAgent(config=config) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) runner.run(episodes=1000, episode_finished=episode_finished) print('Prioritized replay memory DQN: ' + str(runner.episode))
def create_tf_operations(self, state, scope='mlp_baseline'): with tf.variable_scope(scope) as scope: self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(state.shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None, )) layers = [] for size in self.sizes: layers.append({'type': 'dense', 'size': size}) layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork( network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = tf.squeeze(input=network.output, axis=1) loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) variables = tf.contrib.framework.get_variables(scope=scope) self.optimize = optimizer.minimize(loss, var_list=variables)
repeat_update=100 ), override_line_search=False, generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, cg_iterations=20, cg_damping=0.01, line_search_steps=20, max_kl_divergence=0.005, gamma=0.97, continuous=False, preprocessing=None, states=env.states, actions=env.actions, network=layered_network_builder([dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh')]) )) # Create the runner runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics def episode_finished(r): print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.timestep, reward=r.episode_rewards[-1])) return True # Start learning runner.run(episodes=3000, max_timesteps=200, episode_finished=episode_finished)
from tensorforce import Configuration from tensorforce.agents import TRPOAgent from tensorforce.environments.openai_gym import OpenAIGym from tensorforce.execution import Runner from tensorforce.core.networks import layered_network_builder import numpy as np # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=Configuration(batch_size=200, states=env.states, actions=env.actions, network=layered_network_builder( [dict(type='dense', size=10)]))) # Create the runner runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics def episode_finished(r): print( "Finished episode {ep} after {ts} timesteps (reward: {reward})".format( ep=r.episode, ts=r.timestep, reward=r.episode_rewards[-1])) return True # Start learning runner.run(episodes=3000, max_timesteps=200, episode_finished=episode_finished)