def test_dqn_agent(self): config = { 'seed': 10, 'batch_size': 16, 'state_shape': (2, ), 'actions': 2, 'action_shape': (), 'update_rate': 1, 'update_repeat': 4, 'min_replay_size': 50, 'memory_capacity': 50, "exploration": "epsilon_decay", "exploration_param": { "epsilon": 1, "epsilon_final": 0, "epsilon_states": 50 }, 'target_network_update_rate': 1.0, 'use_target_network': True, "alpha": 0.0005, "gamma": 0.99, "tau": 1.0 } tf.reset_default_graph() tf.set_random_seed(10) config = create_config(config) network_builder = NeuralNetwork.layered_network( layers=[{ 'type': 'dense', 'num_outputs': 16 }, { 'type': 'linear', 'num_outputs': 2 }]) agent = DQNAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in xrange(10000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = False agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: return assert (sum(rewards) == 100.0)
def test_trpo_agent(self): config = { 'batch_size': 8, 'max_episode_length': 4, 'continuous': False, 'state_shape': (2, ), 'actions': 2, 'action_shape': () } tf.reset_default_graph() config = create_config(config) network_builder = NeuralNetwork.layered_network( layers=[{ 'type': 'dense', 'num_outputs': 32 }, { 'type': 'linear', 'num_outputs': 2 }]) agent = DQNAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 for n in range(1000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = True agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: return