def test_dqfd_agent(self): config = { "expert_sampling_ratio": 0.01, "supervised_weight": 0.5, "expert_margin": 1, 'batch_size': 8, 'state_shape': (2, ), 'actions': 2, 'action_shape': (), 'update_rate': 1, 'update_repeat': 4, 'min_replay_size': 20, 'memory_capacity': 20, "exploration": "epsilon_decay", "exploration_param": { "epsilon": 0, "epsilon_final": 0, "epsilon_states": 0 }, 'target_network_update_rate': 1.0, 'use_target_network': True, "alpha": 0.00004, "gamma": 1, "tau": 1.0 } tf.reset_default_graph() config = create_config(config) network_builder = NeuralNetwork. \ layered_network(layers=[{'type': 'dense', 'num_outputs': 16, 'weights_regularizer': 'tensorflow.contrib.layers.python.layers.regularizers.l2_regularizer', 'weights_regularizer_kwargs': { 'scale': 0.01 } }, {'type': 'linear', 'num_outputs': 2}]) agent = DQFDAgent(config=config, network_builder=network_builder) state = (1, 0) rewards = [0.0] * 100 # First: add to demo memory for n in xrange(50): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = False agent.add_demo_observation(state=state, action=action, reward=reward, terminal=terminal) # Pre-train from demo data agent.pre_train(10000) # If pretraining worked, we should not need much more training for n in xrange(1000): action = agent.get_action(state=state) if action == 0: state = (1, 0) reward = 0.0 terminal = False else: state = (0, 1) reward = 1.0 terminal = False agent.add_observation(state=state, action=action, reward=reward, terminal=terminal) rewards[n % 100] = reward if sum(rewards) == 100.0: print('Passed after steps = {:d}'.format(n)) return print('sum = {:f}'.format(sum(rewards)))
def test_dqfd_agent(self): passed = 0 for _ in xrange(5): environment = MinimalTest(continuous=False) config = Configuration( batch_size=8, memory_capacity=100, first_update=20, repeat_update=4, target_update_frequency=1, discount=1, learning_rate=0.001, expert_sampling_ratio=0.1, supervised_weight=1, expert_margin=1, states=environment.states, actions=environment.actions, network=layered_network_builder(layers_config=[ dict(type='dense', size=32, l2_regularization=0.0001) ])) agent = DQFDAgent(config=config) # First: generate some data to add to demo memory state = environment.reset() agent.reset() for n in xrange(50): action = agent.act(state=state) # Override with correct action action = 1 state, step_reward, terminal = environment.execute( action=action) agent.add_demo_observation(state=state, action=action, reward=step_reward, terminal=terminal) if terminal: state = environment.reset() agent.reset() # Pre-train from demo data agent.pre_train(10000) runner = Runner(agent=agent, environment=environment) def episode_finished(r): return r.episode < 100 or not all( x >= 1.0 for x in r.episode_rewards[-100:]) # Fewer than in DQN test runner.run(episodes=1000, episode_finished=episode_finished) if runner.episode < 1000: passed += 1 print('passed') else: print('failed') print('DQFD Agent passed = {}'.format(passed)) self.assertTrue(passed >= 4)