コード例 #1
0
def testPolicy():
	num_actions = 6
	start_value = 1
	end_value = 0.1
	num_steps = 1000
	
	policy1 = GreedyPolicy()
	policy2 = LinearDecayGreedyEpsilonPolicy(num_actions, start_value, end_value, num_steps)

	q_values = np.array([1.0, 1.3, 1.2, 1.5, 1.1, 1.4])
	assert(policy1.select_action(q_values) == 3)

	assert(policy2.epsilon == 1)

	policy2.select_action(q_values)
	assert(np.isclose(policy2.epsilon, 0.9991))

	policy2.select_action(q_values)
	policy2.select_action(q_values)

	assert(np.isclose(policy2.epsilon, 0.9973))

	for i in range(num_steps):
		policy2.select_action(q_values)

	assert(np.isclose(policy2.epsilon, end_value))

	policy2.select_action(q_values)

	assert(np.isclose(policy2.epsilon, end_value))
コード例 #2
0
    history_store.reset()
    state = atari_processor.state_for_nn(observation)
    history_store.add_history(state)
    nn_tmp = history_store.get_history()
    nn_input = np.zeros((1, IMAGE_SIZE[0], IMAGE_SIZE[1], HISTORY_LENGTH),
                        dtype=float)
    nn_input[0, :] = nn_tmp

    episode_interaction_cnt = 0
    flag_first = 0
    done = False
    while done == False:
        # Interact with environment and store into memory.

        q_values = model_online.predict(nn_input)
        action = greedy_epsilon_linear_decay_selector.select_action(q_values)
        observation, reward, done, info = env.step(action)
        if flag_first == 0:
            info_prev = info
            episode_end == False
            flag_first = 1
        else:
            if info != info_prev:
                episode_end = True
            else:
                episode_end = False
            info_prev = info

        reward = atari_processor.process_reward(reward)
        state_next = atari_processor.state_for_nn(observation)
        history_store.add_history(state_next)