コード例 #1
0

for i in range(NUM_EPISODES):

    state = env.reset()
    #state = convert(env.agentPos)
    steps = 0

    while True:

        if np.random.sample() < EPSILON:
            action = np.random.choice(env.action_space.n)
        else:
            action = np.argmax(Q[state])

        next_state, reward, done, _ = env.step(action)

        #next_state = convert(env.agentPos)
        #if done and reward > 0:
        #    next_state = 15
        #env.render()
        print(M)

        stats.episode_rewards[i] += reward
        stats.episode_lengths[i] = steps

        Q[state][action] = np.dot(M[next_state, :], w)

        # compute TD error in V
        V_target = reward + np.dot(M[next_state, :], w)
        V_error = V_target - np.dot(M[state, :], w)
コード例 #2
0
                           regularize=False,
                           randomization_space=[rp],
                           goal_reward=GOAL_REWARD,
                           lava_reward=LAVA_REWARD,
                           step_reward=STEP_REWARD,
                           out_of_grid=OUT_OF_GRID_REWARD,
                           max_episode_steps=10)

        obs, _ = env.reset()
        score = 0

        for i in range(10000):
            action, _ = agent.act(torch.FloatTensor(obs).to(device))
            if i == 0:
                first_action = action
            obs, rew, done, info = env.step(action)
            score += rew
            if done:
                obs, _ = env.reset()
                break

        first_actions.append(action)
        scores.append(score)
    print(scores, first_actions)

    if all(np.array(scores) >= 0):
        same_actions.append(
            int(all(elem == first_actions[0] for elem in first_actions)))

np.save('results/' + str(NAME) + '_vpg.npy', np.array(same_actions))
plt.hist(same_actions)
コード例 #3
0
        agent.reset()

        agent.append_trajectory(t_step=0,
                                prev_action=None,
                                observation=obs,
                                reward=None,
                                done=None)

        prev_action = agent.pick_action(obs)

        while True:

            #      --- time step rolls here ---
            #print('----  time step {0}  ----'.format(env.t_step))

            obs, reward, done = env.step(prev_action)

            #print(agent.V)

            agent.append_trajectory(t_step=env.t_step,
                                    prev_action=prev_action,
                                    observation=obs,
                                    reward=reward,
                                    done=done)

            agent.eval_td()  # learn from history

            #print(agent.V)

            if done:
                break