for i in range(NUM_EPISODES): state = env.reset() #state = convert(env.agentPos) steps = 0 while True: if np.random.sample() < EPSILON: action = np.random.choice(env.action_space.n) else: action = np.argmax(Q[state]) next_state, reward, done, _ = env.step(action) #next_state = convert(env.agentPos) #if done and reward > 0: # next_state = 15 #env.render() print(M) stats.episode_rewards[i] += reward stats.episode_lengths[i] = steps Q[state][action] = np.dot(M[next_state, :], w) # compute TD error in V V_target = reward + np.dot(M[next_state, :], w) V_error = V_target - np.dot(M[state, :], w)
regularize=False, randomization_space=[rp], goal_reward=GOAL_REWARD, lava_reward=LAVA_REWARD, step_reward=STEP_REWARD, out_of_grid=OUT_OF_GRID_REWARD, max_episode_steps=10) obs, _ = env.reset() score = 0 for i in range(10000): action, _ = agent.act(torch.FloatTensor(obs).to(device)) if i == 0: first_action = action obs, rew, done, info = env.step(action) score += rew if done: obs, _ = env.reset() break first_actions.append(action) scores.append(score) print(scores, first_actions) if all(np.array(scores) >= 0): same_actions.append( int(all(elem == first_actions[0] for elem in first_actions))) np.save('results/' + str(NAME) + '_vpg.npy', np.array(same_actions)) plt.hist(same_actions)
agent.reset() agent.append_trajectory(t_step=0, prev_action=None, observation=obs, reward=None, done=None) prev_action = agent.pick_action(obs) while True: # --- time step rolls here --- #print('---- time step {0} ----'.format(env.t_step)) obs, reward, done = env.step(prev_action) #print(agent.V) agent.append_trajectory(t_step=env.t_step, prev_action=prev_action, observation=obs, reward=reward, done=done) agent.eval_td() # learn from history #print(agent.V) if done: break