def markovDecision(layout, circle): env = SnakesAndLadder(layout, circle) agent = RandomAgent(env.action_space) n_episodes = 50 for episode in range(n_episodes): state = env.reset() done = False while not done: action = agent.select_action(state) next_state, reward, done = env.step(action) agent.update(state, action, reward, next_state) state = next_state
def collect_random_data(agent): env = Env() random_agent = RandomAgent() end = False states = [] actions = [] rewards = [] data = [] discount_G = 1.0 G = 0. t = 0 while not end: states.append(env.state) action = random_agent.select_action(env.feasible_actions) action_index = 4 * action[0] + action[1] actions.append(action_index) reward, _, end = env.step(action) rewards.append(reward) # discount = gamma # for s in range(t): # values[t-s-1] += discount * reward # discount = discount * gamma t += 1 G += discount_G * reward discount_G = discount_G * agent.gamma R = 0. # evaluate state values of all states encountered in a batch to save time state_values = agent.net.get_value( np.array(states).reshape(-1, 7, 7, agent.state_channels)).reshape(-1) for s in range(t): R = rewards[t - s - 1] + agent.gamma * R advantage = R - state_values[t - s - 1] data = [ dict({ "state": states[t - s - 1], "advantage": advantage, "action": actions[t - s - 1], "critic_target": R }) ] + data assert (G == R) assert (len(state_values) == len(states) == len(actions) == len(rewards) == t) # data = [] # for s in range(len(states)-1): # advantage = rewards[s] + values[s+1] - values[s] # data.append(dict({"state" : states[s], # "advantage" : advantage, # "critic_target" : values[s], # "action" : actions[s]})) # T = len(states)-1 # advantage = rewards[T] - values[T] # next state value is 0 because it is terminal # data.append(dict({"state" : states[T], # "advantage" : advantage, # "critic_target" : values[T], # "action" : actions[T]})) return data