예제 #1
0
파일: main.py 프로젝트: radia408/MDP
def markovDecision(layout, circle):
    env = SnakesAndLadder(layout, circle)
    agent = RandomAgent(env.action_space)

    n_episodes = 50

    for episode in range(n_episodes):
        state = env.reset()
        done = False
        while not done:
            action = agent.select_action(state)
            next_state, reward, done = env.step(action)

            agent.update(state, action, reward, next_state)

            state = next_state
예제 #2
0
def collect_random_data(agent):
    env = Env()
    random_agent = RandomAgent()
    end = False
    states = []
    actions = []
    rewards = []
    data = []
    discount_G = 1.0
    G = 0.
    t = 0
    while not end:
        states.append(env.state)
        action = random_agent.select_action(env.feasible_actions)
        action_index = 4 * action[0] + action[1]
        actions.append(action_index)
        reward, _, end = env.step(action)
        rewards.append(reward)
        # discount = gamma
        # for s in range(t):
        # 	values[t-s-1] += discount * reward
        # 	discount = discount * gamma
        t += 1
        G += discount_G * reward
        discount_G = discount_G * agent.gamma

    R = 0.

    # evaluate state values of all states encountered in a batch to save time
    state_values = agent.net.get_value(
        np.array(states).reshape(-1, 7, 7, agent.state_channels)).reshape(-1)

    for s in range(t):
        R = rewards[t - s - 1] + agent.gamma * R
        advantage = R - state_values[t - s - 1]
        data = [
            dict({
                "state": states[t - s - 1],
                "advantage": advantage,
                "action": actions[t - s - 1],
                "critic_target": R
            })
        ] + data

    assert (G == R)
    assert (len(state_values) == len(states) == len(actions) == len(rewards) ==
            t)

    # data = []
    # for s in range(len(states)-1):
    # 	advantage = rewards[s] + values[s+1] - values[s]
    # 	data.append(dict({"state" : states[s],
    # 					  "advantage" : advantage,
    # 					  "critic_target" : values[s],
    # 					  "action" : actions[s]}))

    # T = len(states)-1
    # advantage = rewards[T] - values[T] # next state value is 0 because it is terminal
    # data.append(dict({"state" : states[T],
    # 				  "advantage" : advantage,
    # 				  "critic_target" : values[T],
    # 				  "action" : actions[T]}))

    return data