Ejemplo n.º 1
0
    def sarsa(self, num_iter, alpha, epsilon):
        # 定义行为值函数为字典,并初始化为0
        qfunc = dict()
        for s in self.states:
            for a in self.actions:
                qfunc['%d_%s' % (s, a)] = 0.0
        # 迭代探索环境
        for _ in range(num_iter):
            # 随机初始化初始状态
            state = self.states[int(random.random() * len(self.states))]
            action = self.actions[int(random.random() * len(self.actions))]

            is_terminal, count = False, 0
            while False == is_terminal and count < 100:
                policy = "%d_%s" % (state, action)
                is_terminal, next_state, reward = self.env.transform1(
                    state, action)
                # next_state处的最大动作,通过epsilon_greedy得出,这里是和qLearning的区别。
                next_action = epsilon_greedy(qfunc, next_state, self.actions,
                                             epsilon)
                next_policy = "%d_%s" % (next_state, next_action)
                # 利用qlearning算法更新值函数, 评估策略是greedy,因为计算当前值函数的下一个策略的action是由greedy得出的。
                qfunc[policy] = qfunc[policy] + alpha * (
                    reward + self.gamma * qfunc[next_policy] - qfunc[policy])
                # 转到下一个状态, 行动策略(探索策略)为epsilon_greedy.
                state, action = next_state, epsilon_greedy(
                    qfunc, next_state, self.actions, epsilon)
                count += 1
        return qfunc
Ejemplo n.º 2
0
def SARSA(env, num_episodes, gamma, lr, e):
    """
    Implement the SARSA algorithm following epsilon-greedy exploration.

    Inputs:
    env: OpenAI Gym environment 
            env.P: dictionary
                    P[state][action] are tuples of tuples tuples with (probability, nextstate, reward, terminal)
                    probability: float
                    nextstate: int
                    reward: float
                    terminal: boolean
            env.nS: int
                    number of states
            env.nA: int
                    number of actions
    num_episodes: int
            Number of episodes of training
    gamma: float
            Discount factor.
    lr: float
            Learning rate.
    e: float
            Epsilon value used in the epsilon-greedy method.

    Outputs:
    Q: numpy.ndarray
            State-action values
    """

    Q = np.zeros((env.nS, env.nA))

    #TIPS: Call function epsilon_greedy without setting the seed
    #      Choose the first state of each episode randomly for exploration.
    ############################
    # YOUR CODE STARTS HERE
    for i in range(num_episodes):
        state = np.random.randint(0, env.nS)
        statelist = []
        for j in range(0, env.nS):
            if j == state:
                statelist.append(1)
            else:
                statelist.append(0)
        env.isd = statelist
        env.reset()
        terminal = False
        action = epsilon_greedy(Q[state], e)
        while not terminal:
            next_state, reward, terminal, prob = env.step(action)
            new_action = epsilon_greedy(Q[next_state], e)
            new_value = Q[state, action] + lr * (
                reward + gamma * Q[next_state, new_action]) - lr * Q[state,
                                                                     action]
            Q[state, action] = new_value
            state, action = next_state, new_action
    # YOUR CODE ENDS HERE
    ############################

    return Q
Ejemplo n.º 3
0
def q_learning(env,
               nE,
               min_epsilon=0.01,
               alpha=0.01,
               gamma=0.9,
               lamb=0.2,
               debug=False,
               render=False):
    """
    This method updates the state action values using the Q-learning algorithm

    Q(s,a) <- Q(s,a) + alpha*(r + gamma*Q(s_prime, pi_target(s_prime))-Q(s,a))))
    """

    q_pi = defaultdict(lambda: defaultdict(float))
    E = defaultdict(lambda: defaultdict(float))

    for e in range(nE):

        s = env.reset()
        epsilon = (1.0-min_epsilon) * \
            (1.0-float(e)/float(nE)) + min_epsilon
        done = False

        if debug:
            if e % 1000 == 0:
                print("Completed {} episodes".format(e))
                print("Epsilon {}".format(epsilon))

        while not done:
            if render:
                env.render()
                time.sleep(0.25)
            a = utils.epsilon_greedy(q_pi, env, epsilon)(s)
            s_prime, r, done, _ = env.step(a)
            a_target = utils.epsilon_greedy(q_pi, env, 0)(s_prime)

            delta = r + q_pi[s_prime][a_target] - q_pi[s][a]
            E[s][a] += 1.0

            for s in q_pi:
                for a in q_pi[s]:
                    q_pi[s][a] += alpha * delta * E[s][a]
                    E[s][a] *= gamma * lamb

            s = s_prime

    return q_pi, utils.epsilon_greedy(q_pi, env, min_epsilon)
Ejemplo n.º 4
0
def test_procedure(shared_actor, env):
    num_actions = env.action_space.n
    local_actor = nets.Actor(num_actions=num_actions)
    # load parameters from shared models
    begin_time = time.time()
    while True:
        replay_buffer = utils.ReplayBuffer(size=4, frame_history_len=4)
        local_actor.load_state_dict(shared_actor.state_dict())
        obs = env.reset()
        rewards = []
        while True:
            replay_buffer.store_frame(obs)
            states = replay_buffer.encode_recent_observation()

            states = np.expand_dims(states, axis=0) / 255.0 - .5
            logits = local_actor(
                Variable(torch.FloatTensor(states.astype(np.float32))))
            action = utils.epsilon_greedy(logits,
                                          num_actions=env.action_space.n,
                                          epsilon=-1.)
            obs, reward, done, info = env.step(action)
            rewards.append(reward)
            if done:
                print("Time:{}, computer:{}, agent:{}".format(
                    time.time() - begin_time, sum(np.array(rewards) == -1),
                    sum(np.array(rewards) == 1)))
                break
Ejemplo n.º 5
0
 def decide(self, state):
     eps = max(
         ConfigBrain.BASE_EPSILON,
         1 - (self._age /
              (self.learning_frequency() * ConfigBiology.MATURITY_AGE)))
     brain_actions_prob = self.brain().think(state)
     action_prob = utils.normalize_dist(self.fitrah() + brain_actions_prob)
     decision = utils.epsilon_greedy(eps, action_prob)
     return decision
Ejemplo n.º 6
0
    def act(self, state):
        """ Choose action according to behavior policy. Returns action values, state as torch.Tensor, chosen action and probabilites

        Params
        ======
            state(array): current state
        """
        q_values, state_t = self.decisionQValues(state)
        action, probs = utils.epsilon_greedy(q_values, self.epsilon)
        return q_values, state_t, action, probs
def sarsa(env, pi, nE, alpha=0.01, gamma=1.0, lamb=0.2, min_epsilon=0.01, debug=False, render=False):
    """Calculates Q using the on-policy SARSA method
    """
    q_pi = defaultdict(lambda: defaultdict(float))
    E = defaultdict(lambda: defaultdict(float))

    for e in range(nE):
        done = False
        s = env.reset()
        a = pi(s)
        epsilon = (1.0 - min_epsilon) * \
            (1.0 - float(e) / float(nE)) + min_epsilon

        if debug:
            if e % 1000 == 0:
                print("Completed {} episodes".format(e))
                print("Epsilon: {}".format(epsilon))

        while not done:
            if render:
                env.render()
                time.sleep(0.25)
            s_prime, r, done, _ = env.step(a)
            a_prime = pi(s_prime)

            delta = r + q_pi[s_prime][a_prime] - q_pi[s][a]
            E[s][a] += 1.0

            for s in q_pi:
                for a in q_pi[s]:
                    q_pi[s][a] += alpha * delta * E[s][a]
                    E[s][a] *= gamma*lamb

            s = s_prime
            a = a_prime
            pi = utils.epsilon_greedy(q_pi, env, epsilon)

    return q_pi, pi
Ejemplo n.º 8
0
def monte_carlo_control(pi,
                        env,
                        n,
                        gamma=1.0,
                        min_epsilon=0.01,
                        debug=False,
                        render=False):
    """
    This takes a policy and performs the monte carlo update
    """
    q_pi = defaultdict(lambda: defaultdict(float))

    for i in range(n):
        epsilon = (1.0 - min_epsilon) * \
            (1 - float(i) / float(n)) + min_epsilon
        if debug:
            if i % 1000 == 0:
                print("Finished {} episodes".format(i))
                print("Epsilon: {} episodes".format(epsilon))
        G, N = monte_carlo_episode(pi, env, gamma, render)
        q_pi = monte_carlo_step(q_pi, N, G)
        pi = utils.epsilon_greedy(q_pi, env, epsilon)

    return q_pi, pi
Ejemplo n.º 9
0
 def decide(self, state):
     brain_actions_prob = self._brain.think(state)
     action_prob = utils.normalize_dist(
         brain_actions_prob)  # + self.fitrah()
     decision = utils.epsilon_greedy(0, dist=action_prob)
     return decision
Ejemplo n.º 10
0
def learning_thread(shared_actor,
                    shared_critic,
                    shared_actor_optim,
                    shared_critic_optim,
                    exploration=LinearSchedule(1000000, 0.1),
                    gamma=0.99,
                    frame_history_len=4):
    ####
    # 1. build a local model
    # 2. synchronize the shared model parameters and local model
    # 3. choose an action based on observation
    # 4. take an action and get the reward and the next observation
    # 5. calculate the target, and accumulate the gradient
    # 6. update the global model
    ####

    # prepare environment
    env = get_env()
    obs = env.reset()
    num_actions = env.action_space.n
    # prepare local model
    local_actor = nets.Actor(num_actions=num_actions)
    local_critic = nets.Critic()

    # criterion
    criterion = nn.MSELoss(size_average=False)

    # load parameters from shared models
    local_actor.load_state_dict(shared_actor.state_dict())
    local_critic.load_state_dict(shared_critic.state_dict())

    replay_buffer = utils.ReplayBuffer(size=4,
                                       frame_history_len=frame_history_len)

    #
    idx = replay_buffer.store_frame(obs)

    num_n_steps = 4
    for i in itertools.count():
        states = []
        actions = []
        next_states = []
        dones = []
        rewards = []
        for i in range(num_n_steps):
            replay_buffer.store_frame(obs)
            state = replay_buffer.encode_recent_observation()
            state = np.expand_dims(state, axis=0) / 255.0 - .5

            state = Variable(torch.from_numpy(state.astype(np.float32)),
                             volatile=True)
            logits = local_actor(state)
            action = utils.epsilon_greedy(logits,
                                          num_actions=num_actions,
                                          epsilon=exploration(i))
            next_obs, reward, done, info = env.step(action)

            replay_buffer.store_frame(next_obs)
            # store the states for get the gradients
            states.append(state)
            actions.append(action)
            dones.append(done)
            rewards.append(reward)
            next_states.append(replay_buffer.encode_recent_observation())

            if done:
                break
        # compute targets and compute the critic's gradient
        # from numpy to torch.Variable
        cur_states = np.array(states) / 255.0 - .5
        cur_states = Variable(torch.FloatTensor(cur_states.astype(np.float32)))
        next_states = np.array(next_states) / 255.0 - .5
        next_states = Variable(torch.FloatTensor(next_states.astype(
            np.float32)),
                               volatile=True)
        not_done_mask = torch.FloatTensor(1 - np.array(dones).astype(
            dtype=np.float32)).view_(-1, 1)
        rewards = torch.FloatTensor(np.array(rewards).astype(
            np.float32)).view_(-1, 1)
        values = local_critic(next_states)
        targets = values.data.mul_(not_done_mask).mul_(gamma)
        targets = targets.add_(rewards)
Ejemplo n.º 11
0
                              num_actions=num_actions,
                              name='target_dqn',
                              learning_rate=LR)
    buf = MemoryBuffer(memory_size=BUFFER_SIZE)

    total_episode_rewards = []
    step = 0
    for episode in range(MAX_EPISODE + 1):
        frame = env.reset()  # LazyFrames
        state = np.array(frame)  # narray (84, 84, 4)
        done = False
        cur_episode_reward = 0
        while not done:  # 如果done则结束episode
            if step % C == 0:
                target_dqn.copy_from(dqn)  # 复制参数
            if epsilon_greedy(step):
                action = env.action_space.sample()
            else:
                action = dqn.get_action(state / 255.0)
            # env.render()
            next_frame, reward, done, _ = env.step(action)
            next_state = np.array(next_frame)
            buf.push(state, action, reward, next_state, done)
            state = next_state
            cur_episode_reward += reward

            if buf.size() > MIN_BUFFER:
                states, actions, rewards, next_states, dones = buf.sample(
                    MINI_BATCH)
                next_state_action_values = np.max(target_dqn.predict(
                    next_states / 255.0),
Ejemplo n.º 12
0
    env = GridWorld(path=args.input)
    num_states = env.get_num_states()
    num_actions = len(env.get_action_set())
    num_rows, num_cols = env.get_grid_dimensions()

    # Sarsa(0):
    gamma = 0.95
    step_size = 0.1
    num_steps_episode = []
    for seed in range(args.num_seeds):
        random.seed(seed)
        num_steps_episode.append([])
        q_values = np.zeros((num_states, num_actions))
        for i in range(args.num_episodes):
            s = env.get_current_state()
            a = utils.epsilon_greedy(q_values[s])
            num_steps = 0
            while num_steps < args.max_length_ep and not env.is_terminal():
                r = env.act(env.get_action_set()[a])
                next_s = env.get_current_state()
                next_a = utils.epsilon_greedy(q_values[next_s])

                td_error = r + gamma * q_values[next_s][next_a] - q_values[s][a]
                q_values[s][a] = q_values[s][a] + step_size * td_error

                s = next_s
                a = next_a
                num_steps += 1
            env.reset()
            num_steps_episode[seed].append(num_steps)
Ejemplo n.º 13
0
def double_q_learning(env,
                      nE,
                      alpha=0.01,
                      gamma=0.9,
                      lamb=0.2,
                      min_epsilon=0.01,
                      debug=False):
    """Learns Q use the double-q learning algorithm:

     choose A using Q_1 + Q_2

     with p=.5 make update to Q_1 using Q_2:

         Q_1(S, A) = Q_1(S, A) + alpha(r + gamma * Q_2(S', pi_t(S')) - Q_1(S, A))

     with p=.5 make update to Q_2 using Q_1:

         Q_2(S, A) = Q_2(S, A) + alpha(r + gamma * Q_1(S', pi_t(S')) - Q_2(S, A))
     """

    q_1 = defaultdict(lambda: defaultdict(float))
    q_2 = defaultdict(lambda: defaultdict(float))
    E = defaultdict(lambda: defaultdict(float))
    flag = True

    for e in range(nE):

        epsilon = (1.0 - min_epsilon) * \
            (1.0 - float(e)/float(nE)) + min_epsilon

        if debug:
            if e % 1000 == 0:
                print("Completed {} episodes".format(e))
                print("Epsilon: {}".format(epsilon))
        done = False
        s = env.reset()

        while not done:
            a = utils.epsilon_greedy({s: Counter(q_1[s]) + Counter(q_2[s])},
                                     env, epsilon)(s)
            s_prime, r, done, _ = env.step(a)

            if np.random.rand() < 0.5:
                a_prime = utils.epsilon_greedy(q_2, env, epsilon=0)(s_prime)
                delta = r + q_2[s_prime][a_prime] - q_1[s][a]
                flag = True
            else:
                a_prime = utils.epsilon_greedy(q_1, env, epsilon=0)(s_prime)
                delta = r + q_1[s_prime][a_prime] - q_2[s][a]
                flag = False
            E[s][a] += 1

            for s in q_1:
                for a in q_1[s]:
                    if flag:
                        q_1[s][a] += alpha * delta * E[s][a]

                    else:
                        q_2[s][a] += alpha * delta * E[s][a]
                    E[s][a] *= gamma * lamb

            s = s_prime

    q_pi = utils.combine_nested_dicts(q_1, q_2)
    return q_pi, utils.epsilon_greedy(q_pi, env, epsilon=min_epsilon)
lamb = float(input("Lambda: "))
n_episodes = int(input("Num Episodes: "))
n_episodes_watch = int(input("Num of episodes to watch: "))
should_render = "y" == input("Render (y/n): ")
should_debug = "y" == input("Debug (y/n): ")

algo = int(
    input(
        "Choose algorithm:\n(1)Monte-Carlo\n(2)SARSA\n(3)Q-learning\n(4)Double Q-learning\n(5)Value Iteration\n"
    ))

pi = utils.initial_pi(env)
v_pi = utils_model.value_iteration(env.P, gamma=gamma)
q_v_pi = utils.state_values_to_action_values(v_pi, env)

pi_opt = utils.epsilon_greedy(q_v_pi, env, epsilon=min_epsilon)

if algo == 1:
    q_pi, pi_opt = utils_monte_carlo.monte_carlo_control(
        pi,
        env,
        n_episodes,
        gamma=gamma,
        min_epsilon=min_epsilon,
        debug=should_debug)

elif algo == 2:
    q_pi, pi_opt = utils_sarsa.sarsa(env,
                                     pi,
                                     n_episodes,
                                     gamma=gamma,