Ejemplo n.º 1
0
def main():

    env = CliffWalkingEnv()
    sess = tf.Session()
    ac = ActorCritic(env.nA, env.nS)
    sess.run(tf.global_variables_initializer())

    date_str = datetime.now().strftime("%m%d_%H%M%S")
    summaries_dir = os.path.abspath("./summary/ac/" + date_str)
    if not os.path.exists(summaries_dir):
        os.makedirs(summaries_dir)
    summary_writer = tf.summary.FileWriter(summaries_dir,
                                           graph=tf.get_default_graph())

    state = env.reset()
    episode_cnt = 0
    episode_step = 0
    episode_reward = 0.
    while 1:

        probs, value = sess.run([ac.probs, ac.value],
                                feed_dict={ac.state: state})
        action = np.random.choice(env.nA, p=probs)
        next_state, reward, done, _ = env.step(action)

        episode_step += 1
        episode_reward += reward

        value_next = sess.run(ac.value, feed_dict={ac.state: next_state})
        td_target = reward + 0.99 * value_next
        td_adv = td_target - value

        summary, global_step, _, _ = \
            sess.run([ac.summary, get_or_create_global_step(), ac.train_p, ac.train_v],
                     feed_dict={ac.state: state, ac.action: action,
                                ac.adv: td_adv, ac.target: td_target})

        summary_writer.add_summary(summary, global_step)

        if done or episode_step > 1000:
            print('episode cnt:', episode_cnt, 'eoisode step:', episode_step,
                  'reward:', episode_reward)
            episode_step = 0.
            episode_reward = 0.
            episode_cnt += 1
            state = env.reset()
        else:
            state = next_state
Ejemplo n.º 2
0
from cliff_walking import CliffWalkingEnv
import sys
from collections import defaultdict
import itertools
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from matplotlib.font_manager import FontProperties
font_set = FontProperties(fname=r"c://windows//fonts//simsun.ttc", size=15)
env = CliffWalkingEnv()


def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype="float") * epsilon / nA
        best_action = np.argmax(Q[observation])  ##从Q表中选出reward最大的动作
        A[best_action] += (1 - epsilon)
        return A

    return policy_fn


def make_twice_epsilon_greedy_policy(Q1, Q2, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype="float") * epsilon / nA
        best_action = np.argmax(Q1[observation] +
                                Q2[observation])  ##从Q表中选出reward最大的动作
        A[best_action] += (1 - epsilon)
        return A

    return policy_fn
Ejemplo n.º 3
0
if "../" not in sys.path:
  sys.path.append("../") 

from collections import defaultdict
from cliff_walking import CliffWalkingEnv
import plotting

matplotlib.style.use('ggplot')


# In[15]:

origin1 = (0, 0)
des=(49, 36)
env = CliffWalkingEnv(origin1, des)


# In[16]:


def make_epsilon_greedy_policy(Q, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function and epsilon.
    
    Args:
        Q: A dictionary that maps from state -> action-values.
            Each value is a numpy array of length nA (see below)
        epsilon: The probability to select a random action. Float between 0 and 1.
        nA: Number of actions in the environment.
    
Ejemplo n.º 4
0
 def setUpClass(cls):
     np.random.seed(0)
     env = CliffWalkingEnv()
     cls.Q, cls.stats = q_learning(env, 500)
Ejemplo n.º 5
0
        # Decay agent epsilon
        # agent.epsilon = ?
        s = new_s
    return total_reward


def getActionRange(state):
    return range(env.nA)


if __name__ == '__main__':
    max_iterations = 100000000000
    visualize = True
    # Create Taxi-v2 env
    # env = gym.make('Taxi-v2')
    env = CliffWalkingEnv()
    env.reset()
    env.render()

    n_states = env.nS
    n_actions = env.nA

    print('States number = %i, Actions number = %i' % (n_states, n_actions))

    # create q learning agent with
    alpha = 0.5
    get_legal_actions = lambda s: range(n_actions)
    epsilon = 0.2
    discount = 0.99

    agent = QLearningAgent(alpha, epsilon, discount, getActionRange)
    for step in range(t_max):
        a = agent.get_action(state)
        new_state, reward, done, _ = env.step(a)
        agent.update(state, a, new_state, reward)
        total_reward += reward
        state = new_state
        if done:
            break

    return total_reward


if __name__ == '__main__':
    max_iterations = 5000
    visualize = False
    env = CliffWalkingEnv()
    env.reset()
    # env.render()

    n_states = env.nS
    n_actions = env.nA

    print('States number = %i, Actions number = %i' % (n_states, n_actions))

    # create q learning agent with
    alpha = 0.5
    epsilon = 0.2
    epsilon_threshold = 0.1
    discount = 0.99
    get_legal_actions = lambda s: range(n_actions)
    epsilon_ratio = 0.99
Ejemplo n.º 7
0
        a = agent.get_action(s)
        new_s, r, is_done, _ = env.step(a)
        # Update rewards
        agent.update(s, a, new_s, r, agent.get_action(new_s))
        total_reward += r
        s = new_s
        if is_done:
            break
    return total_reward


if __name__ == '__main__':
    max_iterations = 1000
    visualize = True
    # Create Taxi-v2 env
    env = CliffWalkingEnv()

    n_states = env.nS
    n_actions = env.nA

    print('States number = %i, Actions number = %i' % (n_states, n_actions))

    # create q learning agent with
    alpha = 0.1
    get_legal_actions = lambda s: range(n_actions)
    epsilon = 0.5
    discount = 0.99

    agent = QLearningAgent(alpha, epsilon, discount, get_legal_actions)
    s_agent = SarsaAgent(alpha, epsilon, discount, get_legal_actions)