def main(): env = CliffWalkingEnv() sess = tf.Session() ac = ActorCritic(env.nA, env.nS) sess.run(tf.global_variables_initializer()) date_str = datetime.now().strftime("%m%d_%H%M%S") summaries_dir = os.path.abspath("./summary/ac/" + date_str) if not os.path.exists(summaries_dir): os.makedirs(summaries_dir) summary_writer = tf.summary.FileWriter(summaries_dir, graph=tf.get_default_graph()) state = env.reset() episode_cnt = 0 episode_step = 0 episode_reward = 0. while 1: probs, value = sess.run([ac.probs, ac.value], feed_dict={ac.state: state}) action = np.random.choice(env.nA, p=probs) next_state, reward, done, _ = env.step(action) episode_step += 1 episode_reward += reward value_next = sess.run(ac.value, feed_dict={ac.state: next_state}) td_target = reward + 0.99 * value_next td_adv = td_target - value summary, global_step, _, _ = \ sess.run([ac.summary, get_or_create_global_step(), ac.train_p, ac.train_v], feed_dict={ac.state: state, ac.action: action, ac.adv: td_adv, ac.target: td_target}) summary_writer.add_summary(summary, global_step) if done or episode_step > 1000: print('episode cnt:', episode_cnt, 'eoisode step:', episode_step, 'reward:', episode_reward) episode_step = 0. episode_reward = 0. episode_cnt += 1 state = env.reset() else: state = next_state
from cliff_walking import CliffWalkingEnv import sys from collections import defaultdict import itertools import numpy as np from matplotlib import pyplot as plt import pandas as pd from matplotlib.font_manager import FontProperties font_set = FontProperties(fname=r"c://windows//fonts//simsun.ttc", size=15) env = CliffWalkingEnv() def make_epsilon_greedy_policy(Q, epsilon, nA): def policy_fn(observation): A = np.ones(nA, dtype="float") * epsilon / nA best_action = np.argmax(Q[observation]) ##从Q表中选出reward最大的动作 A[best_action] += (1 - epsilon) return A return policy_fn def make_twice_epsilon_greedy_policy(Q1, Q2, epsilon, nA): def policy_fn(observation): A = np.ones(nA, dtype="float") * epsilon / nA best_action = np.argmax(Q1[observation] + Q2[observation]) ##从Q表中选出reward最大的动作 A[best_action] += (1 - epsilon) return A return policy_fn
if "../" not in sys.path: sys.path.append("../") from collections import defaultdict from cliff_walking import CliffWalkingEnv import plotting matplotlib.style.use('ggplot') # In[15]: origin1 = (0, 0) des=(49, 36) env = CliffWalkingEnv(origin1, des) # In[16]: def make_epsilon_greedy_policy(Q, epsilon, nA): """ Creates an epsilon-greedy policy based on a given Q-function and epsilon. Args: Q: A dictionary that maps from state -> action-values. Each value is a numpy array of length nA (see below) epsilon: The probability to select a random action. Float between 0 and 1. nA: Number of actions in the environment.
def setUpClass(cls): np.random.seed(0) env = CliffWalkingEnv() cls.Q, cls.stats = q_learning(env, 500)
# Decay agent epsilon # agent.epsilon = ? s = new_s return total_reward def getActionRange(state): return range(env.nA) if __name__ == '__main__': max_iterations = 100000000000 visualize = True # Create Taxi-v2 env # env = gym.make('Taxi-v2') env = CliffWalkingEnv() env.reset() env.render() n_states = env.nS n_actions = env.nA print('States number = %i, Actions number = %i' % (n_states, n_actions)) # create q learning agent with alpha = 0.5 get_legal_actions = lambda s: range(n_actions) epsilon = 0.2 discount = 0.99 agent = QLearningAgent(alpha, epsilon, discount, getActionRange)
for step in range(t_max): a = agent.get_action(state) new_state, reward, done, _ = env.step(a) agent.update(state, a, new_state, reward) total_reward += reward state = new_state if done: break return total_reward if __name__ == '__main__': max_iterations = 5000 visualize = False env = CliffWalkingEnv() env.reset() # env.render() n_states = env.nS n_actions = env.nA print('States number = %i, Actions number = %i' % (n_states, n_actions)) # create q learning agent with alpha = 0.5 epsilon = 0.2 epsilon_threshold = 0.1 discount = 0.99 get_legal_actions = lambda s: range(n_actions) epsilon_ratio = 0.99
a = agent.get_action(s) new_s, r, is_done, _ = env.step(a) # Update rewards agent.update(s, a, new_s, r, agent.get_action(new_s)) total_reward += r s = new_s if is_done: break return total_reward if __name__ == '__main__': max_iterations = 1000 visualize = True # Create Taxi-v2 env env = CliffWalkingEnv() n_states = env.nS n_actions = env.nA print('States number = %i, Actions number = %i' % (n_states, n_actions)) # create q learning agent with alpha = 0.1 get_legal_actions = lambda s: range(n_actions) epsilon = 0.5 discount = 0.99 agent = QLearningAgent(alpha, epsilon, discount, get_legal_actions) s_agent = SarsaAgent(alpha, epsilon, discount, get_legal_actions)