def sarsa(self, num_iter, alpha, epsilon): # 定义行为值函数为字典,并初始化为0 qfunc = dict() for s in self.states: for a in self.actions: qfunc['%d_%s' % (s, a)] = 0.0 # 迭代探索环境 for _ in range(num_iter): # 随机初始化初始状态 state = self.states[int(random.random() * len(self.states))] action = self.actions[int(random.random() * len(self.actions))] is_terminal, count = False, 0 while False == is_terminal and count < 100: policy = "%d_%s" % (state, action) is_terminal, next_state, reward = self.env.transform1( state, action) # next_state处的最大动作,通过epsilon_greedy得出,这里是和qLearning的区别。 next_action = epsilon_greedy(qfunc, next_state, self.actions, epsilon) next_policy = "%d_%s" % (next_state, next_action) # 利用qlearning算法更新值函数, 评估策略是greedy,因为计算当前值函数的下一个策略的action是由greedy得出的。 qfunc[policy] = qfunc[policy] + alpha * ( reward + self.gamma * qfunc[next_policy] - qfunc[policy]) # 转到下一个状态, 行动策略(探索策略)为epsilon_greedy. state, action = next_state, epsilon_greedy( qfunc, next_state, self.actions, epsilon) count += 1 return qfunc
def SARSA(env, num_episodes, gamma, lr, e): """ Implement the SARSA algorithm following epsilon-greedy exploration. Inputs: env: OpenAI Gym environment env.P: dictionary P[state][action] are tuples of tuples tuples with (probability, nextstate, reward, terminal) probability: float nextstate: int reward: float terminal: boolean env.nS: int number of states env.nA: int number of actions num_episodes: int Number of episodes of training gamma: float Discount factor. lr: float Learning rate. e: float Epsilon value used in the epsilon-greedy method. Outputs: Q: numpy.ndarray State-action values """ Q = np.zeros((env.nS, env.nA)) #TIPS: Call function epsilon_greedy without setting the seed # Choose the first state of each episode randomly for exploration. ############################ # YOUR CODE STARTS HERE for i in range(num_episodes): state = np.random.randint(0, env.nS) statelist = [] for j in range(0, env.nS): if j == state: statelist.append(1) else: statelist.append(0) env.isd = statelist env.reset() terminal = False action = epsilon_greedy(Q[state], e) while not terminal: next_state, reward, terminal, prob = env.step(action) new_action = epsilon_greedy(Q[next_state], e) new_value = Q[state, action] + lr * ( reward + gamma * Q[next_state, new_action]) - lr * Q[state, action] Q[state, action] = new_value state, action = next_state, new_action # YOUR CODE ENDS HERE ############################ return Q
def q_learning(env, nE, min_epsilon=0.01, alpha=0.01, gamma=0.9, lamb=0.2, debug=False, render=False): """ This method updates the state action values using the Q-learning algorithm Q(s,a) <- Q(s,a) + alpha*(r + gamma*Q(s_prime, pi_target(s_prime))-Q(s,a)))) """ q_pi = defaultdict(lambda: defaultdict(float)) E = defaultdict(lambda: defaultdict(float)) for e in range(nE): s = env.reset() epsilon = (1.0-min_epsilon) * \ (1.0-float(e)/float(nE)) + min_epsilon done = False if debug: if e % 1000 == 0: print("Completed {} episodes".format(e)) print("Epsilon {}".format(epsilon)) while not done: if render: env.render() time.sleep(0.25) a = utils.epsilon_greedy(q_pi, env, epsilon)(s) s_prime, r, done, _ = env.step(a) a_target = utils.epsilon_greedy(q_pi, env, 0)(s_prime) delta = r + q_pi[s_prime][a_target] - q_pi[s][a] E[s][a] += 1.0 for s in q_pi: for a in q_pi[s]: q_pi[s][a] += alpha * delta * E[s][a] E[s][a] *= gamma * lamb s = s_prime return q_pi, utils.epsilon_greedy(q_pi, env, min_epsilon)
def test_procedure(shared_actor, env): num_actions = env.action_space.n local_actor = nets.Actor(num_actions=num_actions) # load parameters from shared models begin_time = time.time() while True: replay_buffer = utils.ReplayBuffer(size=4, frame_history_len=4) local_actor.load_state_dict(shared_actor.state_dict()) obs = env.reset() rewards = [] while True: replay_buffer.store_frame(obs) states = replay_buffer.encode_recent_observation() states = np.expand_dims(states, axis=0) / 255.0 - .5 logits = local_actor( Variable(torch.FloatTensor(states.astype(np.float32)))) action = utils.epsilon_greedy(logits, num_actions=env.action_space.n, epsilon=-1.) obs, reward, done, info = env.step(action) rewards.append(reward) if done: print("Time:{}, computer:{}, agent:{}".format( time.time() - begin_time, sum(np.array(rewards) == -1), sum(np.array(rewards) == 1))) break
def decide(self, state): eps = max( ConfigBrain.BASE_EPSILON, 1 - (self._age / (self.learning_frequency() * ConfigBiology.MATURITY_AGE))) brain_actions_prob = self.brain().think(state) action_prob = utils.normalize_dist(self.fitrah() + brain_actions_prob) decision = utils.epsilon_greedy(eps, action_prob) return decision
def act(self, state): """ Choose action according to behavior policy. Returns action values, state as torch.Tensor, chosen action and probabilites Params ====== state(array): current state """ q_values, state_t = self.decisionQValues(state) action, probs = utils.epsilon_greedy(q_values, self.epsilon) return q_values, state_t, action, probs
def sarsa(env, pi, nE, alpha=0.01, gamma=1.0, lamb=0.2, min_epsilon=0.01, debug=False, render=False): """Calculates Q using the on-policy SARSA method """ q_pi = defaultdict(lambda: defaultdict(float)) E = defaultdict(lambda: defaultdict(float)) for e in range(nE): done = False s = env.reset() a = pi(s) epsilon = (1.0 - min_epsilon) * \ (1.0 - float(e) / float(nE)) + min_epsilon if debug: if e % 1000 == 0: print("Completed {} episodes".format(e)) print("Epsilon: {}".format(epsilon)) while not done: if render: env.render() time.sleep(0.25) s_prime, r, done, _ = env.step(a) a_prime = pi(s_prime) delta = r + q_pi[s_prime][a_prime] - q_pi[s][a] E[s][a] += 1.0 for s in q_pi: for a in q_pi[s]: q_pi[s][a] += alpha * delta * E[s][a] E[s][a] *= gamma*lamb s = s_prime a = a_prime pi = utils.epsilon_greedy(q_pi, env, epsilon) return q_pi, pi
def monte_carlo_control(pi, env, n, gamma=1.0, min_epsilon=0.01, debug=False, render=False): """ This takes a policy and performs the monte carlo update """ q_pi = defaultdict(lambda: defaultdict(float)) for i in range(n): epsilon = (1.0 - min_epsilon) * \ (1 - float(i) / float(n)) + min_epsilon if debug: if i % 1000 == 0: print("Finished {} episodes".format(i)) print("Epsilon: {} episodes".format(epsilon)) G, N = monte_carlo_episode(pi, env, gamma, render) q_pi = monte_carlo_step(q_pi, N, G) pi = utils.epsilon_greedy(q_pi, env, epsilon) return q_pi, pi
def decide(self, state): brain_actions_prob = self._brain.think(state) action_prob = utils.normalize_dist( brain_actions_prob) # + self.fitrah() decision = utils.epsilon_greedy(0, dist=action_prob) return decision
def learning_thread(shared_actor, shared_critic, shared_actor_optim, shared_critic_optim, exploration=LinearSchedule(1000000, 0.1), gamma=0.99, frame_history_len=4): #### # 1. build a local model # 2. synchronize the shared model parameters and local model # 3. choose an action based on observation # 4. take an action and get the reward and the next observation # 5. calculate the target, and accumulate the gradient # 6. update the global model #### # prepare environment env = get_env() obs = env.reset() num_actions = env.action_space.n # prepare local model local_actor = nets.Actor(num_actions=num_actions) local_critic = nets.Critic() # criterion criterion = nn.MSELoss(size_average=False) # load parameters from shared models local_actor.load_state_dict(shared_actor.state_dict()) local_critic.load_state_dict(shared_critic.state_dict()) replay_buffer = utils.ReplayBuffer(size=4, frame_history_len=frame_history_len) # idx = replay_buffer.store_frame(obs) num_n_steps = 4 for i in itertools.count(): states = [] actions = [] next_states = [] dones = [] rewards = [] for i in range(num_n_steps): replay_buffer.store_frame(obs) state = replay_buffer.encode_recent_observation() state = np.expand_dims(state, axis=0) / 255.0 - .5 state = Variable(torch.from_numpy(state.astype(np.float32)), volatile=True) logits = local_actor(state) action = utils.epsilon_greedy(logits, num_actions=num_actions, epsilon=exploration(i)) next_obs, reward, done, info = env.step(action) replay_buffer.store_frame(next_obs) # store the states for get the gradients states.append(state) actions.append(action) dones.append(done) rewards.append(reward) next_states.append(replay_buffer.encode_recent_observation()) if done: break # compute targets and compute the critic's gradient # from numpy to torch.Variable cur_states = np.array(states) / 255.0 - .5 cur_states = Variable(torch.FloatTensor(cur_states.astype(np.float32))) next_states = np.array(next_states) / 255.0 - .5 next_states = Variable(torch.FloatTensor(next_states.astype( np.float32)), volatile=True) not_done_mask = torch.FloatTensor(1 - np.array(dones).astype( dtype=np.float32)).view_(-1, 1) rewards = torch.FloatTensor(np.array(rewards).astype( np.float32)).view_(-1, 1) values = local_critic(next_states) targets = values.data.mul_(not_done_mask).mul_(gamma) targets = targets.add_(rewards)
num_actions=num_actions, name='target_dqn', learning_rate=LR) buf = MemoryBuffer(memory_size=BUFFER_SIZE) total_episode_rewards = [] step = 0 for episode in range(MAX_EPISODE + 1): frame = env.reset() # LazyFrames state = np.array(frame) # narray (84, 84, 4) done = False cur_episode_reward = 0 while not done: # 如果done则结束episode if step % C == 0: target_dqn.copy_from(dqn) # 复制参数 if epsilon_greedy(step): action = env.action_space.sample() else: action = dqn.get_action(state / 255.0) # env.render() next_frame, reward, done, _ = env.step(action) next_state = np.array(next_frame) buf.push(state, action, reward, next_state, done) state = next_state cur_episode_reward += reward if buf.size() > MIN_BUFFER: states, actions, rewards, next_states, dones = buf.sample( MINI_BATCH) next_state_action_values = np.max(target_dqn.predict( next_states / 255.0),
env = GridWorld(path=args.input) num_states = env.get_num_states() num_actions = len(env.get_action_set()) num_rows, num_cols = env.get_grid_dimensions() # Sarsa(0): gamma = 0.95 step_size = 0.1 num_steps_episode = [] for seed in range(args.num_seeds): random.seed(seed) num_steps_episode.append([]) q_values = np.zeros((num_states, num_actions)) for i in range(args.num_episodes): s = env.get_current_state() a = utils.epsilon_greedy(q_values[s]) num_steps = 0 while num_steps < args.max_length_ep and not env.is_terminal(): r = env.act(env.get_action_set()[a]) next_s = env.get_current_state() next_a = utils.epsilon_greedy(q_values[next_s]) td_error = r + gamma * q_values[next_s][next_a] - q_values[s][a] q_values[s][a] = q_values[s][a] + step_size * td_error s = next_s a = next_a num_steps += 1 env.reset() num_steps_episode[seed].append(num_steps)
def double_q_learning(env, nE, alpha=0.01, gamma=0.9, lamb=0.2, min_epsilon=0.01, debug=False): """Learns Q use the double-q learning algorithm: choose A using Q_1 + Q_2 with p=.5 make update to Q_1 using Q_2: Q_1(S, A) = Q_1(S, A) + alpha(r + gamma * Q_2(S', pi_t(S')) - Q_1(S, A)) with p=.5 make update to Q_2 using Q_1: Q_2(S, A) = Q_2(S, A) + alpha(r + gamma * Q_1(S', pi_t(S')) - Q_2(S, A)) """ q_1 = defaultdict(lambda: defaultdict(float)) q_2 = defaultdict(lambda: defaultdict(float)) E = defaultdict(lambda: defaultdict(float)) flag = True for e in range(nE): epsilon = (1.0 - min_epsilon) * \ (1.0 - float(e)/float(nE)) + min_epsilon if debug: if e % 1000 == 0: print("Completed {} episodes".format(e)) print("Epsilon: {}".format(epsilon)) done = False s = env.reset() while not done: a = utils.epsilon_greedy({s: Counter(q_1[s]) + Counter(q_2[s])}, env, epsilon)(s) s_prime, r, done, _ = env.step(a) if np.random.rand() < 0.5: a_prime = utils.epsilon_greedy(q_2, env, epsilon=0)(s_prime) delta = r + q_2[s_prime][a_prime] - q_1[s][a] flag = True else: a_prime = utils.epsilon_greedy(q_1, env, epsilon=0)(s_prime) delta = r + q_1[s_prime][a_prime] - q_2[s][a] flag = False E[s][a] += 1 for s in q_1: for a in q_1[s]: if flag: q_1[s][a] += alpha * delta * E[s][a] else: q_2[s][a] += alpha * delta * E[s][a] E[s][a] *= gamma * lamb s = s_prime q_pi = utils.combine_nested_dicts(q_1, q_2) return q_pi, utils.epsilon_greedy(q_pi, env, epsilon=min_epsilon)
lamb = float(input("Lambda: ")) n_episodes = int(input("Num Episodes: ")) n_episodes_watch = int(input("Num of episodes to watch: ")) should_render = "y" == input("Render (y/n): ") should_debug = "y" == input("Debug (y/n): ") algo = int( input( "Choose algorithm:\n(1)Monte-Carlo\n(2)SARSA\n(3)Q-learning\n(4)Double Q-learning\n(5)Value Iteration\n" )) pi = utils.initial_pi(env) v_pi = utils_model.value_iteration(env.P, gamma=gamma) q_v_pi = utils.state_values_to_action_values(v_pi, env) pi_opt = utils.epsilon_greedy(q_v_pi, env, epsilon=min_epsilon) if algo == 1: q_pi, pi_opt = utils_monte_carlo.monte_carlo_control( pi, env, n_episodes, gamma=gamma, min_epsilon=min_epsilon, debug=should_debug) elif algo == 2: q_pi, pi_opt = utils_sarsa.sarsa(env, pi, n_episodes, gamma=gamma,