def policy_mc_iterate(n_iter=10, start_from=None, cross_entropy_call=None): # init random policy pi[a,s] pi = np.zeros((n + 1, n + 1)) for s in range(1, n + 1): pi[1:s + 1, s] = np.ones(s) * (1. / s) if cross_entropy_call == None: # init uniform policy mu[a,s] that will be the behavior policy mu = np.zeros((n + 1, n + 1)) for s in range(1, n + 1): mu[1:s + 1, s] = np.ones(s) * (1. / s) else: mu = cross_entropy_call # TODO: assert that pi and mu sum to 1 over actions q = np.zeros((n, n + 1)) C = np.zeros((n, n + 1)) for i in range(1, n_iter): if cross_entropy_call == None: print "EPISODE", i if i % 1000 == 0: # update behavior _, mu = policy_mc_iterate(n_iter=1000, start_from=None, cross_entropy_call=mu) mu = epsilon_greedify(mu, 0.05) states, actions, rewards = generate_episode(mu, p_h, n, start_from=start_from) else: start = ((i + 1) % n + 1, None) states, actions, rewards = generate_episode(mu, p_h, n, start) #print "states", states #print "actions", actions #print "rewards:", rewards G = 0 W = 1 for t in reversed(range(0, len(states) - 1)): a = actions[t] s = states[t] G = G + rewards[t + 1] C[a, s] = C[a, s] + W q[a, s] = q[a, s] + (W / C[a, s]) * (G - q[a, s]) # greedify: pi[:, s] = np.zeros(n + 1) action_max = np.argmax(q[1:, s]) + 1 # add 1 to action_max because we throw away bet 0 pi[action_max, s] = 1 if action_max != a: # i.e. if mu[a,s] = 0 break W = W / mu[a, s] return q, pi
def train(sess, env, agent, gamma=0.99): episode = generate_episode(sess, env, agent) loss = 0 G = 0 for s, a, r in reversed(episode[:-1]): # with [:-1], converge fast... G = G * gamma + r loss = agent.train_step(sess, [s, a, G]) return loss, len(episode) - 1
def policy_mc_iterate(epsilon, max_iter=10, delta=10**-6, start_from=None): # init random policy pi[a,s] pi = np.zeros((n + 1, n + 1)) for s in range(1, n + 1): pi[1:s + 1, s] = np.ones(s) * (1. / s) # init uniform policy mu[a,s] that will be the behavior policy mu = np.zeros((n + 1, n + 1)) for s in range(1, n + 1): mu[1:s + 1, s] = np.ones(s) * (1. / s) # TODO: assert that pi and mu sum to 1 over actions q = np.zeros((n, n + 1)) C = np.zeros((n, n + 1)) for i in range(max_iter): print "EPISODE", i states, actions, rewards = generate_episode(mu, p_h, n, start_from=start_from) #print "states", states #print "actions", actions #print "rewards:", rewards G = 0 W = 1 for t in reversed(range(0, len(states) - 1)): a = actions[t] s = states[t] G = G + rewards[t + 1] C[a, s] = C[a, s] + W q[a, s] = q[a, s] + (W / C[a, s]) * (G - q[a, s]) # greedify: pi[:, s] = np.zeros(n + 1) action_max = np.argmax(q[1:, s]) + 1 # add 1 to action_max because we throw away bet 0 pi[action_max, s] = 1 if action_max != a: # i.e. if mu[a,s] = 0 break W = W / mu[a, s] #if np.linalg.norm(old_q - q) < delta: # print "break at iteration", i # break mu = epsilon_greedify(pi, epsilon) return q, pi
def run_single(self, i, results): algorithm = self.algorithm_factory.create(*self.algo_params) for episode in range(self.n_episodes): steps = generate_episode(self.env, algorithm, render=False) results[episode] += steps print('Run: {:2}, params: {}, ep: {:3}, steps: {:4}'.format(i, self.algo_params, episode, steps))
if done: self._reset() if __name__ == '__main__': n_servers = 10 env = AccessControlQueueTimeLimit(max_episode_steps=int(1e6), free_prob=0.06, n_servers=n_servers) value_function = ValueFunction(8, 2048, len(AccessControlQueue.PRIORITIES), n_servers) algorithm = DifferentialSemiGradientSarsa(env, value_function, alpha=0.01 / value_function.n_tilings) generate_episode(env, algorithm, print_step=True) policy = value_function.to_policy(algorithm.actions, AccessControlQueue.PRIORITIES, np.arange(n_servers + 1)) values = value_function.to_value(algorithm.actions, AccessControlQueue.PRIORITIES, np.arange(n_servers + 1)) fig = tools.make_subplots(rows=1, cols=2) fig.append_trace( go.Heatmap(z=policy, x=np.arange(n_servers + 1), y=AccessControlQueue.REWARDS, name='Policy'), 1, 1) for i, row in enumerate(values):
state_space = env.observation_space.shape[0] action_space = env.action_space.n alpha = 0.001 policy_network = Policynetwork(state_space, action_space) optimizer = torch.optim.Adam(policy_network.parameters(), lr = alpha) ## TRAIN episodes = 1000 rewards = [] for episode in range(episodes): episode_experience, reward_list = generate_episode(env, policy_network) # RETURNS (S,A,R,S') TUPLES OF THE EPISODE AND LIST OF REWARDS OBTAINED AT EACH STEP total_reward = np.sum(reward_list) # TOTAL REWARD OF THE EPISODE rewards.append(total_reward) # STORING TOTAL REWARD OF EACH EPISODE loss = 0 for i,sars in enumerate(episode_experience): state, action, _, nextstate = sars target_reward = np.sum(reward_list[i:-1]) - np.mean(reward_list) reward_weight = torch.tensor(target_reward) # using monte carlo estimate for target, have i incorporated causality here?? action_logprob = -torch.log(policy_network(state))[action] loss += action_logprob*reward_weight optimizer.zero_grad() loss.backward() optimizer.step()