コード例 #1
0
def policy_mc_iterate(n_iter=10, start_from=None, cross_entropy_call=None):
    # init random policy pi[a,s]
    pi = np.zeros((n + 1, n + 1))
    for s in range(1, n + 1):
        pi[1:s + 1, s] = np.ones(s) * (1. / s)

    if cross_entropy_call == None:
        # init uniform policy mu[a,s] that will be the behavior policy
        mu = np.zeros((n + 1, n + 1))
        for s in range(1, n + 1):
            mu[1:s + 1, s] = np.ones(s) * (1. / s)
    else:
        mu = cross_entropy_call

    # TODO: assert that pi and mu sum to 1 over actions

    q = np.zeros((n, n + 1))
    C = np.zeros((n, n + 1))
    for i in range(1, n_iter):
        if cross_entropy_call == None:
            print "EPISODE", i
            if i % 1000 == 0:  # update behavior
                _, mu = policy_mc_iterate(n_iter=1000,
                                          start_from=None,
                                          cross_entropy_call=mu)
                mu = epsilon_greedify(mu, 0.05)

            states, actions, rewards = generate_episode(mu,
                                                        p_h,
                                                        n,
                                                        start_from=start_from)
        else:
            start = ((i + 1) % n + 1, None)
            states, actions, rewards = generate_episode(mu, p_h, n, start)
        #print "states", states
        #print "actions", actions
        #print "rewards:", rewards
        G = 0
        W = 1
        for t in reversed(range(0, len(states) - 1)):
            a = actions[t]
            s = states[t]
            G = G + rewards[t + 1]
            C[a, s] = C[a, s] + W
            q[a, s] = q[a, s] + (W / C[a, s]) * (G - q[a, s])
            # greedify:
            pi[:, s] = np.zeros(n + 1)
            action_max = np.argmax(q[1:, s]) + 1
            # add 1 to action_max because we throw away bet 0
            pi[action_max, s] = 1
            if action_max != a:  # i.e. if mu[a,s] = 0
                break
            W = W / mu[a, s]

    return q, pi
コード例 #2
0
def train(sess, env, agent, gamma=0.99):
    episode = generate_episode(sess, env, agent)

    loss = 0
    G = 0
    for s, a, r in reversed(episode[:-1]):  # with [:-1], converge fast...
        G = G * gamma + r
        loss = agent.train_step(sess, [s, a, G])

    return loss, len(episode) - 1
def policy_mc_iterate(epsilon, max_iter=10, delta=10**-6, start_from=None):
    # init random policy pi[a,s]
    pi = np.zeros((n + 1, n + 1))
    for s in range(1, n + 1):
        pi[1:s + 1, s] = np.ones(s) * (1. / s)

    # init uniform policy mu[a,s] that will be the behavior policy
    mu = np.zeros((n + 1, n + 1))
    for s in range(1, n + 1):
        mu[1:s + 1, s] = np.ones(s) * (1. / s)

    # TODO: assert that pi and mu sum to 1 over actions

    q = np.zeros((n, n + 1))
    C = np.zeros((n, n + 1))
    for i in range(max_iter):
        print "EPISODE", i
        states, actions, rewards = generate_episode(mu,
                                                    p_h,
                                                    n,
                                                    start_from=start_from)
        #print "states", states
        #print "actions", actions
        #print "rewards:", rewards
        G = 0
        W = 1
        for t in reversed(range(0, len(states) - 1)):
            a = actions[t]
            s = states[t]
            G = G + rewards[t + 1]
            C[a, s] = C[a, s] + W
            q[a, s] = q[a, s] + (W / C[a, s]) * (G - q[a, s])
            # greedify:
            pi[:, s] = np.zeros(n + 1)
            action_max = np.argmax(q[1:, s]) + 1
            # add 1 to action_max because we throw away bet 0
            pi[action_max, s] = 1
            if action_max != a:  # i.e. if mu[a,s] = 0
                break
            W = W / mu[a, s]

        #if np.linalg.norm(old_q - q) < delta:
        #    print "break at iteration", i
        #    break
        mu = epsilon_greedify(pi, epsilon)
    return q, pi
コード例 #4
0
 def run_single(self, i, results):
     algorithm = self.algorithm_factory.create(*self.algo_params)
     for episode in range(self.n_episodes):
         steps = generate_episode(self.env, algorithm, render=False)
         results[episode] += steps
         print('Run: {:2}, params: {}, ep: {:3}, steps: {:4}'.format(i, self.algo_params, episode, steps))
        if done:
            self._reset()


if __name__ == '__main__':
    n_servers = 10
    env = AccessControlQueueTimeLimit(max_episode_steps=int(1e6),
                                      free_prob=0.06,
                                      n_servers=n_servers)
    value_function = ValueFunction(8, 2048, len(AccessControlQueue.PRIORITIES),
                                   n_servers)
    algorithm = DifferentialSemiGradientSarsa(env,
                                              value_function,
                                              alpha=0.01 /
                                              value_function.n_tilings)
    generate_episode(env, algorithm, print_step=True)

    policy = value_function.to_policy(algorithm.actions,
                                      AccessControlQueue.PRIORITIES,
                                      np.arange(n_servers + 1))
    values = value_function.to_value(algorithm.actions,
                                     AccessControlQueue.PRIORITIES,
                                     np.arange(n_servers + 1))

    fig = tools.make_subplots(rows=1, cols=2)
    fig.append_trace(
        go.Heatmap(z=policy,
                   x=np.arange(n_servers + 1),
                   y=AccessControlQueue.REWARDS,
                   name='Policy'), 1, 1)
    for i, row in enumerate(values):
コード例 #6
0
ファイル: REINFORCE.py プロジェクト: jayeshk7/RL-Algorithms
state_space = env.observation_space.shape[0]
action_space = env.action_space.n

alpha = 0.001
policy_network = Policynetwork(state_space, action_space)
optimizer = torch.optim.Adam(policy_network.parameters(), lr = alpha)

## TRAIN

episodes = 1000
rewards = []

for episode in range(episodes):
    
    episode_experience, reward_list = generate_episode(env, policy_network)          # RETURNS (S,A,R,S') TUPLES OF THE EPISODE AND LIST OF REWARDS OBTAINED AT EACH STEP
    total_reward = np.sum(reward_list)                  # TOTAL REWARD OF THE EPISODE
    rewards.append(total_reward)                        # STORING TOTAL REWARD OF EACH EPISODE
    loss = 0

    for i,sars in enumerate(episode_experience):
        
        state, action, _, nextstate = sars
        target_reward = np.sum(reward_list[i:-1]) - np.mean(reward_list)
        reward_weight = torch.tensor(target_reward)                             # using monte carlo estimate for target, have i incorporated causality here??
        action_logprob = -torch.log(policy_network(state))[action]
        loss += action_logprob*reward_weight

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()