def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(5),
            GridworldEnv(4), GridworldEnv(6)], batch_size=128, gamma=0.999, alpha=0.8,
            beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5,
            is_plot=False, num_episodes=200,
            max_num_steps_per_episode=1000, learning_rate=0.001,
            memory_replay_size=10000, memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    num_actions = list_of_envs[0].action_space.n
    input_size = list_of_envs[0].observation_space.shape[0]
    num_envs = len(list_of_envs)
    policy = PolicyNetwork(input_size, num_actions)
    models = [DQN(input_size,num_actions) for _ in range(0, num_envs)]   ### Add torch.nn.ModuleList (?)
    memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)]

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        policy.cuda()
        for model in models:
            model.cuda()

    optimizers = [optim.Adam(model.parameters(), lr=learning_rate)
                    for model in models]
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    episode_durations = [[] for _ in range(num_envs)]
    episode_rewards = [[] for _ in range(num_envs)]

    steps_done = np.zeros(num_envs)
    episodes_done = np.zeros(num_envs)
    current_time = np.zeros(num_envs)

    distilled_logits_magnitude = np.zeros((num_episodes,num_envs))
    policy_logits_magnitude = np.zeros((num_episodes,num_envs))
    # keep track of num of times a random action is picked
    num_rand = np.zeros(num_envs)

    # Initialize environments
    states = []
    for env in list_of_envs:
        states.append(torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size))

    while np.min(episodes_done) < num_episodes:
        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        for i_env, env in enumerate(list_of_envs):
        
            # select an action
            action, pi_0_norm, pi_i_norm = select_action(states[i_env], policy, models[i_env], num_actions,
                                    eps_start, eps_end, eps_decay,
                                    episodes_done[i_env], alpha, beta)

            if episodes_done[i_env] < num_episodes:
                if pi_0_norm + pi_i_norm == 0:
                    num_rand[i_env] += 1
                else:
                    distilled_logits_magnitude[int(episodes_done[i_env]), i_env] += pi_0_norm
                    policy_logits_magnitude[int(episodes_done[i_env]), i_env] += pi_i_norm

            steps_done[i_env] += 1
            current_time[i_env] += 1
            next_state_tmp, reward, done, _ = env.step(action[0,0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy( next_state_tmp ).type(torch.FloatTensor).view(-1,input_size)

            if done:
                next_state = None

            # Store the transition in memory
            time = Tensor([current_time[i_env]])
            memories[i_env].push(states[i_env], action, next_state, reward, time)

            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                            memories[i_env], batch_size, alpha, beta, gamma)

            # Update state
            states[i_env] = next_state

            # Check if agent reached target
            if done or current_time[i_env] >= max_num_steps_per_episode:
                if episodes_done[i_env] <= num_episodes:
                    print("ENV:", i_env, "iter:", episodes_done[i_env],
                        "\treward:{0:.2f}".format(env.episode_total_reward),
                        "\tit:", current_time[i_env], "\texp_factor:", eps_end +
                        (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay))


                if episodes_done[i_env] < num_episodes:
                    # average the cumulative norms
                    distilled_logits_magnitude[int(episodes_done[i_env]), i_env] /= (current_time[i_env] - num_rand[i_env])
                    policy_logits_magnitude[int(episodes_done[i_env]), i_env] /= (current_time[i_env] -num_rand[i_env])
                    num_rand[i_env] = 0

                episode_rewards[i_env].append(env.episode_total_reward)
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0

                states[i_env] = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size)

                if is_plot:
                    plot_rewards(episode_rewards, i_env)


        optimize_policy(policy, policy_optimizer, memories, batch_size,
                    num_envs, gamma, alpha, beta)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)
    np.save(file_name + '-beta-distilled_logit_norms', distilled_logits_magnitude)
    np.save(file_name + '-beta-policy_logit_norms', policy_logits_magnitude)

    return models, policy, episode_rewards, episode_durations
def trainSQL(file_name="SQL",
             env=GridworldEnv(1),
             batch_size=128,
             gamma=0.999,
             beta=5,
             eps_start=0.9,
             eps_end=0.05,
             eps_decay=1000,
             is_plot=False,
             num_episodes=500,
             max_num_steps_per_episode=1000,
             learning_rate=0.001,
             memory_replay_size=10000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    if is_plot:
        env.reset()
        plt.ion()
        plt.figure()
        plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(),
                   interpolation='none')
        plt.draw()
        plt.pause(0.00001)

    num_actions = env.action_space.n
    model = DQN(num_actions)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []

    steps_done, t = 0, 0
    # plt.ion()
    for i_episode in range(num_episodes):
        print("Cur episode:", i_episode, "steps done:", t,
                "exploration factor:", eps_end + (eps_start - eps_end) * \
                math.exp(-1. * steps_done / eps_decay))
        # Initialize the environment and state
        env.reset()
        # last_screen = env.current_grid_map
        current_screen = get_screen(env)
        state = current_screen  # - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            _, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env)
            if not done:
                next_state = current_screen  # - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, optimizer, memory, batch_size, gamma, beta)
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(env.episode_total_reward)
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                steps_done += 1
                break

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-sql-rewards', episode_rewards)
    np.save(file_name + '-sql-durations', episode_durations)

    return model, episode_rewards, episode_durations
Esempio n. 3
0
    # initialize network and experience replay objects
    td_learner = TDLearner(env, **mc_params)

    # train monte carlo learner
    t0 = time()
    episode_rewards = []
    for idx in xrange(n_epochs):
        total_reward = td_learner.run_episode(env)

        print('Total reward on epoch {}/{}:\t{}'.format(
            idx + 1, n_epochs, total_reward))

        episode_rewards.append(total_reward)
    print('\nTraining took {} mins'.format((time() - t0) / 60.))

    env_name = env.spec.id
    policy = 'off' if off_policy else 'on'
    param_str = '{}Policy_lr{:.3E}_ntilings{}_griddim{}x{}.png'\
                    .format(policy, learning_rate, n_tilings, *grid_dims)
    save_path = 'plots/td_learner_{}.png'.format(param_str)
    plot_rewards(episode_rewards, save_path, env_name)

    print('Executing greedy policy\n')
    td_learner.epsilon = 0
    for idx in xrange(10):
        total_reward = td_learner.run_episode(env, render=True)

        print('Total reward on greedy epoch {}/{}:\t{}\n'.format(
            idx + 1, 10, total_reward))
        # Move to the next state
        state = next_state
    cumulative_rewards.append(cum_reward)
    writer.add_scalar('Training ' + env_name, cum_reward, ep)
    # Update the target network, copying all weights and biases in DQN
    # Uncomment for Task 4
    if ep % TARGET_UPDATE == 0:
        agent.update_target_network()

    # Save the policy
    # Uncomment for Task 4
    if ep % 1000 == 0:
        torch.save(agent.policy_net.state_dict(),
                   "weights_%s_%d.mdl" % (env_name, ep))

plot_rewards(cumulative_rewards)
plt.savefig("plots/task-4b.png")
print('Complete')
plt.ioff()
plt.show()

# Task 3 - plot the policy

# Values used for the discretization
discr = 16
x_min, x_max = -2.4, 2.4
th_min, th_max = -0.3, 0.3

# Fixed values
v = 0
av = 0
Esempio n. 5
0
def main():
    np.random.seed(2)
    tf.set_random_seed(2)  # reproducible

    sess = tf.Session()
    hp = Hyperparameters()

    env = gym.make('CartPole-v0')
    env.seed(1)  # reproducible
    env = env.unwrapped

    actor = Actor(sess, n_features=hp.N_F, n_actions=hp.N_A, lr=hp.LR_A)
    # we need a good teacher, so the teacher should learn faster than the actor
    critic = Critic(sess, n_features=hp.N_F, lr=hp.LR_C, discount=hp.GAMMA)

    sess.run(tf.global_variables_initializer())

    if hp.OUTPUT_GRAPH:
        tf.summary.FileWriter("./logs/", sess.graph)

    running_rewards = []

    for i_episode in range(hp.MAX_EPISODE):
        s = env.reset()
        # assert to check: whether there is nan in s.
        assert np.isnan(np.min(s.ravel())) == False
        t = 0
        track_r = []
        while True:
            if hp.RENDER:
                env.render()

            a, probs = actor.choose_action(s)
            if i_episode == 0:
                write_file('./logs/probs.txt', probs, True)
            else:
                write_file('./logs/probs.txt', probs, False)
            # print('------------------------------------', probs)

            s_, r, done, info = env.step(a)

            assert np.isnan(np.min(s_.ravel())) == False

            if done:
                r = -20

            track_r.append(r)

            td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
            exp_v = actor.learn(s, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]
            # # debug mode # #
            # exp_v, act_prob, log_prob, l1 = actor.learn(s, a, td_error)
            # # debug mode # #

            s = s_
            t += 1

            if done or t >= hp.MAX_EP_STEPS:
                ep_rs_sum = sum(track_r)

                if 'running_reward' not in globals() and 'running_reward' not in locals():
                    running_reward = ep_rs_sum
                else:
                    running_reward = running_reward * 0.95 + ep_rs_sum * 0.05

                running_rewards.append(running_reward)
                # print(len(running_rewards))
                if len(running_rewards) % 1000 == 0:
                    write_file('./logs/rewards_' + str(i_episode) + '.txt', running_rewards, True)
                    y_axis_ticks = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
                    plot_rewards(running_rewards, y_axis_ticks, './logs/' + str(i_episode) + '/')
                if running_reward > hp.DISPLAY_REWARD_THRESHOLD:
                    hp.RENDER = True  # rendering
                # # debug mode # #
                # print('\naction:', a, 'td_error:', td_error, 'exp_v:', exp_v, 'act_prob:', act_prob, 'log_prob:',
                #       log_prob, 'l1:', l1)
                # # debug mode # #
                print('episode:', i_episode, ' running reward:', int(running_reward),
                      ' episode reward:', ep_rs_sum, ' tf_error:', td_error, ' exp_v:', exp_v)
                break
Esempio n. 6
0
def trainSQL0(file_name="SQL0", env=GridworldEnv(1), batch_size=128,
            gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000,
            is_plot=False, num_episodes=500, max_num_steps_per_episode=1000,
            learning_rate=0.001, memory_replay_size=10000):
    """
    Soft Q-learning training routine when observation vector is input
    Retuns rewards and durations logs.
    Plot environment screen
    """
    if is_plot:
        env.reset()
        plt.ion()
        plt.figure()
        plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(),
                   interpolation='none')
        plt.draw()
        plt.pause(0.00001)

    num_actions = env.action_space.n
    input_size = env.observation_space.shape[0]
    model = DQN(input_size, num_actions)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []

    steps_done, t = 0, 0
    # plt.ion()
    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        if i_episode != 0:
            print("Cur episode:", i_episode, "steps done:", episode_durations[-1],
                    "exploration factor:", eps_end + (eps_start - eps_end) * \
                    math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward)
        # Initialize the environment and state
        state = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size)

        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions,
                                    eps_start, eps_end, eps_decay, steps_done)
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy( next_state_tmp ).type(torch.FloatTensor).view(-1,input_size)

            if done:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, optimizer, memory, batch_size, gamma, beta)  #### Difference w.r.t DQN
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(env.episode_total_reward)  ##### Modify for OpenAI envs such as CartPole
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                steps_done += 1
                break

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-sql0-rewards', episode_rewards)
    np.save(file_name + '-sql0-durations', episode_durations)

    return model, episode_rewards, episode_durations