Beispiel #1
0
def run_actions(env: gridworld_env.GridworldEnv, actions):
    env.reset()
    for action in actions:
        print(action)
        _, _, is_done, _ = env.update(action)
        env.display()
        time.sleep(1)
        if is_done:
            break
Beispiel #2
0
def trainA3C(file_name="A3C",
             env=GridworldEnv(1),
             update_global_iter=10,
             gamma=0.999,
             is_plot=False,
             num_episodes=500,
             max_num_steps_per_episode=1000,
             learning_rate=0.0001):
    """
    A3C training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    ns = env.observation_space.shape[
        0]  ## Line to fix for arbitrary environment
    na = env.action_space.n

    gnet = Net(ns, na)  # global network
    gnet.share_memory()  # share the global parameters in multiprocessing
    opt = SharedAdam(gnet.parameters(), lr=learning_rate)  # global optimizer
    global_ep, global_ep_r, res_queue = mp.Value('i',
                                                 0), mp.Value('d',
                                                              0.), mp.Queue()

    # parallel training
    workers = [
        Worker(gnet, opt, global_ep, global_ep_r, res_queue, i,
               update_global_iter, num_episodes, max_num_steps_per_episode,
               gamma, env, ns, na) for i in range(mp.cpu_count())
    ]

    [w.start() for w in workers]
    episode_rewards = []  # record episode reward to plot
    while True:
        r = res_queue.get()
        if r is not None:
            episode_rewards.append(r)
        else:
            break
    [w.join() for w in workers]

    #Store results
    np.save(file_name + '-a3c-rewards', episode_rewards)

    return episode_rewards
def trainDistral(file_name="Distral_1col_AC",
                 list_of_envs=[GridworldEnv(4),
                               GridworldEnv(5)],
                 batch_size=128,
                 gamma=0.95,
                 alpha=0.8,
                 beta=5,
                 num_episodes=200,
                 max_num_steps_per_episode=1000,
                 learning_rate=0.001,
                 n_step=1):

    # Specify Environment conditions
    input_size = list_of_envs[0].observation_space.shape[0]
    num_actions = list_of_envs[0].action_space.n
    tasks = len(list_of_envs)

    # Define our set of policies, including distilled one
    models = torch.nn.ModuleList(
        [Policy(input_size, num_actions) for _ in range(tasks)])
    distilled = Distilled(input_size, num_actions, tasks)
    optimizers = [
        optim.Adam(model.parameters(), lr=learning_rate) for model in models
    ]
    opt_distilled = optim.Adam(distilled.parameters(), lr=learning_rate)

    # Store the total rewards
    episode_rewards = [[] for i in range(num_episodes)]
    episode_duration = [[] for i in range(num_episodes)]

    for i_episode in range(num_episodes):
        task_specific_losses = []

        # For each one of the envs
        for i_env, env in enumerate(list_of_envs):

            #Initialize state of envs
            state = env.reset()

            #Store total reward per environment per episode
            total_reward = 0

            # Store duration of each episode per env
            duration = 0

            for t in range(max_num_steps_per_episode):

                # Run our policy
                action = select_action(state, models[i_env], distilled, i_env)

                next_state, reward, done, _ = env.step(action.item())
                models[i_env].rewards.append(reward)
                total_reward += reward
                duration += 1

                if done:
                    break

                #Update state
                state = next_state

            episode_rewards[i_episode].append(total_reward)
            episode_duration[i_episode].append(duration)

            # get the value estimate of the final state according to equation 7 from distral paper
            next_state = torch.from_numpy(np.asarray(next_state)).float()
            _, action_pref_temp = models[i_env](next_state)
            pi_0_temp, _ = distilled(next_state)

            temp_term = beta * action_pref_temp - torch.max(
                beta * action_pref_temp)
            final_state_value = torch.log((torch.pow(pi_0_temp, alpha) *
                                           torch.exp(temp_term)).sum()) / beta

            if done:
                final_state_value = 0

            # Distill for each environment
            task_specific_losses.append(
                task_specific_update(models[i_env], distilled,
                                     optimizers[i_env], alpha, beta, gamma,
                                     final_state_value, i_env, n_step))

        finish_episode(task_specific_losses, models, distilled, opt_distilled,
                       alpha, beta, gamma)

        # if i_episode % args.log_interval == 0:
        for i in range(tasks):
            print('Episode: {}\tEnv: {}\tDuration: {}\tTotal Reward: {:.2f}'.
                  format(i_episode, i, episode_duration[i_episode][i],
                         episode_rewards[i_episode][i]))

    np.save(file_name + '-rewards', episode_rewards)
    np.save(file_name + '-durations', episode_duration)

    print('Completed')
            if done:
                final_state_value = 0

            # Distill for each environment
            task_specific_losses.append(
                task_specific_update(models[i_env], distilled,
                                     optimizers[i_env], alpha, beta, gamma,
                                     final_state_value, i_env, n_step))

        finish_episode(task_specific_losses, models, distilled, opt_distilled,
                       alpha, beta, gamma)

        # if i_episode % args.log_interval == 0:
        for i in range(tasks):
            print('Episode: {}\tEnv: {}\tDuration: {}\tTotal Reward: {:.2f}'.
                  format(i_episode, i, episode_duration[i_episode][i],
                         episode_rewards[i_episode][i]))

    np.save(file_name + '-rewards', episode_rewards)
    np.save(file_name + '-durations', episode_duration)

    print('Completed')


if __name__ == '__main__':
    trainDistral(list_of_envs=[GridworldEnv(7),
                               GridworldEnv(8)],
                 learning_rate=0.00005,
                 num_episodes=200)
Beispiel #5
0
def trainSQL0(file_name="SQL0",
              env=GridworldEnv(1),
              batch_size=128,
              gamma=0.999,
              beta=5,
              eps_start=0.9,
              eps_end=0.05,
              eps_decay=1000,
              is_plot=False,
              num_episodes=200,
              max_num_steps_per_episode=1000,
              learning_rate=0.0001,
              memory_replay_size=10000,
              n_step=10,
              target_update=10):
    """
    Soft Q-learning training routine when observation vector is input
    Retuns rewards and durations logs.
    """

    num_actions = env.action_space.n
    input_size = env.observation_space.shape[0]
    model = DQN(input_size, num_actions)
    target_model = DQN(input_size, num_actions)
    target_model.load_state_dict(model.state_dict())
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model.cuda()

    memory = ReplayMemory(memory_replay_size, n_step, gamma)

    episode_durations = []
    mean_durations = []
    episode_rewards = []
    mean_rewards = []

    steps_done, t = 0, 0

    for i_episode in range(num_episodes):
        if i_episode % 20 == 0:
            clear_output()
        if i_episode != 0:
            print("Cur episode:", i_episode, "steps done:", episode_durations[-1],
                    "exploration factor:", eps_end + (eps_start - eps_end) * \
                    math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward)
        # Initialize the environment and state
        state = torch.from_numpy(env.reset()).type(torch.FloatTensor).view(
            -1, input_size)

        for t in count():
            # Select and perform an action
            action = select_action(state, model, num_actions, eps_start,
                                   eps_end, eps_decay, steps_done)
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy(next_state_tmp).type(
                torch.FloatTensor).view(-1, input_size)

            if done:
                next_state = None

            # Store the transition in memory
            memory.push(model, target_model, state, action, next_state, reward)

            # Move to the next state
            state = next_state
            # plot_state(state)
            # env.render()

            # Perform one step of the optimization (on the target network)
            optimize_model(model, target_model, optimizer, memory, batch_size,
                           gamma, beta)  #### Difference w.r.t DQN
            if done or t + 1 >= max_num_steps_per_episode:
                episode_durations.append(t + 1)
                episode_rewards.append(
                    env.episode_total_reward
                )  ##### Modify for OpenAI envs such as CartPole
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                steps_done += 1
                break
        if i_episode % target_update == 0 and i_episode != 0:
            target_model.load_state_dict(model.state_dict())

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results
    np.save(file_name + '-sql0-rewards', episode_rewards)
    np.save(file_name + '-sql0-durations', episode_durations)

    return model, episode_rewards, episode_durations
Beispiel #6
0
                )  ##### Modify for OpenAI envs such as CartPole
                if is_plot:
                    plot_durations(episode_durations, mean_durations)
                    plot_rewards(episode_rewards, mean_rewards)
                steps_done += 1
                break
        if i_episode % target_update == 0 and i_episode != 0:
            target_model.load_state_dict(model.state_dict())

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results
    np.save(file_name + '-sql0-rewards', episode_rewards)
    np.save(file_name + '-sql0-durations', episode_durations)

    return model, episode_rewards, episode_durations


if __name__ == '__main__':
    # trainSQL0(env=GridworldEnv(4), learning_rate=0.00001, max_num_steps_per_episode=100, num_episodes=1000)
    trainSQL0(env=GridworldEnv(8),
              learning_rate=0.001,
              max_num_steps_per_episode=100,
              num_episodes=1000,
              n_step=10,
              target_update=100)
def trainD(file_name="Distral_1col",
           list_of_envs=[GridworldEnv(5),
                         GridworldEnv(4),
                         GridworldEnv(6)],
           batch_size=128,
           gamma=0.999,
           alpha=0.8,
           beta=5,
           eps_start=0.9,
           eps_end=0.05,
           eps_decay=5,
           is_plot=False,
           num_episodes=200,
           max_num_steps_per_episode=1000,
           learning_rate=0.001,
           memory_replay_size=10000,
           memory_policy_size=1000):
    """
    Soft Q-learning training routine. Retuns rewards and durations logs.
    Plot environment screen
    """
    num_actions = list_of_envs[0].action_space.n
    input_size = list_of_envs[0].observation_space.shape[0]
    num_envs = len(list_of_envs)
    policy = PolicyNetwork(input_size, num_actions)
    models = [DQN(input_size, num_actions)
              for _ in range(0, num_envs)]  ### Add torch.nn.ModuleList (?)
    memories = [
        ReplayMemory(memory_replay_size, memory_policy_size)
        for _ in range(0, num_envs)
    ]

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        policy.cuda()
        for model in models:
            model.cuda()

    optimizers = [
        optim.Adam(model.parameters(), lr=learning_rate) for model in models
    ]
    policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
    # optimizer = optim.RMSprop(model.parameters(), )

    episode_durations = [[] for _ in range(num_envs)]
    episode_rewards = [[] for _ in range(num_envs)]

    steps_done = np.zeros(num_envs)
    episodes_done = np.zeros(num_envs)
    current_time = np.zeros(num_envs)

    # Initialize environments
    states = []
    for env in list_of_envs:
        states.append(
            torch.from_numpy(env.reset()).type(torch.FloatTensor).view(
                -1, input_size))

    while np.min(episodes_done) < num_episodes:
        # TODO: add max_num_steps_per_episode

        # Optimization is given by alterating minimization scheme:
        #   1. do the step for each env
        #   2. do one optimization step for each env using "soft-q-learning".
        #   3. do one optimization step for the policy

        for i_env, env in enumerate(list_of_envs):

            # select an action
            action = select_action(states[i_env], policy, models[i_env],
                                   num_actions, eps_start, eps_end, eps_decay,
                                   episodes_done[i_env], alpha, beta)

            steps_done[i_env] += 1
            current_time[i_env] += 1
            next_state_tmp, reward, done, _ = env.step(action[0, 0])
            reward = Tensor([reward])

            # Observe new state
            next_state = torch.from_numpy(next_state_tmp).type(
                torch.FloatTensor).view(-1, input_size)

            if done:
                next_state = None

            # Store the transition in memory
            time = Tensor([current_time[i_env]])
            memories[i_env].push(states[i_env], action, next_state, reward,
                                 time)

            # Perform one step of the optimization (on the target network)
            optimize_model(policy, models[i_env], optimizers[i_env],
                           memories[i_env], batch_size, alpha, beta, gamma)

            # Update state
            states[i_env] = next_state

            # Check if agent reached target
            if done or current_time[i_env] >= max_num_steps_per_episode:
                if episodes_done[i_env] <= num_episodes:
                    print(
                        "ENV:", i_env, "iter:", episodes_done[i_env],
                        "\treward:{0:.2f}".format(env.episode_total_reward),
                        "\tit:", current_time[i_env], "\texp_factor:",
                        eps_end + (eps_start - eps_end) *
                        math.exp(-1. * episodes_done[i_env] / eps_decay))

                episode_rewards[i_env].append(env.episode_total_reward)
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0

                states[i_env] = torch.from_numpy(env.reset()).type(
                    torch.FloatTensor).view(-1, input_size)

                if is_plot:
                    plot_rewards(episode_rewards, i_env)

        optimize_policy(policy, policy_optimizer, memories, batch_size,
                        num_envs, gamma, alpha, beta)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations
                states[i_env] = torch.from_numpy(env.reset()).type(
                    torch.FloatTensor).view(-1, input_size)

                if is_plot:
                    plot_rewards(episode_rewards, i_env)

        optimize_policy(policy, policy_optimizer, memories, batch_size,
                        num_envs, gamma, alpha, beta)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations


if __name__ == '__main__':
    # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001, max_num_steps_per_episode=100, num_episodes=1000)
    # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001)
    trainD(list_of_envs=[GridworldEnv(4), GridworldEnv(5)],
           learning_rate=0.00001,
           beta=3)
                final_state_value = 0

            # Distill for each environment
            task_specific_losses.append(
                task_specific_update(models[i_env], distilled,
                                     optimizers[i_env], alpha, beta, gamma,
                                     final_state_value, i_env))

        finish_episode(task_specific_losses, models, distilled, opt_distilled,
                       alpha, beta, gamma)

        # if i_episode % args.log_interval == 0:
        for i in range(tasks):
            print('Episode: {}\tEnv: {}\tDuration: {}\tTotal Reward: {:.2f}'.
                  format(i_episode, i, episode_duration[i_episode][i],
                         episode_rewards[i_episode][i]))

    np.save(file_name + '-distral0-rewards', episode_rewards)
    np.save(file_name + '-distral0-durations', episode_duration)

    print('Completed')


if __name__ == '__main__':
    # trainDistral(list_of_envs=[GridworldEnv(4), GridworldEnv(5), GridworldEnv(6), GridworldEnv(7), GridworldEnv(8)], learning_rate=0.0001, num_episodes=200)
    trainDistral(list_of_envs=[GridworldEnv(4),
                               GridworldEnv(5)],
                 learning_rate=0.0001,
                 num_episodes=200,
                 beta=5)
                if is_plot:
                    plot_rewards(episode_rewards, i_env)

        # Perform one step of the optimization on the Distilled policy
        optimize_policy(policy, policy_optimizer, memories, batch_size,
                        num_envs, gamma, alpha, beta)

    print('Complete')
    env.render(close=True)
    env.close()

    ## Store Results
    np.save(file_name + '-rewards', episode_rewards)
    np.save(file_name + '-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations


if __name__ == '__main__':
    trainD(list_of_envs=[
        GridworldEnv(4),
        GridworldEnv(5),
        GridworldEnv(6),
        GridworldEnv(7),
        GridworldEnv(8)
    ],
           learning_rate=0.001,
           max_num_steps_per_episode=100,
           num_episodes=1000,
           alpha=1.)
                episode_rewards[i_env].append(env.episode_total_reward)
                episodes_done[i_env] += 1
                episode_durations[i_env].append(current_time[i_env])
                current_time[i_env] = 0

                states[i_env] = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size)


        optimize_policy(policy, policy_optimizer, memories, batch_size,
                    num_envs, gamma, alpha, beta)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)

    return models, policy, episode_rewards, episode_durations

if __name__ == '__main__':
    # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001, max_num_steps_per_episode=100, num_episodes=1000)
    # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001)
    trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5)], learning_rate=0.001, beta=4)
                    default=543,
                    metavar='N',
                    help='random seed (default: 1)')
parser.add_argument('--render',
                    action='store_true',
                    help='render the environment')
parser.add_argument('--log-interval',
                    type=int,
                    default=10,
                    metavar='N',
                    help='interval between training status logs (default: 10)')
args = parser.parse_args()

#env = gym.make('CartPole-v0')

env = GridworldEnv(6)
env.seed(args.seed)
#torch.manual_seed(args.seed)

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])


class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(3, 128)
        self.action_head = nn.Linear(128, 4)
        self.value_head = nn.Linear(128, 1)

        self.saved_actions = []
        self.rewards = []
                current_time[i_env] = 0

                states[i_env] = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size)

                if is_plot:
                    plot_rewards(episode_rewards, i_env)


        optimize_policy(policy, policy_optimizer, memories, batch_size,
                    num_envs, gamma, alpha, beta)

    print('Complete')
    env.render(close=True)
    env.close()
    if is_plot:
        plt.ioff()
        plt.show()

    ## Store Results

    np.save(file_name + '-distral-2col-rewards', episode_rewards)
    np.save(file_name + '-distral-2col-durations', episode_durations)
    np.save(file_name + '-beta-distilled_logit_norms', distilled_logits_magnitude)
    np.save(file_name + '-beta-policy_logit_norms', policy_logits_magnitude)

    return models, policy, episode_rewards, episode_durations

if __name__ == '__main__':
    # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001, max_num_steps_per_episode=100, num_episodes=1000)
    trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001)
    # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5)], learning_rate=0.001)
Beispiel #14
0
            pi_0_temp, _ = distilled(next_state)

            temp_term = beta*action_pref_temp - torch.max(beta*action_pref_temp)
            final_state_value = torch.log((torch.pow(pi_0_temp, alpha) * torch.exp(temp_term)).sum()) / beta            

            if done:
                final_state_value = 0

            # Distill for each environment
            task_specific_losses.append(task_specific_update(models[i_env], distilled,
                                                             optimizers[i_env], alpha,
                                                             beta, gamma, final_state_value,
                                                             i_env))

        finish_episode(task_specific_losses, models, distilled, opt_distilled, alpha, beta, gamma)

        # if i_episode % args.log_interval == 0:
        for i in range(tasks):
            print('Episode: {}\tEnv: {}\tDuration: {}\tTotal Reward: {:.2f}'.format(
                i_episode, i, episode_duration[i_episode][i], episode_rewards[i_episode][i]))


    np.save(file_name + '-distral0-rewards' , episode_rewards)
    np.save(file_name + '-distral0-durations' , episode_duration)

    print('Completed')

if __name__ == '__main__':
    # trainDistral(list_of_envs=[GridworldEnv(4), GridworldEnv(5), GridworldEnv(6), GridworldEnv(7), GridworldEnv(8)], learning_rate=0.0001, num_episodes=200)
    trainDistral(list_of_envs=[GridworldEnv(4), GridworldEnv(5)], learning_rate=0.00025, num_episodes=200, beta=5)